Is there an easy way (google hasn't delivered...) to allocate per-block shared memory regions from a single input array such that there can be an overlap?
The simple example is string searching; Saw I want to dice up the input text, have each thread in each block search for a pattern starting from text[thread_id], but want the data assigned to each block to overlap by the pattern length so matching cases that fall across the border are still found.
I.e the total memory size allocated to shared memory on each block is
(blocksize+patternlength)*sizeof(char)
I'm probably missing something simple and am currently diving through the CUDA guide, but would appreciate some guidance.
UPDATE: I suspect some people have misunderstood my question (or I mis-explained it).
Say I have a dataset QWERTYUIOP, and i want to search for a 3 character match, and i dice up the dataset (arbitrarily) into 4's for each thread block; QWER TYUI OPxx
This is simple enough to accomplish but the algorithm fails if the 3 character match is actually looking for IOP.
In this case, what I want is for each block to have in shared memory:
QWERTY TYUIOP OPxxxx
ie each block gets assigned the blocksize+patternlength-1 characters so no memory border issues occur.
Hope that explains things better.
Since @jmilloy is being persistent... :P
//VERSION 1: Simple
__global__ void gpuSearchSimple(char *T, int lenT, char *P, int lenP, int *pFound)
{
int startIndex = blockDim.x*blockIdx.x + threadIdx.x;
int fMatch = 1;
for (int i=0; i < lenP; i++)
{
if (T[startIndex+i] != P[i]) fMatch = 0;
}
if (fMatch) atomicMin(pFound, startIndex);
}
//VERSION 2: Texture
__global__ void gpuSearchTexture(int lenT, int lenP, int *pFound)
{
int startIndex = blockDim.x*blockIdx.x + threadIdx.x;
int fMatch = 1;
for (int i=0; i < lenP; i++)
{
if (tex1Dfetch(texT,startIndex+i) != tex1Dfetch(texP,i)) fMatch = 0;
}
if (fMatch) atomicMin(pFound, startIndex);
}
//Version 3: Shared
__global__ void gpuSearchTexSha(int lenT, int lenP, int *pFound)
{
extern __shared__ char shaP[];
for (int i=0;threadIdx.x+i<lenP; i+=blockDim.x){
shaP[threadIdx.x+i]= tex1Dfetch(texP,threadIdx.x+i);
}
__syncthreads();
//At this point shaP is populated with the pattern
int startIndex = blockDim.x*blockIdx.x + threadIdx.x;
// only continue if an earlier instance hasn't already been found
int fMatch = 1;
for (int i=0; i < lenP; i++)
{
if (tex1Dfetch(texT,startIndex+i) != shaP[i]) fMatch = 0;
}
if (fMatch) atomicMin(pFound, startIndex);
}
What I would like to have done is to put the text into shared memory chunks, as described in the rest of the question, instead of keeping the text in texture memory for the later versions.
blocksize? and what istext(as intext[thread_id])? - jmilloy