I am trying to use texture objects in streams. The elements in the buffers are allocated and all set to value 1:
float* aa[nStreams];
for (int i = 0; i < nStreams; ++i) {
checkCuda(cudaMallocHost((void**)&aa[i], streamBytes)); // device
float* ar = aa[i];
for (int k = 0; k < streamSize; k++) {
ar[k]=1;
}
}
I then declare and create an array of textures in the loop over streams:
cudaTextureObject_t tex_ar[nStreams];
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
// create texture object
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = aa[i];
resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
resDesc.res.linear.desc.x = 32; // bits per channel
resDesc.res.linear.sizeInBytes = streamBytes;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
// create texture object: we only have to do this once!
cudaCreateTextureObject(&tex_ar[i], &resDesc, &texDesc, NULL);
checkCuda(cudaMemcpyAsync(&d_b[offset], &b[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
kernelTex << <streamSize / blockSize, blockSize, 0, stream[i] >> >(tex_ar[i], d_b, offset);
checkCuda(cudaMemcpyAsync(&b[offset], &d_b[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
The kernel simply assign back the texture value:
__global__ void kernelTex(cudaTextureObject_t tex, float* b, int offset)
{
int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
b[i] = tex1Dfetch<float>(tex, i);
}
So I expect, after kernels are finished, array b to have all its elements equal to 1. However other than the elements set by the first stream the rest of the elements are zero.