I have an array already initialized that I am trying to use in each thread of the kernel call (each thread uses a different part of the array so there are no dependencies). I create the array and save memory on the device using cudaMalloc and the array is copied from host to device using cudaMemcpy.
I pass the pointer returned by cudaMalloc to the kernel call to be used by each thread.
int SIZE = 100;
int* data = new int[SIZE];
int* d_data = 0;
cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
data[i] = i;
cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );
This code was taken from here. For the kernel call.
kernel<<<blocks, threads>>> (results, d_data);
I keep track of the results from each thread by using the struct Result. The next code works without errors.
__global__ void mainKernel(Result res[], int* data){
int x = data[0];
}
But when I assign that value to res:
__global__ void mainKernel(Result res[], int* data){
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int x = data[0];
res[threadId].x = x;
}
An error is raised:
cudaSafeCall() Runtime API error in file , line 355 : an illegal memory access was encountered.
The same error appears with any operation involving the use of that pointer
__global__ void mainKernel(Result res[], int* data){
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int x = data[0];
if (x > 10)
res[threadId].x = 5;
}
There is no problem with the definition of res. Assigning any other value to res[threadId].x does not give me any error.
This is the output of running cuda-memcheck:
========= Invalid __global__ read of size 4
========= at 0x00000150 in mainKernel(Result*, int*)
========= by thread (86,0,0) in block (49,0,0)
========= Address 0x13024c0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x150d6d]
========= Host Frame:./out [0x2cc4b]
========= Host Frame:./out [0x46c23]
========= Host Frame:./out [0x3e37]
========= Host Frame:./out [0x3ca1]
========= Host Frame:./out [0x3cd6]
========= Host Frame:./out [0x39e9]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./out [0x31b9]
EDIT:
This is an example of the full code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <iostream>
#include <assert.h>
typedef struct
{
int x,y,z;
} Result;
__global__ void mainKernel(Result pResults[], int* dataimage)
{
int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
int xVal = dataimage[0];
if (xVal > 10)
pResults[threadId].x = 5;
}
int main (int argc, char** argv)
{
int NUM_THREADS = 5*5;
int SIZE = 100;
int* data = new int[SIZE];
int* d_data = 0;
cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
data[i] = i;
cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );
unsigned int GPU_ID = 1; // not actually :-)
// unsigned int GPU_ID = cutGetMaxGflopsDeviceId() ;
cudaSetDevice(GPU_ID);
Result * results_GPU = 0;
cutilSafeCall( cudaMalloc( &results_GPU, NUM_THREADS * sizeof(Result)) );
Result * results_CPU = 0;
cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );
mainKernel<<<5,5>>> ( results_GPU, d_data );
cudaThreadSynchronize();
cutilSafeCall( cudaMemcpy(results_CPU, results_GPU, NUM_THREADS * sizeof(Result),cudaMemcpyDeviceToHost) );
cutilSafeCall(cudaFree(results_GPU));
cutilSafeCall(cudaFreeHost(results_CPU));
cudaThreadExit();
} // ()
results, and yet you have managed to completely omit all the code that shows how you have defined and allocated it. Could you edit your question to include a short, complete code which someone else could compile and run? Without it, it will be very difficult to give you any sort of answer to your question - talonmies