I just have a question about my cuda program that I wrote. It allows me to enter the size of the matrix, col and rows. Say I enter ~1124 and it computes fine. However say I enter 1149 it Seg faults AFTER computing in the device(I think it's seg faulting during the copy back). But say I enter 2000 it seg faults BEFORE computing in the device(I think it seg faults during the copy over). I think my issue is all with memory management. If you guys could point me in the right direction I'd appreciate it.
I udpated the code with how it is called. In the new edit(at the bottom) it contains: sumMatrix(blank matrix with the size of eleCount1, which is the size of the entire matrix), matrixOne(first matrix),matrixTwo(second matrix, allocated same way matrix1 is done),eleCount1(entire size of matrix). Both matrixOne and two are read in from a file.
Wasn't sure if someone needed to see this stuff about my GPU:
- Total amount of constant memory: 65536 bytes
- Total amount of shared memory per block: 49152 bytes
- Total number of registers available per block: 32768
- Warp size: 32
- Maximum number of threads per block: 1024
- Maximum sizes of each dimension of a block: 1024 x 1024 x 64
- Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
The code is:
void addKernel(float *c, float *a, float *b)
{
int i = threadIdx.x;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
c[idx] = a[idx] + b[idx];
}
cudaError_t addWithCuda(float *c, float *a, float *b, size_t size)
{
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
blocksNeeded=(size/MAXTHREADS)+1;
int threadsPerBlock = MAXTHREADS/blocksNeeded+1;
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
addKernel<<<blocksNeeded, size>>>(dev_c, dev_a, dev_b);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
//edit: added how the matrix are allocated
float* matrixOne = (float*)malloc(sizeof(float)*file1size);
int matrixIndex = 0;
readFromFile(fd,byte, matrixOneWidth, matrixOneHeight, matrixOne);
//matrixOneHeight--;
eleCount1 = matrixOneHeight*matrixOneWidth;
matrixOne= (float*)realloc(matrixOne,eleCount1*sizeof(float));
//Edit: Added how the addWithCuda is called.
cudaStatus = addWithCuda(sumMatrix, matrixOne,matrixTwo,eleCount1);
//sumMatrix is created after we know how large the matrices are.
float sumMatrix[eleCount1];
__global__
. You should at least fix that... – harrism