Here's my CUDA code:
#include<stdio.h>
#include<assert.h>
void verify(float * A, float * B, int size);
__global__ void CopyData(float *d_array, float* d_dest_array, size_t pitch, int cols, int rows)
{
for(int i=0; i<rows; i++){
float *rowData = (float*)(((char*)d_array) + (i*pitch));
for(int j=0; j<cols; j++){
d_dest_array[i*cols+j] = *(rowData+j);
}
}
}
int main(int argc, char **argv)
{
int row, col, i, j;
float time1, time2;
float *d_array; // dev arr which mem will be alloc to
float *d_dest_array; // dev arr that will be a copy
size_t pitch; // ensures correct data struc alignm
if(argc != 3)
{
printf("Usage: %s [row] [col]\n", argv[0]);
return 1;
}
row = atoi(argv[1]);
col = atoi(argv[2]);
float *h1_array = new float[col*row];
float *h2_array = new float[col*row];
float *h_ori_array = new float[col*row];
for (i = 0; i<row; i++){
for(j = 0; j<col; j++){
h_ori_array[i*col+j] = i*col + j;
}
}
cudaEvent_t start, stop;
cudaMallocPitch(&d_array, &pitch, col*sizeof(float), row);
cudaMalloc(&d_dest_array, col*row*sizeof(float));
cudaMemcpy2D(d_array, pitch, h_ori_array, col*sizeof(float), col*sizeof(float), row, cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//CopyData<<<100, 512>>>(d_array, d_dest_array, pitch, col, row);
for (i = 0; i<row; i++){
for(j = 0; j<col; j++){
h1_array[i*col+j] = h_ori_array[i*col+j];
}
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time1, start, stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
CopyData<<<row*col/512, 512>>>(d_array, d_dest_array, pitch, col, row);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time2, start, stop);
cudaMemcpy2D(h2_array, pitch, d_dest_array, col*sizeof(float), col*sizeof(float), row, cudaMemcpyDeviceToHost);
verify(h1_array, h2_array, row*col);
free(h1_array); free(h2_array); free(h_ori_array);
cudaFree(d_array); cudaFree(d_dest_array);
printf("Exec time in ser = %f, par = %f ms with pitch %d", time1, time2, (int)pitch);
return 0;
}
void verify(float * A, float * B, int size)
{
for (int i = 0; i < size; i++)
{
assert(A[i]==B[i]);
}
printf("Correct!");
}
It just makes a copy of a matrix. Both a serial and parallel version are written so that I can compare them.
It gives wrong answer if the array size is 64. For 256 and beyond, it gives correct answer. However it takes too long, 4 seconds for a 512x512 matrix.
I am not comfortable with cudaMemcpy2D. Can someone please pinpoint what I am doing wrong? Any suggestion regarding CUDA coding practices will also be appreciated. Also, while calling a kernel, how do I decide the block and grid dimension?
EDIT 1: The CopyData function that I have used does not use parallelism. I foolishly copied it from VIHARRI's answer at the bottom of the page.
The selected answer over there does not specify how the data was copied from host to device. Can someone show how it can be done using the cudaMallocPitch and cudaMemcpy2D functions? I am looking for the correct way to index inside the kernel as well as the correct way to copy a 2D array from host to device.