It is a simple CUDA code for initializing a big matrix (filling in zeros).
I output the first 1*3 matrix, if the code works. It should be all zeros.
If I set the matrix size to be small, then the program works properly. But when I make the size larger (> 43200 * 2400), what is inside the matrix are all garbage.
I had cudaDeviceSynchronize() append at the end of each CUDA functions already.
I am using NVIDIA Quadro K4200, Xeon E5-2630 with Ubuntu 14.04.
Thanks for anyone helping me here.
Attached below is my full code.
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <cuComplex.h>
#define BLOCK_SIZE 16 // change it to 16 to get maximum performance
// populate the matrix using first row
__global__ void RepmatKernel (cuComplex *Mat, const unsigned int N, const unsigned int Cols)
{
unsigned int i = (unsigned int)blockIdx.x * (unsigned int)blockDim.x + (unsigned int)threadIdx.x;
if (i < N)
{
Mat[i].x = 0;
Mat[i].y = 0;
}
}
// main routine
int main ()
{
const unsigned int Rows = 43200;
const unsigned int Cols = 2400;
const unsigned int Num_thrd = 256; // max threads per block
unsigned int Mat_size = Rows * Cols; // size of array
cuComplex *vec; // supposedly the input
cuComplex *mat_debug; // for debug
vec = new cuComplex [Cols];
mat_debug = new cuComplex [Rows*Cols];
cuComplex *mat_in_d; // device array
//input in host array
for(unsigned int i = 0; i < Cols; i++)
{
vec[i].x = 3*i+4;
vec[i].y = 0.2*i+1;
}
const unsigned int size_mat_d = Rows * Cols * sizeof(cuComplex);
//create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
if (cudaMalloc((void **) &mat_in_d , size_mat_d) != cudaSuccess) std::cout<<"Error allocating GPU";
cudaDeviceSynchronize() ;
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( mat_in_d , vec , Cols , cudaMemcpyHostToDevice ) ;
cudaDeviceSynchronize() ;
// ========================================================================
cudaMemcpy(mat_debug , mat_in_d , size_mat_d , cudaMemcpyDeviceToHost) ;
cudaDeviceSynchronize() ;
std::cout<<"before repmat="<<std::endl;
std::cout<<"[";
for(unsigned int i = 0; i < 3; i++)
{
std::cout<< mat_debug[i * Cols].x <<"+"<<mat_debug[i * Cols].y <<"i, ";
std::cout<<";"<<std::endl;
}
std::cout<<"]"<<std::endl;
// ==========================================================================
RepmatKernel<<<(unsigned int)ceil((float)(Mat_size)/(float)(Num_thrd)),
(Num_thrd)>>>(mat_in_d,
Mat_size,
Cols);
cudaDeviceSynchronize();
// ========================================================================
cudaMemcpy(mat_debug , mat_in_d , size_mat_d , cudaMemcpyDeviceToHost) ;
cudaDeviceSynchronize() ;
std::cout<<"after repmat="<<std::endl;
std::cout<<"[";
for(unsigned int i = 0; i < 3; i++)
{
std::cout<< mat_debug[i * Cols].x <<"+"<<mat_debug[i * Cols].y <<"i, ";
std::cout<<";"<<std::endl;
}
std::cout<<"]"<<std::endl;
// ==========================================================================
cudaFree(mat_in_d);
delete [] vec;
delete [] mat_debug;
return 0;
}
checkCudaErrors()
if you includehelper_cuda.h
. – mtycuda-memcheck
to get a quick read on any errors. Also, be sure you are compiling with a proper arch switch for your GPU, such asnvcc -arch=sm_30 ...
If you don't,nvcc
will compile for some lower architecture, and your kernel will not launch because the first launch config parameter (ceil((float)(Mat_size)/(float)(Num_thrd))
is large enough to require a cc3.0 compilation target. It will fail to launch if you compile for a lower target, with your 4300x2400 size. – Robert Crovella