I am trying to write a program for matrix calculations using C/CUDA. I have the following program:
In main.cu
#include <cuda.h>
#include <iostream>
#include "teste.cuh"
using std::cout;
int main(void)
{
const int Ndofs = 2;
const int Nel = 4;
double *Gh = new double[Ndofs*Nel*Ndofs*Nel];
double *Gg;
cudaMalloc((void**)& Gg, sizeof(double)*Ndofs*Nel*Ndofs*Nel);
for (int ii = 0; ii < Ndofs*Nel*Ndofs*Nel; ii++)
Gh[ii] = 0.;
cudaMemcpy(Gh, Gg, sizeof(double)*Ndofs*Nel*Ndofs*Nel, cudaMemcpyHostToDevice);
integraG<<<256, 256>>>(Nel, Gg);
cudaMemcpy(Gg, Gh, sizeof(double)*Ndofs*Nel*Ndofs*Nel, cudaMemcpyDeviceToHost);
for (int ii = 0; ii < Ndofs*Nel*Ndofs*Nel; ii++)
cout << ii + 1 << " " << Gh[ii] << "\n";
return 0;
}
In mtrx.cuh
#ifndef TESTE_CUH_
#define TESTE_CUH_
__global__ void integraG(const int N, double* G)
{
const int szmodel = 2*N;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
int idy = threadIdx.y + blockIdx.y*blockDim.y;
int offset = idx + idy*blockDim.x*gridDim.x;
int posInit = szmodel*offset;
G[posInit + 0] = 1;
G[posInit + 1] = 1;
G[posInit + 2] = 1;
G[posInit + 3] = 1;
}
#endif
The result (which is supposed to be a matrix filled with 1's) is copied back to the host array; The problem is: nothing happens! Apparently, my program is not calling the gpu kernel, and I am still getting an array full of zeros.
I am very new to CUDA programming and I am using CUDA by example (Jason Sanders) as a reference book.
My questions are:
- What is wrong with my code?
- Is this the best way to deal with matrices using GPU, using matrices vectorized form?
- Is there another reference that can provide more examples on matrices using GPU's?
cuMemcpy
. – m.s.