For my first venture into CUDA/cuBLAS, I'm trying to write a simple function that multiplies an MxN matrix (represented with vector-of-vectors, std::vector) with an Nx1 "ones" vector, so as to get rowwise(?) sum of the matrix. This will make use of cublas_gemv() plus other basic CUDA operations, which I see as a good place to start.
After dealing with setup issues and reading/copying sample codes, here's what I have:
std::vector<double> test(std::vector<std::vector<double>> in)
{
std::vector<double> out;
long in_m = in.size();
long in_n = in[0].size();
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
// This just converts a vector-of-vectors into a col-first array
double* p_in = vec2d_to_colfirst_array(in);
double* p_ones = new double[in_n];
double* p_out = new double[in_m];
std::fill(p_ones, p_ones + in_n, 1.0);
double* dev_in;
double* dev_ones;
double* dev_out;
cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
stat = cublasCreate(&handle);
cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
double alpha = 1.0;
double beta = 0.0;
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
out.assign(p_out, p_out + in_m);
cudaFree(dev_in);
cudaFree(dev_ones);
cudaFree(dev_out);
cublasDestroy(handle);
free(p_in);
free(p_ones);
free(p_out);
return out;
}
It doesn't look much different from the sample I read, so I expected it to "just work". However, when I inspected p_out, it's all zeros. Surely I didn't input a zero in matrix.
I verified that vec2d_to_colfirst_array() does its job just fine, and also that dev_in/dev_ones are properly populated by copying the data from device back to host and then reading. Maybe the problem is within the call to cublasDgemv(), but since I'm new (and also since the BLAS grammar is much less unintuitive compared to e.g. Eigen), after much frustration I just can't see what's wrong.
Any help appreciated!
CUBLAS_POINTER_MODE_HOST. - sgarizvi