Matrix-vector multiplication (cublasDgemv) returns zero

Question

For my first venture into CUDA/cuBLAS, I'm trying to write a simple function that multiplies an MxN matrix (represented with vector-of-vectors, std::vector) with an Nx1 "ones" vector, so as to get rowwise(?) sum of the matrix. This will make use of cublas_gemv() plus other basic CUDA operations, which I see as a good place to start.

After dealing with setup issues and reading/copying sample codes, here's what I have:

std::vector<double> test(std::vector<std::vector<double>> in)
{
    std::vector<double> out;
    long in_m = in.size();
    long in_n = in[0].size();
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    // This just converts a vector-of-vectors into a col-first array
    double* p_in = vec2d_to_colfirst_array(in);
    double* p_ones = new double[in_n];
    double* p_out = new double[in_m];
    std::fill(p_ones, p_ones + in_n, 1.0);
    double* dev_in;
    double* dev_ones;
    double* dev_out;
    cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
    stat = cublasCreate(&handle);
    cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
    cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
    double alpha = 1.0;
    double beta = 0.0;
    stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
    cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
    out.assign(p_out, p_out + in_m);
    cudaFree(dev_in);
    cudaFree(dev_ones);
    cudaFree(dev_out);
    cublasDestroy(handle);
    free(p_in);
    free(p_ones);
    free(p_out);
    return out;
}

It doesn't look much different from the sample I read, so I expected it to "just work". However, when I inspected p_out, it's all zeros. Surely I didn't input a zero in matrix.

I verified that vec2d_to_colfirst_array() does its job just fine, and also that dev_in/dev_ones are properly populated by copying the data from device back to host and then reading. Maybe the problem is within the call to cublasDgemv(), but since I'm new (and also since the BLAS grammar is much less unintuitive compared to e.g. Eigen), after much frustration I just can't see what's wrong.

Any help appreciated!

stackoverflow.com/a/47755279/1231073. I guess you have to set pointer mode to CUBLAS_POINTER_MODE_HOST. — sgarizvi
@talonmies.. True indeed..... Andy, can you please provide an MCVE. — sgarizvi

Robert Crovella Robert Crovella · Accepted Answer · 2018-11-01T13:54:04

The error appears to be fairly simple. You are expecting to copy the results from dev_out:

cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);

but you never use dev_out in your cublas call:

stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);

This appears to be just a copy-paste error. If you replace the last instance of dev_ones in your cublas call with dev_out, your code works for me:

stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);

Here is a fully worked example with that change:

$ cat t315.cu
#include <vector>
#include <cublas_v2.h>
#include <iostream>

const long idim1 = 8;
const long idim2 = 8;

double* vec2d_to_colfirst_array(std::vector<std::vector<double>> in){
    long dim1 = in.size();
    long dim2 = in[0].size();
    long k = 0;
    double *res = new double[dim1*dim2];
    for (int i = 0; i < dim1; i++)
      for (int j = 0; j < dim2; j++) res[k++] = in[i][j];
    return res;
}


std::vector<double> test(std::vector<std::vector<double>> in)
{
    std::vector<double> out;
    long in_m = in.size();
    long in_n = in[0].size();
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    // This just converts a vector-of-vectors into a col-first array
    double* p_in = vec2d_to_colfirst_array(in);
    double* p_ones = new double[in_n];
    double* p_out = new double[in_m];
    std::fill(p_ones, p_ones + in_n, 1.0);
    double* dev_in;
    double* dev_ones;
    double* dev_out;
    cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
    stat = cublasCreate(&handle);
    cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
    cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
    double alpha = 1.0;
    double beta = 0.0;
    stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);
    cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
    out.assign(p_out, p_out + in_m);
    cudaFree(dev_in);
    cudaFree(dev_ones);
    cudaFree(dev_out);
    cublasDestroy(handle);

    free(p_in);
    free(p_ones);
    free(p_out);
    return out;
}

int main(){

  std::vector<double> a(idim2, 1.0);
  std::vector<std::vector<double>> b;
  for (int i = 0; i <  idim1; i++) b.push_back(a);
  std::vector<double> c = test(b);
  for (int i = 0; i < c.size(); i++) std::cout << c[i] << ",";
  std::cout << std::endl;
}

$ nvcc -std=c++11 -o t315 t315.cu -lcublas
t315.cu(24): warning: variable "cudaStat" was set but never used

t315.cu(25): warning: variable "stat" was set but never used

$ cuda-memcheck ./t315
========= CUDA-MEMCHECK
8,8,8,8,8,8,8,8,
========= ERROR SUMMARY: 0 errors
$

Note that I don't think free() is the correct API to use with new but that doesn't seem to be the crux of your question or issue.

Matrix-vector multiplication (cublasDgemv) returns zero

1 Answers