I am currently trying to implement matrix multiplication using CUBLAS on my GPU.
It works fine for square matrices and for certain sizes of inputs, but for others the last line is not returned (and contains 0 as it is the way I implemented it).
I assume it is a problem with the allocation or the syntax of cublasSgemm, but I could not find where it was.
N.B. : If you are not familiar with CUBLAS: it is column-majored, which is why it looks like the operation are performed the other way.
Any help would be appreciated.
The code:
Note that the gpuErrchk and cublasErrchk are of course irrelevant here.
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <vector>
std::vector<float> CUDA_mult_MAT(const std::vector<float> &data_1 , const uint64_t data_1_rows, const uint64_t data_1_columns,
const std::vector<float> &data_2 , const uint64_t data_2_rows, const uint64_t data_2_columns){
cublasHandle_t handle;
cublasErrchk(cublasCreate(&handle));
std::vector<float> result(data_1_rows * data_2_columns); //Vector holding the result of the multiplication
/*----------------------------------------------------------------------------------------------*/
float* GPU_data_1 = NULL;
gpuErrchk(cudaMalloc((void**)&GPU_data_1 , data_1.size()*sizeof(float))); //Allocate memory on the GPU
gpuErrchk(cudaMemcpy(GPU_data_1, data_1.data(), data_1.size()*sizeof(float), cudaMemcpyHostToDevice)); //Copy data from data_1 to GPU_data_1
float* GPU_data_2 = NULL;
gpuErrchk(cudaMalloc((void**)&GPU_data_2 ,data_2.size()*sizeof(float))); //Allocate memory on the GPU
gpuErrchk(cudaMemcpy(GPU_data_2, data_2.data(), data_2.size()*sizeof(float), cudaMemcpyHostToDevice));//Copy data from data_2 to GPU_data_2
float* GPU_result = NULL;
gpuErrchk(cudaMalloc((void**)&GPU_result , result.size()*sizeof(float))); //Allocate memory on the GPU
/*----------------------------------------------------------------------------------------------*/
const float alpha = 1.f;
const float beta = 0.f;
cublasErrchk(
cublasSgemm(handle , CUBLAS_OP_N , CUBLAS_OP_N,
data_2_columns , data_2_rows ,data_1_columns,
&alpha , GPU_data_2 , data_2_columns,
GPU_data_1 , data_1_columns,
&beta , GPU_result , data_1_rows)
); //Perform multiplication
gpuErrchk(cudaMemcpy(result.data() , GPU_result , result.size() * sizeof(float) , cudaMemcpyDeviceToHost)); //Copy back to the vector 'result'
gpuErrchk(cudaFree(GPU_data_1)); //Free GPU memory
gpuErrchk(cudaFree(GPU_data_2)); //Free GPU memory
gpuErrchk(cudaFree(GPU_result)); //Free GPU memory
cublasErrchk(cublasDestroy_v2(handle));
return result;
}
The inputs:
#include <iostream>
#include <vector>
int main(){
const std::vector<float> r1 = CUDA_mult_MAT({1 , 2 , 3 , 4 , 5 , 6} , 2 , 3 ,
{7 , 8 , 9 , 10 , 11 , 12} , 3 , 2);
/*
Product :
7 8
1 2 3 9 10
4 5 6 11 12
*/
for(auto & value: r1){std::cout << value << " " ;}
std::cout << std::endl;
const std::vector<float> r2 = CUDA_mult_MAT({7 , 8 , 9 , 10 , 11 , 12} , 3 , 2 ,
{1 , 2 , 3 , 4 , 5 , 6} , 2 , 3);
/*
Product :
7 8
9 10 1 2 3
11 12 4 5 6
*/
for(auto & value: r2){std::cout << value << " " ;}
std::cout << std::endl;
return 0;
}
Ouputs:
Printed by the program:
58 64 139 154
39 54 69 49 68 87 0 0 0
^~~~~~~
Expected:
58 64 139 154
39 54 69 49 68 87 59 82 105
^~~~~~~