I'm benchmarking the sparse matrix-matrix multiplication on Nvidia K40 using cuSPARSE
library.
I'm creating my own sparse matrix in CSR
format and I'm using the cusparseXcsrgemmNnz
routine of the cuSPARSE
library.
However, as I increase the data size, an error occurs when calling cusparseXcsrgemmNnz
, i.e CUSPARSE_STATUS_SUCCESS
is not returned.
Also a cudaMemcpy
fails and CUDA_SUCCESS
is not returned.
The code works fine for 8 x 8
and 16 x 16
matrices. However, from 32 x 32
on, this error is observed.
Edit: I'm receiving CUSPARSE_STATUS_EXECUTION_FAILED
from cusparseXcsrgemmNnz
for the third matrix size. The execution happens correctly for the first two matrix sizes.
#include <cusparse_v2.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
double timerval()
{
struct timeval st;
gettimeofday(&st, NULL);
return (st.tv_sec+st.tv_usec*1e-6);
}
// perform sparse-matrix multiplication C=AxB
int main(){
double avg_time = 0, s_time, e_time;
cusparseStatus_t stat;
cusparseHandle_t hndl;
cusparseMatDescr_t descrA, descrB, descrC;
int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC,*pos;
float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
int nnzA, nnzB, nnzC;
int m=4,n,k,loop;
int i,j;
int iterations;
for (iterations=0;iterations<10;iterations++)
{
m *=2;
n = m;
k = m;
//density of the sparse matrix to be created. Assume 5% density.
double dense_const = 0.05;
int temp5, temp6,temp3,temp4;
int density=(m*n)*(dense_const);
nnzA = density;
nnzB = density;
h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
h_csrColIndA = (int *)malloc(density*sizeof(int));
h_csrColIndB = (int *)malloc(density*sizeof(int));
h_csrValA = (float *)malloc(density*sizeof(float));
h_csrValB = (float *)malloc(density*sizeof(float));
if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
{printf("malloc fail\n"); return -1;}
//position array for random initialisation of positions in input matrix
pos= (int *)calloc((m*n), sizeof(int));
int temp,temp1;
// printf("the density is %d\n",density);
// printf("check 1:\n");
//randomly initialise positions
for(i=0;i<density;i++)
{
temp1=rand()%(m*n);
pos[i]=temp1;
}
// printf("check 2:\n");
//sort the 'pos' array
for (i = 0 ; i < density; i++) {
int d = i;
int t;
while ( d > 0 && pos[d] < pos[d-1]) {
t = pos[d];
pos[d] = pos[d-1];
pos[d-1] = t;
d--;
}
}
// initialise with non zero elements and extract column and row ptr vector
j=1;
//ja[0]=1;
int p=0;
int f=0;
for(i = 0; i < density; i++)
{
temp=pos[i];
h_csrValA[f] = rand();
h_csrValB[f] = rand();
h_csrColIndA[f] = temp%m;
h_csrColIndB[f] = temp%m;
f++;
p++;
temp5= pos[i];
temp6=pos[i+1];
temp3=temp5-(temp5%m);
temp4=temp6-(temp6%m);
if(!(temp3== temp4))
{
if((temp3+m==temp6))
{}
else
{
h_csrRowPtrA[j]=p;
h_csrRowPtrB[j]=p;
j++;
}
}
}
// transfer data to device
cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
cudaMalloc(&csrColIndA, density*sizeof(int));
cudaMalloc(&csrColIndB, density*sizeof(int));
cudaMalloc(&csrValA, density*sizeof(float));
cudaMalloc(&csrValB, density*sizeof(float));
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndA, h_csrColIndA, density*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndB, h_csrColIndB, density*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrValA, h_csrValA, density*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(csrValB, h_csrValB, density*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
// set cusparse matrix types
CUSPARSE_CHECK(cusparseCreate(&hndl));
stat = cusparseCreateMatDescr(&descrA);
CUSPARSE_CHECK(stat);
stat = cusparseCreateMatDescr(&descrB);
CUSPARSE_CHECK(stat);
stat = cusparseCreateMatDescr(&descrC);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
// figure out size of C
int baseC;
// nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnzC;
stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
CUSPARSE_CHECK(stat);
cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
cudaCheckErrors("cudaMalloc fail");
s_time=timerval();
stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
descrA, nnzA, csrRowPtrA, csrColIndA,
descrB, nnzB, csrRowPtrB, csrColIndB,
descrC, csrRowPtrC, nnzTotalDevHostPtr );
CUSPARSE_CHECK(stat);
if (NULL != nnzTotalDevHostPtr){
nnzC = *nnzTotalDevHostPtr;}
else{
cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
nnzC -= baseC;}
cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
cudaCheckErrors("cudaMalloc fail");
// perform multiplication C = A*B
for(loop=0;loop<1000;loop++)
{
stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
descrA, nnzA,
csrValA, csrRowPtrA, csrColIndA,
descrB, nnzB,
csrValB, csrRowPtrB, csrColIndB,
descrC,
csrValC, csrRowPtrC, csrColIndC);
CUSPARSE_CHECK(stat);
}
e_time=timerval();
avg_time=avg_time/1000;
// copy result (C) back to host
h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
h_csrValC = (float *)malloc(nnzC *sizeof(float));
if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
{printf("malloc fail\n"); return -1;}
cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrColIndC, csrColIndC, nnzC*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
printf ("\n Input size: %d x %d ,Time: %lf and density is %d \n", m,n, avg_time, density);
cudaFree(csrRowPtrC);
cudaFree(csrColIndC);
cudaFree(csrValC);
cudaFree(csrRowPtrA);
cudaFree(csrColIndA);
cudaFree(csrValA);
cudaFree(csrRowPtrB);
cudaFree(csrColIndB);
cudaFree(csrValB);
free(h_csrRowPtrC);
free(h_csrColIndC);
free(h_csrValC);
free(h_csrRowPtrA);
free(h_csrColIndA);
free(h_csrValA);
free(h_csrRowPtrB);
free(h_csrColIndB);
free(h_csrValB);
}
return 0;
}