Can't get matrix*vector multiplication to go faster in CUDA than in CPU

Question

#include <iostream>
#include <assert.h>
#include <sys/time.h>

#define BLOCK_SIZE 32 // CUDA block size

__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
    if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
    return 0;
}

__device__ inline int getValFromVector(int* vector, int row, int matSize) {
    if (row<matSize) {return vector[row];}
    return 0;
}

__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
    __shared__ int aRowShared[BLOCK_SIZE];
    __shared__ int bShared[BLOCK_SIZE];
    __shared__ int myRow;
    __shared__ double rowSum;

    int myIndexInBlock = threadIdx.x;
    myRow = blockIdx.x;
    rowSum = 0;

    for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
        aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
        bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);

        __syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared

        if (myIndexInBlock==0) {
            for (int k=0;k<BLOCK_SIZE;k++) {
                rowSum += aRowShared[k] * bShared[k];
            }
        }
    }

    if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}

static inline void cudaCheckReturn(cudaError_t result) {
    if (result != cudaSuccess) {
        std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
        assert(result == cudaSuccess);
    }
}

static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
    matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row

    cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}


static void matVecMult(int** A,int* b, int* c, int matSize) {
    // Sequential implementation:
    for (int i=0;i<matSize;i++) {
        c[i]=0;
        for (int j=0;j<matSize;j++) {
            c[i]+=(A[i][j] * b[j]);
        }
    }
}

int main() {
    int matSize = 1000;

    int** A,* b,* c;
    int* aOnGPU,* bOnGPU,* cOnGPU;

    A = new int*[matSize];
    for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
    b = new int[matSize]();
    c = new int[matSize]();

    int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);

    cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
    cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
    cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));

    srand(time(NULL));

    for (int i=0;i<matSize;i++) {
        b[i] = rand()%100;
        for (int j=0;j<matSize;j++) {
            A[i][j] = rand()%100;
        }
    }

    for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
    cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));

    int iters=1;
    timeval start,end;

    // Sequential run:
    gettimeofday(&start,NULL);
    for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
    gettimeofday(&end,NULL);
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;

    // CUDA run:
    gettimeofday(&start,NULL);
    for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
    gettimeofday(&end,NULL);
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;

    cudaCheckReturn(cudaFree(aOnGPU));
    cudaCheckReturn(cudaFree(bOnGPU));
    cudaCheckReturn(cudaFree(cOnGPU));


    for (int i = 0; i < matSize; ++i) {
        delete[] A[i];
    }
    delete[] A;
    delete[] b;
    delete[] c;
}

Gives:

267171
580253

I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.

Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.

Is my implementation wrong, or is simply CUDA not faster than the CPU?

Regis Portalez Regis Portalez · Accepted Answer · 2016-04-26T07:03:48

First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.

Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu. Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.

Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

1 Answers