Matrix Multiplication using CUDA (how threads are numbered)

Question

I was trying to catch a mistake in my program which multiplies square matrices using CUDA. As long as it's not possible to printf() from kernel (please tell if you know how to do it), I send an additional structure to the kernel to get the numbers of threads, blocks, rows and columns. All the elements of the structure are -1 (to identify them). So, the matrix product are partially incorrect but the the data from kernel totally confused me. Here's the code:

typedef struct {
int tx;
int ty;
int bx;
int by;
int rw;
int cl;
} THREADS; 

__global__ void mult1_kernel (float *a, float *b, float *c, THREADS *d, int n) {
    int k;
    float sum = 0.0f;
    int rw = blockIdx.y * blockDim.y + threadIdx.y;
    int cl = blockIdx.x * blockDim.x + threadIdx.x;

for (k = 0; k < n; k++)
    sum += a[rw*n + k] * b[k*n + cl];

c[rw*n + cl] = sum;
d[rw*n + cl].tx = threadIdx.y;
d[rw*n + cl].ty = threadIdx.x;  
d[rw*n + cl].bx = blockIdx.y;
d[rw*n + cl].by = blockIdx.x;
d[rw*n + cl].rw = rw;
d[rw*n + cl].cl = cl;

}

...

dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);

Now what is gives:

Input matrix A:
       1        2        3        4 
       2        3        4        5 
       3        4        5        6 
       4        5        6        7 

Input matrix B:
       1      0.5    0.333     0.25 
     0.5    0.333     0.25      0.2 
   0.333     0.25      0.2    0.167 
    0.25      0.2    0.167    0.143 
Time spent executing on the GPU: 0.05 millseconds

Matrix product of A and B (CUDA):
       4     2.72      2.1     1.72 
    6.08        4     3.05     2.48 
       4     3.05     2.48      2.1 
    3.22      2.6     2.19     1.89 

[ThreadIdx.x][ThreadIdx.y]:
[0][0] [0][1] [0][2] [0][3] 
[1][0] [1][1] [-1][-1] [-1][-1] 
[-1][-1] [-1][-1] [-1][-1] [-1][-1] 
[-1][-1] [-1][-1] [-1][-1] [-1][-1] 

[BlockIdx.x][BlockIdx.y]:
[0][0] [0][0] [0][0] [0][0] 
[0][0] [-1][-1] [-1][-1] [-1][-1] 
[-1][-1] [-1][-1] [-1][-1] [-1][-1] 
[-1][-1] [-1][-1] [-1][-1] [-1][-1] 

    [rw][cl]:
    [0][0] [0][1] [0][2] [0][3] 
    [1][0] [-1][-1] [-1][-1] [-1][-1] 
    [-1][-1] [-1][-1] [-1][-1] [-1][-1] 
    [-1][-1] [-1][-1] [-1][-1] [-1][-1] 

    Matrix product of A and B (Correct result):
       4     2.72      2.1     1.72 
    6.08        4     3.05     2.48 
    8.17     5.28        4     3.24 
    10.2     6.57     4.95        4

So each thread writes its threadIdx, blockIdx, numbers of row and columns but most of the values hadn't been changed since their initialization. Please help me find out how it all works.

Part 2. Each thread outputs correct sum, but after copying data from device memory to host some elements become zeros. Here's the code:

 #include <stdio.h>
#include <stdlib.h>
#include "cuPrintf.cuh"
#include "cuPrintf.cu"

__global__ void mult1_kernel (float *a, float *b, float *c, int n) {
    int k;
    float sum = 0.0f;
    int rw = blockIdx.y * blockDim.y + threadIdx.y;
    int cl = blockIdx.x * blockDim.x + threadIdx.x;

    if ((rw < n) && (cl < n)) {
        rw *= n;     
        for (k = 0; k < n; k++)
            sum += a[rw + k] * b[k*n + cl];

        cuPrintf ("Thread[%d][%d] from block[%d][%d]:\n rw = %d, cl = %d, sum = %f\n\n",

                                    threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, rw/n, cl, sum);
    }

    c[rw + cl] = sum;
}

__host__ int mult1_host (float *a, float *b, float *c, int n) {
    cudaEvent_t start, stop;
    cudaError_t cuerr;
    int i, j;
    int size = n * n * sizeof (float);
    float gpuTime = 0.0f;
    float *aDev = NULL, *bDev = NULL, *cDev = NULL;

    cuerr = cudaMalloc ((void**)&aDev, size);
    if (cuerr != cudaSuccess) {
        fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
        return (-1);
    }
    cuerr = cudaMalloc ((void**)&bDev, size);
    if (cuerr != cudaSuccess) {
        fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
        return (-1);
    }
    cuerr = cudaMalloc ((void**)&cDev, size);
    if (cuerr != cudaSuccess) {
        fprintf (stderr, "Cannot allocate GPU memory for cDev: %s\n", cudaGetErrorString (cuerr));
        return (-1);
    }   

    dim3 blockSize = dim3 (16, 16);
    dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);

    cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess) {
        fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
        return (-1);
    }
    cuerr = cudaMemcpy (bDev, b, size, cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess) {
        fprintf (stderr, "Cannot copy data from a to bDev: %s\n", cudaGetErrorString (cuerr));
        return (-1);
    }

    cudaPrintfInit ();

    cudaEventCreate (&start);
    cudaEventCreate (&stop);

    cudaEventRecord (start, 0);
mult1_kernel <<< gridSize, blockSize >>> (aDev, bDev, cDev, n);

cudaEventRecord (stop, 0);

cudaEventSynchronize (stop);

cudaEventElapsedTime (&gpuTime, start, stop);

cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();

cuerr = cudaMemcpy (c, cDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot copy data from c to cDev: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

printf ("Time spent executing on the GPU: %.2f millseconds\n", gpuTime);
printf ("\n");

cudaEventDestroy (start);
cudaEventDestroy (stop);

cudaFree (aDev);
cudaFree (bDev);
cudaFree (cDev);

    return (0);
}
int main () {
    int i, j, k;
    int n;
    float *a, *b, *c;

printf ("Enter the matrix size: ");
scanf ("%d", &n);
printf ("Blocks = %d\n", n*n/512 + 1);

if (n < 1) {
    printf ("Invalid matrix size\n");
    return (-1);
}

if (!(a = (float*) malloc (n * n * sizeof (float)))) {
    fprintf (stderr, "Cannot allocate CPU memory for a\n"); 
    return (-1);
    }
if (!(b = (float*) malloc (n * n * sizeof (float)))) {
    fprintf (stderr, "Cannot allocate CPU memory for b\n"); 
    return (-1);
    }
if (!(c = (float*) malloc (n * n * sizeof (float)))) {
    fprintf (stderr, "Cannot allocate CPU memory for c\n"); 
    return (-1);
    }

for (i = 0; i < n; i++)
    for (j = 0; j < n; j++)
        a[i*n + j] = (float)(i + j + 1);

for (i = 0; i < n; i++)
    for (j = 0; j < n; j++)
        b[i*n + j] = 1 / (float)(i + j + 1);

printf ("Input matrix A:\n");
for (i = 0; i < n; i++) { 
    for (j = 0; j < n; j++)
        printf ("%8.3g ", a[i*n + j]);
    printf ("\n");
}
printf ("\n");

printf ("Input matrix B:\n");
for (i = 0; i < n; i++) { 
    for (j = 0; j < n; j++)
        printf ("%8.3g ", b[i*n + j]);
    printf ("\n");
}   
printf ("\n");

if (mult1_host (a, b, c, n) < 0) {
    return (-1);
}

printf ("Matrix product of A and B (CUDA):\n");
for (i = 0; i < n; i++) { 
    for (j = 0; j < n; j++)
        printf ("%8.3g ", c[i*n + j]);
    printf ("\n");
}
printf ("\n");

for (i = 0; i < n; i++)
    for (j = 0; j < n; j++) {
        c[i*n +j] = 0;
        for (k = 0; k < n; k++)
            c[i*n + j] += a[i*n + k] * b[k*n + j];
    }

printf ("Matrix product of A and B (Check):\n");
for (i = 0; i < n; i++) { 
    for (j = 0; j < n; j++)
        printf ("%8.3g ", c[i*n + j]);
    printf ("\n");
}

free (a);
free (b);
free (c);

    return (0);
}

And it gives:

[0, 0]: Thread[0][0] from block[0][0]:
 rw = 0, cl = 0, sum = 4.000000

[0, 1]: Thread[1][0] from block[0][0]:
 rw = 0, cl = 1, sum = 2.716667

[0, 2]: Thread[2][0] from block[0][0]:
 rw = 0, cl = 2, sum = 2.100000

[0, 3]: Thread[3][0] from block[0][0]:
 rw = 0, cl = 3, sum = 1.721429

[0, 16]: Thread[0][1] from block[0][0]:
 rw = 1, cl = 0, sum = 6.083333

[0, 17]: Thread[1][1] from block[0][0]:
 rw = 1, cl = 1, sum = 4.000000
[0, 18]: Thread[2][1] from block[0][0]:
 rw = 1, cl = 2, sum = 3.050000

[0, 19]: Thread[3][1] from block[0][0]:
 rw = 1, cl = 3, sum = 2.480952

[0, 32]: Thread[0][2] from block[0][0]:
 rw = 2, cl = 0, sum = 8.166666
[0, 33]: Thread[1][2] from block[0][0]:
 rw = 2, cl = 1, sum = 5.283333

[0, 34]: Thread[2][2] from block[0][0]:
 rw = 2, cl = 2, sum = 4.000000

[0, 35]: Thread[3][2] from block[0][0]:
 rw = 2, cl = 3, sum = 3.240476
[0, 48]: Thread[0][3] from block[0][0]:
 rw = 3, cl = 0, sum = 10.250000

[0, 49]: Thread[1][3] from block[0][0]:
 rw = 3, cl = 1, sum = 6.566667

[0, 50]: Thread[2][3] from block[0][0]:
 rw = 3, cl = 2, sum = 4.950000

[0, 51]: Thread[3][3] from block[0][0]:
 rw = 3, cl = 3, sum = 4.000000

Matrix product of A and B (CUDA):
       4     2.72      2.1     1.72 
    6.08        0        0        0 
       0        0        0        0 
       0        0        0        0 

Time spent executing on the GPU: 0.06 millseconds

Matrix product of A and B (Check):
       4     2.72      2.1     1.72 
    6.08        4     3.05     2.48 
    8.17     5.28        4     3.24 
    10.2     6.57     4.95        4

Thanks!

If you have a compute capability 2.0 or later device, it's possible to do printf() directly from kernel code. If you have an earlier device, there is a cuPrintf approach that will work similarly. — Robert Crovella
It might be better if you post the entire code. For example, I'd like to know what you are passing for n, as well as what your kernel invocation looks like, as well as the malloc operations for the various data sets. — Robert Crovella
Also, in your kernel code, I think you will want a statement like: if ((rw<n) && (cl<n)){ after your rw and cl definitions, but before the rest of the kernel code, and add an extra } to close the braces at the end of the kernel. You need to make sure only valid threads are manipulating data. Threads outside of your row and column dimensions are not valid, but if you let them execute they will corrupt your data. — Robert Crovella
Thanks for reply and for the one with cuPrintf()! Actually I've tried to restrict the range of threads, but now it turned out that the mistake is not in the kernel (with the help of cuPrintf() it became evident). Now I definately know that each thread calculates its element correctly. But for some reason the data which I recieve after cudaMemcpy is wrong (I will update my post). — Max

Robert Crovella Robert Crovella · Accepted Answer · 2012-11-28T20:58:10

Instead of this at the end of your kernel:

    }

    c[rw + cl] = sum;
}

Do this:

    c[(rw*n) + cl] = sum;
    }
}

Note the differences:

(I told you to put the exra closing brace at the end of your kernel). By leaving that statement outside of limit on threads, you are letting garbage threads write to locations in your array.
You are not indexing into your 1D array correctly. The way you had it written, the final write to c for matrix location (0, 2) and (2, 0) would go to the same location (rw + cl).

Matrix Multiplication using CUDA (how threads are numbered)

1 Answers