I was trying to catch a mistake in my program which multiplies square matrices using CUDA. As long as it's not possible to printf() from kernel (please tell if you know how to do it), I send an additional structure to the kernel to get the numbers of threads, blocks, rows and columns. All the elements of the structure are -1 (to identify them). So, the matrix product are partially incorrect but the the data from kernel totally confused me. Here's the code:
typedef struct {
int tx;
int ty;
int bx;
int by;
int rw;
int cl;
} THREADS;
__global__ void mult1_kernel (float *a, float *b, float *c, THREADS *d, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
for (k = 0; k < n; k++)
sum += a[rw*n + k] * b[k*n + cl];
c[rw*n + cl] = sum;
d[rw*n + cl].tx = threadIdx.y;
d[rw*n + cl].ty = threadIdx.x;
d[rw*n + cl].bx = blockIdx.y;
d[rw*n + cl].by = blockIdx.x;
d[rw*n + cl].rw = rw;
d[rw*n + cl].cl = cl;
}
...
dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);
Now what is gives:
Input matrix A:
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
Input matrix B:
1 0.5 0.333 0.25
0.5 0.333 0.25 0.2
0.333 0.25 0.2 0.167
0.25 0.2 0.167 0.143
Time spent executing on the GPU: 0.05 millseconds
Matrix product of A and B (CUDA):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
4 3.05 2.48 2.1
3.22 2.6 2.19 1.89
[ThreadIdx.x][ThreadIdx.y]:
[0][0] [0][1] [0][2] [0][3]
[1][0] [1][1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[BlockIdx.x][BlockIdx.y]:
[0][0] [0][0] [0][0] [0][0]
[0][0] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[rw][cl]:
[0][0] [0][1] [0][2] [0][3]
[1][0] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
Matrix product of A and B (Correct result):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
8.17 5.28 4 3.24
10.2 6.57 4.95 4
So each thread writes its threadIdx, blockIdx, numbers of row and columns but most of the values hadn't been changed since their initialization. Please help me find out how it all works.
Part 2. Each thread outputs correct sum, but after copying data from device memory to host some elements become zeros. Here's the code:
#include <stdio.h>
#include <stdlib.h>
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
__global__ void mult1_kernel (float *a, float *b, float *c, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
if ((rw < n) && (cl < n)) {
rw *= n;
for (k = 0; k < n; k++)
sum += a[rw + k] * b[k*n + cl];
cuPrintf ("Thread[%d][%d] from block[%d][%d]:\n rw = %d, cl = %d, sum = %f\n\n",
threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, rw/n, cl, sum);
}
c[rw + cl] = sum;
}
__host__ int mult1_host (float *a, float *b, float *c, int n) {
cudaEvent_t start, stop;
cudaError_t cuerr;
int i, j;
int size = n * n * sizeof (float);
float gpuTime = 0.0f;
float *aDev = NULL, *bDev = NULL, *cDev = NULL;
cuerr = cudaMalloc ((void**)&aDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&bDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&cDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);
cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMemcpy (bDev, b, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfInit ();
cudaEventCreate (&start);
cudaEventCreate (&stop);
cudaEventRecord (start, 0);
mult1_kernel <<< gridSize, blockSize >>> (aDev, bDev, cDev, n);
cudaEventRecord (stop, 0);
cudaEventSynchronize (stop);
cudaEventElapsedTime (&gpuTime, start, stop);
cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
cuerr = cudaMemcpy (c, cDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from c to cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
printf ("Time spent executing on the GPU: %.2f millseconds\n", gpuTime);
printf ("\n");
cudaEventDestroy (start);
cudaEventDestroy (stop);
cudaFree (aDev);
cudaFree (bDev);
cudaFree (cDev);
return (0);
}
int main () {
int i, j, k;
int n;
float *a, *b, *c;
printf ("Enter the matrix size: ");
scanf ("%d", &n);
printf ("Blocks = %d\n", n*n/512 + 1);
if (n < 1) {
printf ("Invalid matrix size\n");
return (-1);
}
if (!(a = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for a\n");
return (-1);
}
if (!(b = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for b\n");
return (-1);
}
if (!(c = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for c\n");
return (-1);
}
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
a[i*n + j] = (float)(i + j + 1);
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
b[i*n + j] = 1 / (float)(i + j + 1);
printf ("Input matrix A:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", a[i*n + j]);
printf ("\n");
}
printf ("\n");
printf ("Input matrix B:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", b[i*n + j]);
printf ("\n");
}
printf ("\n");
if (mult1_host (a, b, c, n) < 0) {
return (-1);
}
printf ("Matrix product of A and B (CUDA):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
printf ("\n");
for (i = 0; i < n; i++)
for (j = 0; j < n; j++) {
c[i*n +j] = 0;
for (k = 0; k < n; k++)
c[i*n + j] += a[i*n + k] * b[k*n + j];
}
printf ("Matrix product of A and B (Check):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
free (a);
free (b);
free (c);
return (0);
}
And it gives:
[0, 0]: Thread[0][0] from block[0][0]:
rw = 0, cl = 0, sum = 4.000000
[0, 1]: Thread[1][0] from block[0][0]:
rw = 0, cl = 1, sum = 2.716667
[0, 2]: Thread[2][0] from block[0][0]:
rw = 0, cl = 2, sum = 2.100000
[0, 3]: Thread[3][0] from block[0][0]:
rw = 0, cl = 3, sum = 1.721429
[0, 16]: Thread[0][1] from block[0][0]:
rw = 1, cl = 0, sum = 6.083333
[0, 17]: Thread[1][1] from block[0][0]:
rw = 1, cl = 1, sum = 4.000000
[0, 18]: Thread[2][1] from block[0][0]:
rw = 1, cl = 2, sum = 3.050000
[0, 19]: Thread[3][1] from block[0][0]:
rw = 1, cl = 3, sum = 2.480952
[0, 32]: Thread[0][2] from block[0][0]:
rw = 2, cl = 0, sum = 8.166666
[0, 33]: Thread[1][2] from block[0][0]:
rw = 2, cl = 1, sum = 5.283333
[0, 34]: Thread[2][2] from block[0][0]:
rw = 2, cl = 2, sum = 4.000000
[0, 35]: Thread[3][2] from block[0][0]:
rw = 2, cl = 3, sum = 3.240476
[0, 48]: Thread[0][3] from block[0][0]:
rw = 3, cl = 0, sum = 10.250000
[0, 49]: Thread[1][3] from block[0][0]:
rw = 3, cl = 1, sum = 6.566667
[0, 50]: Thread[2][3] from block[0][0]:
rw = 3, cl = 2, sum = 4.950000
[0, 51]: Thread[3][3] from block[0][0]:
rw = 3, cl = 3, sum = 4.000000
Matrix product of A and B (CUDA):
4 2.72 2.1 1.72
6.08 0 0 0
0 0 0 0
0 0 0 0
Time spent executing on the GPU: 0.06 millseconds
Matrix product of A and B (Check):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
8.17 5.28 4 3.24
10.2 6.57 4.95 4
Thanks!
printf()
directly from kernel code. If you have an earlier device, there is a cuPrintf approach that will work similarly. – Robert Crovellaif ((rw<n) && (cl<n)){
after your rw and cl definitions, but before the rest of the kernel code, and add an extra}
to close the braces at the end of the kernel. You need to make sure only valid threads are manipulating data. Threads outside of your row and column dimensions are not valid, but if you let them execute they will corrupt your data. – Robert Crovella