2
votes

I need to transpose a square matrix. I test the program with matrix: a[i][j] = 0 if i>j, a[i][j] = if i<=j, but the result shows that not all elements are in the right places.

Here's the code (except for main()):

#include <stdio.h> 
#include <stdlib.h>
__global__ void transpose_kernel (float *a, float *b, int n) {
    unsigned int ax = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int ay = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int aIdx = ax + n * ay;
    unsigned int bIdx = ay + n * ax;

    b[bIdx] = a[aIdx];
}

int transpose_host (float *a, float *b, int n) {
    int size = n * n * sizeof (float);
    float *aDev = NULL, *bDev = NULL;

    cudaError_t cuerr = cudaMalloc ((void**)&aDev, size);
    if (cuerr != cudaSuccess) {
        fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
        return (-1);
    }

cuerr = cudaMalloc ((void**)&bDev, size);
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

dim3 blockSize = dim3 (16, 16, 1);
dim3 gridSize = dim3 (n/16 + 1, n/16 + 1, 1);

cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

transpose_kernel <<< gridSize, blockSize >>> (aDev, bDev, n);

cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

cuerr = cudaMemcpy (b, bDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
    fprintf (stderr, "Cannot copy data from b to bDev: %s\n", cudaGetErrorString (cuerr));
    return (-1);
}

cudaFree (aDev);
cudaFree (bDev);

    return (0);
}

Why isn't my array transposing correctly?

1

1 Answers

2
votes

The problem is in "extra" threads going outside the allocated array.

When you assign your grid block, you round up (in fact, enforce rounding to the next integer even when things divide evenly:)

dim3 gridSize = dim3 (n/16 + 1, n/16 + 1, 1);

so that there are always threads there whose ax or ay fall outside of [0,n). So when you copy a[aIdx] into b[bIdx] anyway, you are copying random data into memory, and in fact potentially overwriting "real" data depending on scheduling.

You can fix this by changing your kernel to check for this:

if (ax < n && ay < n)
    b[bIdx] = a[aIdx];

and you may want to change your rounding of the grid size to not round up if things divide evenly:

dim3 gridSize = dim3 ((n+15)/16, (n+15)/16, 1);