getting the wrong output with CUDA when using more than one block

Question

I am trying to develop an implementation of the FFT in CUDA using visual studio 2010, so far I've made it work for as far as 1024 points inside one block. The issue is that whenever I use more than one block the results for Block 1 will be ok and the others will return a wrong value (doesn't seem random, they don't change in multiple runs.) Here is my kernel

__device__ void FFT(int idxS,int bfsize, Complex* data1, Complex* data0, int k, int N ){
        Complex alpha;
        if((idxS % bfsize) < (bfsize/2)){
            data1[idxS] = ComplexAdd(data0[idxS],data0[idxS+bfsize/2]);
        }
        else
        {
            float angle = -PI*2*((idxS*(1<<k)%(bfsize/2)))/N;
            alpha.x = cos(angle);
            alpha.y= sin(angle);
            Complex v0;
            v0 = ComplexAdd(data0[idxS-bfsize/2] ,ComplexScale(data0[idxS],-1));
            data1[idxS] = ComplexMul(v0, alpha);
        }
       }

__device__ void Ordenador(int r, int idxS ,Complex* data1, Complex* data0 ){
    int p = 0;
    for(int k = 0;k < r;k++)
       {
          if(idxS & (1<<k))
          p+=1<<(r - k - 1);
        }
    data1[idxS] = data0[p];
    __syncthreads();
}


__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
    int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
        __syncthreads;
        int bfsize = 1<<(r - k);
        FFT(idxS, bfsize,  data1,  data0, k, N);
        data0[idxS] = data1[idxS];
   }
int prepFFT(float *Entrada, Complex* saida, int N ){
    if(ceilf(log2((float)N)) == log2((float)N) ){
        for (int i=0; i<N; i++){
            saida[i].x = Entrada[i];
            saida[i].y = 0;
        }
        Complex *d_saida;
        int m = (int)log2((float)N);
        Complex *data1 = new Complex[N];
        Complex *data1_d;
        if (N<1024){
        HANDLE_ERROR (cudaMalloc((void**)&d_saida,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        HANDLE_ERROR (cudaMalloc((void**)&data1_d,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        const dim3 numThreads (N,1,1);
        const dim3 numBlocks(1,1,1);
            for(int k = 0 ;k < m ; k++)
    {
        GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaDeviceSynchronize()); 
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());
        }
        else{
        HANDLE_ERROR (cudaMalloc((void**)&d_saida,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        HANDLE_ERROR (cudaMalloc((void**)&data1_d,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        const dim3 numThreads (1024,1,1);
        const dim3 numBlocks(N/1024 +1,1,1);
            for(int k = 0;k < m;k++)
    {
        GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());     
        cudaFree(data1_d);
        cudaFree(d_saida);
        delete data1;

        }
        return 1;
    }
    else
        return 0;
}

I've tried using Shared memory, however it would return all 0s and I figured CUDA wasn't copying from global to shared ( NSight would tell me that the value for that memory position was ????). This code should be just a proof of concept for now, doesn't have to be optimized, just return the right values. If you guys need the full code I'll provide it. I've been searching for a solution for this for over a month now, this is my desperate call.

Thanks, John

------- Update --------

I changed the code for debugging purposes launching 2 threads in each of the 2 blocks.

int prepFFT(float *Entrada, Complex* saida, int N ){
    if(ceilf(log2((float)N)) == log2((float)N) ){
        for (int i=0; i<N; i++){
            saida[i].x = Entrada[i];
            saida[i].y = 0;
        }
        Complex *d_saida;
        int m = (int)log2((float)N);

        Complex *data1 = new Complex[N];
        Complex *data1_d;

        if (N<1024){
        HANDLE_ERROR (cudaMalloc((void**)&d_saida,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        HANDLE_ERROR (cudaMalloc((void**)&data1_d,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        const dim3 numThreads (2,1,1);
        const dim3 numBlocks(2,1,1);
            for(int k = 0 ;k < m ; k++)
    {
        GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaDeviceSynchronize()); 
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());
        }
        else{
        HANDLE_ERROR (cudaMalloc((void**)&d_saida,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        HANDLE_ERROR (cudaMalloc((void**)&data1_d,   sizeof(Complex) * N));
        HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
        const dim3 numThreads (1024,1,1);
        const dim3 numBlocks(N/1024 +1,1,1);
            for(int k = 0;k < m;k++)
    {
        GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
        HANDLE_ERROR (cudaDeviceSynchronize()); 
    }
        HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
        HANDLE_ERROR (cudaDeviceSynchronize());     
        cudaFree(data1_d);
        cudaFree(d_saida);
        delete data1;

        }
        return 1;
    }
    else
        return 0;

}

---------------------Edit 2 ---------------------

What is really weird is that when I use memcheck (in any mode) the program returns the right results.

----Final Edit ---------------

I found out that the problem was in this bit of code

FFT(idxS, bfsize,  data1,  data0, k, N);
data0[idxS] = data1[idxS];

I found out that separating the last line in a new function and calling it with the CPU produced correct results for me. Thank you for the help!! Best Regards!

I imagine you're doing this for learning purposes, but in case not, it seems appropriate to point out that there is a CUDA library (cufft) that will do FFT's for you. — Robert Crovella
yes, I am doing this for learning purposes, I'll use cufft for comparison later. Thanks for the heads up. — JLugao
You try running your code with cuda-memcheck in the failing case and see if it reports any access errors. — Robert Crovella
Perhaps you should work through the indexing using thread 317 in block 2 as an example, and see if you can spot any problems with the indexing in your arrays in global memory. — Robert Crovella
"when I use memcheck it returns the right result, however if I run without memcheck I get weird results" . This now sounds like a race condition. running with memcheck may affect the order in which blocks are executed. You may have a block-order dependency in your code. Try running memcheck with the racecheck tool. run cuda-memcheck --help to learn how. — Robert Crovella

Rodion Rodion · Accepted Answer · 2013-06-10T07:50:33

First of all you should be checking your main kernel function __global__ void GPU_FFTfor the issue

Just change it to this:

__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
    int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
        int bfsize = 1<<(r - k);
        //FFT(idxS, bfsize,  data1,  data0, k, N);
        //data0[idxS] = data1[idxS];
        if (idxS  <= N) data0[idxS] = idxS;
   }

What happens in the second block now?

If it's OK uncomment //FFT(idxS, bfsize, data1, data0, k, N);

and change last line to:

if (idxS <= N) data0[idxS] = data1[idxS];

What happens now? Still old error?

p.s. and you don't need __syncthreads; just after retrieving your thread indexes

upd.

if((idxS % bfsize) < (bfsize/2)){
__syncthreads;
...}

getting the wrong output with CUDA when using more than one block

1 Answers