Integral Image or Summed Area Table of 2D matrix using CUDA C

Question

I am trying to compute a Summed Area Table for a 2D matrix where the number of rows and columns are not equal. I have run into a slight problem where my code seems to function okay where the rows and columns are equal, but it fails to compute the last row of the final output when the rows and columns are not equal. The problem is I can't figure out why this is happening.

Basic Algorithm for Integral Image/Summed Area Table:

Basically, in an Integral Sum every pixel or index element computes the sum of all the matrix elements above and behind it. For instance for a 3x2 input array with the following elements:

 [5, 2|
 |5, 2|  
 |5, 2]

The Integral Sum in the output array would be as:

 [5,   7|
 |10, 14|  
 |15, 21]

Basically the following is what I am trying to do in CUDA C:

for(int matrixElement_y_index=0; matrixElement_y_index<=total_rows-1; matrixElement_y_index++)
{
    //matrixElement_x_index and matrixElement_y_index represent (x,y) indices of each matrix element
    for(int matrixElement_x_index=0; matrixElement_x_index<=total_columns-1; matrixElement_x_index++)
    {
        int temp=0; 

        for(int r=0;r<=(matrixElement_y_index);r++)
        {
            for(int c=0; c<=matrixElement_x_index;c++)
            {
                temp=temp+input[c][r];
            }
        }

        output[matrixElement_y_index][matrixElement_x_index]=temp;
    }
}

The CUDA C code that I have come up with so far is as follows:

#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void image_integral(int *a, int*b, int width_x,int width_y)
{
    // Thread Ids equal to block Ids because the each blocks contains one thread only.
    int gidx = blockIdx.x;
    int gidy = blockIdx.y;
    int temp=0;

    if(gidx>=width_x || gidy>=width_y)
    {
    //Return the threads which exceed the input array's X or Y dimension.
        return;
    }

    else
    //Compute the Integral Image or Summed Area Table
    {   
        // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array.  
        for(int counter=0;counter<=gidy;counter++)
        {
            // The first loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array  
            for(int counter_two=0; counter_two<=gidx; counter_two++)
            {
                temp = temp+a[counter*width_x+counter_two];
            }
        }
    }

    //Transfer the final result to the output array
    b[gidy*width_x+gidx]=temp;
}

void main()
{
    //M is number of rows
    //N is number of columns

    int M=3,N=2, m_e=0;
    int total_e=M*N;
    int widthstep=total_e*sizeof(int);

    int * matrix_a= (int *)malloc(widthstep);
    int * matrix_b= (int *)malloc(widthstep);

    cout<<"Enter elements for "<< M<<"x"<<N<<" matrix";

    for(int r=0;r<=M-1;r++)
    {
        for(int c=0; c<=N-1;c++)
        {
            cout<<"Enter Matrix element [ "<<c<<","<<r<<"]";
            cin>>m_e;
            matrix_a[r*M+c]=m_e;
            matrix_b[r*M+c]=0;
        }
    }

    int * d_matrix_a, * d_matrix_b;

    cout<<"Input:"<<endl;

    for(int kk=0;kk<=M-1;kk++)
    {
        for(int jj=0;jj<=N-1;jj++){
            cout<<matrix_a[kk*M+jj]<<" ";}
        cout<<endl;
    }

    cout<<endl;

    cudaMalloc(&d_matrix_a,widthstep);
    cudaMalloc(&d_matrix_b,widthstep);

    cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice);

    //Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements.

    //Each block contains only one thread.

    dim3 grid(M,N); 

    image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N);

    cudaThreadSynchronize();

    cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost);

    cout<<"The Summed Area table is: "<<endl;

    for(int kk=0;kk<=M-1;kk++)
    {
        for(int jj=0;jj<=N-1;jj++)
            cout<<matrix_b[kk*M+jj]<<" ";
        cout<<endl;
    }

    system("pause");

    cudaFree(d_matrix_a);
    cudaFree(d_matrix_b);
    free(matrix_a);
    free(matrix_b);
}

Many thanks!!

hubs hubs · Accepted Answer · 2014-03-06T07:59:25

Your main problem is a wrong memory usage and storage. With your code you also corrupted the heap! I rechanged your code by using row-major ordering, as it's usually used in c/c++.

Your first error occurs when you write the inputs into host memory matrix_a[r*M+c]. Because r range is from 0..M(3) and c range is from 0..N(2) the maximum index is 2*3+1=7. But your matrix only have 6 elements - maximum index is 5! Therefore I rechanged all matrix accesses.

With that changes I have to fit your grid setup, too. Now it's dim3 grid(N,M);.

If it's uncertain for you what a variable represents or how to use it, use good representing names for them, as you did it in the c reference code!

With that changes your code work for me. Be aware, the way of input the matrix has changed, too!

Above the changed complete code: Kernel function:

__global__ void image_integral(int *a, int*b, int rowsTotal,int colsTotal)
{
    // Thread Ids equal to block Ids because the each blocks contains one thread only.
    int col = blockIdx.x;
    int row = blockIdx.y;
    int temp=0;

    if(col < colsTotal && row < rowsTotal)
    {
        // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array.  
        for(int r=0;r<=row;r++)
        {
            // The second loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array  
            for(int c=0; c<=col; c++)
            {
                temp = temp+a[r*colsTotal+c];
            }
        }
    }

    //Transfer the final result to the output array
    b[row*colsTotal+col]=temp;
}

The host implementation:

void main()
{
    //M is number of rows
    //N is number of columns

    int M=3,N=2, m_e=0;
    int total_e=M*N;
    int widthstep=total_e*sizeof(int);

    int * matrix_a= (int *)malloc(widthstep);
    int * matrix_b= (int *)malloc(widthstep);

    cout<<"Enter elements for "<< M<<"x"<<N<<" matrix";

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout<<"Enter Matrix element [ "<<r<<","<<c<<"]";
            cin>>m_e;
            matrix_a[r*N+c]=m_e;
            matrix_b[r*N+c]=0;
        }
    }

    int * d_matrix_a, * d_matrix_b;

    cout<<"Input:"<<endl;

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout << matrix_a[r*N+c]<<" ";
        }
        cout << endl;
    }

    cout<<endl;

    cudaMalloc(&d_matrix_a,widthstep);
    cudaMalloc(&d_matrix_b,widthstep);

    cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice);

    //Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements.

    //Each block contains only one thread.

    dim3 grid(N,M);

    image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N);

    cudaThreadSynchronize();

    cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost);

    cout<<"The Summed Area table is: "<<endl;

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout << matrix_b[r*N+c]<<" ";
        }
        cout << endl;
    }

    system("pause");

    cudaFree(d_matrix_a);
    cudaFree(d_matrix_b);
    free(matrix_a);
    free(matrix_b);
}

Integral Image or Summed Area Table of 2D matrix using CUDA C

Basic Algorithm for Integral Image/Summed Area Table:

1 Answers