I am trying to compute a Summed Area Table for a 2D matrix where the number of rows and columns are not equal. I have run into a slight problem where my code seems to function okay where the rows and columns are equal, but it fails to compute the last row of the final output when the rows and columns are not equal. The problem is I can't figure out why this is happening.
Basic Algorithm for Integral Image/Summed Area Table:
Basically, in an Integral Sum every pixel or index element computes the sum of all the matrix elements above and behind it. For instance for a 3x2 input array with the following elements:
[5, 2|
|5, 2|
|5, 2]
The Integral Sum in the output array would be as:
[5, 7|
|10, 14|
|15, 21]
Basically the following is what I am trying to do in CUDA C:
for(int matrixElement_y_index=0; matrixElement_y_index<=total_rows-1; matrixElement_y_index++)
{
//matrixElement_x_index and matrixElement_y_index represent (x,y) indices of each matrix element
for(int matrixElement_x_index=0; matrixElement_x_index<=total_columns-1; matrixElement_x_index++)
{
int temp=0;
for(int r=0;r<=(matrixElement_y_index);r++)
{
for(int c=0; c<=matrixElement_x_index;c++)
{
temp=temp+input[c][r];
}
}
output[matrixElement_y_index][matrixElement_x_index]=temp;
}
}
The CUDA C code that I have come up with so far is as follows:
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
__global__ void image_integral(int *a, int*b, int width_x,int width_y)
{
// Thread Ids equal to block Ids because the each blocks contains one thread only.
int gidx = blockIdx.x;
int gidy = blockIdx.y;
int temp=0;
if(gidx>=width_x || gidy>=width_y)
{
//Return the threads which exceed the input array's X or Y dimension.
return;
}
else
//Compute the Integral Image or Summed Area Table
{
// The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array.
for(int counter=0;counter<=gidy;counter++)
{
// The first loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array
for(int counter_two=0; counter_two<=gidx; counter_two++)
{
temp = temp+a[counter*width_x+counter_two];
}
}
}
//Transfer the final result to the output array
b[gidy*width_x+gidx]=temp;
}
void main()
{
//M is number of rows
//N is number of columns
int M=3,N=2, m_e=0;
int total_e=M*N;
int widthstep=total_e*sizeof(int);
int * matrix_a= (int *)malloc(widthstep);
int * matrix_b= (int *)malloc(widthstep);
cout<<"Enter elements for "<< M<<"x"<<N<<" matrix";
for(int r=0;r<=M-1;r++)
{
for(int c=0; c<=N-1;c++)
{
cout<<"Enter Matrix element [ "<<c<<","<<r<<"]";
cin>>m_e;
matrix_a[r*M+c]=m_e;
matrix_b[r*M+c]=0;
}
}
int * d_matrix_a, * d_matrix_b;
cout<<"Input:"<<endl;
for(int kk=0;kk<=M-1;kk++)
{
for(int jj=0;jj<=N-1;jj++){
cout<<matrix_a[kk*M+jj]<<" ";}
cout<<endl;
}
cout<<endl;
cudaMalloc(&d_matrix_a,widthstep);
cudaMalloc(&d_matrix_b,widthstep);
cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice);
cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice);
//Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements.
//Each block contains only one thread.
dim3 grid(M,N);
image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N);
cudaThreadSynchronize();
cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost);
cout<<"The Summed Area table is: "<<endl;
for(int kk=0;kk<=M-1;kk++)
{
for(int jj=0;jj<=N-1;jj++)
cout<<matrix_b[kk*M+jj]<<" ";
cout<<endl;
}
system("pause");
cudaFree(d_matrix_a);
cudaFree(d_matrix_b);
free(matrix_a);
free(matrix_b);
}
Many thanks!!