I'm trying to implement a reduction along the row direction of a 2D matrix. I'm starting from a code I found on stackoverflow (thanks a lot Robert!)
thrust::max_element slow in comparison cublasIsamax - More efficient implementation?
The above link shows a custom kernel that performs reduction on a single row. It divides the input row into many rows and each row has 1024 threads. Works very well.
For the 2D case, everything's the same except that now there's a y grid dimension. So each block's y dimension is still 1. The problem is that when I try to write data onto the shared memory within each block (within the "max_idx_kernel_reduction_within_block" kernel in the code), It takes very long (More than (# of Rows) * (Time it takes to perform reduction on 1 Row. I would rather run a for loop). I know I have a lot of elements but I was expecting something faster than that.
I don't think the memory access pattern is an issue, but I heard that the TOTAL amount of shared memory might be the limitation?? : CUDA: Is coalesced global memory access faster than shared memory? Also, does allocating a large shared memory array slow down the program?
Any suggestions to make my code faster (the first kernel is the bottleneck)? Thank you very much, very much appreciated!!
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cuda_runtime.h>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024 // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
#define IDX2F(i,j,ld) ((j-1) * (ld) + ((i) - 1)) // 1 based indexing
#define IDX2C(i,j,ld) ((j) * (ld) + i) // 0 based indexing
__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){ // first kernel. Reduction within blocks
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
float my_val = FLOAT_MIN; // lowest possible number
int my_idx = -1; // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
// sweep from global memory
while (idx < xSize){ // this ensures you don't go out the size of the array's x direction
if (data[IDX2C(blockIdx.y,idx,ySize)] > my_val) {my_val = data[IDX2C(blockIdx.y,idx,ySize)]; my_idx = idx;}
// compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
idx += blockDim.x*gridDim.x;}
// until here takes about 6 ms !! very fast!!
// populate shared memory: takes ~ 270 ms
vals[threadIdx.x] = my_val; // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
idxs[threadIdx.x] = my_idx; // do this for index as well -> this is also slow!!
__syncthreads();
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1){
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
// the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
// then puts the larger value into shared memory of threadIdx.x
__syncthreads();} // so now in each block, shared memory's first element (index 0) is the max value and max value index
// perform block-level reduction
if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
// remember, this is a global variable.
blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
__syncthreads();
}
}
// originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
__shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
__shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
int idx = threadIdx.x;
float my_val = FLOAT_MIN;
int my_idx = -1; // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
while (idx < MAX_BLOCKS_X ){ // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
if (blk_vals[blockIdx.y][idx] > my_val)
{my_val = blk_vals[blockIdx.y][idx]; my_idx = blk_idxs[blockIdx.y][idx]; }
idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
// Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
// After this, each thread in the block has a local variable (max value and max value index).
// So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
// populate shared memory
vals[threadIdx.x] = my_val; // This is now shared memory. This is because reduction requires comparison between different elements
idxs[threadIdx.x] = my_idx; // my_idx value is 0 based. This is done for all blocks (in the y direction)
__syncthreads();
// Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
for (int i = (nTPB>>1); i > 0; i>>=1) {
if (threadIdx.x < i) // the first half threads of the block
if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
__syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
// 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
if(!threadIdx.x){
result_maxInd[blockIdx.y] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
result_maxVal[blockIdx.y] = vals[0];
}
}
int main(){
dim3 grids(MAX_BLOCKS_X, NROWS);
dim3 threads(nTPB,1);
dim3 grids2(1,NROWS);
dim3 threads2(nTPB);
float *d_vector, *h_vector;
h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
for (int j = 1; j <= NCOLS; j++) {
for (int i = 1; i <= NROWS; i++) {
h_vector[IDX2F(i,j,NROWS)] = (float) (rand()/(float)RAND_MAX);
}
}
h_vector[IDX2F(2,5,NROWS)] = 10; // create definite max element
cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
int *max_index;
float *max_val;
int *d_max_index;
float *d_max_val;
max_index = (int*)malloc(NROWS * sizeof(int));
max_val = (float*)malloc(NROWS * sizeof(float));
cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
for(int z=0;z<20;z++)
printf("%d ",max_index[z]);
printf("\n\n\n");
for(int z=0;z<20;z++)
printf("%f ",max_val[z]);
return 0;
}