My kernel's FLOP efficiency is very low

Question

I implemented a kernel for pressure gradients computations. Based on used algorithm and previous parts which I did, the ngb_list[] is random and I do not have coalesced memory access. However, the double precision FLOP efficiency of the kernel is 0.2% of the peak performance on TESLA K40. It seems very low ...!

Also:

Global Memory Load Efficiency: 45.05%
Global Memory Store Efficiency: 100.00%

Is there a way to improve the DP FLOP efficiency and Global Memory Load Efficiency?

Here you can find the code:

#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <time.h>
#include <cmath>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>

    typedef double  Float;

__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
                                 Float* interactionVectors_x, Float* interactionVectors_y, Float* interactionVectors_z,
                                 int* ngb_offset, int* ngb_size, int* ngb_list,
                                 Float* pressureGrad_x, Float* pressureGrad_y, Float* pressureGrad_z,
                                 int num){

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < num){

        for (int j = ngb_offset[idx]; j < (ngb_offset[idx] + ngb_size[idx]); j++){
            Float pij = (pressure[idx] + pressure[ngb_list[j]]);
            pressureGrad_x[idx] += interactionVectors_x[j] * pij; 
            pressureGrad_y[idx] += interactionVectors_y[j] * pij;
            pressureGrad_z[idx] += interactionVectors_z[j] * pij;
        }
        pressureGrad_x[idx] *= 0.5;
        pressureGrad_y[idx] *= 0.5;
        pressureGrad_z[idx] *= 0.5;
    }
}

int main(){

    const int num = 1 << 20;
    const int tb = 1024;
    int bg = (num + tb - 1) / tb;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //ngb_size
    thrust::device_vector<int> ngb_size(num,27);

    //ngb_offset
    thrust::device_vector<int> ngb_offset(num);
    thrust::exclusive_scan(ngb_size.begin(),ngb_size.end(), ngb_offset.begin());

    //ngh list
    int ngbSize = thrust::reduce(ngb_size.begin(),ngb_size.end());

    std::cout << "ngbSize" << ngbSize << std::endl;

    thrust::device_vector<int> ngb_list(ngbSize);

    srand((unsigned)time(NULL));
    for (int i = 0; i < num; i++){

        int R = (rand()%(num - 0)) + 0;

        ngb_list[i] = R;
    }

    //pressure
    thrust::device_vector<Float> d_pressure(num);
    thrust::sequence(d_pressure.begin(),d_pressure.end(),1);

    //interaction vectors
    thrust::device_vector<Float> d_xInteractionVectors(ngbSize,1);
    thrust::device_vector<Float> d_yInteractionVectors(ngbSize,0);
    thrust::device_vector<Float> d_zInteractionVectors(ngbSize,0);

    //pressure gradients
    thrust::device_vector<Float> pGradx(num);
    thrust::device_vector<Float> pGrady(num);
    thrust::device_vector<Float> pGradz(num);

    //Pressure list
    thrust::device_vector<Float> pressure_list(ngbSize,0);

    cudaEventRecord(start);
    pGrad_calculator<<<bg,tb>>>(thrust::raw_pointer_cast(&d_pressure[0]),
                                thrust::raw_pointer_cast(&pressure_list[0]),
                                thrust::raw_pointer_cast(&d_xInteractionVectors[0]),
                                thrust::raw_pointer_cast(&d_yInteractionVectors[0]),
                                thrust::raw_pointer_cast(&d_zInteractionVectors[0]),
                                thrust::raw_pointer_cast(&ngb_offset[0]),
                                thrust::raw_pointer_cast(&ngb_size[0]),
                                thrust::raw_pointer_cast(&ngb_list[0]),
                                thrust::raw_pointer_cast(&pGradx[0]),
                                thrust::raw_pointer_cast(&pGrady[0]),
                                thrust::raw_pointer_cast(&pGradz[0]),
                                num);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    std::cout << "KERNEL TIME =  " << milliseconds << "  milliseconds" << std::endl;

    return 0;
}

By the looks of your kernel, it has low computation complexity and lots of global memory loads and stores, which is what you see in the profiling results. In addition, depending on size vector, the kernel may be very divergent. — void_ptr
I think the one word answer to this question is no - you have a very low ratio of FLOPs to memory transactions, and non-coalesced loads. — talonmies
As others noted, you're going to be most heavily influenced by bandwidth in this kernel. You might try adding const __restrict__ to interactionVectors though to see if that helps with read performance. See docs.nvidia.com/cuda/kepler-tuning-guide/… for more information. — jefflarkin
@talonmies doing the loop Unroll will increase the performance dramatically. Do you have any other comment? — Siamak
@jefflarkin Thank you for comment. I tested it and seems it is not effective. — Siamak

Jan Lucas Jan Lucas · Accepted Answer · 2016-01-27T09:31:17

Likely the compiler can not optimize your code properly and many extra loads and writes are used.

Try writing the code like this:

__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
                                 Float* interactionVectors_x, 
                                 Float* interactionVectors_y, 
                                 Float* interactionVectors_z,
                                 int* ngb_offset, int* ngb_size, 
                                 int* ngb_list, Float* pressureGrad_x, 
                                 Float* pressureGrad_y, Float* pressureGrad_z,
                                 int num)
{
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num){
        Float x=pressureGrad_x[idx];
        Float y=pressureGrad_y[idx];
        Float z=pressureGrad_z[idx];
        Float pressure_local=pressure[idx];
        int offset=ngb_offset[idx];
        int end=offset+ngb_size[idx];
        for (int j = offset; j < end; j++){
            Float pij = (pressure_local + pressure[ngb_list[j]]);
            x += interactionVectors_x[j] * pij; 
            y += interactionVectors_y[j] * pij;
            z += interactionVectors_z[j] * pij;
        }
        pressureGrad_x[idx] = 0.5*x;
        pressureGrad_y[idx] = 0.5*y;
        pressureGrad_z[idx] = 0.5*z;
    }
}

But even with this optimization, you will most likely get nowhere near peak flops as you will be limited by memory bandwidth. You are doing less than two floating point ops per double/ 8 bytes. That gives you upper bound on peak flops of 288 GB/s * 2 FOps / 8 bytes = 72 GFlop/s or about 5% peak.

My kernel's FLOP efficiency is very low

1 Answers