1
votes

Dear Scholars,
I am unable to implement mean and var in single kernel call.

Goal: I need to find mean and var of sub matrixs in a matrix. so I wrote following kernels

functions:


    global void kernelMean(d_a, d_mean, ...);
    global void kernelVar(d_a, d_var, ...);
    global void kernelMeanVar(d_a, d_mean, d_var,...);

Issue: a) if I compute the kernelMean and kernelVar individually it works fine b) if I want to compute kernelMeanVar in single kernel is does not work.

Below is my code, kindly let me know on possible any errors.

Thanking you in advance.

Regards,
Nagaraju


global void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
      int block_id = blockIdx.x + gridDim.x * blockIdx.y;
      int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
      int my_rowIdx = thread_id/globalCols ;
      int my_colIdx = thread_id%globalCols ;
      int i,j;
      float temp;
      float sum = 0;
      float sumsq = 0.0;
      float mean;
      float ltotal_elts = (float) (localRows*localCols);

  device_mean[thread_id] = 0;
  if(thread_id <total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sumsq = sumsq + (temp*temp);
        sum = sum + temp;
      }
    }
    mean = sum/ltotal_elts;
    device_mean[thread_id] = mean;

} } global void kernelVar(float* device_array,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads) { int block_id = blockIdx.x + gridDim.x * blockIdx.y; int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x; int my_rowIdx = thread_id/globalCols ; int my_colIdx = thread_id%globalCols ; int i,j; float temp; float sum = 0; float sumsq = 0; float mean = 0; float var = 0; float ltotal_elts = (float) localRows*localCols;

  device_var[thread_id] = 0;
  if(thread_id < total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sum = sum + temp;
        sumsq = sumsq + (temp*temp);
      }
    }
    mean = sum/ltotal_elts;
    device_var[thread_id]  = (sumsq/ltotal_elts) - (mean*mean);
  }
}

global void kernelMeanVar(float* device_array, float* device_mean,float* device_var, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads) { int block_id = blockIdx.x + gridDim.x * blockIdx.y; int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x; int my_rowIdx = thread_id/globalCols ; int my_colIdx = thread_id%globalCols ; int i,j; float temp; float sum = 0; float sumsq = 0.0; float mean; float ltotal_elts = (float) (localRows*localCols); device_mean[thread_id] = 0; device_var[thread_id] = 0; if(thread_id < total_num_threads) { for(i=0; i < localRows ; i++) { for(j=0 ; j < localCols; j++) { temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)]; sumsq = sumsq + (temp*temp); sum = sum + temp; } } mean = sum/ltotal_elts; device_mean[thread_id] = mean; device_var[thread_id] = (sumsq/ltotal_elts) - (mean*mean); } }

Kernel Call Functions


void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*float_ary)[i] = (float) double_ary[i];
     //printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
  }
  return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*double_ary)[i] = (double) float_ary[i];
  }
  return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
  float* host_array_float;
  float* device_array;
  float* host_mean;
  float* host_var;
  float* device_mean;
  float* device_var;
  double total_bytes =0;
  host_array_float = (float*) malloc (num_elts*sizeof(float));
  convertToFloat(&host_array_float, host_array, num_elts);
  //printf("num_elts %d \n", num_elts);
  cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
  cudaMemset(device_array, 0, sizeof(float)* num_elts);
  cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
  int numBlockThreads = MAX_THREADS_PER_BLOCK;
  int num_blocks = 0;
  int remain_elts = 0;
  int total_num_threads = globalRows * globalCols;
  cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
  cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
  cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
  cudaMemset(device_var, 0, sizeof(float)* total_num_threads);
  num_blocks  =  total_num_threads/numBlockThreads;
  remain_elts =  total_num_threads%numBlockThreads;
  if(remain_elts > 0)
  {
    num_blocks++;
  }
  dim3 gridDim(num_blocks,1);
  dim3 blockDim(numBlockThreads,1);
  //kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  //kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  host_mean = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_mean, 0, sizeof(float) * total_num_threads);
  host_var = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_var, 0, sizeof(float) * total_num_threads);
  cudaThreadSynchronize();
  cudaError_t error = cudaGetLastError();
   //if(error!=cudaSuccess) {
      printf("ERROR: %s\n", cudaGetErrorString(error) );
   //}
  cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
  convertToDouble(mean_ary, host_mean, total_num_threads);
  cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads,     cudaMemcpyDeviceToHost);
  for(int i = 0 ; i < 300 ; i++)
    printf("host_var[%d] %f \n",i, host_var[i]);
  convertToDouble(var_ary, host_var, total_num_threads);
  cudaFree(device_array);
  cudaFree(device_mean);
  cudaFree(device_var);
  free(host_mean);
  free(host_var);
  free(host_array_float);
}

Results with enabling


 global void kernelMean(d_a, d_mean, ...);
 global void kernelVar(d_a, d_var, ...);

ERROR: no error host_var[0] 4.497070 host_var[1] 5.061768 host_var[2] 5.687500 host_var[3] 6.347534 host_var[4] 6.829102 host_var[5] 12.940308 host_var[6] 14.309937 host_var[7] 15.141113 host_var[8] 18.741577 host_var[9] 21.323608 host_var[10] 21.727417 host_var[11] 192.348389 host_var[12] 579.911621 host_var[13] 800.821045 host_var[14] 1071.960938 host_var[15] 1077.261719 host_var[16] 993.262207 host_var[17] 924.379883 host_var[18] 839.437012 host_var[19] 810.847656 host_var[20] 835.007813 host_var[21] 1124.365723 host_var[22] 1241.685547 host_var[23] 1376.504150 host_var[24] 1196.745850 host_var[25] 1097.473877 host_var[26] 1008.840088 host_var[27] 867.585083 host_var[28] 794.241699 host_var[29] 1322.409790 host_var[30] 1556.029785 host_var[31] 1564.997803 host_var[32] 1870.985840 host_var[33] 1929.829590 host_var[34] 1822.189453 host_var[35] 1662.321777 host_var[36] 1372.886719 host_var[37] 1074.727539 host_var[38] 833.003906 host_var[39] 632.514648 host_var[40] 380.227539 host_var[41] 87.345703 host_var[42] 82.544922 host_var[43] 78.756836 host_var[44] 68.541016 host_var[45] 61.981445 host_var[46] 60.413086 host_var[47] 60.128906 host_var[48] 59.767578 host_var[49] 59.223633 host_var[50] 56.569336 host_var[51] 53.866211 host_var[52] 51.186523 host_var[53] 55.270508 host_var[54] 59.956055 host_var[55] 66.516602 host_var[56] 70.348633 host_var[57] 71.706055 host_var[58] 70.494141 host_var[59] 69.897461 host_var[60] 66.286133 host_var[61] 67.926758 host_var[62] 160.753906 host_var[63] 447.221191 host_var[64] 831.740723 host_var[65] 1076.513672 host_var[66] 1193.666992 host_var[67] 1208.239746 host_var[68] 1126.845947 host_var[69] 948.397461 host_var[70] 669.399414 host_var[71] 340.465576 host_var[72] 67.161865 host_var[73] 7.421082 host_var[74] 5.485626 host_var[75] 5.135620 host_var[76] 3.460419 host_var[77] 3.853577 host_var[78] 5.221100 host_var[79] 5.890381 host_var[80] 7.139618 host_var[81] 7.517609 host_var[82] 6.865875 host_var[83] 5.053909 host_var[84] 2.781616 host_var[85] 2.021912 host_var[86] 2.130417 host_var[87] 3.113586 host_var[88] 4.024399 host_var[89] 4.582413 host_var[90] 4.077118 host_var[91] 3.024384 host_var[92] 2.287506 host_var[93] 1.793579 host_var[94] 1.567474 host_var[95] 1.829895 host_var[96] 2.325928 host_var[97] 3.429993 host_var[98] 3.885559 host_var[99] 3.835602 host_var[100] 5.566406 host_var[101] 8.065582 host_var[102] 18.767456 host_var[103] 35.395599 host_var[104] 64.148407 host_var[105] 125.937866 host_var[106] 176.445618 host_var[107] 216.073059 host_var[108] 272.109985 host_var[109] 307.972412 host_var[110] 289.652344 host_var[111] 238.253662 host_var[112] 178.304932 host_var[113] 116.925049 host_var[114] 74.773926 host_var[115] 61.227295 host_var[116] 55.238525 host_var[117] 55.387451 host_var[118] 49.241699 host_var[119] 38.396240 host_var[120] 28.304932 host_var[121] 20.225342 host_var[122] 18.043457 host_var[123] 21.418457 host_var[124] 26.120117 host_var[125] 25.899414 host_var[126] 26.641602 host_var[127] 23.747437 host_var[128] 18.927368 host_var[129] 21.664307 host_var[130] 142.432373 host_var[131] 1575.141602 host_var[132] 2901.855957 host_var[133] 4195.149902 host_var[134] 5047.758789 host_var[135] 5450.164063 host_var[136] 5249.767578 host_var[137] 4577.365234 host_var[138] 3352.496094 host_var[139] 1641.593750 host_var[140] 352.242188 host_var[141] 224.824219 host_var[142] 194.578125 host_var[143] 178.875000 host_var[144] 175.148438 host_var[145] 174.117188 host_var[146] 172.707031 host_var[147] 169.578125 host_var[148] 176.308594 host_var[149] 181.968750 host_var[150] 191.507813 host_var[151] 198.500000 host_var[152] 206.824219 host_var[153] 213.273438 host_var[154] 220.312500 host_var[155] 218.859375 host_var[156] 213.941406 host_var[157] 205.474609 host_var[158] 190.722656 host_var[159] 178.414063 host_var[160] 169.302734 host_var[161] 3.750366 host_var[162] 4.333252 host_var[163] 4.901855 host_var[164] 5.527466 host_var[165] 6.201782 host_var[166] 11.921631 host_var[167] 14.135376 host_var[168] 14.885864 host_var[169] 19.083618 host_var[170] 21.290283 host_var[171] 21.415649 host_var[172] 209.747559 host_var[173] 580.304932 host_var[174] 800.949951 host_var[175] 1119.857422 host_var[176] 1129.382324 host_var[177] 1032.616211 host_var[178] 972.797363 host_var[179] 915.440918 host_var[180] 905.890137 host_var[181] 943.649902 host_var[182] 1207.445801 host_var[183] 1345.912109 host_var[184] 1478.704590 host_var[185] 1224.895508 host_var[186] 1105.403564 host_var[187] 1031.981201 host_var[188] 914.456421 host_var[189] 835.127441 host_var[190] 1320.454102 host_var[191] 1561.439941 host_var[192] 1599.149902 host_var[193] 1912.232910 host_var[194] 1993.473145 host_var[195] 1913.377441 host_var[196] 1784.035645 host_var[197] 1554.712891 host_var[198] 1244.698242 host_var[199] 926.668945

Reusults with enabling


global void kernelMeanVar(d_a, d_mean, d_var,...);

ERROR: no error host_var[0] 0.000000 host_var[1] 0.000000 host_var[2] 0.000000 host_var[3] 0.000000 host_var[4] 0.000000 host_var[5] 0.000000 host_var[6] 0.000000 host_var[7] 0.000000 host_var[8] 0.000000 host_var[9] 0.000000 host_var[10] 0.000000 host_var[11] 0.000000 host_var[12] 0.000000 host_var[13] 0.000000 host_var[14] 0.000000 host_var[15] 0.000000 host_var[16] 0.000000 host_var[17] 0.000000 host_var[18] 0.000000 host_var[19] 0.000000 host_var[20] 0.000000 host_var[21] 0.000000 host_var[22] 0.000000 host_var[23] 0.000000 host_var[24] 0.000000 host_var[25] 0.000000 host_var[26] 0.000000 host_var[27] 0.000000 host_var[28] 0.000000 host_var[29] 0.000000 host_var[30] 0.000000 host_var[31] 0.000000 host_var[32] 0.000000 host_var[33] 0.000000 host_var[34] 0.000000 host_var[35] 0.000000 host_var[36] 0.000000 host_var[37] 0.000000 host_var[38] 0.000000 host_var[39] 0.000000 host_var[40] 0.000000 host_var[41] 0.000000 host_var[42] 0.000000 host_var[43] 0.000000 host_var[44] 0.000000 host_var[45] 0.000000 host_var[46] 0.000000 host_var[47] 0.000000 host_var[48] 0.000000 host_var[49] 0.000000 host_var[50] 0.000000 host_var[51] 0.000000 host_var[52] 0.000000 host_var[53] 0.000000 host_var[54] 0.000000 host_var[55] 0.000000 host_var[56] 0.000000 host_var[57] 0.000000 host_var[58] 0.000000 host_var[59] 0.000000 host_var[60] 0.000000 host_var[61] 0.000000 host_var[62] 0.000000 host_var[63] 0.000000 host_var[64] 0.000000 host_var[65] 0.000000 host_var[66] 0.000000 host_var[67] 0.000000 host_var[68] 0.000000 host_var[69] 0.000000 host_var[70] 0.000000 host_var[71] 0.000000 host_var[72] 0.000000 host_var[73] 0.000000 host_var[74] 0.000000 host_var[75] 0.000000 host_var[76] 0.000000 host_var[77] 0.000000 host_var[78] 0.000000 host_var[79] 0.000000 host_var[80] 0.000000 host_var[81] 0.000000 host_var[82] 0.000000 host_var[83] 0.000000 host_var[84] 0.000000 host_var[85] 0.000000 host_var[86] 0.000000 host_var[87] 0.000000 host_var[88] 0.000000 host_var[89] 0.000000 host_var[90] 0.000000 host_var[91] 0.000000 host_var[92] 0.000000 host_var[93] 0.000000 host_var[94] 0.000000 host_var[95] 0.000000 host_var[96] 0.000000 host_var[97] 0.000000 host_var[98] 0.000000 host_var[99] 0.000000 host_var[100] 0.000000 host_var[101] 0.000000 host_var[102] 0.000000 host_var[103] 0.000000 host_var[104] 0.000000 host_var[105] 0.000000 host_var[106] 0.000000 host_var[107] 0.000000 host_var[108] 0.000000 host_var[109] 0.000000 host_var[110] 0.000000 host_var[111] 0.000000 host_var[112] 0.000000 host_var[113] 0.000000 host_var[114] 0.000000 host_var[115] 0.000000 host_var[116] 0.000000 host_var[117] 0.000000 host_var[118] 0.000000 host_var[119] 0.000000 host_var[120] 0.000000 host_var[121] 0.000000 host_var[122] 0.000000 host_var[123] 0.000000 host_var[124] 0.000000 host_var[125] 0.000000 host_var[126] 0.000000 host_var[127] 0.000000 host_var[128] 18.927368 host_var[129] 21.664307 host_var[130] 142.432373 host_var[131] 1575.141602 host_var[132] 2901.855957 host_var[133] 4195.149902 host_var[134] 5047.758789 host_var[135] 5450.164063 host_var[136] 5249.767578 host_var[137] 4577.365234 host_var[138] 3352.496094 host_var[139] 1641.593750 host_var[140] 352.242188 host_var[141] 224.824219 host_var[142] 194.578125 host_var[143] 178.875000 host_var[144] 175.148438 host_var[145] 174.117188 host_var[146] 172.707031 host_var[147] 169.578125 host_var[148] 176.308594 host_var[149] 181.968750 host_var[150] 191.507813 host_var[151] 198.500000 host_var[152] 206.824219 host_var[153] 213.273438 host_var[154] 220.312500 host_var[155] 218.859375 host_var[156] 213.941406 host_var[157] 205.474609 host_var[158] 190.722656 host_var[159] 178.414063 host_var[160] 169.302734 host_var[161] 3.750366 host_var[162] 4.333252 host_var[163] 4.901855 host_var[164] 5.527466 host_var[165] 6.201782 host_var[166] 11.921631 host_var[167] 14.135376 host_var[168] 14.885864 host_var[169] 19.083618 host_var[170] 21.290283 host_var[171] 21.415649 host_var[172] 209.747559 host_var[173] 580.304932 host_var[174] 800.949951 host_var[175] 1119.857422 host_var[176] 1129.382324 host_var[177] 1032.616211 host_var[178] 972.797363 host_var[179] 915.440918 host_var[180] 905.890137 host_var[181] 943.649902 host_var[182] 1207.445801 host_var[183] 1345.912109 host_var[184] 1478.704590 host_var[185] 1224.895508 host_var[186] 1105.403564 host_var[187] 1031.981201 host_var[188] 914.456421 host_var[189] 835.127441 host_var[190] 1320.454102 host_var[191] 1561.439941 host_var[192] 1599.149902 host_var[193] 1912.232910 host_var[194] 1993.473145 host_var[195] 1913.377441 host_var[196] 1784.035645 host_var[197] 1554.712891 host_var[198] 1244.698242 host_var[199] 926.668945

End of Results

1
In what way does it "not work"?Tom
In kernelMeanVar Call : device_Mean is correct but results of device_var is not correct.Naga
Does not seem to be anything wrong with the code you posted. Post the code that you use to spawn the kernel. Which elements of device_var are incorrect? All of them or only some of them? Are they just slightly off or completely wrong?Eugene Smith
I have added the kernel call function. device_var there are many of them, size could be 1000 or more. As for the comparison (not shown here) results are incorrect for device_var for most of them (looks like all, could not verify manually). Note: if I call individually all are correct!!!Naga

1 Answers

3
votes

Suggestions:

  • [First priority] Check return values from CUDA functions to see whether any errors are reported.
  • Run this through cuda-memcheck. I'm not sure what the relationship is between globalRows, globalCols, localRows, localCols, num_elts etc. is but reading out-of-bounds seems like a candadite for problems.
  • Remember that summing the squares can lead to rounding errors fairly quickly if you don't take care. Consider using a running mean/variance or doing a tree-based reduction.