My CUDA kernel is using thrust, sort and reduce by key. when i use an array more than 460 it start showing incorrect results.
could any one explain this behavior? or it is something related to my machine ?
the sort is working correctly despite the size, however, the REDUCE_BY_KEY is not working good. and return improper results.
more details about the code, i have 4 arrays 1) input keys which is defined as wholeSequenceArray. 2) input values which is defined in the kernel with initial value of 1. 3) output keys is to save the distinct values of the input keys 4) output values is to save the sum of input values corresponding to the same input key .
for more description about the reduce_by_key please visit this page: https://thrust.github.io/doc/group__reductions.html#gad5623f203f9b3fdcab72481c3913f0e0
here is my code:
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
using namespace std;
#define size 461
__global__ void calculateOccurances(unsigned int *input_keys,
unsigned int *output_Values) {
int tid = threadIdx.x;
const int N = size;
__shared__ unsigned int input_values[N];
unsigned int outputKeys[N];
int i = tid;
while (i < N) {
if (tid < N) {
input_values[tid] = 1;
}
i += blockDim.x;
}
__syncthreads();
thrust::sort(thrust::device, input_keys, input_keys + N);
thrust::reduce_by_key(thrust::device, input_keys, input_keys + N,
input_values, outputKeys, output_Values);
if (tid == 0) {
for (int i = 0; i < N; ++i) {
printf("%d,", output_Values[i]);
}
}
}
int main(int argc, char** argv) {
unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 };
cout << "wholeSequenceArray:" << endl;
for (int i = 0; i < size; i++) {
cout << wholeSequenceArray[i] << ",";
}
cout << "\nStart C++ Array New" << endl;
cout << "Size of Input:" << size << endl;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
unsigned int counts[size];
unsigned int *d_whole;
unsigned int *d_counts;
cudaMalloc((void**) &d_whole, size * sizeof(unsigned int));
cudaMalloc((void**) &d_counts, size * sizeof(unsigned int));
cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int),
cudaMemcpyHostToDevice);
calculateOccurances<<<1, size>>>(d_whole, d_counts);
cudaMemcpy(counts, d_counts, size * sizeof(unsigned int),
cudaMemcpyDeviceToHost);
cout << endl << "Counts" << endl << endl;
for (int i = 0; i < size; ++i) {
cout << counts[i] << ",";
}
cout << endl;
cudaFree(d_whole);
}
thrustin device code works. You have 461 threads each of which is doing its own, separate sort on the same data, in the same place. That could not possibly be a useful algorithm. Those 461 threads will each be stepping on each other as they are moving the data around to be sorted. It's not clear to me that you need a CUDA kernel at all here. Your described algorithm could be accomplished just using thrust in an ordinary fashion (i.e. from host code). The work will still get done on the device. - Robert Crovella