I try to implement character frequency program in C using CUDA and I have an issue with the results. I think it's something about thread synchronization but I need help.
Output for 1 block and 1 thread per block:
>./char_freq.exe test.txt 1 1
file size is 115
0 = 0
1 = 0
2 = 0
3 = 0
4 = 0
5 = 0
6 = 0
7 = 0
8 = 0
9 = 0
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 14
103 = 7
104 = 18
105 = 1
106 = 14
107 = 20
108 = 0
109 = 0
110 = 1
111 = 1
112 = 0
113 = 0
114 = 5
115 = 8
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0
N: 128, Blocks: 1, Threads: 1
Total time (ms): 0.143
Kernel time (ms): 0.046
Data transfer time(ms): 0.097
Output for 1 block and 5 threads per block:
>./char_freq.exe test.txt 1 5
file size is 115
0 = 0
1 = 0
2 = 0
3 = 0
4 = 0
5 = 0
6 = 0
7 = 0
8 = 0
9 = 0
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 12
103 = 7
104 = 13
105 = 1
106 = 11
107 = 12
108 = 0
109 = 0
111 = 1
112 = 0
113 = 0
114 = 5
115 = 7
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0
N: 128, Blocks: 1, Threads: 5
Total time (ms): 0.157
Kernel time (ms): 0.048
Data transfer time(ms): 0.109
Why the results are different?
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define N 128
#define base 0
__global__ void char_freq(char *buffer, int *freq, int slice, int extra, int total_threads){
int index = threadIdx.x + blockIdx.x * blockDim.x ;
int start = index * slice;
int stop = start + slice;
int i;
if (index == (total_threads-1))
stop += extra;
__shared__ int local_freq[N];
//initialize local_freq
if(threadIdx.x == 0){
memset(local_freq, 0, N*sizeof(int));
}
__syncthreads();
for(i=start; i<stop; i++){
local_freq[buffer[i] - base]++;
}
__syncthreads();
for(i=0; i<N; i++){
freq[i] += local_freq[i];
}
__syncthreads();
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size;
char * buffer;
char * filename;
size_t result;
int j, freq[N];
int slice, extra;
int total_blocks, threads_per_block, total_threads;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 4) {
printf ("Usage : %s <file_name> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
total_blocks = strtol(argv[2], NULL, 10);
threads_per_block = strtol(argv[3], NULL, 10);
total_threads = total_blocks*threads_per_block;
filename = argv[1];
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
char *buffer_dev;
int *freq_dev;
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&freq_dev, N*sizeof(int));
cudaMemset(freq_dev,0,N*sizeof(int));
cudaEventRecord(total_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaEventRecord(comp_start);
slice = file_size / total_threads;
extra = file_size % total_threads;
char_freq<<<total_blocks, threads_per_block>>>(buffer_dev, freq_dev, slice, extra, total_threads);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(freq, freq_dev, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(buffer_dev);
cudaFree(freq_dev);
for (j=0; j<N; j++){
printf("%d = %d\n", j+base, freq[j]);
}
fclose (pFile);
free (buffer);
//GPU Timing
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", N, total_blocks, total_threads);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
return 0;
}
__syncthreads
only synchronize threads belonging to the same block, not between blocks. Besides this, there is no bound check forindex
which is suspicious. – Jérôme Richardfreq
are a race condition, as all threads will read/write from/to the same address at the same time. The easiest fix will be to use atomics for the accesses tofreq
. For performant solutions take a look at the CUDA samples. There is a histogram sample. – paleonix