0
votes

I try to implement character frequency program in C using CUDA and I have an issue with the results. I think it's something about thread synchronization but I need help.

Output for 1 block and 1 thread per block:

>./char_freq.exe test.txt 1 1
file size is 115
0 = 0 
1 = 0 
2 = 0 
3 = 0 
4 = 0 
5 = 0 
6 = 0 
7 = 0 
8 = 0 
9 = 0 
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 14
103 = 7
104 = 18
105 = 1
106 = 14
107 = 20
108 = 0
109 = 0
110 = 1
111 = 1
112 = 0
113 = 0
114 = 5
115 = 8
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0



N: 128, Blocks: 1, Threads: 1
Total time (ms): 0.143
Kernel time (ms): 0.046
Data transfer time(ms): 0.097

Output for 1 block and 5 threads per block:

>./char_freq.exe test.txt 1 5
file size is 115
0 = 0 
1 = 0 
2 = 0 
3 = 0 
4 = 0 
5 = 0 
6 = 0 
7 = 0 
8 = 0 
9 = 0 
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 12
103 = 7
104 = 13
105 = 1
106 = 11
107 = 12
108 = 0
109 = 0
111 = 1
112 = 0
113 = 0
114 = 5
115 = 7
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0



N: 128, Blocks: 1, Threads: 5
Total time (ms): 0.157
Kernel time (ms): 0.048
Data transfer time(ms): 0.109

Why the results are different?

This is my code:

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 128
#define base 0

__global__ void char_freq(char *buffer, int *freq, int slice, int extra, int total_threads){
    int index = threadIdx.x + blockIdx.x * blockDim.x ;
    int start = index * slice; 
    int stop = start + slice;
    int i;

    if (index == (total_threads-1))
    stop += extra;

    __shared__ int local_freq[N];

    //initialize local_freq
    if(threadIdx.x == 0){
        memset(local_freq, 0, N*sizeof(int));
    }

    __syncthreads();

    for(i=start; i<stop; i++){
        local_freq[buffer[i] - base]++;
    }

    __syncthreads();

    for(i=0; i<N; i++){
        freq[i] += local_freq[i];
    }

    __syncthreads();
}


int main(int argc, char *argv[]){
    FILE *pFile;
    long file_size;
    char * buffer;
    char * filename;
    size_t result;
    int j, freq[N];
    int slice, extra;
    int total_blocks, threads_per_block, total_threads;

    float total_time, comp_time;
    cudaEvent_t total_start, total_stop, comp_start, comp_stop;
    cudaEventCreate(&total_start);
    cudaEventCreate(&total_stop);
    cudaEventCreate(&comp_start);
    cudaEventCreate(&comp_stop);


    if (argc != 4) {
        printf ("Usage : %s <file_name> <blocks> <threads_per_block>\n", argv[0]);
        return 1;
    }

    total_blocks = strtol(argv[2], NULL, 10);
    threads_per_block = strtol(argv[3], NULL, 10);
    total_threads = total_blocks*threads_per_block;

    filename = argv[1];
    pFile = fopen ( filename , "rb" );
    if (pFile==NULL) {printf ("File error\n"); return 2;}

    fseek (pFile , 0 , SEEK_END);
    file_size = ftell (pFile);
    rewind (pFile);
    printf("file size is %ld\n", file_size);
    
    buffer = (char*) malloc (sizeof(char)*file_size);
    if (buffer == NULL) {printf ("Memory error\n"); return 3;}

    result = fread (buffer,1,file_size,pFile);
    if (result != file_size) {printf ("Reading error\n"); return 4;}

    char *buffer_dev;
    int *freq_dev;

    cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
    cudaMalloc((void **)&freq_dev, N*sizeof(int));
    cudaMemset(freq_dev,0,N*sizeof(int));

    cudaEventRecord(total_start);

    cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);

    cudaEventRecord(comp_start);

    slice = file_size / total_threads;
    extra = file_size % total_threads;

    char_freq<<<total_blocks, threads_per_block>>>(buffer_dev, freq_dev, slice, extra, total_threads);

    cudaEventRecord(comp_stop);
    cudaEventSynchronize(comp_stop);
    cudaEventElapsedTime(&comp_time, comp_start, comp_stop);

    cudaMemcpy(freq, freq_dev, N*sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(total_stop);
    cudaEventSynchronize(total_stop);
    cudaEventElapsedTime(&total_time, total_start, total_stop);

    cudaFree(buffer_dev);
    cudaFree(freq_dev);

    for (j=0; j<N; j++){
        printf("%d = %d\n", j+base, freq[j]);
    }
    
    fclose (pFile);
    free (buffer);

    //GPU Timing
    printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", N, total_blocks, total_threads);
    printf("Total time (ms): %.3f\n", total_time);
    printf("Kernel time (ms): %.3f\n", comp_time);
    printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);

    return 0;
}
Note __syncthreads only synchronize threads belonging to the same block, not between blocks. Besides this, there is no bound check for index which is suspicious.Jérôme Richard
Your synchronizations are useless as you are accessing thread-local memory, not shared memory. Your accesses to freq are a race condition, as all threads will read/write from/to the same address at the same time. The easiest fix will be to use atomics for the accesses to freq. For performant solutions take a look at the CUDA samples. There is a histogram sample.paleonix