I run nvprof.exe
on the function that initialize data, calls three kernels and free's data. All profiled as it should and I got result like this:
==7956== Profiling application: .\a.exe
==7956== Profiling result:
GPU activities: 52.34% 25.375us 1 25.375us 25.375us 25.375us th_single_row_add(float*, float*, float*)
43.57% 21.120us 1 21.120us 21.120us 21.120us th_single_col_add(float*, float*, float*)
4.09% 1.9840us 1 1.9840us 1.9840us 1.9840us th_single_elem_add(float*, float*, float*)
API calls: 86.77% 238.31ms 9 26.479ms 14.600us 210.39ms cudaMallocManaged
12.24% 33.621ms 1 33.621ms 33.621ms 33.621ms cuDevicePrimaryCtxRelease
0.27% 730.80us 3 243.60us 242.10us 245.60us cudaLaunchKernel
0.15% 406.90us 3 135.63us 65.400us 170.80us cudaDeviceSynchronize
0.08% 229.70us 97 2.3680us 100ns 112.10us cuDeviceGetAttribute
0.08% 206.60us 1 206.60us 206.60us 206.60us cuModuleUnload
0.01% 19.700us 1 19.700us 19.700us 19.700us cuDeviceTotalMem
0.00% 6.8000us 1 6.8000us 6.8000us 6.8000us cuDeviceGetPCIBusId
0.00% 1.9000us 2 950ns 400ns 1.5000us cuDeviceGet
0.00% 1.8000us 3 600ns 400ns 800ns cuDeviceGetCount
0.00% 700ns 1 700ns 700ns 700ns cuDeviceGetName
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetUuid
0.00% 200ns 1 200ns 200ns 200ns cuDeviceGetLuid
==7956== Unified Memory profiling result:
Device "GeForce RTX 2060 SUPER (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
18 20.000KB 8.0000KB 32.000KB 360.0000KB 300.7000us Host To Device
24 20.000KB 8.0000KB 32.000KB 480.0000KB 2.647400ms Device To Host
As you see there are three kernel in GPU activities
.
And here is source code:
void add_elem(int n) {
float *a, *b, *c1, *c2, *c3;
cudaMallocManaged(&a, n * n * sizeof(float));
cudaMallocManaged(&b, n * n * sizeof(float));
cudaMallocManaged(&c1, n * n * sizeof(float));
cudaMallocManaged(&c2, n * n * sizeof(float));
cudaMallocManaged(&c3, n * n * sizeof(float));
for (int i = 0; i < n*n; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
c1[i] = 0.0f;
c2[i] = 0.0f;
c3[i] = 0.0f;
}
int blockSize = 256;
int numBlocks = (n*n + blockSize - 1) / blockSize;
th_single_elem_add<<<numBlocks, blockSize>>>(a, b, c1);
th_single_row_add<<<numBlocks, blockSize>>>(a, b, c2);
th_single_col_add<<<numBlocks, blockSize>>>(a, b, c3);
cudaDeviceSynchronize();
cudaFree(a);
cudaFree(b);
cudaFree(c1);
cudaFree(c2);
cudaFree(c3);
}
After that, I extract initializing data, kernel call and freeing data to separate host functions and call nvprof
again. In result I got only info about API calls, like this:
==18460== Profiling application: .\a.exe
==18460== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
API calls: 81.86% 158.78ms 9 17.643ms 1.4000us 158.76ms cudaMallocManaged
0.17% 322.80us 97 3.3270us 100ns 158.00us cuDeviceGetAttribute
0.11% 214.50us 1 214.50us 214.50us 214.50us cuModuleUnload
0.04% 68.600us 3 22.866us 7.3000us 39.400us cudaDeviceSynchronize
0.01% 12.100us 9 1.3440us 400ns 7.9000us cudaFree
0.00% 7.7000us 1 7.7000us 7.7000us 7.7000us cuDeviceGetPCIBusId
0.00% 2.1000us 3 700ns 300ns 1.0000us cuDeviceGetCount
0.00% 2.0000us 2 1.0000us 300ns 1.7000us cuDeviceGet
0.00% 1.2000us 3 400ns 300ns 500ns cudaLaunchKernel
0.00% 700ns 1 700ns 700ns 700ns cuDeviceGetName
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetUuid
0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetLuid
As You see, also there is no section Unified Memory profiling result
,
so I tried to run nvprof like this nvprof.exe --unified-memory-profiling off .\a.exe
but got same result.
Source code for that:
void add_elem(int n) {
float *a, *b, *c1;
cudaMallocManaged(&a, n * n * sizeof(float));
cudaMallocManaged(&b, n * n * sizeof(float));
cudaMallocManaged(&c1, n * n * sizeof(float));
for (int i = 0; i < n*n; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
c1[i] = 0.0f;
}
int blockSize = 256;
int numBlocks = (n*n + blockSize - 1) / blockSize;
th_single_elem_add<<<numBlocks, blockSize>>>(a, b, c1);
cudaDeviceSynchronize();
cudaFree(a);
cudaFree(b);
cudaFree(c1);
}
void add_row(int n) {
float *a, *b, *c1;
cudaMallocManaged(&a, n * n * sizeof(float));
cudaMallocManaged(&b, n * n * sizeof(float));
cudaMallocManaged(&c1, n * n * sizeof(float));
for (int i = 0; i < n*n; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
c1[i] = 0.0f;
}
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
th_single_row_add<<<numBlocks, blockSize>>>(a, b, c1, n);
cudaDeviceSynchronize();
cudaFree(a);
cudaFree(b);
cudaFree(c1);
}
void add_col(int n) {
float *a, *b, *c1;
cudaMallocManaged(&a, n * n * sizeof(float));
cudaMallocManaged(&b, n * n * sizeof(float));
cudaMallocManaged(&c1, n * n * sizeof(float));
for (int i = 0; i < n*n; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
c1[i] = 0.0f;
}
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
th_single_col_add<<<numBlocks, blockSize>>>(a, b, c1, n);
cudaDeviceSynchronize();
cudaFree(a);
cudaFree(b);
cudaFree(c1);
}
UPDATE: I found the problem, I run code with 10000000000 elements in the array, and it seems like kernels not even called. Because I run them with 10000000 (10^8) elements and it took nearly 3 seconds to finish and with 10000000000 (10^10) it finished immediately. But there is no error oh anything.
How I should catch such cases?
cuda-memcheck
. – Robert Crovella