I am trying to profile my CUDA program, using the nvprof tool.
Here is my code:
#include <iostream>
#include <math.h>
#include <cuda_profiler_api.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<10;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
cudaProfilerStop();
cudaDeviceReset();
return 0;
}
I compiled it using the command nvcc add.cu -o add_cuda.
I then run it (as root) using nvprof ./add_cuda --unified-memory-profiling off or nvprof and get the following output:
==15318== NVPROF is profiling process 15318, command: ./add_cuda
Max error: 0
==15318== Profiling application: ./add_cuda
==15318== Profiling result:
No kernels were profiled.
No API activities were profiled.
==15318== Warning: Some profiling data are not recorded. Make sure cudaProfilerStop() or cuProfilerStop() is called before application exit to flush profile data.
======== Error: Application received signal 139
I searched online for a solution nvprof not picking up any API calls or kernels, https://devtalk.nvidia.com/default/topic/1010691/visual-profiler/nvprof-error-code-139-but-memcheck-ok/, but really nothing helped.
How can i get nvprof to work?
Thanks!
Fedora 29 64-bit
nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2019 NVIDIA Corporation
Release version 10.1.168 (21)
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Apr_24_19:10:27_PDT_2019
Cuda compilation tools, release 10.1, V10.1.168
nvidia-smi
Mon Jul 1 13:24:54 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.26 Driver Version: 430.26 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 108... Off | 00000000:03:00.0 On | N/A |
| 0% 37C P8 20W / 250W | 253MiB / 11175MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 1741 G /usr/libexec/Xorg 154MiB |
| 0 2161 G cinnamon 96MiB |
+-----------------------------------------------------------------------------+