Below are the results of my nvprof
results, I'm trying to understand what the API calls
section mean. The first one in API calls
takes 4.67456s which is much longer than the first one in GPU activities
, why is that?
==25972== Profiling application: python view.py
==25972== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 98.62% 97.765ms 16999 5.7510us 2.6560us 11.744us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::nms_forward_kernel(float*, float const *, float, int, int)
1.09% 1.0835ms 90 12.039us 992ns 48.799us [CUDA memcpy HtoD]
0.06% 58.240us 5 11.648us 11.392us 12.256us void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, int>(int*, int)
0.06% 56.352us 2 28.176us 26.720us 29.632us void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>*, bool=0 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**>)
0.05% 52.672us 3 17.557us 16.576us 19.136us void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>*, bool=1 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**>)
0.03% 27.136us 1 27.136us 27.136us 27.136us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_preprocess_kernel(float const *, float*, int, int*)
0.03% 26.527us 2 13.263us 13.216us 13.311us void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=0*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
0.02% 19.744us 3 6.5810us 5.4720us 8.5120us void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
0.02% 18.528us 2 9.2640us 9.0880us 9.4400us [CUDA memcpy DtoH]
0.01% 8.2240us 1 8.2240us 8.2240us 8.2240us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_postprocess_kernel(float const *, float*, int, int*)
0.00% 3.7120us 1 3.7120us 3.7120us 3.7120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
0.00% 3.3600us 1 3.3600us 3.3600us 3.3600us void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
0.00% 2.9760us 1 2.9760us 2.9760us 2.9760us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
0.00% 2.5600us 1 2.5600us 2.5600us 2.5600us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(float*, thrust::cuda_cub::__transform::no_stencil_tag)
0.00% 2.3680us 1 2.3680us 2.3680us 2.3680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>)
API calls: 69.38% 4.67456s 8 584.32ms 21.948us 4.66813s cudaMalloc
19.85% 1.33738s 1 1.33738s 1.33738s 1.33738s cudaDeviceReset
6.85% 461.19ms 16999 27.130us 4.3450us 2.3428ms cudaStreamCreate
2.18% 146.78ms 17019 8.6240us 5.5850us 590.15us cudaLaunchKernel
0.78% 52.472ms 16998 3.0860us 2.3880us 491.82us cudaEventRecord
0.48% 32.347ms 16998 1.9030us 1.6020us 579.51us cudaStreamWaitEvent
0.41% 27.471ms 16998 1.6160us 1.0150us 501.06us cudaEventCreate
0.02% 1.0187ms 47 21.674us 8.9530us 82.099us cudaMemcpyAsync
0.01% 859.57us 45 19.101us 6.6610us 60.919us cudaMemcpy
0.01% 737.22us 47 15.685us 3.5030us 54.214us cudaStreamSynchronize
0.01% 513.43us 278 1.8460us 427ns 69.612us cuDeviceGetAttribute
0.01% 391.43us 430 910ns 571ns 12.840us cudaGetDevice
0.01% 353.59us 3 117.86us 116.03us 120.19us cuDeviceTotalMem
0.00% 258.63us 2 129.32us 128.63us 130.00us cudaFree
0.00% 223.59us 2 111.79us 95.946us 127.64us cudaGetDeviceProperties
0.00% 139.32us 147 947ns 715ns 7.0800us cudaSetDevice
0.00% 130.12us 240 542ns 390ns 2.9830us cudaGetDeviceCount
0.00% 113.01us 3 37.669us 23.669us 49.539us cuDeviceGetName
0.00% 101.80us 1 101.80us 101.80us 101.80us cudaDeviceSynchronize
0.00% 67.069us 2 33.534us 27.864us 39.205us cudaLaunch
0.00% 22.799us 6 3.7990us 2.7200us 6.9700us cudaFuncGetAttributes
0.00% 12.063us 12 1.0050us 822ns 1.9320us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 11.027us 23 479ns 403ns 754ns cudaPeekAtLastError
0.00% 5.5760us 5 1.1150us 493ns 2.9760us cuDeviceGetCount
0.00% 4.6710us 2 2.3350us 1.3820us 3.2890us cuInit
0.00% 4.6090us 6 768ns 683ns 1.0360us cudaDeviceGetAttribute
0.00% 3.9340us 1 3.9340us 3.9340us 3.9340us cuDeviceGetPCIBusId
0.00% 3.5570us 5 711ns 463ns 1.1720us cudaSetupArgument
0.00% 3.0960us 4 774ns 446ns 1.2680us cuDeviceGet
0.00% 3.0570us 2 1.5280us 1.2220us 1.8350us cudaConfigureCall
0.00% 2.2150us 2 1.1070us 975ns 1.2400us cuDriverGetVersion
0.00% 624ns 1 624ns 624ns 624ns cudaGetLastError
0.00% 526ns 1 526ns 526ns 526ns cuDeviceGetUuid