I'm porting a C++ program to CUDA, the calculations are all about matrices/vectors. The first ported function is matrix's FFT. After porting matrix's FFT to CUDA, I found: data transter between CPU and GPU take almost all the time.
// interface: do shift and inverse FFT on a matrix
extern "C" int cu_inv_fft_shift(std::complex<double>* ptrDest, int nRows, int nCols) {
#ifdef ENABLE_DEBUG_TIME_MEASURE
float ms1, ms2 = 0.f, ms3 = 0.f, ms4 = 0.f;
cudaEvent_t startEvent, stopEvent;
cudaEventCreate(&startEvent); cudaEventCreate(&stopEvent);
#endif
// step1: cpu -> gpu, and column-major -> row-major
#ifdef ENABLE_DEBUG_TIME_MEASURE
cudaEventRecord(startEvent, 0);
#endif
cufftDoubleComplex* ptr_data = matrix_to_cu_data(ptrDest, nRows, nCols);
#ifdef ENABLE_DEBUG_TIME_MEASURE2
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(startEvent);cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&ms1, startEvent, stopEvent);
#endif
// step2: do shift on gpu buffer
#ifdef ENABLE_DEBUG_TIME_MEASURE2
cudaEventRecord(startEvent, 0);
#endif
ptr_data = fft_shift_cd(ptr_data, nRows, nCols);
#ifdef ENABLE_DEBUG_TIME_MEASURE2
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(startEvent);cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&ms2, startEvent, stopEvent);
#endif
// step3: do FFT on gpu buffer
#ifdef ENABLE_DEBUG_TIME_MEASURE2
cudaEventRecord(startEvent, 0);
#endif
ptr_data = do_fft_cd(ptr_data, nRows, nCols, CUFFT_INVERSE);
#ifdef ENABLE_DEBUG_TIME_MEASURE2
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(startEvent);cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&ms3, startEvent, stopEvent);
#endif
// step4: row-major -> column-major, and gpu -> cpu
#ifdef ENABLE_DEBUG_TIME_MEASURE2
cudaEventRecord(startEvent, 0);
#endif
ptr_data = cu_data_to_matrix_inv(ptrDest, nRows, nCols, ptr_data);
#ifdef ENABLE_DEBUG_TIME_MEASURE
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(startEvent);cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&ms4, startEvent, stopEvent);
#endif
#ifdef ENABLE_DEBUG_TIME_MEASURE
cudaEventDestroy(startEvent); cudaEventDestroy(stopEvent);
//std::cout << __func__ << " called.."<< std::endl;
printf("%s: %.4fms, %.4fms, %.4fms, %.4fms\n", __func__, ms1, ms2, ms3, ms4);
#endif
cudaFree(ptr_data);
return 0;
}
The measured result when the matrix is 8192x8192:
cu_fwd_fft_shift: 4.2841ms, 0.7394ms, 0.0492ms, 4.2857ms
It means that(It is verified):
- CPU->GPU: 4.2ms.
- forward FFT: 0.7ms.
- FFT shift: 0.05ms.
- GPU->CPU: 4.2ms.
The problem I encountered is that: in a CPU function, there are some "code snippet" (just like the FFT) could be ported to CUDA, but thre are some if/else code, and intermediate memory malloc between them.
I want to reduce data transfer CPU<-->GPU.My optinion is that porting a whole CPU function to CUDA(GPU side), But there are many "logic code" like if/else, intermediate memory malloc.
So my question are:
- Does it possible to set one core as master(just like CPU) to process these malloc / "logic code" and dispache subsequest calculation to all other cores ?
- Are there any other CUDA projects can I study from ? Or
- Is this solution impossible ?