I Have cuda/C++ code that returns C++ host-side arrays. I wanted to manipulate these arrays in MATLAB so I rewrote my code in mex format and compiled with mex.
I got it to work by passing preallocated arrays from MATLAB into the mex script but this slowed things down insanely. (54 seconds vs 14 seconds without mex)
Here's the slow solution for a simplified, no input 1 output version of my code:
#include "mex.h"
#include "gpu/mxGPUArray.h"
#include "matrix.h"
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
#include <algorithm>
#include <iostream>
#define iterations 159744
#define transMatrixSize 2592 // Just for clarity. Do not change. No need to adjust this value for this simulation.
#define reps 1024 // Is equal to blocksize. Do not change without proper source code adjustments.
#define integralStep 13125 // Number of time steps to be averaged at the tail of the Force-Time curves to get Steady State Force
__global__ void kern(float *masterForces, ...)
{
int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));
...
...
{
...
{
masterForces[i] = buffer[0]/24576.0;
}
}
}
...
}
}
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
...
plhs[0] = mxCreateNumericMatrix(iterations,1,mxSINGLE_CLASS,mxREAL);
float *h_F0 = (float*) mxGetData(plhs[0]);
//Device input vectors
float *d_F0;
..
// Allocate memory for each vector on GPU
cudaMalloc((void**)&d_F0, iterations * sizeof(float));
...
//////////////////////////////////////////////LAUNCH ////////////////////////////////////////////////////////////////////////////////////
kern<<<1, 1024>>>( d_F0);
//////////////////////////////////////////////RETRIEVE DATA ////////////////////////////////////////////////////////////////////////////////////
cudaMemcpyAsync( h_F0 , d_F0 , iterations * sizeof(float), cudaMemcpyDeviceToHost);
///////////////////Free Memory///////////////////
cudaDeviceReset();
////////////////////////////////////////////////////
}
Why so slow?
EDIT: Mex was compiling with an older architecture (SM_13) INSTEAD OF SM_35. Now the time makes sense. (16 s with mex, 14 s with c++/cuda only)
gpuArray
input and returns agpuArray output
. You want to take regular arrays in/out, right? – chappjcdelete h_F0;
when usingmxCreateNumericMatrix
. – chappjc