When I run cuda-memcheck on my ODE integrator for a certain parameter (the number of timesteps, "NUMPOINTS") is less than a certain value (10240), it runs fine. When I increase that value, it terminates with "GPUassert: unspecified launch failure." Further, when I run cuda-memcheck on it, it doesn't seem to terminate at all. I'm running MSVS2012, Windows 7, GTX Titan, 332.21 Drivers.
Searching around, I found http://www.linkedin.com/groups/Unspecified-launch-failure-How-can-1618517.S.69436913 , which suggests that it is caused by either:
1.) Segfault
2.) Kernel being "too long"
3.) Trying to use too much shared memory
I shouldn't be close to the global memory ceiling for my Titan, right? The 2D array that lives on the GPU in which I store all the data that I calculate in the integrator is 11264 x 8960 in size, where each element is a double. That would be 100,925,440 doubles for a total of 807,403,520 bytes, or 788MBytes and the Titan has 6GB.
Does my kernel length matter if I'm doing the integration on a headless GPU and have already run kernels of length ~1000s?
This parameter is not linked to my usage of shared memory.
EDIT
It's not minimal, but here it is.
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <math.h>
using namespace std;
#define NUMBLOCKSPERGRID 35
#define NUMTHREADSPERBLOCK 256
#define MAXLENGTH NUMTHREADSPERBLOCK*NUMBLOCKSPERGRID
#define NUMPOINTS 1024*11 //TROUBLEMAKER
double concStorage[NUMPOINTS][MAXLENGTH] = {};
__device__ __constant__ int numThreads = NUMTHREADSPERBLOCK;
__device__ __constant__ int numBlocks = NUMBLOCKSPERGRID;
__device__ __constant__ int numpoints = NUMPOINTS;
__device__ __constant__ int maxlength = MAXLENGTH;
__device__ __constant__ double localError = 1E-12;
__device__ __constant__ int nc = 2;
__device__ __constant__ int n2 = 0;
__device__ __constant__ double ka = 5E4;
__device__ __constant__ double kb = 0;
__device__ __constant__ double kp = 0;
__device__ __constant__ double km = 2E-8;
__device__ __constant__ double kn = 2E-5;
__device__ __constant__ double kn2 = 0;
__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrDiv(double*, double*, double*);
__global__ void arrAbs(double*);
__global__ void arrInit(double*, double);
__global__ void arrInitToLengths(double*);
__global__ void arrCopy(double*, double*);
__global__ void arrMaxKernel(double*, double*, double*, int);
__device__ double arrSum(double*);
__device__ void arrMultAddStore(double , double*, double*, double*, double*);
__device__ int arrLength(double*);
__device__ void arrMax(double*, double*, int*);
__device__ __constant__ double a21 = static_cast<double>(.25);
__device__ __constant__ double a31 = static_cast<double>(3)/static_cast<double>(32);
__device__ __constant__ double a32 = static_cast<double>(9)/static_cast<double>(32);
__device__ __constant__ double a41 = static_cast<double>(1932)/static_cast<double>(2197);
__device__ __constant__ double a42 = static_cast<double>(-7200)/static_cast<double>(2197);
__device__ __constant__ double a43 = static_cast<double>(7296)/static_cast<double>(2197);
__device__ __constant__ double a51 = static_cast<double>(439)/static_cast<double>(216);
__device__ __constant__ double a52 = static_cast<double>(-8);
__device__ __constant__ double a53 = static_cast<double>(3680)/static_cast<double>(513);
__device__ __constant__ double a54 = static_cast<double>(-845)/static_cast<double>(4104);
__device__ __constant__ double a61 = static_cast<double>(-8)/static_cast<double>(27);
__device__ __constant__ double a62 = static_cast<double>(2);
__device__ __constant__ double a63 = static_cast<double>(-3544)/static_cast<double>(2565);
__device__ __constant__ double a64 = static_cast<double>(1859)/static_cast<double>(4104);
__device__ __constant__ double a65 = static_cast<double>(-11)/static_cast<double>(40);
__device__ double temp1[MAXLENGTH];
__device__ double temp2[MAXLENGTH];
__device__ double temp3[MAXLENGTH];
__device__ double temp4[MAXLENGTH];
__device__ double tempsum[MAXLENGTH];
__device__ double k1s[MAXLENGTH];
__device__ double k2s[MAXLENGTH];
__device__ double k3s[MAXLENGTH];
__device__ double k4s[MAXLENGTH];
__device__ double k5s[MAXLENGTH];
__device__ double k6s[MAXLENGTH];
void printColumnText(string , double*, double [NUMPOINTS][MAXLENGTH]);
__global__ void rkf5(size_t, double*, double* , double*, double*);
__global__ void calcK(double*, double*, double*);
__device__ void calcKs(double*, double*);
__global__ void calcFlux(double*, double*, double*);
__device__ void calcMonomerFlux(double*, double*, double*);
__device__ void calcStepSize(double*, double*, double*, int*);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);
__device__ double flux(int, double*);
__device__ double knowles_flux(int, double*);
__device__ void zeroTemps();
__global__ void storeConcs(double*, size_t, double*, int);
__device__ void storeTime(double*, double, int);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
//Error checking
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char** argv)
{
//Main program.
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
cudaSetDevice(0);
std::cout << std::fixed;
std::cout << std::setprecision(16);
const int numpoints = NUMPOINTS;
const int maxlength = MAXLENGTH;
double mo = 5E-6;
double to = 0;
double tf = 7200;
double dt = (tf-to)/static_cast<double>(numpoints);
string filename = "ItWorks.dat";
double concs[maxlength] = {};
double ts[numpoints]= {};
std::cout<<dt;
std::cout<<"\n";
concs[0]=mo;
std::cout<<concs[0];
std::cout<<" ";
concs[0]=mo;
std::cout<<"\n";
double *d_concStorage;
double *d_concs;
double *d_dt;
double *d_to;
double *d_tf;
double *d_ts;
size_t size_concs = sizeof(concs);
size_t size_dt = sizeof(dt);
size_t size_to = sizeof(to);
size_t size_tf = sizeof(tf);
size_t size_ts = sizeof(ts);
size_t h_pitch = maxlength*sizeof(double);
size_t d_pitch;
gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints));
gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
gpuErrchk(cudaMalloc((void**)&d_to, size_to));
gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));
gpuErrchk(cudaMalloc((void**)&d_ts, size_ts));
gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ts, &ts, size_ts, cudaMemcpyHostToDevice));
rkf5<<<1,1>>>(d_pitch, d_concStorage, d_concs, d_dt, d_ts);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
//Copy 2D array of concentrations vs. time from GPU to Host.
gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&ts, d_ts, size_ts, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
printColumnText(filename, ts, concStorage);
cudaDeviceReset();
return 0;
}
void printColumnText(string filename, double ts[NUMPOINTS], double concStorage[NUMPOINTS][MAXLENGTH])
{
ofstream myfile2;
myfile2.open (filename);
myfile2 << std::fixed << std::setprecision(16);
for(int j=0; j < NUMPOINTS; j++)
{
for (int i=0; i < (MAXLENGTH+1); i++)
{
if (i == 0)
{
myfile2 << std::fixed << std::setprecision(16) << ts[j];
//std::cout<<ts[j];
myfile2 << "\t";
}
else
{
myfile2 << std::fixed << std::setprecision(16) << concStorage[j][i-1];
//std::cout<<concStorage[j][i-1];
myfile2 << "\t";
}
}
myfile2 <<"\n";
}
myfile2.close();
}
__global__ void rkf5(size_t pitch, double* concStorage, double* concs, double* dt, double* d_ts)
{
zeroTemps();
double currentTime = 0; //This can be generalized for a different start time.
for(int k = 0; k < numpoints; k++)
{
double internalCounter = 0;
double error = localError + 1; //Ensure adaptive step size loop happens at least once per timestep.
int errorIdx = -1; //Used to do something.
zeroTemps();
storeConcs<<< numBlocks, numThreads >>>(concStorage, pitch, concs, k); //Store this step's concentrations in 2D array
cudaDeviceSynchronize();
while (error > localError)
{
internalCounter++;
calcKs(concs, dt);
cudaDeviceSynchronize();
calcStepSize(concs, dt, &error, &errorIdx); //temp1 = 4th Order guess, temp2 = 5th Order guess
cudaDeviceSynchronize();
if (error > localError)
{
//*dt = .5* (*dt);
*dt = pow((localError/error),(.2))*(*dt);
}
else if (error < localError)
{
//if (error < .75 * localError)
*dt = pow((localError/error),(.2))*(*dt);
//*dt = 1.25*(*dt);
}
//*/
}
currentTime += (*dt);
storeTime(d_ts, currentTime, k);
cudaDeviceSynchronize();
arrCopy<<< numBlocks, numThreads >>>(concs, temp2); //Probably not necessary if I find way to handle storing IC's better.
cudaDeviceSynchronize();
}
}
__device__ void calcStepSize(double* concs, double* dt, double* error, int* errorIdx)
{
takeFourthOrderStep<<< numBlocks, numThreads >>>(temp1, concs, k1s, k2s, k3s, k4s, k5s);
takeFifthOrderStep<<< numBlocks, numThreads >>>(temp2, concs, k1s, k2s, k3s, k4s, k5s, k6s);
cudaDeviceSynchronize();
arrSub<<< numBlocks, numThreads >>>(temp1, temp2, temp3);
cudaDeviceSynchronize();
arrAbs<<< numBlocks, numThreads >>>(temp3);
cudaDeviceSynchronize();
arrMax(temp3, error, errorIdx);
cudaDeviceSynchronize();
}
__device__ void calcKs(double* concs, double *dt)
{
zeroTemps();
calcFlux<<< numBlocks, numThreads >>>(concs, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt);
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k1s, temp2, dt);
cudaDeviceSynchronize();
zeroTemps(); //temp1 = temp2 = tempsum = 0
arrMultAddStore(a21, temp1, tempsum, k1s, concs); //tempsum = a21*k1
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum); //tempsum = concs + a21*k1
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt); //temp2 = fluxes
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt); //temp1 = r * fluxes, temp2 = fluxes (complete)
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k2s, temp2, dt); //k2s = fluxes*dt
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a31, temp1, tempsum, k1s, concs);
arrMultAddStore(a32, temp1, tempsum, k2s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt);
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k3s, temp2, dt);
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a41, temp1, tempsum, k1s, concs);
arrMultAddStore(a42, temp1, tempsum, k2s, concs);
arrMultAddStore(a43, temp1, tempsum, k3s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt);
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k4s, temp2, dt);
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a51, temp1, tempsum, k1s, concs);
arrMultAddStore(a52, temp1, tempsum, k2s, concs);
arrMultAddStore(a53, temp1, tempsum, k3s, concs);
arrMultAddStore(a54, temp1, tempsum, k4s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt); //temp1 = r * fluxes, temp2 = fluxes (complete)
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k5s, temp2, dt); //k4s = fluxes*dt
cudaDeviceSynchronize();
zeroTemps();
arrMultAddStore(a61, temp1, tempsum, k1s, concs);
arrMultAddStore(a62, temp1, tempsum, k2s, concs);
arrMultAddStore(a63, temp1, tempsum, k3s, concs);
arrMultAddStore(a64, temp1, tempsum, k4s, concs);
arrMultAddStore(a65, temp1, tempsum, k5s, concs);
arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
cudaDeviceSynchronize();
calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt); //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5)
cudaDeviceSynchronize();
calcMonomerFlux(temp2, temp1, dt); //temp1 = r * fluxes, temp2 = fluxes (complete)
cudaDeviceSynchronize();
calcK<<< numBlocks, numThreads >>>(k6s, temp2, dt); //k4s = fluxes*dt
cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished
//At this point, temp1 and tempsum are maxlength dimension arrays that are able to be used for other things.
}
__global__ void takeFourthOrderStep(double* y4, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s)
{
//takeFourthOrderStep is going to overwrite the old temp1 array with the new array of concentrations that result from a 4th order step. This kernel is meant to be launched
// with as many threads as there are discrete concentrations to be tracked.
double b41 = static_cast<double>(25)/static_cast<double>(216);
double b42 = static_cast<double>(0);
double b43 = static_cast<double>(1408)/static_cast<double>(2565);
double b44 = static_cast<double>(2197)/static_cast<double>(4104);
double b45 = static_cast<double>(-1)/static_cast<double>(5);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
y4[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx];
}
__global__ void takeFifthOrderStep(double* y5, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s)
{
//takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations. As of now, this will be the 5th order step. Another function can be d
//defined that will take a fourth order step if that is interesting for any reason. This kernel is meant to be launched with as many threads as there are discrete concentrations
//to be tracked.
//Store b values in register? Constants?
double b51 = static_cast<double>(16)/static_cast<double>(135);
double b52 = static_cast<double>(0);
double b53 = static_cast<double>(6656)/static_cast<double>(12825);
double b54 = static_cast<double>(28561)/static_cast<double>(56430);
double b55 = static_cast<double>(-9)/static_cast<double>(50);
double b56 = static_cast<double>(2)/static_cast<double>(55);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
y5[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx];
}
__device__ void zeroTemps()
{
//Initializes all the temporary storage arrays to 0.
//Tested, works.
arrInit<<< numBlocks, numThreads >>>(temp1, 0);
arrInit<<< numBlocks, numThreads >>>(temp2, 0);
arrInit<<< numBlocks, numThreads >>>(temp3, 0);
arrInit<<< numBlocks, numThreads >>>(temp4, 0);
arrInit<<< numBlocks, numThreads >>>(tempsum, 0);
cudaDeviceSynchronize();
}
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array
//pitch = memory size of a row
//cId = the row of cS I want to store concs in.
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId)
{
//int bIdx = blockIdx.x;
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
//cS is basically the memory address of the first element of the flattened (1D) 2D array.
double* row = (double*)((char*)cS + cId * pitch);
row[tIdx] = concs[tIdx];
}
__device__ void storeTime(double* timeArray, double timeValue, int k)
{
timeArray[k] = timeValue;
}
//Perhaps I can optimize by using shared memory to hold conc values.
__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
fluxes[idx]=knowles_flux(idx, concs);
//fluxes[idx]=flux(idx, concs);
}
__global__ void calcK(double* ks, double* fluxes, double* dt)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
ks[idx]=(*dt)*fluxes[idx];
}
//This function calculates separatemely the flux of the monomer species.
//Tested, works.
__device__ void calcMonomerFlux(double* fluxes, double* lengths, double* dt)
{
arrInitToLengths<<< numBlocks, numThreads >>>(lengths); //lengths = 1,2,3,4,5...maxlength
cudaDeviceSynchronize();
arrMult<<< numBlocks, numThreads >>>(fluxes, lengths, lengths); //lengths = r * fluxes[r]
cudaDeviceSynchronize();
fluxes[0]=-static_cast<double>(1)*arrSum(lengths); //fluxes[0] = -1*sum (r* fluxes[r])
}
//Placeholder function for the flux calculation. It will take the size of the oligomer and current concentrations as inputs.
__device__ double flux(int r, double *concs)
{
return -concs[r];
}
//I need to use constants and replace these for loops with dynamic reductions.
__device__ double knowles_flux(int r, double *conc)
{
double frag_term = 0;
double flux = 0;
if (r == ((maxlength)-1))
{
flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
}
else if (r > ((nc)-1))
{
for (int s = r+1; s < (maxlength); s++)
{
frag_term += conc[s];
}
flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
}
else if (r == ((nc)-1))
{
for (int s = r+1; s < (maxlength); s++)
{
frag_term += conc[s];
}
flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
}
else if (r < ((nc)-1))
{
flux = 0;
}
return flux;
}
//Adds two arrays (a + b) element by element and stores the result in array c.
__global__ void arrAdd(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]+b[idx];
}
//Subtracts two arrays (a - b) element by element and stores the result in array c.
__global__ void arrSub(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]-b[idx];
}
//Multiplies two arrays (a * b) element by element and stores the result in array c.
__global__ void arrMult(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]*b[idx];
}
//Divides two arrays (a / b) element by element and stores the result in array c.
__global__ void arrDiv(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]/b[idx];
}
__global__ void arrAbs(double* a)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx] = abs(a[idx]);
}
//Initializes an array a to double value b.
__global__ void arrInit(double* a, double b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=b;
}
//Initializes an array a to the values of counting numbers. Tested, works.
__global__ void arrInitToLengths(double* a)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=idx+1;
}
//__global__ void arr2DInit(double* a, )
__global__ void arrReverseInitToLengths(double* a)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=1000-idx;
}
//Copies array b onto array a.
__global__ void arrCopy(double* a, double* b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=b[idx];
}
//Sums all elements of array. Tested, works.
__device__ double arrSum(double* a)
{
double sum = 0;
for (int i = 0; i < maxlength; i++)
{
sum += a[i];
}
return sum;
}
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum. Used to
//add all the a*k terms. concs not necessary
//e.g. arrMultAddStore(a21, temp1, tempsum, k1s, concs, maxlength) => tempsum = a21 * k1
__device__ void arrMultAddStore(double tableauValue, double *temp1, double *tempsum, double *ks, double *concs)
{
//Sets tempsum to tabVal * k
arrInit<<< numBlocks, numThreads >>>(temp1, tableauValue); //Set [temp1] to tableau value, temp1 = a
cudaDeviceSynchronize();
arrMult<<< numBlocks, numThreads >>>(ks, temp1, temp1); //Multiply tableau value by appropriate [k], temp1 = a*k
cudaDeviceSynchronize();
arrAdd<<< numBlocks, numThreads >>>(tempsum, temp1, tempsum); //Move tabVal*k to [tempsum], tempsum = tempsum+temp1
cudaDeviceSynchronize();
//temp1 = tableauValue * kArray
//tempsum = current sum (tableauValue * kArray)
}
__device__ int arrLength(double* arr)
{
return sizeof(arr)/sizeof(arr[0]);
}
__device__ void arrMax(double* arr, double* maxVal, int* maxIdx )
{
//int maxIdxID = 0;
int maxThreads = 1024;
int blocks = int(maxlength/maxThreads)+1; //works
double* kernelMaxes= new double[blocks];
double* blockMaxes= new double[1];
double* kernelIdxs= new double[blocks];
double* blockIdxs= new double[1];
double* temp= new double[blocks];
arrInit<<< 1, blocks >>>(kernelMaxes, 0); //works
arrInit<<< 1, 1 >>>(blockMaxes, 0); //works
arrInitToLengths<<< 1, blocks >>>(kernelIdxs); //works
arrInit<<< 1, 1 >>>(blockIdxs, 0); //works
arrInit<<< 1, blocks >>>(temp, 1);
cudaDeviceSynchronize();
arrSub<<< 1, blocks >>>(kernelIdxs, temp, kernelIdxs); //kernel Idxs now initted to index
cudaDeviceSynchronize();
arrMaxKernel<<< blocks, maxThreads, maxThreads*sizeof(double) >>>(arr, kernelMaxes, kernelIdxs, maxlength);
cudaDeviceSynchronize();
arrMaxKernel<<< 1, blocks, blocks*sizeof(double) >>>(kernelMaxes, blockMaxes, blockIdxs, blocks);
cudaDeviceSynchronize();
*maxVal = blockMaxes[0];
*maxIdx = blockIdxs[0];
}
__global__ void arrMaxKernel(double* arr, double* maxes, double* idxs, int length)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//int maxIdx = 0;
extern __shared__ double blockMemory[];
if (idx < length)
{
blockMemory[threadIdx.x] = arr[idx];
//blockMemory2[threadIdx.x] = idxs[idx];
}
else
{
blockMemory[threadIdx.x] = 0;
//blockMemory2[threadIdx.x] = -1;
}
__syncthreads();
int stage = 0;
int maxStage = static_cast<int>(logf(blockDim.x)/logf(2)); //logf needed for CUDA
while (stage <= maxStage)
{
int left = threadIdx.x;
int right = (threadIdx.x) + powf(2, (stage));
if (( right < blockDim.x ) && ( left % int(powf(2, stage)) == 0 ))
{
if ( (blockMemory[right] > blockMemory[left]) )
{
blockMemory[left] = blockMemory[right];
//blockMemory2[left] = blockMemory2[right];
}
}
stage++;
__syncthreads();
}
maxes[blockIdx.x] = blockMemory[0];
//idxs[blockIdx.x] = blockMemory2[0];
}
EDIT
So this is what a 13hr run of CUDA memcheck told me. So now I can see which of my functions is writing illegally...but this almost makes the problem more mysterious to me. The parameter that I'm changing only modifies the size of a 2D global array and the number of times my integrator iterates. What I'm wondering now, is if I can just run NSIGHT CUDA debugger until it finds the error and then I can look at the values that are causing it? I have memcheck enabled, so I assume it will take ANOTHER 13 hrs if not longer, haha.
cuda-memcheck
can make it take very much longer than an ordinary run, like 20x longer, in some cases. If it doesn't seem to terminate undercuda-memcheck
, you probably haven't waited long enough. I assume theGPUAssert
message is coming from your application (cuda error checking) so you should start debugging that by focusing on what is going on at that point in the code. – Robert Crovellaprintf
? It's tedious, but not impossible. – Robert Crovellaprintf
statements anywhere you want, in host code, device code, parent kernels, child kernels. – Robert Crovella