cuda-memcheck does not seem to terminate

Question

When I run cuda-memcheck on my ODE integrator for a certain parameter (the number of timesteps, "NUMPOINTS") is less than a certain value (10240), it runs fine. When I increase that value, it terminates with "GPUassert: unspecified launch failure." Further, when I run cuda-memcheck on it, it doesn't seem to terminate at all. I'm running MSVS2012, Windows 7, GTX Titan, 332.21 Drivers.

Searching around, I found http://www.linkedin.com/groups/Unspecified-launch-failure-How-can-1618517.S.69436913 , which suggests that it is caused by either:

1.) Segfault

2.) Kernel being "too long"

3.) Trying to use too much shared memory

I shouldn't be close to the global memory ceiling for my Titan, right? The 2D array that lives on the GPU in which I store all the data that I calculate in the integrator is 11264 x 8960 in size, where each element is a double. That would be 100,925,440 doubles for a total of 807,403,520 bytes, or 788MBytes and the Titan has 6GB.

Does my kernel length matter if I'm doing the integration on a headless GPU and have already run kernels of length ~1000s?

This parameter is not linked to my usage of shared memory.

EDIT

It's not minimal, but here it is.

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <iomanip>                      
#include <math.h>
using namespace std;

#define NUMBLOCKSPERGRID 35                             
#define NUMTHREADSPERBLOCK 256                          
#define MAXLENGTH NUMTHREADSPERBLOCK*NUMBLOCKSPERGRID   
#define NUMPOINTS 1024*11                //TROUBLEMAKER
double concStorage[NUMPOINTS][MAXLENGTH] = {};  

__device__ __constant__ int numThreads = NUMTHREADSPERBLOCK;    
__device__ __constant__ int numBlocks = NUMBLOCKSPERGRID;       
__device__ __constant__ int numpoints = NUMPOINTS;              
__device__ __constant__ int maxlength = MAXLENGTH;              
__device__ __constant__ double localError = 1E-12;              
__device__ __constant__ int nc = 2;                             
__device__ __constant__ int n2 = 0;                             
__device__ __constant__ double ka = 5E4;
__device__ __constant__ double kb = 0;
__device__ __constant__ double kp = 0;
__device__ __constant__ double km = 2E-8;
__device__ __constant__ double kn = 2E-5;
__device__ __constant__ double kn2 = 0;

__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrDiv(double*, double*, double*);
__global__ void arrAbs(double*);
__global__ void arrInit(double*, double);
__global__ void arrInitToLengths(double*);
__global__ void arrCopy(double*, double*);
__global__ void arrMaxKernel(double*, double*, double*, int);
__device__ double arrSum(double*);
__device__ void arrMultAddStore(double , double*, double*, double*, double*);
__device__ int arrLength(double*);
__device__ void arrMax(double*, double*, int*);

__device__ __constant__ double a21 = static_cast<double>(.25);
__device__ __constant__ double a31 = static_cast<double>(3)/static_cast<double>(32);
__device__ __constant__ double a32 = static_cast<double>(9)/static_cast<double>(32);
__device__ __constant__ double a41 = static_cast<double>(1932)/static_cast<double>(2197);
__device__ __constant__ double a42 = static_cast<double>(-7200)/static_cast<double>(2197);
__device__ __constant__ double a43 = static_cast<double>(7296)/static_cast<double>(2197);
__device__ __constant__ double a51 = static_cast<double>(439)/static_cast<double>(216);
__device__ __constant__ double a52 = static_cast<double>(-8);
__device__ __constant__ double a53 = static_cast<double>(3680)/static_cast<double>(513);
__device__ __constant__ double a54 = static_cast<double>(-845)/static_cast<double>(4104);
__device__ __constant__ double a61 = static_cast<double>(-8)/static_cast<double>(27);
__device__ __constant__ double a62 = static_cast<double>(2);
__device__ __constant__ double a63 = static_cast<double>(-3544)/static_cast<double>(2565);
__device__ __constant__ double a64 = static_cast<double>(1859)/static_cast<double>(4104);
__device__ __constant__ double a65 = static_cast<double>(-11)/static_cast<double>(40);

__device__ double temp1[MAXLENGTH];
__device__ double temp2[MAXLENGTH];
__device__ double temp3[MAXLENGTH];
__device__ double temp4[MAXLENGTH];
__device__ double tempsum[MAXLENGTH];
__device__ double k1s[MAXLENGTH];
__device__ double k2s[MAXLENGTH];
__device__ double k3s[MAXLENGTH];
__device__ double k4s[MAXLENGTH];
__device__ double k5s[MAXLENGTH];
__device__ double k6s[MAXLENGTH];

void printColumnText(string , double*, double [NUMPOINTS][MAXLENGTH]);
__global__ void rkf5(size_t, double*, double* , double*, double*);
__global__ void calcK(double*, double*, double*);
__device__ void calcKs(double*, double*);
__global__ void calcFlux(double*, double*, double*);
__device__ void calcMonomerFlux(double*, double*, double*);
__device__ void calcStepSize(double*, double*, double*, int*);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);
__device__ double flux(int, double*);
__device__ double knowles_flux(int, double*);
__device__ void zeroTemps();
__global__ void storeConcs(double*, size_t, double*, int);
__device__ void storeTime(double*, double, int);



#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    //Error checking
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
int main(int argc, char** argv)
{
    //Main program.
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
    cudaSetDevice(0);
    std::cout << std::fixed;
    std::cout << std::setprecision(16);

    const int numpoints = NUMPOINTS;
    const int maxlength = MAXLENGTH;
    double mo = 5E-6;
    double to = 0;
    double tf = 7200;
    double dt = (tf-to)/static_cast<double>(numpoints);
    string filename = "ItWorks.dat";

    double concs[maxlength] = {};
    double ts[numpoints]= {};

    std::cout<<dt;
    std::cout<<"\n";
    concs[0]=mo;
    std::cout<<concs[0];
    std::cout<<" ";

    concs[0]=mo;
    std::cout<<"\n";

    double *d_concStorage;
    double *d_concs;
    double *d_dt;
    double *d_to;
    double *d_tf;
    double *d_ts;

    size_t size_concs = sizeof(concs);
    size_t size_dt = sizeof(dt);
    size_t size_to = sizeof(to);
    size_t size_tf = sizeof(tf);
    size_t size_ts = sizeof(ts);
    size_t h_pitch = maxlength*sizeof(double);
    size_t d_pitch;

    gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, maxlength * sizeof(double), numpoints)); 

    gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
    gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
    gpuErrchk(cudaMalloc((void**)&d_to, size_to));
    gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));
    gpuErrchk(cudaMalloc((void**)&d_ts, size_ts));

    gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ts, &ts, size_ts, cudaMemcpyHostToDevice));

    rkf5<<<1,1>>>(d_pitch, d_concStorage, d_concs, d_dt, d_ts);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    //Copy 2D array of concentrations vs. time from GPU to Host.
    gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, maxlength*sizeof(double), numpoints, cudaMemcpyDeviceToHost) );   
    gpuErrchk( cudaMemcpy(&ts, d_ts, size_ts, cudaMemcpyDeviceToHost));
    cudaDeviceSynchronize();

    printColumnText(filename, ts, concStorage);

    cudaDeviceReset();
    return 0;
}
void printColumnText(string filename, double ts[NUMPOINTS], double concStorage[NUMPOINTS][MAXLENGTH])
{
    ofstream myfile2;
    myfile2.open (filename);
    myfile2 << std::fixed << std::setprecision(16);
    for(int j=0; j < NUMPOINTS; j++)
    {
        for (int i=0; i < (MAXLENGTH+1); i++)
        {
            if (i == 0)
            {
                myfile2 << std::fixed << std::setprecision(16) << ts[j];
                //std::cout<<ts[j];
                myfile2 << "\t";
            }
            else
            {
                myfile2 << std::fixed << std::setprecision(16) << concStorage[j][i-1];
                //std::cout<<concStorage[j][i-1];
                myfile2 << "\t";
            }
        }
        myfile2 <<"\n";
    }
    myfile2.close();

}
__global__ void rkf5(size_t pitch, double* concStorage, double* concs, double* dt, double* d_ts)
{
    zeroTemps();
    double currentTime = 0;         //This can be generalized for a different start time.

    for(int k = 0; k < numpoints; k++)
    {
        double internalCounter = 0;
        double error = localError + 1;  //Ensure adaptive step size loop happens at least once per timestep.
        int errorIdx = -1;              //Used to do something.
        zeroTemps();

        storeConcs<<< numBlocks, numThreads >>>(concStorage, pitch, concs, k);  //Store this step's concentrations in 2D array
        cudaDeviceSynchronize();

        while (error > localError)
        {
            internalCounter++;
            calcKs(concs, dt);
            cudaDeviceSynchronize();

            calcStepSize(concs, dt, &error, &errorIdx); //temp1 = 4th Order guess, temp2 = 5th Order guess
            cudaDeviceSynchronize();

            if (error > localError)
            {
                //*dt = .5* (*dt);
                *dt = pow((localError/error),(.2))*(*dt);
            }
            else if (error < localError)
            {
                //if (error < .75 * localError)
                    *dt = pow((localError/error),(.2))*(*dt);
                    //*dt = 1.25*(*dt); 
            }
            //*/
        }
        currentTime += (*dt);
        storeTime(d_ts, currentTime, k);

        cudaDeviceSynchronize();
        arrCopy<<< numBlocks, numThreads >>>(concs, temp2);  //Probably not necessary if I find way to handle storing IC's better.
        cudaDeviceSynchronize();
    }
}

__device__ void calcStepSize(double* concs, double* dt, double* error, int* errorIdx)
{
    takeFourthOrderStep<<< numBlocks, numThreads >>>(temp1, concs, k1s, k2s, k3s, k4s, k5s);
    takeFifthOrderStep<<< numBlocks, numThreads >>>(temp2, concs, k1s, k2s, k3s, k4s, k5s, k6s);
    cudaDeviceSynchronize();
    arrSub<<< numBlocks, numThreads >>>(temp1, temp2, temp3);
    cudaDeviceSynchronize();
    arrAbs<<< numBlocks, numThreads >>>(temp3);
    cudaDeviceSynchronize();
    arrMax(temp3, error, errorIdx);
    cudaDeviceSynchronize();
}
__device__ void calcKs(double* concs, double *dt)
{
    zeroTemps();
    calcFlux<<< numBlocks, numThreads >>>(concs, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k1s, temp2, dt);
    cudaDeviceSynchronize();

    zeroTemps();                                                        //temp1 = temp2 = tempsum = 0
    arrMultAddStore(a21, temp1, tempsum, k1s, concs);                                                   //tempsum = a21*k1
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);                                               //tempsum = concs + a21*k1    
    cudaDeviceSynchronize();
    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);      //temp2 = fluxes
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);                                                      //temp1 = r * fluxes, temp2 = fluxes (complete)
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k2s, temp2, dt);                                                     //k2s = fluxes*dt
    cudaDeviceSynchronize();

    zeroTemps();
    arrMultAddStore(a31, temp1, tempsum, k1s, concs);
    arrMultAddStore(a32, temp1, tempsum, k2s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k3s, temp2, dt);
    cudaDeviceSynchronize();

    zeroTemps();
    arrMultAddStore(a41, temp1, tempsum, k1s, concs);
    arrMultAddStore(a42, temp1, tempsum, k2s, concs);
    arrMultAddStore(a43, temp1, tempsum, k3s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k4s, temp2, dt);
    cudaDeviceSynchronize(); 

    zeroTemps();
    arrMultAddStore(a51, temp1, tempsum, k1s, concs);
    arrMultAddStore(a52, temp1, tempsum, k2s, concs);
    arrMultAddStore(a53, temp1, tempsum, k3s, concs);
    arrMultAddStore(a54, temp1, tempsum, k4s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);                                                      //temp1 = r * fluxes, temp2 = fluxes (complete)
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k5s, temp2, dt);                                                         //k4s = fluxes*dt
    cudaDeviceSynchronize();

    zeroTemps();
    arrMultAddStore(a61, temp1, tempsum, k1s, concs);
    arrMultAddStore(a62, temp1, tempsum, k2s, concs);
    arrMultAddStore(a63, temp1, tempsum, k3s, concs);
    arrMultAddStore(a64, temp1, tempsum, k4s, concs);
    arrMultAddStore(a65, temp1, tempsum, k5s, concs);
    arrAdd<<< numBlocks, numThreads >>>(concs, tempsum, tempsum);
    cudaDeviceSynchronize();

    calcFlux<<< numBlocks, numThreads >>>(tempsum, temp2, dt);          //k6 = dt * flux (concs + a61*k1 + a62*k2 + a63*k3 + a64*k4 + a65*k5)
    cudaDeviceSynchronize();
    calcMonomerFlux(temp2, temp1, dt);                                                      //temp1 = r * fluxes, temp2 = fluxes (complete)
    cudaDeviceSynchronize();
    calcK<<< numBlocks, numThreads >>>(k6s, temp2, dt);                                                         //k4s = fluxes*dt
    cudaDeviceSynchronize(); //Sync here because kernel continues onto next line before k1 finished
    //At this point, temp1 and tempsum are maxlength dimension arrays that are able to be used for other things.
}
__global__ void takeFourthOrderStep(double* y4, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s)
{
    //takeFourthOrderStep is going to overwrite the old temp1 array with the new array of concentrations that result from a 4th order step.  This kernel is meant to be launched 
    // with as many threads as there are discrete concentrations to be tracked.
    double b41 = static_cast<double>(25)/static_cast<double>(216);
    double b42 = static_cast<double>(0);
    double b43 = static_cast<double>(1408)/static_cast<double>(2565);
    double b44 = static_cast<double>(2197)/static_cast<double>(4104);
    double b45 = static_cast<double>(-1)/static_cast<double>(5);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    y4[idx] = concs[idx] + b41 * k1s[idx] + b42 * k2s[idx] + b43 * k3s[idx] + b44 * k4s[idx] + b45 * k5s[idx];
}
__global__ void takeFifthOrderStep(double* y5, double* concs, double* k1s, double* k2s, double* k3s, double* k4s, double* k5s, double* k6s)
{
    //takeFifthOrderStep is going to overwrite the old array of concentrations with the new array of concentrations.  As of now, this will be the 5th order step.  Another function can be d
    //defined that will take a fourth order step if that is interesting for any reason.  This kernel is meant to be launched with as many threads as there are discrete concentrations
    //to be tracked.
    //Store b values in register? Constants?
    double b51 = static_cast<double>(16)/static_cast<double>(135);
    double b52 = static_cast<double>(0);
    double b53 = static_cast<double>(6656)/static_cast<double>(12825);
    double b54 = static_cast<double>(28561)/static_cast<double>(56430);
    double b55 = static_cast<double>(-9)/static_cast<double>(50);
    double b56 = static_cast<double>(2)/static_cast<double>(55);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    y5[idx] = concs[idx] + b51 * k1s[idx] + b52 * k2s[idx] + b53 * k3s[idx] + b54 * k4s[idx] + b55 * k5s[idx] + b56 * k6s[idx];
}
__device__ void zeroTemps()
{
    //Initializes all the temporary storage arrays to 0.
    //Tested, works.
    arrInit<<< numBlocks, numThreads >>>(temp1, 0);
    arrInit<<< numBlocks, numThreads >>>(temp2, 0);
    arrInit<<< numBlocks, numThreads >>>(temp3, 0);
    arrInit<<< numBlocks, numThreads >>>(temp4, 0);
    arrInit<<< numBlocks, numThreads >>>(tempsum, 0);
    cudaDeviceSynchronize();
}
//storeConcs takes the current array of concentrations and stores it in the cId'th column of the 2D concStorage array
//pitch = memory size of a row
//cId = the row of cS I want to store concs in.
__global__ void storeConcs(double* cS, size_t pitch, double* concs, int cId)
{
    //int bIdx = blockIdx.x;
    int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
    //cS is basically the memory address of the first element of the flattened (1D) 2D array.
    double* row = (double*)((char*)cS + cId * pitch);
    row[tIdx] = concs[tIdx];
}
__device__ void storeTime(double* timeArray, double timeValue, int k)
{
    timeArray[k] = timeValue;
}
//Perhaps I can optimize by using shared memory to hold conc values.
__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    fluxes[idx]=knowles_flux(idx, concs);
    //fluxes[idx]=flux(idx, concs);
}
__global__ void calcK(double* ks, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    ks[idx]=(*dt)*fluxes[idx];
}
//This function calculates separatemely the flux of the monomer species.
//Tested, works.
__device__ void calcMonomerFlux(double* fluxes, double* lengths, double* dt)
{
    arrInitToLengths<<< numBlocks, numThreads >>>(lengths);         //lengths = 1,2,3,4,5...maxlength
    cudaDeviceSynchronize();
    arrMult<<< numBlocks, numThreads >>>(fluxes, lengths, lengths); //lengths = r * fluxes[r]
    cudaDeviceSynchronize();
    fluxes[0]=-static_cast<double>(1)*arrSum(lengths);              //fluxes[0] = -1*sum (r* fluxes[r])
}
//Placeholder function for the flux calculation.  It will take the size of the oligomer and current concentrations as inputs.
__device__ double flux(int r, double *concs) 
{
    return -concs[r];
}
//I need to use constants and replace these for loops with dynamic reductions.
__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;
    if (r == ((maxlength)-1))
        {
        flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
        }
    else if (r > ((nc)-1))
        {
        for (int s = r+1; s < (maxlength); s++)
            {
            frag_term += conc[s];
            }
        flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
        }
    else if (r == ((nc)-1))
        {
        for (int s = r+1; s < (maxlength); s++)
            {
            frag_term += conc[s];
            }
        flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
        }
    else if (r < ((nc)-1))
        {
        flux = 0;
        }
    return flux;
}
//Adds two arrays (a + b) element by element and stores the result in array c.
__global__ void arrAdd(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]+b[idx];
}
//Subtracts two arrays (a - b) element by element and stores the result in array c.
__global__ void arrSub(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]-b[idx];
}
//Multiplies two arrays (a * b) element by element and stores the result in array c.
__global__ void arrMult(double* a, double* b, double* c)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]*b[idx];
}
//Divides two arrays (a / b) element by element and stores the result in array c.
__global__ void arrDiv(double* a, double* b, double* c)
{                                                 
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    c[idx]=a[idx]/b[idx];
}
__global__ void arrAbs(double* a)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx] = abs(a[idx]);
}
//Initializes an array a to double value b.
__global__ void arrInit(double* a, double b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b;
}
//Initializes an array a to the values of counting numbers.  Tested, works.
__global__ void arrInitToLengths(double* a)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=idx+1;
}
//__global__ void arr2DInit(double* a, )
__global__ void arrReverseInitToLengths(double* a)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=1000-idx;
}
//Copies array b onto array a.
__global__ void arrCopy(double* a, double* b)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    a[idx]=b[idx];
}
//Sums all elements of array. Tested, works.
__device__ double arrSum(double* a)
{
    double sum = 0;
    for (int i = 0; i < maxlength; i++) 
    {
        sum += a[i];
    }
    return sum;
}
//This function multiplies a tableau value by the corresponding k array and adds the result to tempsum.  Used to
//add all the a*k terms. concs not necessary
//e.g. arrMultAddStore(a21, temp1, tempsum, k1s, concs, maxlength) => tempsum = a21 * k1
__device__ void arrMultAddStore(double tableauValue, double *temp1, double *tempsum, double *ks, double *concs) 
{
    //Sets tempsum to tabVal * k
    arrInit<<< numBlocks, numThreads >>>(temp1, tableauValue);      //Set [temp1] to tableau value, temp1 = a
    cudaDeviceSynchronize();
    arrMult<<< numBlocks, numThreads >>>(ks, temp1, temp1);         //Multiply tableau value by appropriate [k], temp1 = a*k
    cudaDeviceSynchronize();
    arrAdd<<< numBlocks, numThreads >>>(tempsum, temp1, tempsum);   //Move tabVal*k to [tempsum], tempsum = tempsum+temp1
    cudaDeviceSynchronize();
    //temp1 = tableauValue * kArray
    //tempsum = current sum (tableauValue * kArray)
}
__device__ int arrLength(double* arr)
{
    return sizeof(arr)/sizeof(arr[0]);
}
__device__ void arrMax(double* arr, double* maxVal, int* maxIdx )
{
    //int maxIdxID = 0;
    int maxThreads = 1024;
    int blocks = int(maxlength/maxThreads)+1;   //works

    double* kernelMaxes= new double[blocks];
    double* blockMaxes= new double[1];
    double* kernelIdxs= new double[blocks];
    double* blockIdxs= new double[1];
    double* temp= new double[blocks];

    arrInit<<< 1, blocks >>>(kernelMaxes, 0);   //works
    arrInit<<< 1, 1 >>>(blockMaxes, 0); //works
    arrInitToLengths<<< 1, blocks >>>(kernelIdxs);  //works
    arrInit<<< 1, 1 >>>(blockIdxs, 0);  //works
    arrInit<<< 1, blocks >>>(temp, 1);
    cudaDeviceSynchronize();
    arrSub<<< 1, blocks >>>(kernelIdxs, temp, kernelIdxs);  //kernel Idxs now initted to index
    cudaDeviceSynchronize();

    arrMaxKernel<<< blocks, maxThreads, maxThreads*sizeof(double) >>>(arr, kernelMaxes, kernelIdxs, maxlength);
    cudaDeviceSynchronize();
    arrMaxKernel<<< 1, blocks, blocks*sizeof(double) >>>(kernelMaxes, blockMaxes, blockIdxs, blocks);
    cudaDeviceSynchronize();
    *maxVal = blockMaxes[0];
    *maxIdx = blockIdxs[0];
}
__global__ void arrMaxKernel(double* arr, double* maxes, double* idxs, int length)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    //int maxIdx = 0;
    extern __shared__ double blockMemory[];

    if (idx < length)
    {
        blockMemory[threadIdx.x] = arr[idx];
        //blockMemory2[threadIdx.x] = idxs[idx];
    }
    else
    {
        blockMemory[threadIdx.x] = 0;
        //blockMemory2[threadIdx.x] = -1;
    }

    __syncthreads();

    int stage = 0;
    int maxStage = static_cast<int>(logf(blockDim.x)/logf(2));  //logf needed for CUDA

    while (stage <= maxStage)
    {
        int left = threadIdx.x;         
        int right = (threadIdx.x) + powf(2, (stage));

        if (( right < blockDim.x ) && ( left % int(powf(2, stage)) == 0 ))
        {
            if ( (blockMemory[right] > blockMemory[left]) )
            {
                blockMemory[left] = blockMemory[right];
                //blockMemory2[left] = blockMemory2[right];
            }
        }
        stage++;
        __syncthreads();
    }

    maxes[blockIdx.x] = blockMemory[0];
    //idxs[blockIdx.x] = blockMemory2[0];
}

EDIT

So this is what a 13hr run of CUDA memcheck told me. So now I can see which of my functions is writing illegally...but this almost makes the problem more mysterious to me. The parameter that I'm changing only modifies the size of a 2D global array and the number of times my integrator iterates. What I'm wondering now, is if I can just run NSIGHT CUDA debugger until it finds the error and then I can look at the values that are causing it? I have memcheck enabled, so I assume it will take ANOTHER 13 hrs if not longer, haha.

cuda-memcheck output

Running an application under cuda-memcheck can make it take very much longer than an ordinary run, like 20x longer, in some cases. If it doesn't seem to terminate under cuda-memcheck, you probably haven't waited long enough. I assume the GPUAssert message is coming from your application (cuda error checking) so you should start debugging that by focusing on what is going on at that point in the code. — Robert Crovella
Is there a way to do that if the cuda error checking is pointing to the launch of a parent kernel in which 98% of my entire application takes place? I've been trying to use the NSIGHT CUDA debugger without much success. — Hair of Slytherin
How about using printf ? It's tedious, but not impossible. — Robert Crovella
Like the CUDA version of printf? Anything I'd be debugging would be inside the master thread. — Hair of Slytherin
You can put printf statements anywhere you want, in host code, device code, parent kernels, child kernels. — Robert Crovella

Hair of Slytherin Hair of Slytherin · Accepted Answer · 2014-01-19T23:11:55

If anyone comes across this with the same question, Robert Crovella is indeed right. You just need to run cuda-memcheck longer. It took me 13 hours and it did help me figure out the bug.

cuda-memcheck does not seem to terminate

1 Answers