I have a program that does a lot of single precision math. It produces correct results if I specify 1.0 architecture but is broken for 2.X and 3.X architectures. What would cause this?
Included below: Very long code sample. Compile command and good output. Compile command and bad output.
If I run the same routing in the CPU using gcc, I get results that match the 1.0 architecture.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
/*
* svdcomp - SVD decomposition routine.
* Takes an mxn matrix a and decomposes it into udv, where u,v are
* left and right orthogonal transformation matrices, and d is a
* diagonal matrix of singular values.
*
* This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
* code from Numerical Recipes adapted by Luke Tierney and David Betz.
* Originally from: "Numerical Recipes in C: The Art of Scientific Computing",
* Press, Flannery, Teukolosky, Vetterling. 1992.
*
* Input to dsvd is as follows:
* a = mxn matrix to be decomposed, gets overwritten with u
* m = row dimension of a
* n = column dimension of a
* w = returns the vector of singular values of a
* v = returns the right orthogonal transformation matrix
*/
#define SIGN(a, b) ((b) >= 0.0f ? fabsf(a) : -fabsf(a))
#define MIN(x,y) ( (x) < (y) ? (x) : (y) )
#define MAX(x,y) ((x)>(y)?(x):(y))
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
#define ERRCHECK \
if (cudaPeekAtLastError()) { \
fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
__device__ int
svd(float *a, int m, int n, float *w, float *v, int skip_u)
{
int flag, i, its, j, jj, k, l, nm;
float c, f, h, s, x, y, z;
float anorm = 0.0f, g = 0.0f, scale = 0.0f;
float rv1[3];
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++)
{
/* left-hand reduction */
l = i + 1;
rv1[i] = scale * g;
g = s = scale = 0.0f;
if (i < m)
{
for (k = i; k < m; k++)
scale += fabsf(a[k*n+i]);
if (scale)
{
for (k = i; k < m; k++)
{
a[k*n+i] /= scale;
s += powf(a[k*n+i], 2);
}
f = a[i*n+i];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+i] = f - g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = i; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = s / h;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (k = i; k < m; k++)
a[k*n+i] *= scale;
}
}
w[i] = scale * g;
/* right-hand reduction */
g = s = scale = 0.0f;
if (i < m && i != n - 1)
{
for (k = l; k < n; k++)
scale += fabsf(a[i*n+k]);
if (scale)
{
for (k = l; k < n; k++)
{
a[i*n+k] /= scale;
s += powf(a[i*n+k], 2);
}
f = a[i*n+l];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+l] = f - g;
for (k = l; k < n; k++)
rv1[k] = a[i*n+k] / h;
if (i != m - 1)
{
for (j = l; j < m; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[j*n+k] * a[i*n+k];
for (k = l; k < n; k++)
a[j*n+k] += s * rv1[k];
}
}
for (k = l; k < n; k++)
a[i*n+k] *= scale;
}
}
anorm = MAX(anorm, fabsf(w[i]) + fabsf(rv1[i]));
}
/* accumulate the right-hand transformation */
for (i = n - 1; i >= 0; i--)
{
if (i < n - 1)
{
if (g)
{
for (j = l; j < n; j++)
v[j*n+i] = (a[i*n+j] / a[i*n+l]) / g;
/* float division to avoid underflow */
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[i*n+k] * v[k*n+j];
for (k = l; k < n; k++)
v[k*n+j] += s * v[k*n+i];
}
}
for (j = l; j < n; j++)
v[i*n+j] = v[j*n+i] = 0.0f;
}
v[i*n+i] = 1.0f;
g = rv1[i];
l = i;
}
/* accumulate the left-hand transformation */
if (!skip_u) {
for (i = n - 1; i >= 0; i--)
{
l = i + 1;
g = w[i];
if (i < n - 1)
for (j = l; j < n; j++)
a[i*n+j] = 0.0f;
if (g)
{
g = 1.0f / g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = (s / a[i*n+i]) * g;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (j = i; j < m; j++)
a[j*n+i] = a[j*n+i]*g;
}
else
{
for (j = i; j < m; j++)
a[j*n+i] = 0.0f;
}
++a[i*n+i];
}
}
/* diagonalize the bidiagonal form */
for (k = n - 1; k >= 0; k--)
{ /* loop over singular values */
for (its = 0; its < 30; its++)
{ /* loop over allowed iterations */
flag = 1;
for (l = k; l >= 0; l--)
{ /* test for splitting */
nm = l - 1;
if (fabsf(rv1[l]) + anorm == anorm)
{
flag = 0;
break;
}
if (fabsf(w[nm]) + anorm == anorm)
break;
}
if (flag)
{
c = 0.0f;
s = 1.0f;
for (i = l; i <= k; i++)
{
f = s * rv1[i];
if (fabsf(f) + anorm != anorm)
{
g = w[i];
h = hypotf(f, g);
w[i] = h;
h = 1.0f / h;
c = g * h;
s = (- f * h);
if (!skip_u) {
for (j = 0; j < m; j++)
{
y = a[j*n+nm];
z = a[j*n+i];
a[j*n+nm] = y * c + z * s;
a[j*n+i] = z * c - y * s;
}
}
}
}
}
z = w[k];
if (l == k)
{ /* convergence */
if (z < 0.0f)
{ /* make singular value nonnegative */
w[k] = -z;
for (j = 0; j < n; j++)
v[j*n+k] = -v[j*n+k];
}
break;
}
if (its >= 30) {
}
/* shift from bottom 2 x 2 minor */
x = w[l];
nm = k - 1;
y = w[nm];
g = rv1[nm];
h = rv1[k];
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
g = hypotf(f, 1.0f);
f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
/* next QR transformation */
c = s = 1.0f;
for (j = l; j <= nm; j++)
{
i = j + 1;
g = rv1[i];
y = w[i];
h = s * g;
g = c * g;
z = hypotf(f, h);
rv1[j] = z;
c = f / z;
s = h / z;
f = x * c + g * s;
g = g * c - x * s;
h = y * s;
y = y * c;
for (jj = 0; jj < n; jj++)
{
x = v[jj*n+j];
z = v[jj*n+i];
v[jj*n+j] = x * c + z * s;
v[jj*n+i] = z * c - x * s;
}
z = hypotf(f, h);
w[j] = z;
if (z)
{
z = 1.0f / z;
c = f * z;
s = h * z;
}
f = (c * g) + (s * y);
x = (c * y) - (s * g);
if (!skip_u) {
for (jj = 0; jj < m; jj++)
{
y = a[jj*n+j];
z = a[jj*n+i];
a[jj*n+j] = y * c + z * s;
a[jj*n+i] = z * c - y * s;
}
}
}
rv1[l] = 0.0f;
rv1[k] = f;
w[k] = x;
}
}
return(0);
}
__global__ void
svd_kernel(float *v)
{
float a[9], w[3];
a[0] = 8.0f;
a[1] = 3.0f;
a[2] = 7.0f;
a[3] = 7.0f;
a[4] = 9.0f;
a[5] = 1.0f;
a[6] = 3.0f;
a[7] = 7.0f;
a[8] = 2.0f;
svd(a, 3, 3, w, v, 1);
}
int main()
{
int i, j;
float *v_d, v[9];
PERR(cudaMalloc(&v_d, 9*sizeof(float)));
svd_kernel<<<1,1>>>(v_d);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(v, v_d, 9*sizeof(float), cudaMemcpyDeviceToHost));
for (i = 0; i < 3; i++) {
for (j = 0; j < 3; j++) {
printf("%6.3f\t", v[i*3+j]);
}
printf("\n");
}
return 0;
}
Correct Results:
$ nvcc -arch=sm_10 -o svd svd.cu
$ ./svd
-0.657 -0.685 0.314
-0.668 0.337 -0.664
-0.349 0.646 0.679
Broken Results:
$ nvcc -arch=sm_20 -o svd svd.cu
$ ./svd
-0.661 -0.660 0.356
-0.642 0.253 -0.724
0.019 0.460 0.888
sm_10
orsm_20
. I tried fiddling with various nvcc floating point compile switches but didn't have any luck there narrowing down the behavior issue on cuda 5.5 - Robert Crovellaa[]
to not be reflected in a read from the same location, some time later. Even if that write has not percolated all the way out to global memory (e.g. it is in L1 or L2 cache) the subsequent read will pick up the most recently modified value. It does not bypass the cache, and then go to global memory and get a stale value. I'm not sure what you're getting at, but many aspects of language support would be broken if that were not the case - Robert Crovella