We are experiencing problems while using cuSOLVER
's cusolverSpScsrlsvchol
function, probably due to misunderstanding of the cuSOLVER
library.
Motivation: we are solving the Poisson equation -divgrad x = b
on a rectangular grid. In 2
dimensions with a 5
-stencil (1, 1, -4, 1, 1)
, the Laplacian on the grid provides a (quite sparse) matrix A
. Moreover, the charge distribution on the grid gives a (dense) vector b
. A
is positive definite and symmetric.
Now we solve A * x = b
for x
using nvidia's new cuSOLVER
library that comes with CUDA 7.0 . It provides a function cusolverSpScsrlsvchol
which should do the sparse Cholesky factorisation for floats.
Note: we are able to correctly solve the system with the alternative sparse QR factorisation function cusolverSpScsrlsvqr
. For a 4 x 4
grid with all b
entries on the edge being 1
and the rest 0
, we get for x
:
1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1
Our problems:
cusolverSpScsrlsvchol
returns wrong results forx
:1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
(solved, see answer below) Converting the CSR matrix
A
to a dense matrix and showing the output gives weird numbers (10^-44
and the like). The respective data from the CSR format are correct and validated with python numpy.(solved, see answer below) The alternative sparse
LU
and partial pivoting withcusolverSpScsrlsvlu
cannot even be found:$ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined
What are we doing wrong? Thanks for your help!
Our C++ CUDA code:
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>
// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
std::vector<float>& rhs, int Nrows, int Ncols) {
//nnz: 5 entries per row (node) for nodes in the interior
// 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
vals.resize(nnz);
row.resize(nnz);
col.resize(nnz);
rhs.resize(Nrows*Ncols);
int counter = 0;
for(int i = 0; i < Nrows; ++i) {
for (int j = 0; j < Ncols; ++j) {
int idx = j + Ncols*i;
if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
vals[counter] = 1.;
row[counter] = idx;
col[counter] = idx;
counter++;
rhs[idx] = 1.;
// if (i == 0) {
// rhs[idx] = 3.;
// }
} else { // -laplace stencil
// above
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-Ncols;
counter++;
// left
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-1;
counter++;
// center
vals[counter] = 4.;
row[counter] = idx;
col[counter] = idx;
counter++;
// right
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+1;
counter++;
// below
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+Ncols;
counter++;
rhs[idx] = 0;
}
}
}
assert(counter == nnz);
}
int main() {
// --- create library handles:
cusolverSpHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverSpCreate(&cusolver_handle);
std::cout << "status create cusolver handle: " << cusolver_status << std::endl;
cusparseHandle_t cusparse_handle;
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(&cusparse_handle);
std::cout << "status create cusparse handle: " << cusparse_status << std::endl;
// --- prepare matrix:
int Nrows = 4;
int Ncols = 4;
std::vector<float> csrVal;
std::vector<int> cooRow;
std::vector<int> csrColInd;
std::vector<float> b;
assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);
int nnz = csrVal.size();
int m = Nrows * Ncols;
std::vector<int> csrRowPtr(m+1);
// --- prepare solving and copy to GPU:
std::vector<float> x(m);
float tol = 1e-5;
int reorder = 0;
int singularity = 0;
float *db, *dcsrVal, *dx;
int *dcsrColInd, *dcsrRowPtr, *dcooRow;
cudaMalloc((void**)&db, m*sizeof(float));
cudaMalloc((void**)&dx, m*sizeof(float));
cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
cudaMalloc((void**)&dcooRow, nnz*sizeof(int));
cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);
cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;
cudaDeviceSynchronize(); // matrix format conversion has to be finished!
// --- everything ready for computation:
cusparseMatDescr_t descrA;
cusparse_status = cusparseCreateMatDescr(&descrA);
std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
// optional: print dense matrix that has been allocated on GPU
std::vector<float> A(m*m, 0);
float *dA;
cudaMalloc((void**)&dA, A.size()*sizeof(float));
cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, dA, m);
cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "A: \n";
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
std::cout << A[i*m + j] << " ";
}
std::cout << std::endl;
}
cudaFree(dA);
std::cout << "b: \n";
cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
for (auto a : b) {
std::cout << a << ",";
}
std::cout << std::endl;
// --- solving!!!!
// cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
// dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
// &singularity);
cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
&singularity);
cudaDeviceSynchronize();
std::cout << "singularity (should be -1): " << singularity << std::endl;
std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;
cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);
// relocated these 2 lines from above to solve (2):
cusparse_status = cusparseDestroy(cusparse_handle);
std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;
cusolver_status = cusolverSpDestroy(cusolver_handle);
std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;
for (auto a : x) {
std::cout << a << " ";
}
std::cout << std::endl;
cudaFree(db);
cudaFree(dx);
cudaFree(dcsrVal);
cudaFree(dcsrColInd);
cudaFree(dcsrRowPtr);
cudaFree(dcooRow);
return 0;
}