3
votes

I have strange problem which I can't solve. It's connected with boost+thrust code.

Code:

#include <boost/config/compiler/nvcc.hpp>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <thrust/random.h>
#include <thrust/generate.h>
#include <thrust/detail/type_traits.h>

#include <cuda_runtime.h>

#include <cublas_v2.h>
#include <common/inc/helper_cuda.h>

#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/generate.hpp>
#include <boost/compute/algorithm/generate_n.hpp>


#include <algorithm>
#include <time.h>
#include <limits.h>
#include <algorithm>

using namespace boost::numeric::ublas;
using namespace boost::random;
using namespace boost::compute;


int main(int argc, char **argv)
{
    int N = 100000;

    unbounded_array<float> lineMatrix1(N*N);
    unbounded_array<float> lineMatrix2(N*N);    

    generate_n(lineMatrix1.begin(), N*N, []() { return (10 * rand() / RAND_MAX); });
    generate_n(lineMatrix2.begin(), N*N, []() { return (10 * rand() / RAND_MAX); });    

    matrix<float> matrix1(N, N, lineMatrix1);
    matrix<float> matrix2(N, N, lineMatrix2);
    matrix<float> zeroMatrix(N, N, 0);  
    matrix<float> zeroMatrix2(N, N, 0);

    //boost single core computation start

    auto matrix3 = prod(matrix1, matrix2);

    //boost single core computation finish

    //thrust computation start

    findCudaDevice(argc, (const char **)argv);

    cublasHandle_t handle;
    cublasCreate(&handle);

    float alpha = 1.0f;
    float beta = 0.0f;

    auto result = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, matrix1.data().cbegin(), N, matrix2.data().cbegin(), N, &beta, zeroMatrix.data().begin(), N);
    cudaDeviceSynchronize();

    thrust::device_vector<float> deviceMatrix1(N*N);
    thrust::device_vector<float> deviceMatrix2(N*N);
    thrust::device_vector<float> deviceZeroMatrix(N*N, 0);

    thrust::copy(matrix1.data().cbegin(), matrix1.data().cend(), deviceMatrix1.begin());
    thrust::copy(matrix2.data().cbegin(), matrix2.data().cend(), deviceMatrix2.begin());

    auto result2 = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, deviceMatrix1.data().get(), N, deviceMatrix2.data().get(), N, &beta, deviceZeroMatrix.data().get(), N);
    cudaDeviceSynchronize();

    thrust::copy(deviceZeroMatrix.cbegin(), deviceZeroMatrix.cend(), zeroMatrix2.data().begin());

    std::cout << result << std::endl;
    std::cout << result2 << std::endl;

    //thrust computation finish    

    float eps = 0.00001;
    int differCount1 = 0;
    int differCount2 = 0;

    for (int i = 0; i < matrix3.size1(); i++)
    {
        for (int j = 0; j < matrix3.size2(); j++)
        {
            if (std::abs(matrix3(i, j) != zeroMatrix(i, j)) > eps)
                differCount1++;

            if (std::abs(matrix3(i, j) != zeroMatrix2(i, j)) > eps)
                differCount2++;
        }
    }

    std::cout << differCount1 << std::endl;
    std::cout << differCount2 << std::endl;

    char c;
    std::cin >> c;

    return 0;
}

This file has name 'myFirstMatrixTest.cu'.

So, I have compilator errors:

MSB3721 exit from command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\bin\nvcc.exe" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_37,code=\"sm_37,compute_37\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_60,code=\"sm_60,compute_60\" -gencode=arch=compute_61,code=\"sm_61,compute_61\" -gencode=arch=compute_70,code=\"sm_70,compute_70\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.14.26428\bin\HostX86\x64" -x cu -rdc=true -I./ -I../common/inc -I../../common/inc -I/common/inc -I../ -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -Xcompiler "/wd 4819" -g -DWIN32 -DWIN32 -D_MBCS -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MTd " -o x64/Debug/MyFirstMatrixTest.cu.obj "C:\User Root\Repository\CUDA Projects\MatrixMultiplicationThrust\MyFirstMatrixTest.cu"" with code "2". MyFirstMatrixTest C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\VC\VCTargets\BuildCustomizations\CUDA 9.2.targets 707

and this:

Fatal Error C1012 unmatched parenthesis : missing character ")" MyFirstMatrixTest c:\local\boost\preprocessor\slot\detail\shared.hpp 27

Why could this error occur?

Thank you.

2
For thrust+boost to work you need to install a very specific version-combination of CUDA, visual studio and boost. Have you checked that?JHBonarius
@JHBonarius I've Installed CUDA 9.2 SDK, Boost and I have 2017 studio. Boost code (without any thrust code) works fine, thrust examples work fine too.Dmitriy
To test your code, I've updated my Cuda to v9.2 and now everything broke :'( So cannot help you until I fix it again. I had many issues trying to get the parts work together in the past. Heard it's a common problem.JHBonarius
I read up on this and found that nvcc has issues compiling boost code. You should probably separate your host and device code.JHBonarius

2 Answers

1
votes

Well, the first problem is

int N = 100000;

So N^2 = 10,000,000,000... (will never fit in an int). That is 10G*4 bytes(float) = 40 GBytes of data. For me that throws a memory exception.

The next problem I had was with the combination of unbounded_array and generate_n. Just didn't work. But since you're using Thrust, use the Thrust types and algorithms (I'm not sure why Thrust has it's own types to replace STL, but whatever).

I'm using Visual Studio 2017 v15.7 in 2015 mode (else I get a not supported error) with Cuda v9.2 and Boost 1.67.0.

I modified your code until it compiles correctly: (Note the correction in the randomizer functor, it was first only generating integers and casting them to floats)

#include <boost/config/compiler/nvcc.hpp>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/generate.h>
#include <thrust/inner_product.h>

#include <cuda_runtime.h>

#include <cublas_v2.h>
#pragma comment(lib,"cublas.lib")
#include <helper_cuda.h>

#include <boost/numeric/ublas/matrix.hpp>
//#include <boost/numeric/ublas/io.hpp>
using boost::numeric::ublas::matrix;

#include <random>

int main(int argc, char **argv)
{
    constexpr size_t N = 100;
    constexpr size_t NN = N * N;

    thrust::host_vector<float> lineMatrix1; lineMatrix1.reserve(NN);
    thrust::host_vector<float> lineMatrix2; lineMatrix2.reserve(NN);
    {
        std::random_device rd;  //Will be used to obtain a seed for the random number engine
        std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
        std::uniform_real_distribution<float> dis(0.0f, 10.0f);
        auto genRnd = [&]() { return dis(gen); };
        thrust::generate_n(std::back_inserter(lineMatrix1), NN, genRnd);
        thrust::generate_n(std::back_inserter(lineMatrix2), NN, genRnd);
    }

    matrix<float> matrix1(N, N);
    thrust::copy_n(std::cbegin(lineMatrix1), NN, std::begin(matrix1.begin1()));
    //std::cout << "Matrix 1:\n" << matrix1 << std::endl;

    matrix<float> matrix2(N, N);
    thrust::copy_n(std::cbegin(lineMatrix2), NN, std::begin(matrix2.begin1()));
    //std::cout << "Matrix 2:\n" << matrix2 << std::endl;

    //auto matrix3 = prod(matrix1, matrix2);
    auto matrix3 = trans(prod(trans(matrix1), trans(matrix2)));
    //std::cout << "Matrix 3:\n" << matrix3 << std::endl;

    thrust::host_vector<float> hostResult; hostResult.reserve(NN);
    for (auto rowIt = matrix3.cbegin1(); rowIt != matrix3.cend1(); rowIt++)
        for (const auto& element : rowIt)
            hostResult.push_back(element);
    std::cout << "Host Result:\n";
    for (const auto& el : hostResult) std::cout << el << " ";
    std::cout << std::endl;
    //////boost single core computation finish

    //////thrust computation start
    findCudaDevice(argc, (const char **)argv);
    cublasHandle_t handle;
    cublasCreate(&handle);

    const float alpha = 1.0f;
    const float beta = 0.0f;

    thrust::device_vector<float> deviceMatrix1; deviceMatrix1.reserve(NN);
    thrust::copy_n(std::cbegin(lineMatrix1), NN, std::back_inserter(deviceMatrix1));

    thrust::device_vector<float> deviceMatrix2; deviceMatrix2.reserve(NN);
    thrust::copy_n(std::cbegin(lineMatrix2), NN, std::back_inserter(deviceMatrix2));

    thrust::device_vector<float> deviceZeroMatrix(NN,0);
    auto result2 = cublasSgemm(handle,
        CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
        &alpha,
        deviceMatrix1.data().get(), N,
        deviceMatrix2.data().get(), N,
        &beta,
        deviceZeroMatrix.data().get(), N);
    cudaDeviceSynchronize();

    cublasDestroy(handle);

    thrust::host_vector<float> deviceResult; deviceResult.reserve(NN);
    thrust::copy_n(std::cbegin(deviceZeroMatrix), NN, std::back_inserter(deviceResult));
    std::cout << "Device Result:\n";
    for (const auto& el : deviceResult) std::cout << el << " ";
    std::cout << std::endl;
    //////thrust computation finish    

    auto accError = thrust::inner_product(std::cbegin(hostResult), std::cend(hostResult), std::cbegin(deviceResult), 0.0f, std::plus<float>(),
        [](auto val1, auto val2) { return std::abs(val1 - val2); });

    std::cout << "Accumulated error: " << accError << std::endl;
    std::cout << "Average error: " << accError/NN << std::endl;

    std::cin.ignore();

    return 0;
}

edit: Fixed the code. ublas matrix stores the matrices different then vector, so I had to transpose the matrices and the result. Furthermore, it turned out to be difficult to copy the ublas matrix back to a vector.

edit2: compilation parameters

"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\bin\nvcc.exe" -gencode=arch=compute_30,code=\"sm_30,compute_30\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -x cu  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 --compile -cudart static  -g   -DWIN32 -DWIN64 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MDd " -o x64\Debug\kernel.cu.obj "C:\Cpp\Cuda\SoHelp2\kernel.cu"
0
votes

You're using lambdas - feed the '--std=c++11' option to nvcc.