thrust::raw_pointer_cast and multiple GPU, weird behaviour

Question

I use thrust a lot in my code, because it is a great wrapper and provide very useful utilities, I am even more convinced since the support of asynchronous behaviour has been added.

My code was working well using cuda thrust until I recently tried to add multi-GPU support in my application. I experienced annoying

CUDA Runtime API error 77 : an illegal memory access was encountered

over part of my code that never showed any bounds problems before.

I added verbosity to my code and it appeared that my thrust::device_vector pointer address were changing along the execution, for no apparent reason, generating error 77 in handwritten kernels.

I may have misunderstood the UVA concept and its eventual "side effects", but still, I am interested in the understanding of the process that lead to pointer update.

I was not able to reproduce exactly my problem, in which I do not use temporary host variable to store cuda memory pointer, but only thrust::raw_pointer_cast on the fly when needed in kernel wrapper call.

But I have written a small program that shows what kind error I may have trouble with, note that this is not robust and you need to have at least 2 gpu on your system to run it:

/********************************************************************************************
** Compile using nvcc ./test.cu -gencode arch=compute_35,code=sm_35 -std=c++11 -o test.exe **
********************************************************************************************/

//Standard Library
#include <iostream>
#include <vector>

//Cuda
#include "cuda_runtime.h"

//Thrust
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>

inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
    if( err != cudaSuccess )
    {
        printf("%s(%i) : CUDA Runtime API error %i : %s \n",file ,line, (int)err, cudaGetErrorString(err) );
    }
};

#define checkCudaErrors(err)    __checkCudaErrors (err, __FILE__, __LINE__)

__global__ void write_memory( float* buf, float value )
{
    printf("GPU TALK: Raw pointer is %p \n",buf);
    buf[0] = value;
}

int main( int argc, char* argv[] )
{
    //declare a vector of vector
    std::vector<thrust::device_vector<float> > v;
    float test;
    float* tmp;

    //Initialize first vector on GPU 0
    cudaSetDevice( 0 );
    v.emplace_back( 65536, 1.0f );
    tmp = thrust::raw_pointer_cast( v.at(0).data() );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 0 " << (void*)tmp << std::endl;

    //Try to use it raw pointer
    write_memory<<<1,1,0,0>>>( tmp, 2.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After first kernel launch, value is " << test << std::endl;

    //Initialize second vector on GPU 1, but we do not use it
    cudaSetDevice( 1 );
    v.emplace_back( 65536, 1.0f );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 1 is now " << (void*)thrust::raw_pointer_cast( v.at(0).data() ) << " != " << (void*)tmp << std::endl;
    std::cout << " Host TALK: Raw pointer of vector 1 at step 1 is " << (void*)thrust::raw_pointer_cast( v.at(1).data() ) << std::endl; 

    //Try to use the first vector : No segmentation fault ?
    test = v.at(0)[0];
    std::cout << " Host TALK: Before second kernel launch, value is " << test << std::endl;
    write_memory<<<1,1,0,0>>>( thrust::raw_pointer_cast( v.at(0).data() ), 3.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After second kernel launch, value is " << test << std::endl;

    //Raw pointer stored elsewhere: generates a segmentation fault
    write_memory<<<1,1,0,0>>>( tmp, 4.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After third kernel launch, value is " << test << std::endl;

    return 0;
}

Here is and example of the output it produces on my machine:

Host TALK: Raw pointer of vector 0 at step 0 0xb043c0000
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0xb08000000 != 0xb043c0000
Host TALK: Raw pointer of vector 1 at step 1 is 0xb07fc0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0xb08000000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0xb043c0000
./test.cu(68) : CUDA Runtime API error 77 : an illegal memory access was encountered terminate called after throwing an instance of 'thrust::system::system_error' what(): an illegal memory access was encountered

Thank you in advance for your help, I may also ask this question on thrust's github.

EDIT: Thanks to m.s and Hiura, here is a code that works as expected:

/********************************************************************************************
** Compile using nvcc ./test.cu -gencode arch=compute_35,code=sm_35 -std=c++11 -o test.exe **
********************************************************************************************/

//Standard Library
#include <iostream>
#include <vector>

//Cuda
#include "cuda_runtime.h"

//Thrust
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>

inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
    if( err != cudaSuccess )
    {
        printf("%s(%i) : CUDA Runtime API error %i : %s \n",file ,line, (int)err, cudaGetErrorString(err) );
    }
};

#define checkCudaErrors(err)    __checkCudaErrors (err, __FILE__, __LINE__)

__global__ void write_memory( float* buf, float value )
{
    printf("GPU TALK: Raw pointer is %p \n",buf);
    buf[0] = value;
}

int main( int argc, char* argv[] )
{
    //declare a vector of vector
    std::vector<thrust::device_vector<float> > v;
    v.reserve(2);
    float test;
    float* tmp;

    //Initialize first vector on GPU 0
    cudaSetDevice( 0 );
    v.emplace_back( 65536, 1.0f );
    tmp = thrust::raw_pointer_cast( v.at(0).data() );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 0 " << (void*)tmp << std::endl;

    //Try to use it raw pointer
    write_memory<<<1,1,0,0>>>( tmp, 2.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After first kernel launch, value is " << test << std::endl;

    //Initialize second vector on GPU 1, but we do not use it
    cudaSetDevice( 1 );
    v.emplace_back( 65536, 1.0f );
    std::cout << " Host TALK: Raw pointer of vector 0 at step 1 is now " << (void*)thrust::raw_pointer_cast( v.at(0).data() ) << " != " << (void*)tmp << std::endl;
    std::cout << " Host TALK: Raw pointer of vector 1 at step 1 is " << (void*)thrust::raw_pointer_cast( v.at(1).data() ) << std::endl; 

    //Try to use the first vector : No segmentation fault ?
    cudaSetDevice( 0 );
    test = v.at(0)[0];
    std::cout << " Host TALK: Before second kernel launch, value is " << test << std::endl;
    write_memory<<<1,1,0,0>>>( thrust::raw_pointer_cast( v.at(0).data() ), 3.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After second kernel launch, value is " << test << std::endl;

    //Raw pointer stored elsewhere: generates a segmentation fault
    write_memory<<<1,1,0,0>>>( tmp, 4.0f );
    checkCudaErrors( cudaStreamSynchronize( NULL ) );
    test = v.at(0)[0];
    std::cout << " Host TALK: After third kernel launch, value is " << test << std::endl;

    return 0;
}

It was one of the last place in my code where I did not used vector of pointer to objects instead of vector of objects for simplicity, but I see that I should have to avoid these annoying move/copy problems ...

Output now is:

Host TALK: Raw pointer of vector 0 at step 0 0xb043c0000
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After first kernel launch, value is 2
Host TALK: Raw pointer of vector 0 at step 1 is now 0xb043c0000 != xb043c0000
Host TALK: Raw pointer of vector 1 at step 1 is 0xb07fc0000
Host TALK: Before second kernel launch, value is 2
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After second kernel launch, value is 3
GPU TALK: Raw pointer is 0xb043c0000
Host TALK: After third kernel launch, value is 4

Print the data pointers of both device_vectors, not just the first one. — user1084944
I printed the other device_vector raw pointer, which shows that it as been allocated at an address different from that of the first device_vector — Tobbey
does it work if you use cudaSetDevice(0); before write_memory<<<1,1,0,0>>>( tmp, 4.0f ); ? — m.s.
I suspect that the first vector is moved/copied around when you emplace the second one. What happens if you reserve the space in v before adding any of the vectors? (Or just add both of them at the same time.) — Hiura
Are they still similar looking address, though? My next best guess is that the problem is that both vectors are located at "reasonable" addresses, so the wrong pointer is still pointing to a valid memory address and so you're just corrupting random memory. But passing in the uninitialized pointer gives a totally unreasonable memory address. — user1084944

Hiura Hiura · Accepted Answer · 2015-05-28T08:32:49

So I installed CUDA quickly to test my hypothesis: adding a reserve statement preserves the addresses.

//declare a vector of vector
std::vector<thrust::device_vector<float> > v;
v.reserve(2); // <<-- HERE
float test;
float* tmp;

And the outputs, first without the patch.

 $ nvcc thrust.cu  -std=c++11 -o test
 $ ./test 
  Host TALK: Raw pointer of vector 0 at step 0 0x700ca0000
 GPU TALK: Raw pointer is 0x700ca0000 
  Host TALK: After first kernel launch, value is 2
  Host TALK: Raw pointer of vector 0 at step 1 is now 0x700d20000 != 0x700ca0000
  Host TALK: Raw pointer of vector 1 at step 1 is 0x700ce0000
  Host TALK: Before second kernel launch, value is 2
 GPU TALK: Raw pointer is 0x700d20000 
  Host TALK: After second kernel launch, value is 3
 GPU TALK: Raw pointer is 0x700ca0000 
  Host TALK: After third kernel launch, value is 3

with the patch:

 $ nvcc thrust.cu  -std=c++11 -o test
 $ ./test 
  Host TALK: Raw pointer of vector 0 at step 0 0x700ca0000
 GPU TALK: Raw pointer is 0x700ca0000 
  Host TALK: After first kernel launch, value is 2
  Host TALK: Raw pointer of vector 0 at step 1 is now 0x700ca0000 != 0x700ca0000
  Host TALK: Raw pointer of vector 1 at step 1 is 0x700ce0000
  Host TALK: Before second kernel launch, value is 2
 GPU TALK: Raw pointer is 0x700ca0000 
  Host TALK: After second kernel launch, value is 3
 GPU TALK: Raw pointer is 0x700ca0000 
  Host TALK: After third kernel launch, value is 4

thrust::raw_pointer_cast and multiple GPU, weird behaviour

1 Answers