I have the following simple piece of code, consisting of 4 files:
//Data.h:
#ifndef DATA_H
#define DATA_H
constexpr int N=10000000;
namespace data{
float particles[N];
float outputArray[N];
}
#endif
//Process.h:
#ifndef PROCESS_H
#define PROCESS_H
template <typename ProcessImpl>
class Process{
public:
using Base_t = ProcessImpl;
Process(ProcessImpl arg):fProcessImpl(arg){}
void Get1(int N, float * outputArray) const;
void Get2(int N) const;
private:
ProcessImpl fProcessImpl;
};
template <class ProcessImpl>
void Process<ProcessImpl>::Get1(int N, float * outputArray) const
{
#pragma acc parallel loop gang vector present(outputArray)
for(int ind=0; ind < N; ++ind){outputCSArray[ind]=fProcessImpl.Get1(ind);}
}
template <class ProcessImpl>
void Process<ProcessImpl>::Get2(int N) const
{
#pragma acc parallel loop gang vector
for (int ind = 0u; ind < N; ++ind){fProcessImpl.Get2(ind);}
}
#endif
//ProcessImpl.h:
#ifndef PROCESSIMPL_H
#define PROCESSIMPL_H
#include "Data.h"
using namespace data;
class ProcessImpl
{
public:
inline float Get1(int ind, float * outputArray) const;
inline void Get2(int ind) const;
};
float ProcessImpl::Get1(int ind, float * outputArray) const
{
outputArray[ind]=particles[ind];
return particles[ind+1];
}
void ProcessImpl::Get2(int ind) const
{
particles[ind]=2*particles[ind];
}
#endif
//main.cpp:
#include <iostream>
#include "Data.h"
#include "Process.h"
#include "ProcessImpl.h"
#include <accelmath.h>
#include <openacc.h>
using namespace data;
using Process_t = Process<ProcessImpl>;
Process_t process = Process_t(typename Process_t::Base_t());
int main(int argc, char **argv)
{
#pragma acc data create(particles,outputArray)
{
#pragma acc parallel loop gang vector present(particles)
for(int i=0; i<N; ++i) particles[i]=static_cast<float>(i);
#pragma acc update host(particles)
for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
std::cout<<std::endl;
process.Get2(N);
#pragma acc update host(particles)
for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
std::cout<<std::endl;
}
return 0;
}
It works properly on CPU with PGI 19.4 compiler. But my task is to launch the code on GPU. I use PGI 19.4 + OpenAcc. Use a simple CMakeLists.txt file and the compile line (GPU Nvidia GeForce 650 Ti, compute capability 3.0):
cmake . -DCMAKE_C_COMPILER=pgcc -DCMAKE_CXX_COMPILER=pgc++
-DCMAKE_C_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"
-DCMAKE_CXX_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"
The compilation fails with:
> Scanning dependencies of target Test
[ 50%] Building CXX object CMakeFiles/Test.dir/main.cpp.o
main:
16, Generating create(_ZN4data11outputArrayE[:])
Generating present(_ZN4data9particlesE[:])
Generating create(_ZN4data9particlesE[:])
Generating Tesla code
18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
18, Generating update self(_ZN4data9particlesE[:])
23, Generating update self(_ZN4data9particlesE[:])
PGCC-W-0155-External and Static variables are not supported in acc routine -
_ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19)
ProcessImpl::Get2(int) const:
4, include "ProcessImpl.h"
18, Generating implicit acc routine seq
Process<ProcessImpl>::Get2(int) const:
3, include "Process.h"
25, Generating Tesla code
27, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
25, Generating implicit copyin(this[:])
PGCC/x86-64 Linux 19.4-0: compilation completed with warnings
[100%] Linking CXX executable Test
nvlink error : Undefined reference to '_ZNK11ProcessImpl4Get2Ei' in
'CMakeFiles/Test.dir/main.cpp.o'
pgacclnk: child process exit status 2: /opt/pgi/linux86-64-llvm/19.4/bin/pgnvd
CMakeFiles/Test.dir/build.make:83: recipe for target 'Test' failed
make[2]: *** [Test] Error 2
CMakeFiles/Makefile2:72: recipe for target 'CMakeFiles/Test.dir/all' failed
make[1]: *** [CMakeFiles/Test.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2
Using pggdecode, found that "_ZNK11ProcessImpl4Get2Ei" is the mangled name for ProcessImpl::Get2(int) const . I removed the inline keyword from ProcessImpl.h and tried to add copyin(process) to #pragma acc data create(particles,outputArray) in main(), but this did not help. Work on Fedora 23 with gcc 5.3.1.
In the full code I avoid the problem of multiple definition of arrays particles and outputArray including them in a single .cpp file, because OpenAcc does not allow to use the extern keyword. May be it is not good (if You know how to do better, please, advise), but it works.
The questions are:
How to properly pass the GPU version of arrays particles and outputArray to Get1() and Get2() in Process.h and make Get1() and Get2() in ProcessImpl.h work with the arrays allocated on GPU? And how to compile this code?
How OpenAcc allows to directly access the globally allocated arrays copied to GPU in the code in OpenAcc compute regions without passing a pointer to them as a parameter of the calling function?
Thank You.