1
votes

Part of the code I am working on requires to perform a Matrix vector multiplication as fast as possible, i.e using an optimized third party library like cublas (although the same principle applies to any cpu blas).

The problem is that there is a kind of stride between elements in the vector like so:

The matrix is stored as a 3Nx3N 1D array of floats.

The vector is stored as a N 1D array of float4s, but only the first three elements of each float4 are to be used, the fourth should be ignored.

N is in the order of millions.

If the vector were stored as float3 instead of float4 I could just cast the pointer to float, like in this working example:

//Compile with nvcc test.cu -O3 -lcublas -o test

/*
Multiply a 3Nx3N float matrix, M,  by a vector, X, of N float3 elements 

The result, Y, is a 3N float vector
-----------------------

What if X is a vector of N float4?

How can I tell cublas to skip the forth element?

*/

#include<iostream>
#include<thrust/device_vector.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>

using namespace std;

int main(){

  int N = 3;

  thrust::device_vector<float3> X(N);

  thrust::device_vector<float> Y(3*N);

  for(int i=0; i<N; i++) 
     X[i] = make_float3(1,1,1); //make_float4(1,1,1,0); //in the case of float4 i.e., The result should be the same 

  thrust::device_vector<float> M(3*N*3*N, 1);


  cublasHandle_t handle;

  cublasCreate(&handle);

  float beta = 0.0f;
  float alpha = 1.0f;
  cublasSgemv(handle, CUBLAS_OP_T,
          3*N, 3*N,
          &alpha,
          thrust::raw_pointer_cast(&M[0]), 3*N,
          (float*) thrust::raw_pointer_cast(&X[0]), 1,
          &beta,
          thrust::raw_pointer_cast(&Y[0]), 1);

  cout<<"Performed Y = M·X\n\tX = ";
  for(int i=0; i<N; i++){
    float3 Xi = X[i];
    cout<<Xi.x<<" "<<Xi.y<<" "<<Xi.z<<" ";
  }  
  cout<<"\n\tY = ";
  for(int i=0; i<3*N; i++){
    cout<<Y[i]<<" ";
  }
  cout<<endl;

  return 0;
}

But, how can I perform this operation if the X vector is stored as float4 s?

Given that float4* can be interpreted as a float* with 4 times more elements, the question could be more general (although I am only interested in the float4 case); If there is a stride between each 3 "useful" elements. I want to say to cublas that the array is not coalescent in memory. But something like: There is 3 elements at the start, the next three are "stride" elements after that, etc. Similar to what you can do in OpenGL whith vertex array objects.

EDIT:

The answers suggested that the most viable method is to just copy the strided array into a temporal, transformed, float3 array that cublas understands.

The two options at the moment to do so are:

1. Use cudaMemcpy2D
2. Use a thrust transformation
3. Use a custom copy kernel

I wrote this code to test the three cases:

//Compile with Compile with: nvcc test.cu -O3 -lcublas -o test
#include<iostream>
#include<thrust/device_vector.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>

using namespace std;


struct Timer{
  cudaEvent_t start, stop;
  float time;

  void tic(){
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
  }
  float toc(){
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return time;
  }

};



struct copy_functor{
  copy_functor(){}
  __device__ float3 operator() (const float4& X4){
    return make_float3(X4.x, X4.y, X4.z);
  }
};


__global__ void copy_kernel(const float4* __restrict__ X4, float3* __restrict__ X3, int N){
  int id = blockIdx.x*blockDim.x + threadIdx.x;
  if(id < N){
    float4 x4 = X4[id];
    X3[id] = make_float3(x4.x, x4.y, x4.z);
  }
}

int main(){

  int N = 1000000;
  int Ntest = 1000;

  Timer t;

  thrust::device_vector<float3> X3(N, make_float3(0,0,0));
  thrust::device_vector<float4> X4(N, make_float4(1,1,1,10));


  /*************************CUDAMEMCPY2D*******************/
  t.tic();

  for(int i= 0; i<Ntest; i++){
    cudaMemcpy2DAsync(thrust::raw_pointer_cast(&X3[0]),
              3*sizeof(float),
              thrust::raw_pointer_cast(&X4[0]),
              4*sizeof(float),
              3*sizeof(float),
              N,
              cudaMemcpyDeviceToDevice);
     cudaDeviceSynchronize();
   }
   printf ("Time for cudaMemcpy2DAsync: %f ms\n", t.toc()/(float)Ntest);


   /************************THRUST***********************/
   t.tic();

   for(int i= 0; i<Ntest; i++){
     transform(X4.begin(), X4.end(), X3.begin(), copy_functor());  
     cudaDeviceSynchronize();
   }

   printf ("Time for thrust transformation: %f ms\n", t.toc()/(float)Ntest);

   /*********************COPY KERNEL*****************************/

   t.tic();
   for(int i= 0; i<Ntest; i++){
     copy_kernel<<< N/128 + 1, 128 >>>(thrust::raw_pointer_cast(&X4[0]),
                       thrust::raw_pointer_cast(&X3[0]), N);
     cudaDeviceSynchronize();
   }
   printf ("Time for copy kernel: %f ms\n", t.toc()/(float)Ntest);


return 0;
}

Notice that I am performing the mean of 1000 copies.

The output of this code in a GTX 980 is the following:

Time for cudaMemcpy2DAsync: 1.465522 ms
Time for thrust transformation: 0.178745 ms
Time for copy kernel: 0.168507 ms

cudaMemcpy2D is an order of magnitude more slow than the rest.

thrust and copy kernel are very similar and the fastest way

This behavior appears to remain with any number of elements.

EDIT2:

Other answers suggest that GEMM could be used to communicate the stride. Without the need for a temporal array.

Interpreting the Matrix vector mul. as a Matrix Matrix mul. would be done like so:

 cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T,
              3*N, 1 /*m*/, 3*N,
              &alpha,
              thrust::raw_pointer_cast(&M[0]), 3*N,
              (float*) thrust::raw_pointer_cast(&X3[0]), 1 /*ldb*/,
              &beta,
              thrust::raw_pointer_cast(&Y[0]), 3*N);

However, at this point, I do not know how to pass X4 instead of X3. The solution appears to be in the m and ldb parameters.

1

1 Answers

1
votes

You could treat your 1-D float4 vector as a Nx3 2-D float matrix with a row stride of 4, and use cudaMemcpy2DAsync to change the stride from 4 to 3 with

cudaMemcpy2DAsync(dst,
                  3*sizeof(float),
                  src,
                  4*sizeof(float),
                  3*sizeof(float),
                  N,
                  cudaMemcpyDeviceToDevice);

Then the dst can be treated as a 3N 1-D float vector and passed to gemv() directly.

Given the scale of your N, the time of copying is not noticeable compared to gemv().


EDIT

Benchmark result from @Apo shows that it is better to use a copy kernel instead of cudaMemcpy2DAsync. I was over-expected on cudaMemcpy2DAsync and thought it would be well optimized and have best performance for all cases.