Reducing the rows of a matrix can be solved by using CUDA Thrust in three ways (they may not be the only ones, but addressing this point is out of scope). As also recognized by the same OP, using CUDA Thrust is preferable for such a kind of problem. Also, an approach using cuBLAS is possible.
APPROACH #1 - reduce_by_key
This is the approach suggested at this Thrust example page. It includes a variant using make_discard_iterator
.
APPROACH #2 - transform
This is the approach suggested by Robert Crovella at CUDA Thrust: reduce_by_key on only some values in an array, based off values in a “key” array.
APPROACH #3 - inclusive_scan_by_key
This is the approach suggested by Eric at How to normalize matrix columns in CUDA with max performance?.
APPROACH #4 - cublas<t>gemv
It uses cuBLAS
gemv
to multiply the relevant matrix by a column of 1
's.
THE FULL CODE
Here is the code condensing the two approaches. The Utilities.cu
and Utilities.cuh
files are mantained here and omitted here. The TimingGPU.cu
and TimingGPU.cuh
are maintained here and are omitted as well.
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
// --- Required for approach #2
__device__ float *vals;
/**************************************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
/**************************************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/******************************************/
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
/******************************************/
struct row_reduction {
const int Ncols; // --- Number of columns
row_reduction(int _Ncols) : Ncols(_Ncols) {}
__device__ float operator()(float& x, int& y ) {
float temp = 0.f;
for (int i = 0; i<Ncols; i++)
temp += vals[i + (y*Ncols)];
return temp;
}
};
/**************************/
/* NEEDED FOR APPROACH #3 */
/**************************/
template<typename T>
struct MulC: public thrust::unary_function<T, T>
{
T C;
__host__ __device__ MulC(T c) : C(c) { }
__host__ __device__ T operator()(T x) { return x * C; }
};
/********/
/* MAIN */
/********/
int main()
{
const int Nrows = 5; // --- Number of rows
const int Ncols = 8; // --- Number of columns
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_matrix(Nrows * Ncols);
for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
TimingGPU timerGPU;
/***************/
/* APPROACH #1 */
/***************/
timerGPU.StartCounter();
// --- Allocate space for row sums and indices
thrust::device_vector<float> d_row_sums(Nrows);
thrust::device_vector<int> d_row_indices(Nrows);
// --- Compute row sums by summing values with equal row indices
//thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Ncols)),
// thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Ncols)) + (Nrows*Ncols),
// d_matrix.begin(),
// d_row_indices.begin(),
// d_row_sums.begin(),
// thrust::equal_to<int>(),
// thrust::plus<float>());
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)) + (Nrows*Ncols),
d_matrix.begin(),
thrust::make_discard_iterator(),
d_row_sums.begin());
printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
// --- Print result
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_row_sums[i] << "\n";
}
/***************/
/* APPROACH #2 */
/***************/
timerGPU.StartCounter();
thrust::device_vector<float> d_row_sums_2(Nrows, 0);
float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]);
gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
thrust::transform(d_row_sums_2.begin(), d_row_sums_2.end(), thrust::counting_iterator<int>(0), d_row_sums_2.begin(), row_reduction(Ncols));
printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_row_sums_2[i] << "\n";
}
/***************/
/* APPROACH #3 */
/***************/
timerGPU.StartCounter();
thrust::device_vector<float> d_row_sums_3(Nrows, 0);
thrust::device_vector<float> d_temp(Nrows * Ncols);
thrust::inclusive_scan_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols)) + (Nrows*Ncols),
d_matrix.begin(),
d_temp.begin());
thrust::copy(
thrust::make_permutation_iterator(
d_temp.begin() + Ncols - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Ncols))),
thrust::make_permutation_iterator(
d_temp.begin() + Ncols - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Ncols))) + Nrows,
d_row_sums_3.begin());
printf("Timing for approach #3 = %f\n", timerGPU.GetCounter());
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_row_sums_3[i] << "\n";
}
/***************/
/* APPROACH #4 */
/***************/
cublasHandle_t handle;
timerGPU.StartCounter();
cublasSafeCall(cublasCreate(&handle));
thrust::device_vector<float> d_row_sums_4(Nrows);
thrust::device_vector<float> d_ones(Ncols, 1.f);
float alpha = 1.f;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_row_sums_4.data()), 1));
printf("Timing for approach #4 = %f\n", timerGPU.GetCounter());
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_matrix[i * Ncols + j] << " ";
std::cout << "] = " << d_row_sums_4[i] << "\n";
}
return 0;
}
TIMING RESULTS (tested on a Kepler K20c)
Matrix size #1 #1-v2 #2 #3 #4 #4 (no plan)
100 x 100 0.63 1.00 0.10 0.18 139.4 0.098
1000 x 1000 1.25 1.12 3.25 1.04 101.3 0.12
5000 x 5000 8.38 15.3 16.05 13.8 111.3 1.14
100 x 5000 1.25 1.52 2.92 1.75 101.2 0.40
5000 x 100 1.35 1.99 0.37 1.74 139.2 0.14
It seems that approaches #1 and #3 outperform approach #2, except in the cases of small numbers of columns. The best approach, however, is approach #4, which is significantly more convenient than the others, provided that the time needed to create the plan can be amortized during the computation.