Problem setting
The problem consists of sampling out of a year of 365 days n days, in such a way that
- the days are drawn by uniform probability distribution
- the days comply to have a minimum distance given by
min_dist - the result is given as numeric vector
Example
With n= 12 and min_dist= 20 a proper result might be the vector
[1] 4 43 69 97 129 161 192 215 243 285 309 343 as diff of this vector is [1] 39 26 28 32 32 31 23 28 42 24 34, all values larger or equal to min_dist= 20.
Question
I have solved this problem with
- function
sample_r()in nativeR - function
sample_cpp()inc++using the fantasticRcppinterface package
The c++ solution turns out to much slower (on my Mac factor 60x). I am a Rccp newbie, hence my own research capabilities are limited - please forgive.
What can I do to refactor the c++ code to be faster than native R code ?
Reproducible code (.cpp file)
#include <Rcpp.h>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
IntegerVector sample_cpp(int n, int min_dist= 5L, int seed= 42L) {
IntegerVector res_empty= Rcpp::rep(NA_INTEGER, n);
IntegerVector res;
IntegerVector available_days_full= Rcpp::seq(1, 365);
IntegerVector available_days;
IntegerVector forbidden_days;
IntegerVector forbidden_space = Rcpp::seq(-(min_dist-1), (min_dist-1));
bool fail;
Environment base("package:base");
Function set_seed = base["set.seed"];
set_seed(seed);
do {
res= res_empty;
available_days = available_days_full;
fail= FALSE;
for(int i= 0; i < n; ++i) {
res[i]= sample(available_days, 1, FALSE)[0];
forbidden_days= res[i]+forbidden_space;
available_days= setdiff(available_days, forbidden_days);
if(available_days.size() <= 1){
fail= TRUE;
break;
}
}
}
while(fail== TRUE);
std::sort(res.begin(), res.end());
return res;
}
/*** R
# c++ function
(r= sample_cpp(n= 12, min_dist= 20, seed=1))
diff(r)
# R function
sample_r= function(n= 12, min_dist=5, seed= 42){
if(n*min_dist>= 365) stop("Infeasible.")
set.seed(seed)
repeat{
res= numeric(n)
fail= FALSE
available_days= seq(365)
for(i in seq(n)){
if(length(available_days) <= 1){
fail= TRUE
break()
}
res[i]= sample(available_days, 1)
forbidden_days= res[i]+(-(min_dist-1):(min_dist-1))
available_days= setdiff(available_days, forbidden_days)
}
if(fail== FALSE) return(sort(res))
}
}
(r= sample_r(n= 12, min_dist= 20, seed= 40))
diff(r)
# Benchmark
library(rbenchmark)
benchmark(cpp= sample_cpp(n= 12, min_dist = 28),
r= sample_r(n= 12, min_dist = 28),
replications = 50)[,1:4]
*/
Benchmark:
test replications elapsed relative
1 cpp 50 28.005 63.217
2 r 50 0.443 1.000
Edit:
OK, I tried to optimize (as far as I am capable of c++), still the c++ implementation is behind, but now only marginally.
#include <Rcpp.h>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
IntegerVector sample_cpp(int n, int min_dist= 5L, int seed= 42L) {
IntegerVector res;
IntegerVector available_days;
IntegerVector forbidden_days;
IntegerVector forbidden_space = Rcpp::seq(-(min_dist-1), (min_dist-1));
bool fail;
Environment base("package:base");
Function set_seed = base["set.seed"];
set_seed(seed);
do {
res= Rcpp::rep(NA_INTEGER, n);
available_days = Rcpp::seq(1, 365);
fail= FALSE;
for(int i= 0; i < n; ++i) {
if(available_days.size() < n-i){
fail= TRUE;
break;
}
int temp= sample(available_days, 1, FALSE)[0];
res[i]= temp;
forbidden_days= unique(pmax(0, temp + forbidden_space));
available_days= setdiff(available_days, forbidden_days);
}
}
while(fail== TRUE);
std::sort(res.begin(), res.end());
return res;
}
/*** R
# R function
sample_r= function(n= 12, min_dist=5, seed= 42){
if(n*min_dist>= 365) stop("Infeasible.")
set.seed(seed)
repeat{
res= numeric(n)
fail= FALSE
available_days= seq(365)
for(i in seq(n)){
if(length(available_days) <= n-i){
fail= TRUE
break()
}
res[i]= sample(available_days, 1)
forbidden_days= res[i]+(-(min_dist-1):(min_dist-1))
available_days= setdiff(available_days, forbidden_days)
}
if(fail== FALSE) return(sort(res))
}
}
# Benchmark
library(rbenchmark)
benchmark(cpp= sample_cpp(n= 12, min_dist = 28),
r= sample_r(n= 12, min_dist = 28),
replications = 50)[,1:4]
*/
Benchmark:
test replications elapsed relative
1 cpp 50 0.643 1.475
2 r 50 0.436 1.000
forbidden_days= res[i]+forbidden_space;being one. But any answer will be a stab in the dark. You should really get a profiller to measure which part is taking all the CPU time and then look into how you can fix that. - UKMonkey"Mean relative difference: 0.06066634"fromall.equalbut I don't have time to fix this for the OP. - Dirk EddelbuettelRandRccp sugarare not synced, identity is not a test. - user2030503