This is my transaction data. It shows the transactions made from the accounts in from
column to the accounts in to
column with the date and the amount information
data
id from to date amount
<int> <fctr> <fctr> <date> <dbl>
19521 6644 6934 2005-01-01 700.0
19524 6753 8456 2005-01-01 600.0
19523 9242 9333 2005-01-01 1000.0
… … … … …
1056317 7819 7454 2010-12-31 60.2
1056318 6164 7497 2010-12-31 107.5
1056319 7533 7492 2010-12-31 164.1
I want to calculate how much transaction amount the accounts in from
column received in the last 6 month prior to the date that particular transaction was made and want to save this information as a new column.
This following code works very well to accomplish this in a small dataset ,say, with 1000 rows:
library(dplyr)
library(purrr)
data %>%
mutate(total_trx_amount_received_in_last_sixmonth= map2_dbl(from, date,
~sum(amount[to == .x & between(date, .y - 180, .y)])))
However, since my data has over 1 million rows, this code will take more than a couple of hours to complete.
I searched the internet if I can speed up the run time of this code. I tried this suggestion on SO about how to make purrr
map
function run faster. So, I tried the following code and instead of mutate
of dplyr
I used data.table
to speed up the code even faster:
library(future)
library(data.table)
library(furrr)
data[, total_trx_amount_received_in_last_sixmonth:= furrr::future_pmap_dbl(list(from, date),
~mean(amount[to == .x & between(date, .y-180, .y)])) ]
But, the speed hasn't been improved at all.
Is there any suggestion on how can I make the code run faster?
dput() output of the data:
structure(list(id = c(18529L, 13742L, 9913L, 956L, 2557L, 1602L,
18669L, 35900L, 48667L, 51341L, 53713L, 60126L, 60545L, 65113L,
66783L, 83324L, 87614L, 88898L, 89874L, 94765L, 100277L, 101587L,
103444L, 108414L, 113319L, 121516L, 126607L, 130170L, 131771L,
135002L, 149431L, 157403L, 157645L, 158831L, 162597L, 162680L,
163901L, 165044L, 167082L, 168562L, 168940L, 172578L, 173031L,
173267L, 177507L, 179167L, 182612L, 183499L, 188171L, 189625L,
193940L, 198764L, 199342L, 200134L, 203328L, 203763L, 204733L,
205651L, 209672L, 210242L, 210979L, 214532L, 214741L, 215738L,
216709L, 220828L, 222140L, 222905L, 226133L, 226527L, 227160L,
228193L, 231782L, 232454L, 233774L, 237836L, 237837L, 238860L,
240223L, 245032L, 246673L, 247561L, 251611L, 251696L, 252663L,
254410L, 255126L, 255230L, 258484L, 258485L, 259309L, 259910L,
260542L, 262091L, 264462L, 264887L, 264888L, 266125L, 268574L,
272959L), from = c("5370", "5370", "5370", "8605", "5370", "6390",
"5370", "5370", "8934", "5370", "5635", "6046", "5680", "8026",
"9037", "5370", "7816", "8046", "5492", "8756", "5370", "9254",
"5370", "5370", "7078", "6615", "5370", "9817", "8228", "8822",
"5735", "7058", "5370", "8667", "9315", "6053", "7990", "8247",
"8165", "5656", "9261", "5929", "8251", "5370", "6725", "5370",
"6004", "7022", "7442", "5370", "8679", "6491", "7078", "5370",
"5370", "5370", "5658", "5370", "9296", "8386", "5370", "5370",
"5370", "9535", "5370", "7541", "5370", "9621", "5370", "7158",
"8240", "5370", "5370", "8025", "5370", "5370", "5370", "6989",
"5370", "7059", "5370", "5370", "5370", "9121", "5608", "5370",
"5370", "7551", "5370", "5370", "5370", "5370", "9163", "9362",
"6072", "5370", "5370", "5370", "5370", "5370"), to = c("9356",
"5605", "8567", "5370", "5636", "5370", "8933", "8483", "5370",
"7626", "5370", "5370", "5370", "5370", "5370", "9676", "5370",
"5370", "5370", "5370", "9105", "5370", "9772", "6979", "5370",
"5370", "7564", "5370", "5370", "5370", "5370", "5370", "8744",
"5370", "5370", "5370", "5370", "5370", "5370", "5370", "5370",
"5370", "5370", "7318", "5370", "8433", "5370", "5370", "5370",
"7122", "5370", "5370", "5370", "8566", "6728", "9689", "5370",
"8342", "5370", "5370", "5614", "5596", "5953", "5370", "7336",
"5370", "7247", "5370", "7291", "5370", "5370", "6282", "7236",
"5370", "8866", "8613", "9247", "5370", "6767", "5370", "9273",
"7320", "9533", "5370", "5370", "8930", "9343", "5370", "9499",
"7693", "7830", "5392", "5370", "5370", "5370", "7497", "8516",
"9023", "7310", "8939"), date = structure(c(12934, 13000, 13038,
13061, 13099, 13113, 13117, 13179, 13238, 13249, 13268, 13296,
13299, 13309, 13314, 13391, 13400, 13404, 13409, 13428, 13452,
13452, 13460, 13482, 13493, 13518, 13526, 13537, 13542, 13544,
13596, 13616, 13617, 13626, 13633, 13633, 13639, 13642, 13646,
13656, 13660, 13664, 13667, 13669, 13677, 13686, 13694, 13694,
13707, 13716, 13725, 13738, 13739, 13746, 13756, 13756, 13756,
13761, 13769, 13770, 13776, 13786, 13786, 13786, 13791, 13799,
13806, 13813, 13817, 13817, 13817, 13822, 13829, 13830, 13836,
13847, 13847, 13847, 13852, 13860, 13866, 13871, 13878, 13878,
13878, 13882, 13883, 13883, 13887, 13887, 13888, 13889, 13890,
13891, 13895, 13896, 13896, 13899, 13905, 13909), class = "Date"),
amount = c(24.4, 7618, 21971, 5245, 2921, 8000, 169.2, 71.5,
14.6, 4214, 14.6, 13920, 14.6, 24640, 1600, 261.1, 16400,
3500, 2700, 19882, 182, 14.6, 16927, 25653, 3059, 2880, 9658,
4500, 12480, 14.6, 1000, 3679, 34430, 12600, 14.6, 19.2,
4900, 826, 3679, 2100, 38000, 79, 11400, 21495, 3679, 200,
14.6, 100.6, 3679, 5300, 108.9, 3679, 2696, 7500, 171.6,
14.6, 99.2, 2452, 3679, 3218, 700, 69.7, 14.6, 91.5, 2452,
3679, 2900, 17572, 14.6, 14.6, 90.5, 2452, 49752, 3679, 1900,
14.6, 870, 85.2, 2452, 3679, 1600, 540, 14.6, 14.6, 79, 210,
2452, 28400, 720, 180, 420, 44289, 489, 3679, 840, 2900,
150, 870, 420, 14.6)), row.names = c(NA, -100L), class = "data.frame")
dput()
. – sindri_baldur