I notice you asked for the most efficient way-- if you are looking at scaling this up to a much larger set, I would strongly recommend data.table.
library(data.table)
library(RcppRoll)
l[, .(sum = RcppRoll::roll_sum(y, n = 2L, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
A rough benchmark comparison of this vs an answer using the tidyverse packages with 100,000 rows and 10,000 groups illustrates the significant difference.
(I used Psidom's answer instead of jazzurro's since jazzuro's did not allow for an arbritary number of rows to be summed.)
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, sumRows),
seq = seq_len(length(.$y) - sumRows + 1)
)
)
|========================================================0% ~0 s remaining
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm ## Stop the clock
Results:
> dplyr_time
user system elapsed
10.28 0.04 10.36
> data.table_time
user system elapsed
0.35 0.02 0.36
> all.equal(dplyr_result,as.tibble(dt_result))
[1] TRUE