I would like to have a sliding window to calculate several quantities (min, max, total, mean) on a whole data table that has about 130 columns. The window slides over groups of data set by the first key. The following code works for one column, but I don't know how to modify the code for multiple columns.
# Rolling sums
# ref http://stackoverflow.com/questions/23501262/r-using-data-table-to-efficiently-test-rolling-conditions-across-multiple-rows
#
library(data.table)
#
# Generate sample data
#
set.seed(57)
min_nSamples=5
max_nSamples=20
repN<-function(un) {
rep(un,sample(min_nSamples:max_nSamples,1))
}
nUnits=1
units<- unlist(lapply(1:nUnits,repN))
dt<-data.table(unit = units)
dt <- dt[,{
## if there's just one row in the group of ID's, return nothing
list(
index = as.double(seq(1,.N))
)}, by = c("unit")]
dt[, var1:=runif(nrow(dt), -5, 5)]
if (FALSE) {
dt[, var2:=runif(nrow(dt), 5, 10)]
dt[, var3:=runif(nrow(dt), 10, 15)]
}
setkeyv(dt,c("unit","index"))
# Reference
# http://stackoverflow.com/questions/14937165/using-dynamic-column-names-in-data-table
# Get a complete map of units / indices
keys = key(dt)
key_dt = CJ(event=unique(dt$unit), index=min(dt$index):max(dt$index))
key_map_values = dt[key_dt]
window_size = 5L
window_dt = key_map_values[, list(window_index = seq(index-window_size, index-1L, by=1L)), by=keys]
setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window"
setkeyv(key_map_values, keys)
ans = key_map_values[window_dt]
q_mov_avg = ans[, mean(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_tot = ans[, sum(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_max = ans[, max(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_min = ans[, min(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_all_step_1 <- merge(q_mov_avg,q_mov_max,by=c("unit","i.index"))
setnames(q_all_step_1,"V1.x","avg_var1")
setnames(q_all_step_1,"V1.y","max_var1")
q_all_step_2 <- merge(q_all_step_1,q_mov_tot,by=c("unit","i.index"))
setnames(q_all_step_2,"V1","tot_var1")
q_all <- merge(q_all_step_2,q_mov_min,by=c("unit","i.index"))
setnames(q_all,"V1","min_var1")
setnames(q_all,"i.index","index")
setkey(q_all,unit,index)
dt$src="orig"
q_all$src="window"
dt_melt <- melt(dt,id=c(1:2,4),measure=3)
q_all_melt <- melt(q_all,id=c(1:2,7),measure=3:6)
dt_q_melt<-rbind(dt_melt,q_all_melt)
dt_q_melt[src=="window"]$index <- dt_q_melt[src=="window"]$index - window_size/2
ggplot(data=dt_q_melt[unit==1,],aes(x=index,y=value,group=variable,color=variable)) +
geom_line() +
geom_point()
How do I modify the code to run when the data sample has more columns, set FALSE to TRUE above?