0
votes

I would like to have a sliding window to calculate several quantities (min, max, total, mean) on a whole data table that has about 130 columns. The window slides over groups of data set by the first key. The following code works for one column, but I don't know how to modify the code for multiple columns.

# Rolling sums
# ref http://stackoverflow.com/questions/23501262/r-using-data-table-to-efficiently-test-rolling-conditions-across-multiple-rows
#
library(data.table)
#
# Generate sample data
#
set.seed(57)
min_nSamples=5
max_nSamples=20
repN<-function(un) {
  rep(un,sample(min_nSamples:max_nSamples,1))
}
nUnits=1
units<- unlist(lapply(1:nUnits,repN))
dt<-data.table(unit = units)
dt <- dt[,{
  ## if there's just one row in the group of ID's, return nothing
  list(
    index = as.double(seq(1,.N))
  )}, by = c("unit")]
dt[, var1:=runif(nrow(dt), -5, 5)]
if (FALSE) {
  dt[, var2:=runif(nrow(dt), 5, 10)]
  dt[, var3:=runif(nrow(dt), 10, 15)]
}
setkeyv(dt,c("unit","index"))


# Reference
# http://stackoverflow.com/questions/14937165/using-dynamic-column-names-in-data-table

# Get a complete map of units / indices
keys = key(dt)
key_dt = CJ(event=unique(dt$unit), index=min(dt$index):max(dt$index))

key_map_values = dt[key_dt]

window_size = 5L
window_dt = key_map_values[, list(window_index = seq(index-window_size, index-1L, by=1L)), by=keys]

setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window"
setkeyv(key_map_values, keys)

ans = key_map_values[window_dt]

q_mov_avg = ans[, mean(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_tot = ans[, sum(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_max = ans[, max(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_min = ans[, min(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]

q_all_step_1 <- merge(q_mov_avg,q_mov_max,by=c("unit","i.index"))
setnames(q_all_step_1,"V1.x","avg_var1")
setnames(q_all_step_1,"V1.y","max_var1")
q_all_step_2 <- merge(q_all_step_1,q_mov_tot,by=c("unit","i.index"))
setnames(q_all_step_2,"V1","tot_var1")
q_all <- merge(q_all_step_2,q_mov_min,by=c("unit","i.index"))
setnames(q_all,"V1","min_var1")
setnames(q_all,"i.index","index")
setkey(q_all,unit,index)
dt$src="orig"
q_all$src="window"

dt_melt <- melt(dt,id=c(1:2,4),measure=3)
q_all_melt <- melt(q_all,id=c(1:2,7),measure=3:6)
dt_q_melt<-rbind(dt_melt,q_all_melt)
dt_q_melt[src=="window"]$index <- dt_q_melt[src=="window"]$index - window_size/2
ggplot(data=dt_q_melt[unit==1,],aes(x=index,y=value,group=variable,color=variable)) +
  geom_line() +
  geom_point()

How do I modify the code to run when the data sample has more columns, set FALSE to TRUE above?

1

1 Answers

0
votes

The following code evaluates user defined functions on multiple columns of a data table using rows from a sliding window. There are two keys, the first key is used to group the data, and the second key is the direction of the sliding window. The results of the moving windows are plotted with the data.

#
# Generate sample data
#
library(data.table)
library(ggplot2)
library(reshape2)
fakeData<-function() {

  set.seed(103)
  # Little dataset
  min_nSamples=30
  max_nSamples=50
  nVar = 10
  nUnits=30
  if(FALSE) {
    # Big dataset
    min_nSamples=80
    max_nSamples=120
    nVar = 130
    nUnits=330
  } 

  repN<-function(un) {
    rep(un,sample(min_nSamples:max_nSamples,1))
  }
  units<- unlist(lapply(1:nUnits,repN))
  dt<-data.table(unit = units)
  dt <- dt[,{
    ## if there's just one row in the group of ID's, return nothing
    list(
      index = as.double(seq(1,.N))
    )}, by = c("unit")]
  for (iv in seq(1,nVar)) {
    vname <- paste("var_",iv,sep="")
    expr <- bquote(.(as.name(vname)):=runif(nrow(dt), -5, 5))
    dt[, eval(expr)]  
  }
  setkeyv(dt,c("unit","index"))
  return(dt)
}
dt<-fakeData()
(dim(dt))

# Get a complete map of units / indices
keys = key(dt)
key_dt = dt[, ..keys]
setkey(key_dt)

key_map_values = dt[key_dt]

window_size = 10L
fmla_txt = parse(text=paste("list(window_index = seq(", keys[2], "-window_size, ",
                            keys[2],"-1L, by=1L))",sep=""))
key_map_values[is.na(get(keys[2])), (keys[2]):=-1 ] # Remove NA's
window_dt = key_map_values[, eval(fmla_txt), by=keys]

setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window"
setkeyv(key_map_values, keys)

ans = key_map_values[window_dt]

mov_mean <- function(v,indx) {
  mean(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}
mov_total <- function(v,indx) {
  sum(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}
mov_min <- function(v,indx) {
  max(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}
mov_max <- function(v,indx) {
  min(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}


# q_mov_avg = ans[, lapply(.SD, mov_mean, index), by="unit,i.index"]
#
fmla_txt = parse(text=paste("lapply(.SD, mov_mean, ", keys[2], ")",sep=""))
ikey2=paste("i.",keys[2],sep="")
klist=c(keys[1],ikey2)
q_mov_avg = ans[, eval(fmla_txt), by=klist]
fmla_txt = parse(text=paste("lapply(.SD, mov_total, ", keys[2], ")",sep=""))
q_mov_tot = ans[, eval(fmla_txt), by=klist]
fmla_txt = parse(text=paste("lapply(.SD, mov_max, ", keys[2], ")",sep=""))
q_mov_max = ans[, eval(fmla_txt), by=klist]
fmla_txt = parse(text=paste("lapply(.SD, mov_min, ", keys[2], ")",sep=""))
q_mov_min = ans[, eval(fmla_txt), by=klist]
# 
change_col_names <- function(dt,cname) {
  namesToChange <- names(dt)[grep(paste(keys,collapse="|"),names(dt),invert=TRUE)]
  chgName <- function(n) {
    setnames(dt,n,paste(cname,n,sep="."))
  }
  lapply(namesToChange,chgName)

  # dt$index<-NULL
  fmla_txt = parse(text=paste(keys[2],":=NULL", sep=""))
  dt[,eval(fmla_txt)]
  setnames(dt,ikey2,keys[2])
  setkeyv(dt,keys)
  return(dt)
}
q_mov_avg <- change_col_names(q_mov_avg,"avg")
q_mov_tot <- change_col_names(q_mov_tot,"sum")
q_mov_max <- change_col_names(q_mov_max,"max")
q_mov_min <- change_col_names(q_mov_min,"min")
dt_all <- Reduce(merge,list(dt,q_mov_avg,q_mov_tot,q_mov_max,q_mov_min))
# dt_all$window_index <- dt_all$index - window_size/2
dt_all$window_index <- dt_all[,keys[2],with=FALSE] - window_size/2
# 
nplots=3
fmla_txt = parse(text=paste("punit = sort(sample(unique(dt_all[,",keys[1],"]),nplots))", sep=""))
eval(fmla_txt)
for (idx_key1 in punit) {
  vname <- sample(names(dt)[!(names(dt) %in% key(dt))],1)
  fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,]", sep=""))
  pdt <- melt(eval(fmla_txt),id=c("index"),
              measure=c(vname,paste("max",vname,sep="."),
                        paste("min",vname,sep="."),
                        paste("avg",vname,sep="."),
                        paste("sum",vname,sep=".")))
  pdt[variable==vname,"variable"] = "Data"
  pdt[variable==paste("max",vname,sep="."),"variable"] = "Max"
  pdt[variable==paste("min",vname,sep="."),"variable"] = "Min"
  pdt[variable==paste("avg",vname,sep="."),"variable"] = "Avg"
  pdt[variable==paste("sum",vname,sep="."),"variable"] = "Sum"
  fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,",keys[2],"]", sep=""))
  pdt[variable=="Data","x"]=eval(fmla_txt)
  fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,window_index]", sep=""))
  pdt[variable!="Data","x"]=eval(fmla_txt)
  gp<-ggplot(data=pdt,aes(x=x,y=value,group=variable,color=variable)) +
    geom_line(size=1.1) +
    labs(title=paste(keys[1],idx_key1),y=vname)+theme(legend.title=element_blank())
  print(gp)
}

enter image description here