0
votes

Sample Data I have.

DF <- data.frame(
ID =c(1,2,3,4,5,6),
YEAR1 =c(2003,2005,2007,2008,2011,NA),
TEST1 =c(0,0,0,0,0,NA),
DROP1 =c(0,0,0,0,0,NA),
YEAR2 =c(2005,2007,2009,2010,2013,2011),
TEST2 =c(1,0,0,0,0,NA),
DROP2 =c(1,0,0,1,0,0),
YEAR3 =c(2007,2009,2011,2012,2015,2014),
TEST3 =c(NA,1,1,NA,1,0),
DROP3 =c(NA,0,0,NA,0,0),
YEAR4 =c(2009,2012,2013,2014,2017,2016),
TEST4 =c(NA,NA,1,NA,0,0),
DROP4 =c(NA,1,0,NA,0,0))

Same Data I want

DF_NEW <- data.frame(
A=c(1,2,3,4,5,6),
B=c(1,1,1,0,1,0),
C=c(1,1,1,1,0,0),
D=c(2003,2005,2007,2008,2011,2011),
E=c(2003,2007,2009,2010,2013,2016),
F=c(2005,2009,2011,2010,2015,2016),
G=c(2005,2012,2013,2010,2017,2016))

For this data:

A = Student ID

B = 1 if Student ever gets a score of 'TEST' that is 1. If not, 0.

C = 1 if Student ever gets a score of 'DROP' that is 1. If not, 0.

D = Student first year reported.

E = If Student gets a first score of 'TEST' = 1 in Year N then E equals to YEAR[N-1]. Not actually subtracting 1 from YEAR but instead taking the Year reported before the Student got a first score of 'TEST' = 1. If a Student never get a score of 'TEST' = 1 then E equals to the most recent (last) YEAR reported.

F = Year Student gets a first score of 'TEST' = 1. If Student never gets a score of 'TEST' = 1 then it is the most recent (last) YEAR.

G = Year Student gets a first score of 'DROP' = 1. If Student never gets a score of 'DROP' = 1 then it is the most recent (last) YEAR.

I made many attempts including the dplyr package but am wondering how to make this work correctly and efficiently. Specifically creating 'E'

This is what I have so far:

DF$A <- DF$ID DF$B <- apply(DF[,c("TEST1","TEST2","TEST3","TEST4")],1,max) 
DF$B[is.na(DF$B)] <- 0 
DF$C <-apply(DF[,c("DROP1","DROP2","DROP3","DROP4")],1,max)
DF$C[is.na(DF$C)] <- 0 
DF$D <- apply(DF[,c("YEAR1","YEAR2","YEAR3","YEAR4")],1,min) 
1

1 Answers

0
votes

Here is a method with data.table package

library(data.table)
library(stringi)
dt <- as.data.table(DF)
# convert from wide to long
ldt <- melt(dt, id.vars = "ID")
# split out variable from time indication
ldt[, time_id := as.integer(stringi::stri_extract_first_regex(variable, "\\d*$"))]
ldt[, variable2 := stringi::stri_replace_all_regex(variable, "\\d*$", "")]

# functions for E,F,G
getE <- function(var, val, time){
  w <- time[var=="TEST"][which(val[var == "TEST"] == 1)]
  if(length(w) > 0){
    t <- max(1, min(w)-1)
  }else{
    t <- max(time[var=="TEST" & !is.na(val)])
  }
  out <- val[time == t & var == "YEAR"]
  out
}
getFG <- function(var, val, col="TEST"){
  x <- val[var=="YEAR"]
  y <- val[var==col]
  w <- which(y==1)
  if(length(w)==0){
    w <- which(x == max(x[!is.na(y)]))
  }else{
    w <- min(w, na.rm=TRUE)
  }
  out <- x[w]
  out
}

# data.table aggreggation method
out <- ldt[, .(
  B = as.integer(any(value[variable2 == "TEST"] == 1, na.rm=TRUE))
  , C = as.integer(any(value[variable2 == "DROP"] == 1, na.rm=TRUE))
  , D = min(value[variable2 == "YEAR"], na.rm=TRUE)
  , E = getE(variable2, value, time_id)
  , F = getFG(variable2, value, "TEST")
  , G = getFG(variable2, value, "DROP")
), by = .(A=ID)]

# back to data.frame
out <- as.data.frame(out)
out

# test
# is C[3] correct?
out == DF_NEW