3
votes

I am trying to construct a group-specific variable that is:

  1. NA for observations until a condition is met.
  2. NA for the first observation when the condition is met.
  3. For subsequent observations, the value of the last prior observation when the condition was met.

Below is a MWE. I am looking for code to construct "last_value". I would much prefer to use data.table (or dplyr) for scalability/performance but am open to other suggestions. Thanks!

library(data.table)

data.table(
    group = c(rep("alpha", 4), rep("beta", 3)),
    time = c(1:4, 1:3),
    condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
    value = 1:7,
    last_value = c(NA, 1, 1, 3, NA, NA, 6))

#    group time condition value last_value
# 1: alpha    1       Yes     1         NA
# 2: alpha    2        No     2          1
# 3: alpha    3       Yes     3          1
# 4: alpha    4        No     4          3
# 5:  beta    1        No     5         NA
# 6:  beta    2       Yes     6         NA
# 7:  beta    3        No     7          6

# last_value is:

#     NA in the 1st row as that is the 1st observation for group "alpha"
#     1 in the 2nd row as the 1st observation is condition = "Yes"
#     1 in the 3rd row as the 1st observation is condition = "Yes"; 2nd observation is condition = "No"
#     3 in the 4rd row as the 3rd observation is condition = "Yes"

#     NA in the 5th row as that is the 1st observation for group "beta"
#     NA in the 6th row as there is no prior observation with condition = "Yes"
#     6 in the 7th row as the 6th observation is condition = "Yes"
4

4 Answers

3
votes

Here is another one in dplyr using cummax.

library(dplyr)
df %>%
  group_by(group) %>%
  mutate(last = cummax(row_number() * (condition == "Yes")),
         last = lag(value[replace(last, last == 0, NA)]))


#  group  time condition value last_value  last
#  <fct> <int> <fct>     <int>      <dbl> <int>
#1 alpha     1 Yes           1         NA    NA
#2 alpha     2 No            2          1     1
#3 alpha     3 Yes           3          1     1
#4 alpha     4 No            4          3     3
#5 beta      1 No            5         NA    NA
#6 beta      2 Yes           6         NA    NA
#7 beta      3 No            7          6     6
2
votes

Took the opportunity to play with some of data.table's newest additions fifelse() and nafill():

DT[order(time), # necessary if data is note in order
   last_value2 := nafill(shift(fifelse(condition == "Yes", value, NA_integer_)), "locf"),
   by = group
   ]

   group time condition value last_value last_value2
1: alpha    1       Yes     1         NA          NA
2: alpha    2        No     2          1           1
3: alpha    3       Yes     3          1           1
4: alpha    4        No     4          3           3
5:  beta    1        No     5         NA          NA
6:  beta    2       Yes     6         NA          NA
7:  beta    3        No     7          6           6

Or with pipes:

DT[, 
   last_value2 := 
     fifelse(condition == "Yes", value, NA_integer_) %>% 
       shift() %>% 
       nafill("locf"),
   by = group
   ]
1
votes

Here's a dplyr approach, where the desired output in last_value is generated faithfully in last_value2.

library(dplyr)
library(tidyr)
df %>%
  group_by(group) %>%
  mutate(value2 = if_else(condition == "Yes", value, NA_integer_)) %>%
  tidyr::fill(value2) %>%
  mutate(last_value2 = lag(value2)) %>%
  ungroup()

## A tibble: 7 x 7
#  group  time condition value last_value value2 last_value2
#  <fct> <int> <fct>     <int>      <dbl>  <int>       <int>
#1 alpha     1 Yes           1         NA      1          NA
#2 alpha     2 No            2          1      1           1
#3 alpha     3 Yes           3          1      3           1
#4 alpha     4 No            4          3      3           3
#5 beta      1 No            5         NA     NA          NA
#6 beta      2 Yes           6         NA      6          NA
#7 beta      3 No            7          6      6           6

Assuming the data loaded here:

df <- data.frame(
  group = c(rep("alpha", 4), rep("beta", 3)),
  time = c(1:4, 1:3),
  condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
  value = 1:7,
  last_value = c(NA, 1, 1, 3, NA, NA, 6))
1
votes

Here are a few options using data.table:

1. using non-equi join

DT[, lv := 
    DT[condition=="Yes"][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
]

2. Cut sequences into groups where it starts with TRUE

DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
    lv := shift(lv), group]

3. Rolling join in data.table (which should be the fastest)

DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
DT3[group!=shift(group), t2 := NA_integer_]
DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]

4. In the dev version of data.table, you should be able to use DT[, lv2 := shift(nafill(replace(value, condition=="No", NA_integer_)), group]. Caveat: cant test since cant upgrade.

data:

library(data.table)
DT <- data.table(
    group = c(rep("alpha", 4), rep("beta", 3)),
    time = c(1:4, 1:3),
    condition = c(T, F, T, F, F, T, F),
    value = 1:7,
    last_value = c(NA, 1, 1, 3, NA, NA, 6))
DT2 <- copy(DT); DT3 <- copy(DT);

output:

   group time condition value last_value lv
1: alpha    1      TRUE     1         NA NA
2: alpha    2     FALSE     2          1  1
3: alpha    3      TRUE     3          1  1
4: alpha    4     FALSE     4          3  3
5:  beta    1     FALSE     5         NA NA
6:  beta    2      TRUE     6         NA NA
7:  beta    3     FALSE     7          6  6

timing code:

library(data.table)

set.seed(0L)
ng <- 1e6
nr <- 1e7
DT <- data.table(group=sort(sample(ng, nr, TRUE)))
DT[, c("time", "condition", "value") := .(rowid(group), sample(c(TRUE, FALSE), nr, TRUE), .I)]
DT2 <- copy(DT)
DT3 <- copy(DT)

mtd0 <- function() {
    DT[, lv :=
        DT[(condition)][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
    ]
}

mtd1 <- function() {
    DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
        lv := shift(lv), group]
}

mtd2 <- function() {
    DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
    DT3[group!=shift(group), t2 := NA_integer_]
    DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]
}

bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)

timings:

# A tibble: 3 x 13
  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result     memory   time  gc     
  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>     <list>   <lis> <list> 
1 mtd0()       5.24s   5.24s    0.191      485MB    0.191     1     1      5.24s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
2 mtd1()      48.55s  48.55s    0.0206     240MB    9.19      1   446     48.55s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
3 mtd2()       1.28s   1.28s    0.780      618MB    0         1     0      1.28s <df[,6] [~ <df[,3]~ <bch~ <tibbl~