Forward fill a column's values by group after a specific value in another column

Question

I have a data table similar to something like this:

> dt
   FundId Period FundAssets
1       a 200601          0
2       a 200602          0
3       a 200603          0
4       a 200604   40000000
5       a 200605   45000000
6       a 200606   48000000
7       a 200607   52000000
8       a 200608   55000000
9       a 200609   57000000
10      a 200610   49000000
11      a 200611   16000000
12      a 200612    1500000
13      b 200601          0
14      b 200602          0
15      b 200603          0
16      b 200604   58000000
17      b 200605   24000000
18      b 200606   16000000
19      b 200607   57000000
20      b 200608          0
21      b 200609          0
22      b 200610          0
23      b 200611          0
24      b 200612          0
25      c 200601   57000000
26      c 200602   65000000
27      c 200603   70000000
28      c 200604   70000000
29      c 200605   78000000
30      c 200606   43000000
31      c 200607   56000000
32      c 200608   33000000
33      c 200609   23000000
34      c 200610   21000000
35      c 200611   24000000
36      c 200612   23000000

But with many more columns and period values. However, these are the important columns for this question. I am trying to make a new column that will serve as a marker for if the fund is in existence and has ever reached 50,000,000 in assets.

My idea was to do this in 2 columns: check1 and check2. check1 would see what Periods of each fund have 50,000,000 or more in assets. I got this part with

dt[, check1 := dt[,FundAssets]>=50000000]

> dt
    FundId Period FundAssets check1
 1:      a 200601          0  FALSE
 2:      a 200602          0  FALSE
 3:      a 200603          0  FALSE
 4:      a 200604   40000000  FALSE
 5:      a 200605   45000000  FALSE
 6:      a 200606   48000000  FALSE
 7:      a 200607   52000000   TRUE
 8:      a 200608   55000000   TRUE
 9:      a 200609   57000000   TRUE
10:      a 200610   49000000  FALSE
11:      a 200611   16000000  FALSE
12:      a 200612    1500000  FALSE
13:      b 200601          0  FALSE
14:      b 200602          0  FALSE
15:      b 200603          0  FALSE
16:      b 200604   58000000   TRUE
17:      b 200605   24000000  FALSE
18:      b 200606   16000000  FALSE
19:      b 200607   57000000   TRUE
20:      b 200608          0  FALSE
21:      b 200609          0  FALSE
22:      b 200610          0  FALSE
23:      b 200611          0  FALSE
24:      b 200612          0  FALSE
25:      c 200601   57000000   TRUE
26:      c 200602   65000000   TRUE
27:      c 200603   70000000   TRUE
28:      c 200604   70000000   TRUE
29:      c 200605   78000000   TRUE
30:      c 200606   43000000  FALSE
31:      c 200607   56000000   TRUE
32:      c 200608   33000000  FALSE
33:      c 200609   23000000  FALSE
34:      c 200610   21000000  FALSE
35:      c 200611   24000000  FALSE
36:      c 200612   23000000  FALSE

check2 would then be a column with TRUEs after the first TRUE in check1 and as long as FundAssets>0. However, I have run into issues trying to fill the rest of the TRUEs down the column. Basically, the final dt would look like:

> dt
    FundId Period FundAssets check1 check2
 1:      a 200601          0  FALSE  FALSE
 2:      a 200602          0  FALSE  FALSE
 3:      a 200603          0  FALSE  FALSE
 4:      a 200604   40000000  FALSE  FALSE
 5:      a 200605   45000000  FALSE  FALSE
 6:      a 200606   48000000  FALSE  FALSE
 7:      a 200607   52000000   TRUE   TRUE
 8:      a 200608   55000000   TRUE   TRUE
 9:      a 200609   57000000   TRUE   TRUE
10:      a 200610   49000000  FALSE   TRUE
11:      a 200611   16000000  FALSE   TRUE
12:      a 200612    1500000  FALSE   TRUE
13:      b 200601          0  FALSE  FALSE
14:      b 200602          0  FALSE  FALSE
15:      b 200603          0  FALSE  FALSE
16:      b 200604   58000000   TRUE   TRUE
17:      b 200605   24000000  FALSE   TRUE
18:      b 200606   16000000  FALSE   TRUE
19:      b 200607   57000000   TRUE   TRUE
20:      b 200608          0  FALSE  FALSE
21:      b 200609          0  FALSE  FALSE
22:      b 200610          0  FALSE  FALSE
23:      b 200611          0  FALSE  FALSE
24:      b 200612          0  FALSE  FALSE
25:      c 200601   57000000   TRUE   TRUE
26:      c 200602   65000000   TRUE   TRUE
27:      c 200603   70000000   TRUE   TRUE
28:      c 200604   70000000   TRUE   TRUE
29:      c 200605   78000000   TRUE   TRUE
30:      c 200606   43000000  FALSE   TRUE
31:      c 200607   56000000   TRUE   TRUE
32:      c 200608   33000000  FALSE   TRUE
33:      c 200609   23000000  FALSE   TRUE
34:      c 200610   21000000  FALSE   TRUE
35:      c 200611   24000000  FALSE   TRUE
36:      c 200612   23000000  FALSE   TRUE

So, I would be able to see if a fund is in existence and has hit $50,000,000 assets in its history by seeing if either check1 or check2 are TRUE in any given period.

It would also be fine to fill out the rest of check1 with the TRUEs and eliminate the need for check2. I had looked at forward fill functions, but they seem to be for NAs. An answer using data.table is preferred.

akrun akrun · Accepted Answer · 2020-07-23T21:04:15

An option will be cummax while having a logical index in i

library(data.table)     
setDT(dt)[, check1 := FundAssets > 50000000 # // create the check1
       ][, check2 := FALSE][ # // create the check2 as FALSE
      FundAssets != 0, # // specify the i with logical condition
        # // grouped by FundId, get the cummax of check1 convert to logical
         check2 := as.logical(cummax(check1)), FundId][]
#   FundId Period FundAssets check1 check2
# 1:      a 200601          0  FALSE  FALSE
# 2:      a 200602          0  FALSE  FALSE
# 3:      a 200603          0  FALSE  FALSE
# 4:      a 200604   40000000  FALSE  FALSE
# 5:      a 200605   45000000  FALSE  FALSE
# 6:      a 200606   48000000  FALSE  FALSE
# 7:      a 200607   52000000   TRUE   TRUE
# 8:      a 200608   55000000   TRUE   TRUE
# 9:      a 200609   57000000   TRUE   TRUE
#10:      a 200610   49000000  FALSE   TRUE
#11:      a 200611   16000000  FALSE   TRUE
#12:      a 200612    1500000  FALSE   TRUE
#13:      b 200601          0  FALSE  FALSE
#14:      b 200602          0  FALSE  FALSE
#15:      b 200603          0  FALSE  FALSE
#16:      b 200604   58000000   TRUE   TRUE
#17:      b 200605   24000000  FALSE   TRUE
#18:      b 200606   16000000  FALSE   TRUE
#19:      b 200607   57000000   TRUE   TRUE
#20:      b 200608          0  FALSE  FALSE
#21:      b 200609          0  FALSE  FALSE
#22:      b 200610          0  FALSE  FALSE
#23:      b 200611          0  FALSE  FALSE
#24:      b 200612          0  FALSE  FALSE
#25:      c 200601   57000000   TRUE   TRUE
#26:      c 200602   65000000   TRUE   TRUE
#27:      c 200603   70000000   TRUE   TRUE
#28:      c 200604   70000000   TRUE   TRUE
#29:      c 200605   78000000   TRUE   TRUE
#30:      c 200606   43000000  FALSE   TRUE
#31:      c 200607   56000000   TRUE   TRUE
#32:      c 200608   33000000  FALSE   TRUE
#33:      c 200609   23000000  FALSE   TRUE
#34:      c 200610   21000000  FALSE   TRUE
#35:      c 200611   24000000  FALSE   TRUE
#36:      c 200612   23000000  FALSE   TRUE

data

dt <- structure(list(FundId = c("a", "a", "a", "a", "a", "a", "a", 
"a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "b", 
"b", "b", "b", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", 
"c", "c", "c"), Period = c(200601L, 200602L, 200603L, 200604L, 
200605L, 200606L, 200607L, 200608L, 200609L, 200610L, 200611L, 
200612L, 200601L, 200602L, 200603L, 200604L, 200605L, 200606L, 
200607L, 200608L, 200609L, 200610L, 200611L, 200612L, 200601L, 
200602L, 200603L, 200604L, 200605L, 200606L, 200607L, 200608L, 
200609L, 200610L, 200611L, 200612L), FundAssets = c(0L, 0L, 0L, 
40000000L, 45000000L, 48000000L, 52000000L, 55000000L, 57000000L, 
49000000L, 16000000L, 1500000L, 0L, 0L, 0L, 58000000L, 24000000L, 
16000000L, 57000000L, 0L, 0L, 0L, 0L, 0L, 57000000L, 65000000L, 
70000000L, 70000000L, 78000000L, 43000000L, 56000000L, 33000000L, 
23000000L, 21000000L, 24000000L, 23000000L)), class = "data.frame", 
row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", 
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", 
"36"))

Forward fill a column's values by group after a specific value in another column

1 Answers

data