2
votes

I have a large dataset in which I'm looking to use data.table to identify the first non-missing value per grouping id.

I've currently had a go at doing this by defining a function and applying it to the whole data frame using lapply(). I've also trying using mclapply() but that appears to be even slower.

### Libraries ###
library(microbenchmark)
library(ggplot2)
library(data.table)

### Dummy Data Table ###
dt <- data.table(
  id = rep(1:4, each = 4),
  var_int = c(rep(NA, 3), 1L, rep(NA, 2), 10L, rep(NA, 2), 100L, rep(NA, 2), 1000L, rep(NA, 3)),
  var_dou = c(rep(NA, 2), 1, rep(NA, 2), 1.01, rep(NA, 2), 1.001, rep(NA, 3), rep(NA, 3), 1.0001),
  var_cha = c(NA, "a", rep(NA, 2), "b", rep(NA, 6), "c", rep(NA, 2), "d", NA),
  var_intmi = c(1L, rep(NA, 14), 4L)
)
dt
##     id var_int var_dou var_cha var_intmi
##  1:  1      NA      NA    <NA>         1
##  2:  1      NA      NA       a        NA
##  3:  1      NA  1.0000    <NA>        NA
##  4:  1       1      NA    <NA>        NA
##  5:  2      NA      NA       b        NA
##  6:  2      NA  1.0100    <NA>        NA
##  7:  2      10      NA    <NA>        NA
##  8:  2      NA      NA    <NA>        NA
##  9:  3      NA  1.0010    <NA>        NA
## 10:  3     100      NA    <NA>        NA
## 11:  3      NA      NA    <NA>        NA
## 12:  3      NA      NA       c        NA
## 13:  4    1000      NA    <NA>        NA
## 14:  4      NA      NA    <NA>        NA
## 15:  4      NA      NA       d        NA
## 16:  4      NA  1.0001    <NA>         4

### Functions ###
firstnonmiss_1 <- function(x){x[which(complete.cases(x))][1]}
firstnonmiss_2 <- function(x){first(x[complete.cases(x)])}
firstnonmiss_3 <- function(x){x[complete.cases(x)][1]}

### Desired Output ###
dt[, lapply(.SD, firstnonmiss_3), by = id]
##    id var_int var_dou var_cha var_intmi
## 1:  1       1  1.0000       a         1
## 2:  2      10  1.0100       b        NA
## 3:  3     100  1.0010       c        NA
## 4:  4    1000  1.0001       d         4

### Benchmarking ###
t <- microbenchmark(
  "which()[1]" = dt[, lapply(.SD, firstnonmiss_1), by = id],
  "first()" = dt[, lapply(.SD, firstnonmiss_2), by = id],
  "[1]" = dt[, lapply(.SD, firstnonmiss_3), by = id],
  times = 1e4
)
t
## Unit: microseconds
##        expr     min       lq     mean   median       uq       max neval
##  which()[1] 414.438 426.8485 516.7795 437.9710 460.8930 161388.83 10000
##     first() 401.574 413.6190 483.2857 424.6860 446.6475  41523.61 10000
##         [1] 388.845 401.4700 468.9951 411.3505 432.2035  33320.75 10000

### Plot Outputs ###
units <- attributes(print(t))[["unit"]]
autoplot(t) +
  labs(x = "Function", y = paste0("Timings, (", units, ")")) +
  scale_x_discrete() +
  scale_y_log10() +
  geom_violin(fill = "skyblue", alpha = 0.5) +
  theme_light() +
  theme(axis.text.y = element_text(family = "Monaco", angle = 90, hjust = 0.5))

Benchmarking

The benchmark times on the dummy dataset aren't too bad, but when I run the function on a my actual dataset (1,019 columns, 1,506,451 rows, 502,540 group ids) it takes around 11 minutes to complete. Is there a better/faster way to get a collapsed data frame containing the first non-missing observations per group id for each column/variable?

3
Is id always sorted? - CPak
I can run setkey(dt, id) first, but the missing data values aren't always in the same positions for each of the other variables. - Feakster
If it's sorted, vectorized solutions might be possible. - CPak

3 Answers

2
votes

This may be a case where melting the dataset and casting is faster when there are only 3 results per each group.

Using @chinsoon12's dataset, I get 2-3 seconds with OP's original solutions vs. 0.4 s with melt and cast. If you don't mind keeping the data molten (i.e., long), that is around 0.2 seconds which is about 10x faster than the original.

#melt and cast
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)

#only melt
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]

#approach with intermediate variables:
molten_DT<- na.omit(melt(DT, id.vars = 'grp'), 'value')
dcast(molten_DT[molten_DT[, .I[1], by = .(grp, variable)]$V1, ], grp ~ variable)
library(data.table)
library(microbenchmark)

#@chinsoon12's dataset
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
  as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
  grp=rep(1:ngrp, each=avgNr))

system.time(DT[, lapply(.SD, firstnonmiss_1), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_2), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_3), by = grp])
microbenchmark(melt_and_cast = {
  dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)
  },melt_1 = {
    melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]
  }
,times = 20)
2
votes

You might want to consider using Rcpp to reduce the number of NA checks:

library(Rcpp)
cppFunction('
    NumericVector firstNonNA(NumericMatrix M) {
        NumericVector res(M.ncol());

        for (int j=0; j<M.ncol(); j++) {
            res[j] = NA_REAL;
            for (int i=0; i<M.nrow(); i++) {
                if (!Rcpp::traits::is_na<REALSXP>(M(i, j))) {
                    res[j] = M(i, j);
                    break;
                }
            }
        }

        return res;
    }
')

#create sample data
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
    as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
    grp=rep(1:ngrp, each=avgNr))
dim(DT)
#[1] 3000 1001

#use Rcpp function
system.time(DT[, as.list(firstNonNA(as.matrix(.SD))), by=grp])

timing output:

user  system elapsed 
5.59    0.08    5.63 

Unfortunately, no RAM to test actual dim

0
votes

To help those who might stray past this post in future, this is how I used the answer of @Cole to find the first non-missing value per variable for each grouping id:

## Character Vars ##
vars_char <- names(dt)[sapply(dt, is.character)]
dt_char <- melt(dt,
                id.vars = "id",
                measure.vars = vars_char,
                na.rm = T)
dt_char <- dt_char[, .SD[1], by = .(id, variable)]
dt_char <- dcast(dt_char, id ~ variable)

## Integer Vars ##
vars_int <- names(dt)[sapply(dt, is.integer)]
vars_int <- vars_int[vars_int != "id"]
dt_int <- melt(dt,
               id.vars = "id",
               measure.vars = vars_int,
               na.rm = T)
dt_int <- dt_int[, .SD[1], by = .(id, variable)]
dt_int <- dcast(dt_int, id ~ variable)

## Double Vars ##
vars_doub <- names(dt)[sapply(dt, is.double)]
dt <- melt(dt,
           id.vars = "id",
           measure.vars = vars_doub,
           na.rm = T)
dt <- dt[, .SD[1], by = .(id, variable)]
dt <- dcast(dt, id ~ variable)

## Combine Variables Types ##
dt <- Reduce(function(x, y){merge(x, y, by = "id", all = T)}, list(dt_int, dt, dt_char))

The above is split in three to avoid memory issues associated with all values being coerced to character type. If this is not an issue for your data set then the following would work:

dt <- melt(dt,
           id.vars = "id",
           na.rm = T)
dt <- dt[, .SD[1], by = .(id, variable)]
dt <- dcast(dt, id ~ variable)

For the initial example data set it takes significantly longer to run than any of the firstnonmiss() functions.

### Benchmarking ###
t <- microbenchmark(
  "which()[1]" = dt[, lapply(.SD, firstnonmiss_1), by = id],
  "first()" = dt[, lapply(.SD, firstnonmiss_2), by = id],
  "[1]" = dt[, lapply(.SD, firstnonmiss_3), by = id],
  "reshape" = dcast(melt(dt, id.vars = "id", na.rm = T)[, .SD[1], by = .(id, variable)], id ~ variable),
  times = 1e4
)
t
## Unit: microseconds
##        expr      min        lq      mean    median        uq      max neval
##  which()[1]  416.199  434.8970  497.6187  447.8205  471.3300 19577.46 10000
##     first()  400.774  421.4570  472.8580  434.2320  458.2420 31315.78 10000
##         [1]  389.710  408.6455  464.6562  421.2085  442.8305 17822.18 10000
##     reshape 2052.353 2120.1925 2400.9130 2178.8150 2285.6500 96451.59 10000

units <- attributes(print(t))[["unit"]]
autoplot(t) +
  labs(x = "Function", y = paste0("Timings, (", units, ")")) +
  scale_x_discrete() +
  scale_y_log10() +
  geom_violin(fill = "skyblue", alpha = 0.5) +
  theme_light() +
  theme(axis.text.y = element_text(family = "Monaco", angle = 90, hjust = 0.5))

Benchmarking However, it runs much faster than the firstnonmiss() functions in very large data sets (60 seconds vs 11 minutes).