2
votes

I have a large data.frame with ~100.000 rows and 12 columns (3 columns contain variables, 9 values/measurements) with a lot of NA in it, similar to these:

##Example data.frame
Var1 <- c(rep("N01", 9), rep("N02",9))  
Var2 <- c("a","a","a","b","b","b","c","c","c","a","a","a","b","b","b","c","c","c")  
Val1 <- c(NA,2,1,2,NA,1, NA,2,NA, 2,NA,NA,NA,2,NA,1,NA,2)  
Val2 <- c(2,NA,1,NA,2,NA,2,NA,2,NA,2,2,2,NA,2,NA,2,NA)  
data <- data.frame(Var1,Var2,Val1,Val2)  
data  

Var1 Var2 Val1 Val2
N01    a   NA    2
N01    a    2   NA
N01    a    1    1
N01    b    2   NA
N01    b   NA    2
N01    b    1   NA
N01    c   NA    2
N01    c    2   NA
N01    c   NA    2
N02    a    2   NA
N02    a   NA    2
N02    a   NA    2
N02    b   NA    2
N02    b    2   NA
N02    b   NA    2
N02    c    1   NA
N02    c   NA    2
N02    c    2   NA

Var1 and Var2 are variables (Var1 = PlotID, Var2 = SubplotID).
Val1 and Val2 are values(measurements) for each combination of Var1 and Var2.
Now I want to remove as much unnecessary NAs as possible, e.g. Val1 and Val2 both contain one NA for "N01 a", but in different rows. If the data.frame would look like below (just ordered manually), I could easily remove the rows with only NAs (for Val1 & Val2).

#Ordered manually
Val1 <- c(2,1,NA,2,1,NA,2,NA,NA,2,NA,NA,2,NA,NA,2,1,NA)
Val2 <- c(2,1,NA,2,NA,NA,2,2,NA,2,2,NA,2,2,NA,2,NA,NA)
data2 <- data.frame(Var1,Var2,Val1,Val2)

Var1 Var2 Val1 Val2
N01    a    2    2
N01    a    1    1
N01    a   NA   NA
N01    b    2    2
N01    b    1   NA
N01    b   NA   NA
N01    c    2    2
N01    c   NA    2
N01    c   NA   NA
N02    a    2    2
N02    a   NA    2
N02    a   NA   NA
N02    b    2    2
N02    b   NA    2
N02    b   NA   NA
N02    c    2    2
N02    c    1   NA
N02    c   NA   NA

Thus, in the end I want a data.frame that looks like this:

    Var1 Var2 Val1.s Val2.s
1   N01    a      1      1
2   N01    a      2      2
3   N01    b      1      2
4   N01    b      2     NA
5   N01    c      2      2
6   N01    c     NA      2
7   N02    a      2      2
8   N02    a     NA      2
9   N02    b      2      2
10  N02    b     NA      2
11  N02    c      1      2
12  N02    c      2     NA

I wrote a small loop, which just subsets the data.frame in all possible combinations of Var1 and Var2
than order Val1 and Val2 separately and remove the rows where Val1 and Val2 are only NAs.
It works, but it seems rather complicated and for a big data.frame it is not very fast. Has anyone an idea of a better way to get for the first data.frame to the last.
Thanks in advance

#Small loop
library(dplyr)
level.var1 <- unique(Var1)
level.var2 <- unique(Var2)
Res.list1 <- list()
Res.list2 <- list()

for(i in 1:length(level.var1)){
  df.1 <- dplyr::filter(data, Var1==level.var1[i])
  for(o in 1:length(level.var2)){
    df.2 <- dplyr::filter(df.1, Var2==level.var2[o])
    Val1.s <- sort(df.2$Val1, na.last=TRUE)
    Val2.s <- sort(df.2$Val2, na.last=TRUE)
    df.3 <- data.frame(df.2[,c(1:2)], Val1.s, Val2.s)
    row_to_del <- apply(df.3[,c(3,4)], 1, function(x) all(is.na(x)))
    df.4 <- df.3[!row_to_del,]
    Res.list1[[o]] <- df.4
  }
  df.5 <- do.call(rbind, Res.list1)
  Res.list2[[i]] <- df.5  
}

Res.final <- do.call(rbind, Res.list2) 
1

1 Answers

2
votes

I find data.table very straight forward for such operation. The following solution will work for any amount of desired columns once you specified the cols variable correctly

library(data.table)

## Define the columns you want to filter by
cols <- paste0("Val", 1:2)

## Sort the desired columns by group while sending the NAs to the end
setDT(data)[, (cols) := lapply(.SD, sort, na.last = TRUE), 
              .SDcols = cols, 
              by = .(Var1, Var2)]

## Define an index which will check which rows have NAs for all the columns
indx <- rowSums(is.na(data[, cols, with = FALSE])) < length(cols)

## A simple subset by condition
data[indx]

#     Var1 Var2 Val1 Val2
#  1:  N01    a    1    1
#  2:  N01    a    2    2
#  3:  N01    b    1    2
#  4:  N01    b    2   NA
#  5:  N01    c    2    2
#  6:  N01    c   NA    2
#  7:  N02    a    2    2
#  8:  N02    a   NA    2
#  9:  N02    b    2    2
# 10:  N02    b   NA    2
# 11:  N02    c    1    2
# 12:  N02    c    2   NA