1
votes

How do I apply a function after group_by using dplyr to remove those groups with 2 or more consecutive NAs? I have written a function that outputs True or False whether a column in a dataframe has 2 or more NAs:

# function for determining if ts contains consecutive NAs 
is.na.contiguous <- function(df, consecutive) {
  na.rle <- rle(is.na(df$b))
  na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
  any(na.rle$values)
}

# example df
d = structure(list(a = c(1, 2, 3, 4, 5, 6, 7, 8), b = c(1, 2, 2, 
+ NA, NA, 2, NA, 2), c = c(1, 1, 1, 2, 2, 2, 3, 3)), class = "data.frame", row.names = c(NA, 
+ -8L))

head(d)
  a  b c
1 1  1 1
2 2  2 1
3 3  2 1
4 4 NA 2
5 5 NA 2
6 6  2 2
7 7 NA 3
8 8  2 3

# test function
is.na.contiguous(d,2)
TRUE # column b has 2 consecutive NAs
is.na.contiguous(d,3)
FALSE # column b does not have 3 consecutive NAs

Now how do I apply this function to each group in the dataframe? Below is what I have tried:

d %>% group_by(c) %>% mutate(consecNA = is.na.contiguous(.,2)) %>% as.data.frame()

  a  b c consecNA
1 1  1 1     TRUE
2 2  2 1     TRUE
3 3  2 1     TRUE
4 4 NA 2     TRUE
5 5 NA 2     TRUE
6 6  2 2     TRUE
7 7 NA 3     TRUE
8 8  2 3     TRUE

What am I doing wrong?

2
To add a column, d %>% group_by(c) %>% mutate(consecNA = any(is.na(b) & lag(is.na(b), default = FALSE))); to drop groups, d %>% group_by(c) %>% filter(!any(is.na(b) & lag(is.na(b), default = FALSE)))alistaire
@TYL Can you show the expected output. Is it a column of logical vector or you want to filterakrun
My ultimate aim is to filter, but I can't seem to find a way. So what I thought was to mutate a column of logical vector and then filter based on that column.TYL

2 Answers

2
votes

Instead of passing the entire dataframe to is.na.contiguous, pass only the column value then it would be simple to apply it via group and also it would become flexible if you want to do the same for some different column.

is.na.contiguous <- function(x, consecutive) {
   na.rle <- rle(is.na(x))
   na.rle$values <- na.rle$values & na.rle$lengths >= consecutive
   any(na.rle$values)
}

library(dplyr)
d %>%
  group_by(c) %>%
  filter(!is.na.contiguous(b, 2))

#      a     b     c
#  <dbl> <dbl> <dbl>
#1     1     1     1
#2     2     2     1
#3     3     2     1
#4     7    NA     3
#5     8     2     3
1
votes

An option would be to use rleid from data.table on the logical vector (is.na(b)), and use that to subset the groups having number of rows greater than or equal to 2 and if all the elements are NA

library(data.table)
i1 <- setDT(d)[, .I[!(.N >=2 & all(is.na(b)))], rleid(is.na(b))]$V1
d[i1]
#.  a  b c
#1: 1  1 1
#2: 2  2 1
#3: 3  2 1
#4: 6  2 2
#5: 7 NA 3
#6: 8  2 3

Or if we need to also group by 'c'

setDT(d)[d[, .I[sum(is.na(b)) <2], .(grp = rleid(is.na(b)), c)]$V1]

or with tidyverse

library(dplyr)
d %>%
   group_by(grp = rleid(is.na(b))) %>%
   filter(!(n() >=2 & all(is.na(b))))
# A tibble: 6 x 4
# Groups:   grp [4]
#      a     b     c   grp
#  <dbl> <dbl> <dbl> <int>
#1     1     1     1     1
#2     2     2     1     1
#3     3     2     1     1
#4     6     2     2     3
#5     7    NA     3     4
#6     8     2     3     5

Or another option is to get the sum of logical vector and check if it is less than 2

d %>%
    group_by(c, grp = rleid(is.na(b))) %>%
    filter(sum(is.na(b))<2)

If we are using the function from OP

is.na.contiguous <- function(x, consecutive) {
     na.rle <- rle(is.na(x))
      with(na.rle, any(values & na.rle$lengths >= consecutive))

     } 

d %>%
   group_by(c) %>%
   mutate(consecNA = is.na.contiguous(b, 2))
# A tibble: 8 x 4
# Groups:   c [3]
#      a     b     c consecNA
#  <dbl> <dbl> <dbl> <lgl>   
#1     1     1     1 FALSE   
#2     2     2     1 FALSE   
#3     3     2     1 FALSE   
#4     4    NA     2 TRUE    
#5     5    NA     2 TRUE    
#6     6     2     2 TRUE    
#7     7    NA     3 FALSE   
#8     8     2     3 FALSE