2
votes

I have two data frames with different lengths in rows and columns

data.frame(
  stringsAsFactors = FALSE,
              Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
           Germany = c(5, 8, 9),
            France = c(4, NA, 7),
        Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
  stringsAsFactors = FALSE,
              Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
           Germany = c(1, 9, 7, NA),
            France = c(10, 3, 9, 6),
        Luxembourg = c(10, NA, NA, 7),
           Belgium = c(NA, 8, 1, 9)
) -> df2

dfs

I have to create a third df (df3) where,

  1. NA values of df1 are replaced with the values of df2 by matching IDs and Dates and viceversa (the NA from df2 replaced by df1)
  2. The values of df1 are priority (=TRUE)
  3. All those columns that are not in one data frame (like Belgium in this case) should also be included in the df3

df3 should look like this:

results

Any help would be greatly appreciated

5

5 Answers

4
votes

We can do a join on the 'Date' and use fcoalesce to replace the NA with the corresponding non-NA

library(data.table)
nm2 <- intersect(names(df2)[-1], names(df1)[-1])
df3 <- copy(df2)
setDT(df3)[df1, (nm2) := Map(fcoalesce, mget(nm2),
       mget(paste0('i.', nm2))), on = .(Date)]

-output

df3
#         Date Germany France Luxembourg Belgium
#1: 01/01/1990       1     10         10      NA
#2: 01/01/2000       9      3         10       8
#3: 01/01/2010       7      9          6       1
#4: 01/01/2020       9      6          7       9

Or this can be done with tidyverse

library(dplyr)
library(stringr)
left_join(df2, df1, by = 'Date') %>% 
   mutate(Date, across(ends_with(".x"), 
    ~ coalesce(., get(str_replace(cur_column(), "\\.x$", ".y"))))) %>% 
   select(-ends_with('.y')) %>% 
   rename_with(~ str_remove(., "\\.x$"), ends_with('.x'))
3
votes

Here is another data.table option

cols <- setdiff(intersect(names(df1), names(df2)), "Date")
setDT(df1)[setDT(df2),
  on = "Date"
][
  ,
  c(cols) :=
    Map(
      fcoalesce,
      .SD[, cols, with = FALSE],
      .SD[, paste0("i.", cols), with = FALSE]
    )
][,
  .SD,
  .SDcols = patterns("^[^i]")
]

giving

         Date Germany France Luxembourg Belgium
1: 01/01/1990       1     10         10      NA
2: 01/01/2000       5      4         10       8
3: 01/01/2010       8      9          6       1
4: 01/01/2020       9      7          3       9
1
votes
library(tidyverse)
library(lubridate)

df1 <- tibble::tribble(
  ~Date, ~Germany, ~France, ~Luxembourg,
  "01/01/2000",        5,       4,          10,
  "01/01/2010",        8,      NA,           6,
  "01/01/2020",        9,       7,           3
)
df2 <- tibble::tribble(
  ~Date, ~Germany, ~France, ~Luxembourg, ~Belgium,
  "01/01/1990",        1,      10,          10,       NA,
  "01/01/2000",        9,       3,          NA,        8,
  "01/01/2010",        7,       9,          NA,        1,
  "01/01/2020",       NA,       6,           7,        9
)

bind_rows(df1 %>%
            mutate(priority = 1),
          df2 %>%
            mutate(priority = 2)) %>%
  mutate(Date = lubridate::dmy(Date)) %>%
  group_by(Date) %>%
  arrange(priority) %>%
  summarise(across(-priority, ~ first(na.omit(.))))
#> # A tibble: 4 x 5
#>   Date       Germany France Luxembourg Belgium
#>   <date>       <dbl>  <dbl>      <dbl>   <dbl>
#> 1 1990-01-01       1     10         10      NA
#> 2 2000-01-01       5      4         10       8
#> 3 2010-01-01       8      9          6       1
#> 4 2020-01-01       9      7          3       9
1
votes

An approach with dplyr only using mutate(across...

I also propose use of full_join instead of left_join or right_join as full_join will take all rows from df1 or df2 as opposed to left or right joins.

data.frame(
  stringsAsFactors = FALSE,
  Date = c("01/01/2000", "01/01/2010", "01/01/2020"),
  Germany = c(5, 8, 9),
  France = c(4, NA, 7),
  Luxembourg = c(10, 6, 3)
) -> df1
data.frame(
  stringsAsFactors = FALSE,
  Date = c("01/01/1990", "01/01/2000", "01/01/2010", "01/01/2020"),
  Germany = c(1, 9, 7, NA),
  France = c(10, 3, 9, 6),
  Luxembourg = c(10, NA, NA, 7),
  Belgium = c(NA, 8, 1, 9)
) -> df2

library(dplyr)


df1 %>% full_join(df2, by = 'Date', suffix = c('_x', '_y')) %>%
  mutate(across(ends_with('_x'), ~coalesce(., get(sub('_x', '_y', cur_column()))),
                .names = '{sub("_x", "", {.col})}')) %>%
  select(!ends_with('_x') & !ends_with('_y'))

#>         Date Belgium Germany France Luxembourg
#> 1 01/01/2000       8       5      4         10
#> 2 01/01/2010       1       8      9          6
#> 3 01/01/2020       9       9      7          3
#> 4 01/01/1990      NA       1     10         10

Created on 2021-05-18 by the reprex package (v2.0.0)

1
votes

Base R solution:

# Store as a variable a list denoting each data.frame's column names: 
# cnames => character vector
cnames <- list(names(df1), names(df2))

# Determine which vector of names is required in the resulting data.frame 
# required_vecs => character vector
required_vecs <- cnames[[which.max(lengths(cnames))]]

# Merge the data: full_data => data.frame
full_data <- merge(
  df1,
  df2,
  by = "Date",
  all = TRUE
)

# Resolve the vector names of vectors requiring coalescing: 
# clsce_required_vecs=> character vector
clsce_required_vecs <- setdiff(intersect(names(df1), names(df2)), c("Date"))

# Resolve the vector names of vectors not requiring coalescing: 
# nt_rqrd_vecs => character vector
nt_rqrd_vecs <- setdiff(unlist(cnames), clsce_required_vecs)

# Split-Apply-Combine data requiring coalescing: coalesced_data => data.frame
coalesced_data <- setNames(
  data.frame(
    do.call(
      cbind, 
      lapply(
        clsce_required_vecs, 
        function(x) {
          # Subset the data to only contain relevant vectors: sbst => data.frame
          sbst <- full_data[,grepl(x, names(full_data))]
          # Split each column (of the same data) into a data.frame in a list:
          # same_vecs => list of data.frames
          same_vecs <- split.default(sbst, seq_len(ncol(sbst)))
          # Rename the data.frames as required and row-bind them into a single df:
          # vector => GlobalEnv()
          Reduce(
            function(y, z){
              replace(y, is.na(y), z[is.na(y)])
            }, 
            do.call(cbind, same_vecs)
          )
        }
      )
    ), row.names = NULL), 
clsce_required_vecs)
    
# Column bind and order the columns:
res <- cbind(full_data[, nt_rqrd_vecs], coalesced_data)[,required_vecs]