4
votes

Hi I have two data frames as followed:

df1: 
ID   x    y     z     
1    a    b     c     
2    a    b     c
3    a    b     c 
4    a    b     c

and df2:

ID   x    y     
 2   d    NA
 3   NA   e

and I am after a result like this:

df1:

    ID   x    y     z     
    1    a    b     c     
    2    d    b     c
    3    a    e     c 
    4    a    b     c

I have been trying to use the match function as suggested by some other posts but I keep getting the issue where my df1 dataframe being replaced with NA values from df2. This is the code I have been using without luck

for (i in names(df2)[2:length(names(df2))]) {
  df1[i] <- df2[match(df1$ID, df2$ID)]
}

Thanks

3
This is an update join, which doesn't exist yet in dplyr, though you can get the effect by cleaning up after a normal join: df2 %>% full_join(df1, by = 'ID', suffix = c('', '.1')) %>% mutate(x = coalesce(x, x.1), y = coalesce(y, y.1)) %>% select(-x.1, -y.1) %>% arrange(ID). You could do the same in base R, if you like: df3 <- merge(df2, df1, by = 'ID', all = TRUE, suffixes = c('', '.1')); df3$x[is.na(df3$x)] <- df3$x.1[is.na(df3$x)]; df3$y[is.na(df3$y)] <- df3$y.1[is.na(df3$y)]; df3[c('x.1', 'y.1')] <- NULL; df3alistaire

3 Answers

0
votes

Your code didn't work for me so I change it a little but it works. If you are reading data from an external file use the stringAsFactor = FALSE when you read it so you don't run into problems.

df1 = data.frame("ID" = 1:4,"x" = rep("a",4), "y" =rep("b",4),"z" = rep("c",4),
                 stringsAsFactors=FALSE)
df2 = data.frame("ID" = 2:3,"x" = c("d",NA), "y" = c(NA,"e"),stringsAsFactors=FALSE)

for(i in 1:nrow(df2)){
  new_data = df2[i,-which(apply(df2[i,],2,is.na))]
  pos = as.numeric(new_data[1])
  col_replace = intersect(colnames(new_data),colnames(df1))
  df1[pos,col_replace] = new_data
}
0
votes

A solution using dplyr. The idea is to convert both data frames to long format, conduct join and replace the values, and convert the format back to wide format. df5 is the final output.

library(dplyr)
library(tidyr)

df3 <- df1 %>% gather(Col, Value, -ID) 
df4 <- df2 %>% gather(Col, Value, -ID, na.rm = TRUE)
df5 <- df3 %>%
  left_join(df4, by = c("ID", "Col")) %>%
  mutate(Value.x = ifelse(!is.na(Value.y), Value.y, Value.x)) %>%
  select(ID, Col, Value.x) %>%
  spread(Col, Value.x)
df5
#   ID x y z
# 1  1 a b c
# 2  2 d b c
# 3  3 a e c
# 4  4 a b c

DATA

df1 <- read.table(text = "ID   x    y     z     
1    a    b     c     
                  2    a    b     c
                  3    a    b     c 
                  4    a    b     c",
                  header = TRUE, stringsAsFactors = FALSE)

df2 <- read.table(text = "ID   x    y     
 2   d    NA
 3   NA   e",
                  header = TRUE, stringsAsFactors = FALSE)
0
votes

As mentioned by alistaire this is an update join. It is available with the data.table package:

library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = "ID", x := ifelse(is.na(i.x), x, i.x)]
df1[df2, on = "ID", y := ifelse(is.na(i.y), y, i.y)]
df1
   ID x y z
1:  1 a b c
2:  2 d b c
3:  3 a e c
4:  4 a b c

If there are many columns with replacement values, it might be worthwhile to follow www's suggestion to do the replacement after reshaping to long format where column names are treated as data:

library(data.table)
melt(setDT(df1), "ID")[
  melt(setDT(df2), "ID", na.rm = TRUE), on = .(ID, variable), value := i.value][
    , dcast(.SD, ID ~ variable)]
   ID x y z
1:  1 a b c
2:  2 d b c
3:  3 a e c
4:  4 a b c

Data

df1 <- fread(
"ID   x    y     z     
1    a    b     c     
2    a    b     c
3    a    b     c 
4    a    b     c")

df2 <- fread(
"ID   x    y     
 2   d    NA
 3   NA   e")