1
votes

I've got three data frames (Df1, Df2, Df3). These data frames have some variable in common, but they also each contain some unique variables. I'd like to make sure that all variables are represented in all data frames, eg material is present in Df2 but not Df1, so I'd like to create a variable named material in Df1 and set that variable to be NA. Thanks for any help.

Starting point (dfs):

Df1 <- data.frame("color"=c(1,1,1),"price"=c(1,1,1),"buyer"=c(1,1,1))
Df2 <- data.frame("color"=c(1,1,1),"material"=c(1,1,1),"size"=c(1,1,1))
Df3 <- data.frame("color"=c(1,1,1),"price"=c(1,1,1),"key"=c(1,1,1))

Desired outcome (dfs):

Df1 <- data.frame("color"=c(1,1,1),"price"=c(1,1,1),"material"=c(NA,NA,NA),"buyer"=c(1,1,1),"size"=c(NA,NA,NA),"key"=c(NA,NA,NA))
Df2 <- data.frame("color"=c(1,1,1),"price"=c(NA,NA,NA),"material"=c(1,1,1),"buyer"=c(NA,NA,NA),"size"=c(1,1,1),"key"=c(NA,NA,NA))
Df3 <- data.frame("color"=c(1,1,1),"price"=c(1,1,1),"material"=c(NA,NA,NA),"buyer"=c(NA,NA,NA),"size"=c(NA,NA,NA),"key"=c(1,1,1))

My code so far: (I'm trying to compare the variable names in an individual data frame with the variable names in all three data frames, and use the ones not present in the individual data frame to generate the new variables set to NA. But I end up with: Error in VarDf1[, NewVariables] <- NA :incorrect number of subscripts on matrix). Don't know how to fix it.

dfs <- list(Df1,Df2,Df3)  
numdfs <- length(dfs)
for (i in 1:numdfs) 
{
  VarDf1 <- as.vector(names(Df1)) 
  VarDf2 <- as.vector(names(Df2))
  VarDf3 <- as.vector(names(Df3))
  VarAll <- c(VarDf1, VarDf2,VarDf3)
  NewVariables <- as.vector(setdiff(VarAll, dfs[i]))  
  dfs[i][ , NewVariables] <- NA
}
4

4 Answers

3
votes

rbind.fill from the plyr package does what you expect while also combining everything into a big data.frame:

plyr::rbind.fill(Df1,Df2,Df3)
  color price buyer material size key
1     1     1     1       NA   NA  NA
2     1     1     1       NA   NA  NA
3     1     1     1       NA   NA  NA
4     1    NA    NA        1    1  NA
5     1    NA    NA        1    1  NA
6     1    NA    NA        1    1  NA
7     1     1    NA       NA   NA   1
8     1     1    NA       NA   NA   1
9     1     1    NA       NA   NA   1

You can subset the data back out in to new data.frames.

1
votes

This method is similar to rbind.fill, but it will let you separate it back into 3 data frames at the end.

We use tibble::lst rather than list so that the names of the list become 'Df1', 'Df2' and 'Df3'.

bind_rows does the same thing as rbind.fill however we can specify a .id column that links the row to its original data frame. Using this column, we can split this data frame into 3.

library('tidyverse')

lst(Df1, Df2, Df3) %>%
  bind_rows(.id = 'df_id') %>%
  split(.$df_id)
# $Df1
#   df_id color price buyer material size key
# 1   Df1     1     1     1       NA   NA  NA
# 2   Df1     1     1     1       NA   NA  NA
# 3   Df1     1     1     1       NA   NA  NA
# 
# $Df2
#   df_id color price buyer material size key
# 4   Df2     1    NA    NA        1    1  NA
# 5   Df2     1    NA    NA        1    1  NA
# 6   Df2     1    NA    NA        1    1  NA
# 
# $Df3
#   df_id color price buyer material size key
# 7   Df3     1     1    NA       NA   NA   1
# 8   Df3     1     1    NA       NA   NA   1
# 9   Df3     1     1    NA       NA   NA   1

The split can also be written like this if you prefer "tidy" functions.

lst(Df1, Df2, Df3) %>%
  bind_rows(.id = 'df_id') %>%
  group_by(df_id) %>%
  nest %>%
  deframe
0
votes

Here's an approach in base R

Get the column names in all data frames

cols = unique(unlist(lapply(list(Df1,Df2,Df3), FUN = colnames)))

add missing columns filled with NA

lapply(list(Df1,Df2,Df3), function(x){
  for (i in cols[!cols %in% colnames(x)]){
    x[[i]] = NA
  } 
  return(x)
}
)

#output
[[1]]
  color price buyer material size key
1     1     1     1       NA   NA  NA
2     1     1     1       NA   NA  NA
3     1     1     1       NA   NA  NA

[[2]]
  color material size price buyer key
1     1        1    1    NA    NA  NA
2     1        1    1    NA    NA  NA
3     1        1    1    NA    NA  NA

[[3]]
  color price key buyer material size
1     1     1   1    NA       NA   NA
2     1     1   1    NA       NA   NA
3     1     1   1    NA       NA   NA

data:

Df1 <- data.frame("color"=c(1,1,1),"price"=c(1,1,1),"buyer"=c(1,1,1))
Df2 <- data.frame("color"=c(1,1,1),"material"=c(1,1,1),"size"=c(1,1,1))
Df3 <- data.frame("color"=c(1,1,1),"price"=c(1,1,1),"key"=c(1,1,1))
0
votes

We can create a function, add_cols, and apply this function to all data frames.

# Create a list to store all data frames
Df_list <- list(Df1, Df2, Df3)

# Get the unique name of all data frame
Cols <- unique(unlist(lapply(Df_list, colnames)))

# Create a function to add columns
add_cols <- function(df, cols){
  new_col <- cols[!cols %in% colnames(df)]
  df[, new_col] <- NA
  return(df)
}

# Use lapply to apply the function
Df_list2 <- lapply(Df_list, add_cols, Cols)

# View the results
Df_list2
[[1]]
  color price buyer material size key
1     1     1     1       NA   NA  NA
2     1     1     1       NA   NA  NA
3     1     1     1       NA   NA  NA

[[2]]
  color material size price buyer key
1     1        1    1    NA    NA  NA
2     1        1    1    NA    NA  NA
3     1        1    1    NA    NA  NA

[[3]]
  color price key buyer material size
1     1     1   1    NA       NA   NA
2     1     1   1    NA       NA   NA
3     1     1   1    NA       NA   NA