Combining dataframes with missing values

Question

I have several dataframes with data from the same survey. I want to combine them for analysis. The dataframes contain both unique variables and two variables (ID and Contest_no) that are shared across all the dataframes; the two shared variables contain information about the respondent and the contest number (1,2,3, as respondents were asked the same questions three times).

The difficulty is that the dataframes have missing values:

DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
             V2 = factor(c("A", "B", "C", "D")),
             ID = factor(c("x1", "x1", "y2", "y2")),
             Contest_no = factor(c("1", "2", "1", "2")))

DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
                  V4 = factor(c("A", "C", "D")),
                  ID = factor(c("x1", "y2", "y2")),
                  Contest_no = factor(c("1", "1", "2")))

DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
                  V6 = factor(c("A", "B", "C")),
                  ID = factor(c("x1", "x1", "y2")),
                  Contest_no = factor(c("1", "2", "1")))

As a result, respondent IDs and contest numbers aren't aligned. I want to match the data to respondent IDS and contest numbers so that the merged dataframe looks like this:

DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
                    V2 = factor(c("A", "B", "C", "D")),
                    V3 = factor(c("A", NA, "C", "D")),
                    V4 = factor(c("A", NA, "C", "D")),
                    V5 = factor(c("A", "B", "C", NA)),
                    V6 = factor(c("A", "B", "C", NA)),
                    ID = factor(c("x1", "x1", "y2", "y2")),
                    Contest_no = factor(c("1", "2", "1", "2")))

I thought that full_join would do the trick, but DF_merged <- full_join(DF1, DF2, DF3, by="ID") gives me nonsensical results.

How can disparate data like this be combined?

New, updated example (to address the problem of multiplied rows). In this example there are no missing values at all, and both dataframes have the same number of rows, but the code results in multiplied rows. First, the two dataframes to be merged:

df1:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
    0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L, 
    2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234", 
    "V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 
    1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

df2:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L, 
    3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252", 
    "V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 
    1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

and now the unsuccessful attempt to combine the two dataframes:

df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))

results in:

structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 
    1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), 
    Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 
    3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
    ), class = "factor"), Attribute1.x = structure(c(1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
    2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor"), Option.y = structure(c(1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L), 
    Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L, 
    1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 
    2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
    ), class = "factor"), Attribute1.y = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x", 
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x", 
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y", 
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))

All joins are done on 2 data frames. Try full_join(DF1, DF2) %>% full_join(DF3) — Gregor Thomas

MKR MKR · Accepted Answer · 2018-05-15T21:32:32

You can try dplyr::full_join with by=c("ID","Contest_no") argument as:

library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
       full_join(DF3, by=c("ID","Contest_no"))
df1
#  V1 V2   V3   V4   V5   V6 ID Contest_no
#1  A  A    A    A    A    A x1          1
#2  B  B <NA> <NA>    B    B x1          2
#3  C  C    C    C    C    C y2          1
#4  D  D    D    D <NA> <NA> y2          2

Updated: Answer has been modified to consider another column Option in full_join as:

df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option"))

Note: I had to tweak my dplyr to match what is suggested by @Gregor in order to get expected result.

Combining dataframes with missing values

1 Answers