I've encountered a rather puzzling tidyr::spread() error. When I tried to run code (example below) in the full dataframe, I got the "Duplicate identifiers for rows" error.
I subset the (very large) dataframe to investigate and re-ran the code. This time it worked (see subset1df dput below). Then I tried it again with a different subset (subset2df), and I got the error message again. I honestly have no idea how to make sense of this and will greatly appreciate any help.
Reproducible code below:
subset1df:
structure(list(v1 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, NA, NA), .Label = "2", class = "factor"),
v2 = structure(c(1L, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA), .Label = "2", class = "factor"),
v3 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v4 = structure(c(NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v5 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v6 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L), .Label = "2", class = "factor"),
v7 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v8 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v9 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v10 = structure(c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), .Label = character(0), class = "factor"),
v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v12 = structure(c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v13 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v14 = structure(c(NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA,
NA, NA, 1L, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v15 = structure(c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v16 = structure(c(NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA,
NA, NA, NA, NA, 1L, NA), .Label = c("1", "2"), class = "factor"),
respondentID = structure(c(7L, 7L, 7L, 5L, 6L, 6L, 4L, 4L,
4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("EO15", "EO17",
"EO19", "EO21", "Eo23", "EO23", "EO24"), class = "factor")), .Names = c("v1",
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15", "v16", "respondentID"), row.names = c(NA,
-16L), class = "data.frame")
subset2df:
structure(list(v2 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v4 = structure(c(NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v5 = structure(c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA), .Label = c("1",
"2"), class = "factor"), v6 = structure(c(NA, 1L, NA, NA,
2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v9 = structure(c(NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"),
v12 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA), .Label = c("1",
"2"), class = "factor"), v13 = structure(c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 2L, NA, NA, NA), .Label = c("1", "2"), class = "factor"),
v14 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v15 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
v16 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 1L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"),
respondentID = structure(c(21L, 20L, 20L, 19L, 18L, 18L,
1L, 1L, 16L, 16L, 16L, 10L, 10L, 17L, 15L, 15L, 15L, 14L,
14L, 14L, 13L, 12L, 12L, 11L, 11L, 11L, 8L, 9L, 9L, 6L, 7L,
7L, 3L, 2L, 3L, 4L, 4L, 4L, 5L), .Label = c("EO11", "Eo14",
"EO14", "EO16", "EO18", "EO26", "EO27", "Eo28", "EO28", "EO3",
"Eo30", "EO32", "EO331", "EO35", "EO37", "EO4", "EO41", "EO6",
"EO6 ", "EO7", "EO7 "), class = "factor")), .Names = c("v2",
"v4", "v5", "v6", "v9", "v11", "v12", "v13", "v14", "v15", "v16",
"respondentID"), row.names = c(NA, -39L), class = "data.frame")
combodf_id (needed to execute code):
structure(list(color1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"
), class = "factor"), color2 = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue",
"violet"), class = "factor"), color3 = structure(c(1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green",
"turqoise"), class = "factor"), color4 = structure(c(2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black",
"yellow"), class = "factor"), combo = c("v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16")), class = "data.frame", .Names = c("color1", "color2",
"color3", "color4", "combo"), row.names = c(NA, -16L))
code:
result_df <- subset1df %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(combodf_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
The solution by @Tung below is very similar to the one in Spread with duplicate identifiers (using tidyverse and %>%) , but neither of them quite solves the problem.