"Error: Duplicate identifiers for rows" in a dataframe and some of its subsets, but not other subsets

Question

I've encountered a rather puzzling tidyr::spread() error. When I tried to run code (example below) in the full dataframe, I got the "Duplicate identifiers for rows" error.

I subset the (very large) dataframe to investigate and re-ran the code. This time it worked (see subset1df dput below). Then I tried it again with a different subset (subset2df), and I got the error message again. I honestly have no idea how to make sense of this and will greatly appreciate any help.

Reproducible code below:

subset1df:

structure(list(v1 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 1L, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v2 = structure(c(1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, 1L, NA, NA), .Label = "2", class = "factor"), 
    v3 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v4 = structure(c(NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v5 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v6 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, 1L), .Label = "2", class = "factor"), 
    v7 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v8 = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v9 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1L, NA, NA, NA, NA, NA), .Label = "1", class = "factor"), 
    v10 = structure(c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), .Label = character(0), class = "factor"), 
    v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v12 = structure(c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v13 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 
    NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"), 
    v14 = structure(c(NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 
    NA, NA, 1L, NA, NA, NA), .Label = c("1", "2"), class = "factor"), 
    v15 = structure(c(NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v16 = structure(c(NA, NA, 2L, NA, NA, 2L, NA, NA, NA, NA, 
    NA, NA, NA, NA, 1L, NA), .Label = c("1", "2"), class = "factor"), 
    respondentID = structure(c(7L, 7L, 7L, 5L, 6L, 6L, 4L, 4L, 
    4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("EO15", "EO17", 
    "EO19", "EO21", "Eo23", "EO23", "EO24"), class = "factor")), .Names = c("v1", 
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", 
"v12", "v13", "v14", "v15", "v16", "respondentID"), row.names = c(NA, 
-16L), class = "data.frame")

subset2df:

structure(list(v2 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, 1L, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v4 = structure(c(NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v5 = structure(c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA), .Label = c("1", 
    "2"), class = "factor"), v6 = structure(c(NA, 1L, NA, NA, 
    2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), .Label = c("1", "2"), class = "factor"), 
    v9 = structure(c(NA, NA, NA, NA, NA, 1L, NA, NA, NA, 1L, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v11 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 
    NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "1", class = "factor"), 
    v12 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA), .Label = c("1", 
    "2"), class = "factor"), v13 = structure(c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, 
    NA, NA, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, 2L, NA, NA, NA), .Label = c("1", "2"), class = "factor"), 
    v14 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v15 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    v16 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 1L, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Label = "2", class = "factor"), 
    respondentID = structure(c(21L, 20L, 20L, 19L, 18L, 18L, 
    1L, 1L, 16L, 16L, 16L, 10L, 10L, 17L, 15L, 15L, 15L, 14L, 
    14L, 14L, 13L, 12L, 12L, 11L, 11L, 11L, 8L, 9L, 9L, 6L, 7L, 
    7L, 3L, 2L, 3L, 4L, 4L, 4L, 5L), .Label = c("EO11", "Eo14", 
    "EO14", "EO16", "EO18", "EO26", "EO27", "Eo28", "EO28", "EO3", 
    "Eo30", "EO32", "EO331", "EO35", "EO37", "EO4", "EO41", "EO6", 
    "EO6 ", "EO7", "EO7 "), class = "factor")), .Names = c("v2", 
"v4", "v5", "v6", "v9", "v11", "v12", "v13", "v14", "v15", "v16", 
"respondentID"), row.names = c(NA, -39L), class = "data.frame")

combodf_id (needed to execute code):

structure(list(color1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"
), class = "factor"), color2 = structure(c(1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue", 
"violet"), class = "factor"), color3 = structure(c(1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green", 
"turqoise"), class = "factor"), color4 = structure(c(2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black", 
"yellow"), class = "factor"), combo = c("v1", "v2", "v3", "v4", 
"v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", 
"v15", "v16")), class = "data.frame", .Names = c("color1", "color2", 
"color3", "color4", "combo"), row.names = c(NA, -16L))

code:

result_df <- subset1df %>%
  gather(key = combo, value = val, -respondentID) %>%
  filter(!is.na(val)) %>%
  left_join(combodf_id, by = "combo") %>%
  arrange(respondentID) %>% 
  rename(rose_color1 = color1, rose_color2 = color2,
         tulip_color1 = color3, tulip_color2 = color4) %>% 
  gather(color, value, rose_color1:tulip_color2) %>% 
  separate(color, into = c('flower', 'color')) %>% 
  spread(color, value) %>% 
  mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
  mutate(val = if_else(val == flower, 1, 0)) %>% 
  select(respondentID, flower, color1, color2, choice = val)

The solution by @Tung below is very similar to the one in Spread with duplicate identifiers (using tidyverse and %>%) , but neither of them quite solves the problem.

Possible duplicate of Spread with duplicate identifiers (using tidyverse and %>%) — camille

CJ Yetman CJ Yetman · Accepted Answer · 2018-05-11T09:38:20

In your subset2df...

you have removed some of the response columns (e.g. v1, v3, v7, etc.)... that's probably why you have a bunch of rows/respondants without any response (all NA)
your respondentID column has levels with whitespace in some values that will probably mess things up later on (eg. "EO7" and "EO7 ")
there are duplicated rows, e.g. subset2df[c(15, 17), ]
all of your columns are factors... particularly with the response columns with integer values, I find that strange. tidyr functions gather and spread will take into account the levels of a factor, which can seem especially strange when you have subset a factor variable and not all of the levels are represented in the data.

You should probably fix those problems first, because they are what likely lead to your problems later, however... the reason you get the error "Duplicate identifiers for rows" is because the data frame you're passing to spread(color, value) has duplicate rows. You can force this to work by adding distinct() %>% one line before it, but be aware that the only reason you have to do that is because of the other problems before.

subset2df %>%
  as_tibble() %>% 
  mutate_at(vars(v2:v16), as.integer) %>% 
  gather(key = combo, value = val, -respondentID, na.rm = T) %>%
  filter(!is.na(val)) %>%
  left_join(combodf_id, by = "combo") %>%
  arrange(respondentID) %>% 
  rename(rose_color1 = color1, rose_color2 = color2,
         tulip_color1 = color3, tulip_color2 = color4) %>% 
  gather(color, value, rose_color1:tulip_color2) %>% 
  separate(color, into = c('flower', 'color')) %>% 
  distinct() %>% 
  spread(color, value) %>% 
  mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
  mutate(val = if_else(val == flower, 1, 0)) %>% 
  select(respondentID, flower, color1, color2, choice = val)

I would strongly suggest fixing all the above problems first though, like so (notice you won't need the distinct command further down in the chain because you will have already applied that to the original data)...

subset2df %>%
  as_tibble() %>%  # tibble has better printing methods
  mutate_at(vars(-respondentID), as.integer) %>%  # convert response to numeric
  mutate(respondentID = as.character(respondentID)) %>%  # convert to char
  mutate(respondentID = trimws(respondentID)) %>%  # remove whitespace
  distinct() %>%  # remove duplicate rows
  gather(key = combo, value = val, -respondentID, na.rm = T) %>%
  left_join(combodf_id, by = "combo") %>%
  mutate_at(vars(color1:color4), as.character) %>%  # convert colors to char
  rename(rose_color1 = color1, rose_color2 = color2, 
         tulip_color1 = color3, tulip_color2 = color4) %>% 
  gather(color, value, rose_color1:tulip_color2) %>% 
  separate(color, into = c('flower', 'color')) %>% 
  spread(color, value) %>% 
  mutate(val = if_else(val == 1, 'rose', 'tulip')) %>% 
  mutate(val = if_else(val == flower, 1L, 0L)) %>% 
  select(respondentID, flower, color1, color2, choice = val)

"Error: Duplicate identifiers for rows" in a dataframe and some of its subsets, but not other subsets

1 Answers