Losing duplicate column names when flattening list-of-lists into dataframes in R - here is a related post from last year that we are building from. We are making an API call and receiving the following raw data, that we need to convert into a "clean" dataframe:
List of Lists returned by API Call
dput(raw_list)
list(list(id = 248013L, title = "Knockout Stage - Final", start = "2020-10-31T10:25:00Z",
end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL,
lifecycle = "over", tier = 1L, best_of = 5L, chain = list(),
streamed = TRUE, bracket_position = list(part = "UB", col = 0L,
offset = 1L), participants = list(list(seed = 1L, score = 1L,
forfeit = FALSE, roster = list(id = 75421L), winner = FALSE,
stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE,
roster = list(id = 67585L), winner = TRUE, stats = NULL)),
tournament = list(id = 5088L), substage = list(id = 26285L),
game = list(id = 2L), matches = list(list(id = 443031L),
list(id = 443032L), list(id = 443033L), list(id = 443034L),
list(id = 443035L)), casters = list(list(primary = TRUE,
caster = list(id = 47L)), list(primary = FALSE, caster = list(
id = 48L)), list(primary = FALSE, caster = list(id = 425L)),
list(primary = FALSE, caster = list(id = 449L)), list(
primary = FALSE, caster = list(id = 524L)), list(
primary = FALSE, caster = list(id = 1009L)), list(
primary = FALSE, caster = list(id = 1567L)), list(
primary = FALSE, caster = list(id = 1589L)), list(
primary = FALSE, caster = list(id = 1591L)), list(
primary = FALSE, caster = list(id = 3589L)), list(
primary = FALSE, caster = list(id = 3635L)), list(
primary = FALSE, caster = list(id = 4238L)), list(
primary = FALSE, caster = list(id = 5163L)), list(
primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE),
list(id = 248014L, title = "Knockout Stage - Semifinal",
start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z",
postponed_from = NULL, deleted_at = NULL, lifecycle = "over",
tier = 1L, best_of = 5L, chain = list(), streamed = TRUE,
bracket_position = list(part = "UB", col = 1L, offset = 2L),
participants = list(list(seed = 1L, score = 1L, forfeit = FALSE,
roster = list(id = 70687L), winner = FALSE, stats = NULL),
list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
id = 75421L), winner = TRUE, stats = NULL)),
tournament = list(id = 5088L), substage = list(id = 26285L),
game = list(id = 2L), matches = list(list(id = 443021L),
list(id = 443022L), list(id = 443023L), list(id = 443024L),
list(id = 443025L)), casters = list(list(primary = TRUE,
caster = list(id = 47L)), list(primary = FALSE, caster = list(
id = 48L)), list(primary = FALSE, caster = list(id = 425L)),
list(primary = FALSE, caster = list(id = 449L)),
list(primary = FALSE, caster = list(id = 524L)),
list(primary = FALSE, caster = list(id = 1009L)),
list(primary = FALSE, caster = list(id = 1567L)),
list(primary = FALSE, caster = list(id = 1589L)),
list(primary = FALSE, caster = list(id = 1591L)),
list(primary = FALSE, caster = list(id = 3589L)),
list(primary = FALSE, caster = list(id = 3635L)),
list(primary = FALSE, caster = list(id = 4238L)),
list(primary = FALSE, caster = list(id = 5163L)),
list(primary = FALSE, caster = list(id = 5164L))),
has_incident_report = FALSE), list(id = 251494L, title = "Group B",
start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z",
postponed_from = NULL, deleted_at = NULL, lifecycle = "over",
tier = 1L, best_of = 1L, chain = list(), streamed = TRUE,
bracket_position = NULL, participants = list(list(seed = 1L,
score = 1L, forfeit = FALSE, roster = list(id = 61033L),
winner = TRUE, stats = NULL), list(seed = 2L, score = 0L,
forfeit = FALSE, roster = list(id = 60051L), winner = FALSE,
stats = NULL)), tournament = list(id = 5088L), substage = list(
id = 26282L), game = list(id = 2L), matches = list(
list(id = 449511L)), casters = list(list(primary = TRUE,
caster = list(id = 47L)), list(primary = FALSE, caster = list(
id = 48L)), list(primary = FALSE, caster = list(id = 425L)),
list(primary = FALSE, caster = list(id = 449L)),
list(primary = FALSE, caster = list(id = 524L)),
list(primary = FALSE, caster = list(id = 1009L)),
list(primary = FALSE, caster = list(id = 1567L)),
list(primary = FALSE, caster = list(id = 1589L)),
list(primary = FALSE, caster = list(id = 1591L)),
list(primary = FALSE, caster = list(id = 3589L)),
list(primary = FALSE, caster = list(id = 3635L)),
list(primary = FALSE, caster = list(id = 4238L)),
list(primary = FALSE, caster = list(id = 5163L)),
list(primary = FALSE, caster = list(id = 5164L))),
has_incident_report = FALSE))
Here is what we have currently. Our issue here is with the names generated from .name_repair = "unique"
:
output_df <- zed %>%
rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
purrr::map(unlist) %>%
purrr::map(t) %>%
{suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
dplyr::bind_rows() %>%
{suppressMessages(readr::type_convert(.))} %>%
as.data.frame()
> colnames(output_df)
[1] "id" "title" "start" "end" "postponed_from"
[6] "deleted_at" "lifecycle" "tier" "best_of" "streamed"
[11] "bracket_position.part" "bracket_position.col" "bracket_position.offset" "participants.seed...14" "participants.score...15"
[16] "participants.forfeit...16" "participants.roster.id...17" "participants.winner...18" "participants.stats...19" "participants.seed...20"
[21] "participants.score...21" "participants.forfeit...22" "participants.roster.id...23" "participants.winner...24" "participants.stats...25"
[26] "tournament.id" "substage.id" "game.id" "matches.id...29" "matches.id...30"
[31] "matches.id...31" "matches.id...32" "matches.id...33" "casters.primary...34" "casters.caster.id...35"
[36] "casters.primary...36" "casters.caster.id...37" "casters.primary...38" "casters.caster.id...39" "casters.primary...40"
[41] "casters.caster.id...41" "casters.primary...42" "casters.caster.id...43" "casters.primary...44" "casters.caster.id...45"
[46] "casters.primary...46" "casters.caster.id...47" "casters.primary...48" "casters.caster.id...49" "casters.primary...50"
[51] "casters.caster.id...51" "casters.primary...52" "casters.caster.id...53" "casters.primary...54" "casters.caster.id...55"
[56] "casters.primary...56" "casters.caster.id...57" "casters.primary...58" "casters.caster.id...59" "casters.primary...60"
[61] "casters.caster.id...61" "has_incident_report" "bracket_position" "participants.seed...12" "participants.score...13"
[66] "participants.forfeit...14" "participants.roster.id...15" "participants.winner...16" "participants.stats...17" "participants.seed...18"
[71] "participants.score...19" "participants.forfeit...20" "participants.roster.id...21" "participants.winner...22" "participants.stats...23"
[76] "matches.id" "casters.primary...28" "casters.caster.id...29" "casters.primary...30" "casters.caster.id...31"
[81] "casters.primary...32" "casters.caster.id...33"
.name_repair = "unique"
is very helpful because without it, we would have many column names V20
, V21
, V30
, V32
, etc. However, the auto-generated names are still problematic.
> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id"
If would be preferable if these columns were named matches.id, matches.id.1, matches.id.2, ...
. We can attempt to change the column names after the matter, however we are looking for a solution that creates the correct column names in our current chain of rrapply() %>% map() %>% map() %>% map() %>% bind_rows(). Is this possible?
EDIT: So far we're doing something like this:
# fix messy / dupe columns with weird ...23 suffixes
match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
output_df <- output_df %>%
tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)
which collapses these duplicate columns together, so now we simply need to split them back up. Not ideal but seems like this will work.