In R, generate proper column names when flattening nested list of lists

Question

Losing duplicate column names when flattening list-of-lists into dataframes in R - here is a related post from last year that we are building from. We are making an API call and receiving the following raw data, that we need to convert into a "clean" dataframe:

List of Lists returned by API Call

dput(raw_list)

list(list(id = 248013L, title = "Knockout Stage  - Final", start = "2020-10-31T10:25:00Z", 
    end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL, 
    lifecycle = "over", tier = 1L, best_of = 5L, chain = list(), 
    streamed = TRUE, bracket_position = list(part = "UB", col = 0L, 
        offset = 1L), participants = list(list(seed = 1L, score = 1L, 
        forfeit = FALSE, roster = list(id = 75421L), winner = FALSE, 
        stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE, 
        roster = list(id = 67585L), winner = TRUE, stats = NULL)), 
    tournament = list(id = 5088L), substage = list(id = 26285L), 
    game = list(id = 2L), matches = list(list(id = 443031L), 
        list(id = 443032L), list(id = 443033L), list(id = 443034L), 
        list(id = 443035L)), casters = list(list(primary = TRUE, 
        caster = list(id = 47L)), list(primary = FALSE, caster = list(
        id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
        list(primary = FALSE, caster = list(id = 449L)), list(
            primary = FALSE, caster = list(id = 524L)), list(
            primary = FALSE, caster = list(id = 1009L)), list(
            primary = FALSE, caster = list(id = 1567L)), list(
            primary = FALSE, caster = list(id = 1589L)), list(
            primary = FALSE, caster = list(id = 1591L)), list(
            primary = FALSE, caster = list(id = 3589L)), list(
            primary = FALSE, caster = list(id = 3635L)), list(
            primary = FALSE, caster = list(id = 4238L)), list(
            primary = FALSE, caster = list(id = 5163L)), list(
            primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE), 
    list(id = 248014L, title = "Knockout Stage  - Semifinal", 
        start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 5L, chain = list(), streamed = TRUE, 
        bracket_position = list(part = "UB", col = 1L, offset = 2L), 
        participants = list(list(seed = 1L, score = 1L, forfeit = FALSE, 
            roster = list(id = 70687L), winner = FALSE, stats = NULL), 
            list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
                id = 75421L), winner = TRUE, stats = NULL)), 
        tournament = list(id = 5088L), substage = list(id = 26285L), 
        game = list(id = 2L), matches = list(list(id = 443021L), 
            list(id = 443022L), list(id = 443023L), list(id = 443024L), 
            list(id = 443025L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE), list(id = 251494L, title = "Group B", 
        start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 1L, chain = list(), streamed = TRUE, 
        bracket_position = NULL, participants = list(list(seed = 1L, 
            score = 1L, forfeit = FALSE, roster = list(id = 61033L), 
            winner = TRUE, stats = NULL), list(seed = 2L, score = 0L, 
            forfeit = FALSE, roster = list(id = 60051L), winner = FALSE, 
            stats = NULL)), tournament = list(id = 5088L), substage = list(
            id = 26282L), game = list(id = 2L), matches = list(
            list(id = 449511L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE))

Here is what we have currently. Our issue here is with the names generated from .name_repair = "unique":

output_df <- zed %>%
      rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
      purrr::map(unlist) %>% 
      purrr::map(t) %>% 
      {suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
      dplyr::bind_rows() %>%
      {suppressMessages(readr::type_convert(.))} %>%
      as.data.frame()

> colnames(output_df)
 [1] "id"                          "title"                       "start"                       "end"                         "postponed_from"             
 [6] "deleted_at"                  "lifecycle"                   "tier"                        "best_of"                     "streamed"                   
[11] "bracket_position.part"       "bracket_position.col"        "bracket_position.offset"     "participants.seed...14"      "participants.score...15"    
[16] "participants.forfeit...16"   "participants.roster.id...17" "participants.winner...18"    "participants.stats...19"     "participants.seed...20"     
[21] "participants.score...21"     "participants.forfeit...22"   "participants.roster.id...23" "participants.winner...24"    "participants.stats...25"    
[26] "tournament.id"               "substage.id"                 "game.id"                     "matches.id...29"             "matches.id...30"            
[31] "matches.id...31"             "matches.id...32"             "matches.id...33"             "casters.primary...34"        "casters.caster.id...35"     
[36] "casters.primary...36"        "casters.caster.id...37"      "casters.primary...38"        "casters.caster.id...39"      "casters.primary...40"       
[41] "casters.caster.id...41"      "casters.primary...42"        "casters.caster.id...43"      "casters.primary...44"        "casters.caster.id...45"     
[46] "casters.primary...46"        "casters.caster.id...47"      "casters.primary...48"        "casters.caster.id...49"      "casters.primary...50"       
[51] "casters.caster.id...51"      "casters.primary...52"        "casters.caster.id...53"      "casters.primary...54"        "casters.caster.id...55"     
[56] "casters.primary...56"        "casters.caster.id...57"      "casters.primary...58"        "casters.caster.id...59"      "casters.primary...60"       
[61] "casters.caster.id...61"      "has_incident_report"         "bracket_position"            "participants.seed...12"      "participants.score...13"    
[66] "participants.forfeit...14"   "participants.roster.id...15" "participants.winner...16"    "participants.stats...17"     "participants.seed...18"     
[71] "participants.score...19"     "participants.forfeit...20"   "participants.roster.id...21" "participants.winner...22"    "participants.stats...23"    
[76] "matches.id"                  "casters.primary...28"        "casters.caster.id...29"      "casters.primary...30"        "casters.caster.id...31"     
[81] "casters.primary...32"        "casters.caster.id...33"

.name_repair = "unique" is very helpful because without it, we would have many column names V20, V21, V30, V32, etc. However, the auto-generated names are still problematic.

> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id"

If would be preferable if these columns were named matches.id, matches.id.1, matches.id.2, .... We can attempt to change the column names after the matter, however we are looking for a solution that creates the correct column names in our current chain of rrapply() %>% map() %>% map() %>% map() %>% bind_rows(). Is this possible?

EDIT: So far we're doing something like this:

# fix messy / dupe columns with weird ...23 suffixes
    match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
    forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
    score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
    winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
    seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
    
    output_df <- output_df %>% 
      tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)

which collapses these duplicate columns together, so now we simply need to split them back up. Not ideal but seems like this will work.

What are you trying to do? How does your expected output look like? — Martin Gal

Martin Gal Martin Gal · Accepted Answer · 2021-08-19T18:16:58

I'm still unsure what you are trying to do, but you could clean up your pipe a little bit:

zed %>%
  rrapply::rrapply(f = function(x) replace(x, is.null(x), NA)) %>% 
  purrr::map_df(
    ~ unlist(.x) %>% 
      t() %>% 
      as_tibble(.name_repair = "unique")
  ) %>%
  readr::type_convert() %>% 
  tidyr::unite('match_ids', starts_with('matches.id'), na.rm = TRUE) %>%
  tidyr::unite('is_forfeits', starts_with('participants.forfeit'), na.rm = TRUE) %>%
  tidyr::unite('scores', starts_with('participants.score'), na.rm = TRUE) %>%
  tidyr::unite('is_winners', starts_with('participants.winner'), na.rm = TRUE) %>%
  tidyr::unite('seeds', starts_with('participants.seed'), na.rm = TRUE) %>% 
  suppressMessages()

minus the as.data.frame()-part which I refuse to use. This should create the same output without leaving the pipe and without needing auxiliary vectors like match_cols.

In R, generate proper column names when flattening nested list of lists

1 Answers