2
votes

I have two data tables:

  • before represents a data table in its "raw" state (prior to any cleaning operations).
  • after represents a data table after various cleaning and manipulation.

They have mostly matching column names.

Is it possible to construct a third data frame where columns with matching names are placed adjacent to one another and the names possibly modified (name.before, name.after) and all excess columns are placed at the end?

For example:

before data frame:

data.table::data.table(a = c(1,2,3), b = c(1,2,3), c = c(1,2,3))

   a b c
1: 1 1 1
2: 2 2 2
3: 3 3 3

after data frame:

data.table::data.table(a = c("a","b","c"), c = c("a","b","c"), d = c(1,2,3))

   a c d
1: a a 1
2: b b 2
3: c c 3

The desired output would be:

   a.before a.after c.before c.after d
1:        1       a        1       a 1
2:        2       b        2       b 2
3:        3       c        3       c 3

The purpose of this would be for easy comparison of identical columns to verify that column outputs are appropriate after various functions have been applied to the data.table.

3

3 Answers

2
votes

An option is to cbind and use setcolorder on the ordered column names concatenate and then use make.unique if the intention is to identify the before/after on the duplicate column names

library(data.table)
out <- setcolorder(cbind(dt1, dt2), order(c(names(dt1), names(dt2))))[]
setnames(out, make.unique(names(out)))[]
out[, setdiff(names(dt1), names(dt2)) := NULL][]
#   a.before a.after c.before c.after d
#1:        1       a        1       a 1
#2:        2       b        2       b 2
#3:        3       c        3       c 3

If we need to specifically use before/after

out <- setcolorder(cbind(dt1, dt2), order(c(names(dt1), names(dt2))))[]    
out[, setdiff(names(dt1), names(dt2)) := NULL][]
i1 <- duplicated(names(out), fromLast = TRUE)
i2 <- duplicated(names(out))
names(out)[i1] <- paste0(names(out)[i1], ".before")
names(out)[i2] <- paste0(names(out)[i2], ".after")   

out
#   a.before a.after c.before c.after d
#1:        1       a        1       a 1
#2:        2       b        2       b 2
#3:        3       c        3       c 3
2
votes

base R playground:


cols_after <- colnames(after)

cols_before <- colnames(before)

inter <- intersect(cols_after, cols_before)

in_after <- cols_after %in% inter

n_after <- paste0(cols_after[in_after], ".after")

colnames(after)[in_after] <- n_after

in_before <- cols_before %in% inter

n_before <- paste0(cols_before[in_before], ".before")

colnames(before)[in_before] <- n_before

# some merge procedure merge_df or simple cbind
merge_df <- cbind(after, before)

merge_df_names <- merge_df[, c(as.vector(t(data.frame(n_before ,n_after))), 
colnames(merge_df)[!(colnames(merge_df) %in% c(n_before, n_after))])]

# if merge_df is data.table we need with = FALSE
# merge_df[, c(as.vector(t(data.frame(n_before ,n_after))), colnames(merge_df)[!(colnames(merge_df) %in% c(n_before, n_after))]), with = FALSE]

merge_df_names

# to remove b column if needed
# merge_df_names <- merge_df_names[, setdiff(colnames(merge_df_names), "b")]
# if merge_df is data.table we need with = FALSE
1
votes

Updated Unaffected columns (b, d) are displayed last.

This tidyverse approach accepts data.table objects and also returns a data.table object:

library(tidyverse)

cols_to_rename <- intersect(colnames(before), colnames(after))

rename_cols <- function(data, suffix) 
  data %>% rename_with(~paste0(., suffix), all_of(cols_to_rename))

bind_cols(rename_cols(before, ".before"), rename_cols(after, ".after")) %>%
  select(starts_with(paste0(cols_to_rename, ".")), everything())

   a.before a.after c.before c.after b d
1:        1       a        1       a 1 1
2:        2       b        2       b 2 2
3:        3       c        3       c 3 3