0
votes

I have a data frame that consists of character vectors.

  col1 <- c('ab', 'bc', 'cd', 'de', 'ef', 'fg', 'gh', 'hj', 'jk', 'kl', 'lm', 'mn', 'no', 'op', 'pr', 'xxx')
  col2 <- c('ac', 'bd', 'ce', 'df', 'ef', 'fh', 'gj', 'hk', 'jl', 'km', 'ln', 'mo', 'np', 'or', 'ps', 'xyz')
  col3 <- c('abc', 'bd', 'cde', 'def', 'efg', 'fgh', 'ghj', 'hjk', 'jkl', 'klm', 'lmm', 'mno', 'nop', 'opr', 'prs', 'aaa')
  col4 <- c('abcd', 'bc', 'cdef', 'defg', 'ef', 'fghj', 'ghjk', 'hjkl', 'jklm', 'klmn', 'lmmo', 'mnop', 'nopr', 'oprs', 'prst', 'xxx')
  col5 <- c('abcdd', 'bd', 'cdeff', 'defgg', 'ef', 'fghjj', 'ghjkk', 'hjkll', 'jklmm', 'klmnn', 'lmmoo', 'mnopp', 'noprr', 'oprss', 'prstt', 'aaa')
  df <- cbind(col1,col2,col3,col4,col5)

What I am trying to do is finding out how many duplicates does an element have on the same row and where.

For example, the second row of the data frame has two bc & three bd elements.

Fifth row has 4 duplicate values on col1, col2, col4 & col5.

Again last row has 2*2 duplicate values; xxx on col1 & col4, aaa on col3 & col5.

The output that I would like to see is:

  col6<- c(0,3,0,0,4,0,0,0,0,0,0,0,0,0,0,2)
  col7<- c(0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2)
  col8<- c('NA', 'col2,col3,col5', 'NA', 'NA', 'col1,col2,col4,col5', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'col1,col4')  
  col9<- c('NA', 'col1,col4', 'NA', 'NA', 'col1,col2,col4,col5', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'col3,col5')  
  df2 <- cbind(df,col6,col7,col8,col9)

Is there any convenient way to achieve this?

2

2 Answers

0
votes

Using the Base R functions:

df <- data.frame(df)

a <- cbind(stack(df),grp = c(row(df)))
b <- do.call(data.frame,aggregate(ind~.,a, function(x)c(length(x),toString(x))))
d <- transform(subset(type.convert(b),ind.1 > 1), time = ave(grp,grp ,FUN=seq_along))
e <- reshape(d, idvar = "grp",dir="wide",drop = "values")
merge(cbind(grp = seq(nrow(df)),df), e, all.x = TRUE)

  grp col1 col2 col3 col4  col5 ind.1.1                ind.2.1 ind.1.2          ind.2.2
1    1   ab   ac  abc abcd abcdd      NA                   <NA>      NA             <NA>
2    2   bc   bd   bd   bc    bd       2             col1, col4       3 col2, col3, col5
3    3   cd   ce  cde cdef cdeff      NA                   <NA>      NA             <NA>
4    4   de   df  def defg defgg      NA                   <NA>      NA             <NA>
5    5   ef   ef  efg   ef    ef       4 col1, col2, col4, col5      NA             <NA>
6    6   fg   fh  fgh fghj fghjj      NA                   <NA>      NA             <NA>
7    7   gh   gj  ghj ghjk ghjkk      NA                   <NA>      NA             <NA>
8    8   hj   hk  hjk hjkl hjkll      NA                   <NA>      NA             <NA>
9    9   jk   jl  jkl jklm jklmm      NA                   <NA>      NA             <NA>
10  10   kl   km  klm klmn klmnn      NA                   <NA>      NA             <NA>
11  11   lm   ln  lmm lmmo lmmoo      NA                   <NA>      NA             <NA>
12  12   mn   mo  mno mnop mnopp      NA                   <NA>      NA             <NA>
13  13   no   np  nop nopr noprr      NA                   <NA>      NA             <NA>
14  14   op   or  opr oprs oprss      NA                   <NA>      NA             <NA>
15  15   pr   ps  prs prst prstt      NA                   <NA>      NA             <NA>
16  16  xxx  xyz  aaa  xxx   aaa       2             col3, col5       2       col1, col4

EDIT:

in Tidyverse:

library(tidyverse)
df1 <- cbind(id = seq(nrow(df)), df)
df1 %>%
  pivot_longer(-id)%>%
  group_by(id, value)%>%
  summarise(value = length(name), name = toString(name),.groups = "drop_last")%>%
  filter(value>1)%>%
  mutate(s = seq(n()))%>%
  pivot_wider(id,s, values_from = c('value',"name"))%>%
  right_join(df1)%>%
  arrange(id)%>%
  select(starts_with("col"),everything())

Joining, by = "id"
# A tibble: 16 x 10
# Groups:   id [16]
   col1  col2  col3  col4  col5     id value_1 value_2 name_1                 name_2          
   <chr> <chr> <chr> <chr> <chr> <int>   <int>   <int> <chr>                  <chr>           
 1 ab    ac    abc   abcd  abcdd     1      NA      NA NA                     NA              
 2 bc    bd    bd    bc    bd        2       2       3 col1, col4             col2, col3, col5
 3 cd    ce    cde   cdef  cdeff     3      NA      NA NA                     NA              
 4 de    df    def   defg  defgg     4      NA      NA NA                     NA              
 5 ef    ef    efg   ef    ef        5       4      NA col1, col2, col4, col5 NA              
 6 fg    fh    fgh   fghj  fghjj     6      NA      NA NA                     NA              
 7 gh    gj    ghj   ghjk  ghjkk     7      NA      NA NA                     NA              
 8 hj    hk    hjk   hjkl  hjkll     8      NA      NA NA                     NA              
 9 jk    jl    jkl   jklm  jklmm     9      NA      NA NA                     NA              
10 kl    km    klm   klmn  klmnn    10      NA      NA NA                     NA              
11 lm    ln    lmm   lmmo  lmmoo    11      NA      NA NA                     NA              
12 mn    mo    mno   mnop  mnopp    12      NA      NA NA                     NA              
13 no    np    nop   nopr  noprr    13      NA      NA NA                     NA              
14 op    or    opr   oprs  oprss    14      NA      NA NA                     NA              
15 pr    ps    prs   prst  prstt    15      NA      NA NA                     NA              
16 xxx   xyz   aaa   xxx   aaa      16       2       2 col3, col5             col1, col4  
0
votes

It is easier to deal with data frames than matrices:

col1 <- c('ab', 'bc', 'cd', 'de', 'ef', 'fg', 'gh', 'hj', 'jk', 'kl', 'lm', 'mn', 'no', 'op', 'pr', 'xxx')
col2 <- c('ac', 'bd', 'ce', 'df', 'ef', 'fh', 'gj', 'hk', 'jl', 'km', 'ln', 'mo', 'np', 'or', 'ps', 'xyz')
col3 <- c('abc', 'bd', 'cde', 'def', 'efg', 'fgh', 'ghj', 'hjk', 'jkl', 'klm', 'lmm', 'mno', 'nop', 'opr', 'prs', 'aaa')
col4 <- c('abcd', 'bc', 'cdef', 'defg', 'ef', 'fghj', 'ghjk', 'hjkl', 'jklm', 'klmn', 'lmmo', 'mnop', 'nopr', 'oprs', 'prst', 'xxx')
col5 <- c('abcdd', 'bd', 'cdeff', 'defgg', 'ef', 'fghjj', 'ghjkk', 'hjkll', 'jklmm', 'klmnn', 'lmmoo', 'mnopp', 'noprr', 'oprss', 'prstt', 'aaa')
df <- as.data.frame(cbind(col1,col2,col3,col4,col5))
df2 <- df

df2$col6 <- apply(df, 1, function(x) {
  sum(duplicated(x))}
  )

df2$col7 <- ifelse(df2$col6 == 0, 0, ncol(df) - df2$col6)

df2$col8 <- apply(df, 1, function(x) {
  n_dup <- sum(duplicated(x))
  if (n_dup == 0.){
    return("NA") 
  } else {
    return(paste0(names(df)[duplicated(x)]))
  }
}
)

df2$col9 <- apply(df, 1, function(x) {
  n_dup <- sum(duplicated(x))
  if (n_dup == 0.){
    return("NA") 
  } else {
    return(paste0(names(df)[!duplicated(x)]))
  }
}
)

Output

> df2
   col1 col2 col3 col4  col5 col6 col7             col8             col9
1    ab   ac  abc abcd abcdd    0    0               NA               NA
2    bc   bd   bd   bc    bd    3 2 col3, col4, col5       col1, col2
3    cd   ce  cde cdef cdeff    0    0               NA               NA
4    de   df  def defg defgg    0    0               NA               NA
5    ef   ef  efg   ef    ef    3 2 col2, col4, col5       col1, col3
6    fg   fh  fgh fghj fghjj    0    0               NA               NA
7    gh   gj  ghj ghjk ghjkk    0    0               NA               NA
8    hj   hk  hjk hjkl hjkll    0    0               NA               NA
9    jk   jl  jkl jklm jklmm    0    0               NA               NA
10   kl   km  klm klmn klmnn    0    0               NA               NA
11   lm   ln  lmm lmmo lmmoo    0    0               NA               NA
12   mn   mo  mno mnop mnopp    0    0               NA               NA
13   no   np  nop nopr noprr    0    0               NA               NA
14   op   or  opr oprs oprss    0    0               NA               NA
15   pr   ps  prs prst prstt    0    0               NA               NA
16  xxx  xyz  aaa  xxx   aaa    2  3       col4, col5 col1, col2, col3