14
votes

I have been busy with this question since last night and I could not figure out how to do it.

What I want to do is to match df1 strings to df2 strings and get the similar ones out

what I do is like this

# a function to arrange the data to have IDs for each string 
    normalize <- function(x, delim) {
      x <- gsub(")", "", x, fixed=TRUE)
      x <- gsub("(", "", x, fixed=TRUE)
      idx <- rep(seq_len(length(x)), times=nchar(gsub(sprintf("[^%s]",delim), "", as.character(x)))+1)
      names <- unlist(strsplit(as.character(x), delim))
      return(setNames(idx, names))
    }

# a function to arrange the second df  
lookup <- normalize(df2[,1], ",")

# a function to match them and give the IDs
process <- function(s) {
  lookup_try <- lookup[names(s)]
  found <- which(!is.na(lookup_try))
  pos <- lookup_try[names(s)[found]]
  return(paste(s[found], pos, sep="-"))
  #change the last line to "return(as.character(pos))" to get only the result as in the comment
}

then I get the results like this

res <- lapply(colnames(df1), function(x) process(normalize(df1[,x], ";")))

This gives me the row number of each string from df1 and row number of string from df2 that matched. so the output of this data looks like this

> res
$s1
[1] "3-4" "4-1" "5-4"

$s2
[1] "2-4"  "3-15" "7-16"

The first column IDs is the row number of df2 which matched with strings in df1 The second column No is the number of times it matched The third column ID-col-n is the row number of string in df1 which matched with that string + their column name the forth is string from first column of the df1 which matched with that string the fifth column is the string of second column which matched with that string and so on

2
I assume your recent questions are somehow related to this. Wouldn't that solve the problem?akrun
Looks like it needs some time to spend. I am right now busy with a project to deliver.akrun
I really would like to help you. But, I am very busy with a project. So, I am only answering questions that take less time.akrun
In normalize you can reduce the two regex functions to simply gsub("[()]", "", x)Pierre L
In your example, P41182 is in the first three rows of df2, and in the 4th row, s1 column of df1, so shouldn't there be a line with 2 1 4s1 P41182 - and 2 1 4s1 P41182 - in your results?NicE

2 Answers

6
votes

In this case I find it easier to switch the data to the wide format and before merging it to the lookup table.

You could try:

library(tidyr)
library(dplyr)
df1_tmp <- df1
df2_tmp <- df2
#add numerical id to df1_tmp to keep row information
df1_tmp$id <- seq_along(df1_tmp[,1])

#switch to wide and unnest rows with several strings
df1_tmp <- gather(df1_tmp,key="s_val",value="query_string",-id)
df1_tmp <- df1_tmp %>% 
        mutate(query_string = strsplit(as.character(query_string), ";")) %>% 
        unnest(query_string)


df2_tmp$IDs. <- gsub("[()]", "", df2_tmp$IDs.)

#add numerical id to df1_tmp to keep row information
df2_tmp$id <- seq_along(df2_tmp$IDs.)

#unnest rows with several strings
df2_tmp <- df2_tmp %>% 
        mutate(IDs. = strsplit(as.character(IDs.), ",")) %>% 
        unnest(IDs.)

res <- merge(df1_tmp,df2_tmp,by.x="query_string",by.y="IDs.")

res$ID_col_n <- paste(paste0(res$id.x,res$s_val))
res$total_id <- 1:nrow(res)
res <- spread(res,s_val,value=query_string,fill=NA)
res
#summarize to get required output 

res <- res %>% group_by(id.y) %>%
        mutate(No=n())  %>% group_by(id.y,No) %>%
        summarise_each(funs(paste(.[!is.na(.)],collapse=","))) %>% 
        select(-id.x,-total_id)

colnames(res)[colnames(res)=="id.y"]<-"IDs"

res$df1_colMatch_counts <- rowSums(res[,-(1:3)]!="")
df2_counts <- df2_tmp %>% group_by(id) %>% summarize(df2_string_counts=n())
res <- merge(res,df2_counts,by.x="IDs",by.y="id")
res


res

  IDs No    ID_col_n            s1     s2 df1_colMatch_counts df2_string_counts
1   1  1         4s1        P41182                          1                 2
2   2  1         4s1        P41182                          1                 2
3   3  1         4s1        P41182                          1                 2
4   4  3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831                   2                 4
5  15  1         3s2               P54612                   1                 5
6  16  1         7s2               O15143                   1                 7
2
votes

For this solution I use a general-purpose helper function meant to work around the limitation of rbind() that it can't handle inconsistent column names. You could probably use any function from Combine two data frames by rows (rbind) when they have different sets of columns, but I also wrote my own:

rbind.cn <- function(...,filler=NA) {
    ## note: must explicitly set proper S3 class; otherwise, filler would corrupt the column type in cases where a column is missing from the first df
    ## for example, logical NA (the default for filler) would nix factors, resulting in character after type promotion
    ## to do this, will use single-row temp df as first argument to rbind()
    ## note: tried zero-row to prevent need to excise afterward, but zero-row rbind() arguments are ignored for typing purposes
    l <- list(...);
    schema <- do.call(cbind,unname(lapply(l,function(df) df[1L,,drop=F]))); ## unname() is necessary, otherwise cbind() tries to be a good citizen and concats first df cell value found in lapply() names onto schema column names
    schema <- schema[unique(names(schema))];
    res <- do.call(rbind,c(list(schema),lapply(l,function(df) {
        cns.add <- names(schema)[!names(schema)%in%names(df)];
        do.call(cbind,c(
            list(df),
            setNames(rep(filler,length(cns.add)),cns.add),
            stringsAsFactors=F
        ));
    })))[-1L,,drop=F];
    ## fix up row names
    rns <- do.call(c,lapply(l,rownames));
    rownames(res) <- ifelse(grepl('^[0-9]+$',rns),seq_along(rns),rns);
    res;
};

I also wrote my own normalization function. I think you were on the right track by precomputing a normalized representation of both input data.frames, but because you used named vector indexing to match the ids, you didn't catch cases of duplicate names in df2, which is why your result is missing the additional occurrences of id P41182. Here's my normalization function:

## normalization function
## for each column, splits on sep and captures the id, row index, column index, and column name in a data.frame
normalize <- function(df,sep) {
    do.call(rbind,lapply(seq_along(df),function(ci) {
        l <- strsplit(gsub('[()]','',df[[ci]]),sep);
        cbind(
            do.call(rbind,lapply(seq_along(l),function(ri)
                if (length(l[[ri]]) > 0L)
                    data.frame(id=l[[ri]],ri=ri,stringsAsFactors=F)
            )),
            ci,
            cn=names(df)[ci]
        );
    }));
};

Here's the complete solution:

## normalize both data.frames
df1.norm <- normalize(df1,';');
df2.norm <- normalize(df2,',');

## join them on matching ids
df.match <- merge(df1.norm,df2.norm,'id',suffixes=c('.1','.2'));
df.match <- df.match[with(df.match,order(ri.2,cn.1,ri.1)),]; ## order by df2 row index, df1 column name, and finally df1 row index, as per required output

## aggregate and format as required
res <- do.call(rbind.cn,c(by(df.match,df.match$ri.2,function(x) {
    strCols <- aggregate(id~cn.1,x[c('id','cn.1')],paste,collapse=','); ## conveniently, automatically orders by the grouping column cn.1
    do.call(cbind,c(
        list(data.frame(IDs=x$ri.2[1L],No=nrow(x),`ID-col-n`=paste0(x$ri.1,x$cn.1,collapse=','),stringsAsFactors=F,check.names=F)),
        setNames(strCols$id,paste0('string-df1-',strCols$cn.1)),
        stringsAsFactors=F
    ));
}),filler='-'));

## order string-df1 columns
res <- res[c(1:3,order(as.integer(sub('.*?([0-9]+)$','\\1',names(res)[-1:-3])))+3L)];

And here are all intermediate and final data.frames:

df1.norm;
##        id ri ci cn
## 1  Q9Y6W5  1  1 s1
## 2  Q9Y6U3  2  1 s1
## 3  Q9Y6Q9  3  1 s1
## 4  P41182  4  1 s1
## 5  Q9HCP0  4  1 s1
## 6  Q09472  5  1 s1
## 7  Q9Y6I3  6  1 s1
## 8  Q9Y6H1  7  1 s1
## 9  Q5T1J5  7  1 s1
## 10 Q16835  1  2 s2
## 11 P61809  2  2 s2
## 12 Q92831  2  2 s2
## 13 P41356  3  2 s2
## 14 P54612  3  2 s2
## 15 A41PH2  3  2 s2
## 16 P3R117  4  2 s2
## 17 P31908  5  2 s2
## 18 P54112  6  2 s2
## 19 O15143  7  2 s2

df2.norm;
##        id ri ci   cn
## 1  P41182  1  1 IDs.
## 2  P56524  1  1 IDs.
## 3  P41182  2  1 IDs.
## 4  Q9UQL6  2  1 IDs.
## 5  P41182  3  1 IDs.
## 6  Q8WUI4  3  1 IDs.
## 7  Q92793  4  1 IDs.
## 8  Q09472  4  1 IDs.
## 9  Q9Y6Q9  4  1 IDs.
## 10 Q92831  4  1 IDs.
## 11 P30561  5  1 IDs.
## 12 P53762  5  1 IDs.
## 13 Q15021  6  1 IDs.
## 14 Q9BPX3  6  1 IDs.
## 15 Q15003  6  1 IDs.
## 16 O95347  6  1 IDs.
## 17 Q9NTJ3  6  1 IDs.
## 18 Q92902  7  1 IDs.
## 19 Q9NQG7  7  1 IDs.
## 20 Q969F9  8  1 IDs.
## 21 Q9UPZ3  8  1 IDs.
## 22 Q86YV9  8  1 IDs.
## 23 Q92903  9  1 IDs.
## 24 Q96NY9  9  1 IDs.
## 25 Q91VB4 10  1 IDs.
## 26 P59438 10  1 IDs.
## 27 Q8BLY7 10  1 IDs.
## 28 Q92828 11  1 IDs.
## 29 Q13227 11  1 IDs.
## 30 O15379 11  1 IDs.
## 31 O75376 11  1 IDs.
## 32 O60907 11  1 IDs.
## 33 Q9BZK7 11  1 IDs.
## 34 P78537 12  1 IDs.
## 35 Q6QNY1 12  1 IDs.
## 36 Q6QNY0 12  1 IDs.
## 37 Q9NUP1 12  1 IDs.
## 38 Q96EV8 12  1 IDs.
## 39 Q8TDH9 12  1 IDs.
## 40 Q9UL45 12  1 IDs.
## 41 O95295 12  1 IDs.
## 42 O55102 13  1 IDs.
## 43 Q9CWG9 13  1 IDs.
## 44 Q5U5M8 13  1 IDs.
## 45 Q8VED2 13  1 IDs.
## 46 Q91WZ8 13  1 IDs.
## 47 Q8R015 13  1 IDs.
## 48 Q9R0C0 13  1 IDs.
## 49 Q9Z266 13  1 IDs.
## 50 P30561 14  1 IDs.
## 51 O08915 14  1 IDs.
## 52 P07901 14  1 IDs.
## 53 P11499 14  1 IDs.
## 54 Q8WMR7 15  1 IDs.
## 55 P67776 15  1 IDs.
## 56 P11493 15  1 IDs.
## 57 P54612 15  1 IDs.
## 58 P54613 15  1 IDs.
## 59 P61160 16  1 IDs.
## 60 P61158 16  1 IDs.
## 61 O15143 16  1 IDs.
## 62 O15144 16  1 IDs.
## 63 O15145 16  1 IDs.
## 64 P59998 16  1 IDs.
## 65 O15511 16  1 IDs.

df.match;
##       id ri.1 ci.1 cn.1 ri.2 ci.2 cn.2
## 2 P41182    4    1   s1    1    1 IDs.
## 4 P41182    4    1   s1    2    1 IDs.
## 3 P41182    4    1   s1    3    1 IDs.
## 8 Q9Y6Q9    3    1   s1    4    1 IDs.
## 6 Q09472    5    1   s1    4    1 IDs.
## 7 Q92831    2    2   s2    4    1 IDs.
## 5 P54612    3    2   s2   15    1 IDs.
## 1 O15143    7    2   s2   16    1 IDs.

res;
##   IDs No    ID-col-n string-df1-s1 string-df1-s2
## 1   1  1         4s1        P41182             -
## 2   2  1         4s1        P41182             -
## 3   3  1         4s1        P41182             -
## 4   4  3 3s1,5s1,2s2 Q9Y6Q9,Q09472        Q92831
## 5  15  1         3s2             -        P54612
## 6  16  1         7s2             -        O15143