Binding 3 data frames with only partially same names

Question

I have 3 data frames which I would like to bind together. Two of them have 25 columns and one has 24. In first column you can find the name of the genes and in the rest (24 columns) are values. Mostly the names in the first column are the same but the order can be different. That means that one gene can come in second row in first table and in another in row number 2000.

Is there any way to bind those tables and keep all of the names ?

Data frame nr 1.

> dput(head(tbl_gel1))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220", 
"at1g01320", "at1g01420"), x1.1 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.10 = c(NA, NA, 0.97940004406824, 
NA, NA, NA), x1.11 = c(NA, 0.715595925164684, 1.12076888461521, 
NA, 1, NA), x1.12 = c(NA, NA, 1, NA, 1, NA), x1.13 = c(NA, NA, 
1.27620944815459, NA, 1.10617482362388, NA), x1.14 = c(NA, NA, 
0.970143924518673, NA, 0.897284652612375, NA), x1.15 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.16 = c(NA, 
NA, 0.855292180180481, NA, 0.678275003166569, NA), x1.17 = c(NA, 
NA, NA, NA, 1.31361646343431, NA), x1.18 = c(NA, NA, 1.01824439729952, 
NA, 0.731395183389585, NA), x1.19 = c(NA, NA, 2.13871102449867, 
NA, 1.26860481661042, NA), x1.2 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.20 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.21 = c(NA, NA, 1.5546960313129, 
NA, 2.12826383499469, NA), x1.22 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.23 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.24 = c(NA, 0.553801084127354, 
1.68155174378018, NA, 1, NA), x1.3 = c(NA, 1.91253217984776, 
NA, NA, NA, NA), x1.4 = c(NA, 1.2635979388975, NA, NA, NA, NA
), x1.5 = c(NA, 0.997262468935362, NA, NA, NA, 1), x1.6 = c(NA, 
0.836333481838468, 0.186450525168714, NA, NA, 1), x1.7 = c(0.713761294385108, 
0.998433283631924, NA, NA, NA, NA), x1.8 = c(NA, 1.00273753106464, 
0.105799532964898, NA, NA, NA), x1.9 = c(1.14311935280745, 0.720766625421293, 
0.763452683687036, 1, NA, NA)), .Names = c("X", "x1.1", "x1.10", 
"x1.11", "x1.12", "x1.13", "x1.14", "x1.15", "x1.16", "x1.17", 
"x1.18", "x1.19", "x1.2", "x1.20", "x1.21", "x1.22", "x1.23", 
"x1.24", "x1.3", "x1.4", "x1.5", "x1.6", "x1.7", "x1.8", "x1.9"
), row.names = c(NA, 6L), class = "data.frame")

Data frame nr 2.

> dput(head(tbl_gel2))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220", 
"at1g01320", "at1g01710"), x1.25 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.26 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.27 = c(NA, 1.27850867927992, 
NA, NA, NA, NA), x1.28 = c(NA, 2.21463917401264, NA, NA, NA, 
NA), x1.29 = c(NA, 0.953489798239202, NA, NA, NA, NA), x1.30 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.31 = c(0.601097045390844, 
1.19887070873732, NA, NA, NA, NA), x1.32 = c(1.09252174468455, 
0.517510161235958, 0.615226728434775, 0.542879255811352, NA, 
NA), x1.33 = c(1, 0.419630322520778, 1, 1, NA, NA), x1.34 = c(NA, 
NA, 0.799097170720151, NA, NA, NA), x1.35 = c(1, 0.585395967425093, 
1.0988581226183, 1, 0.850838480887727, 1), x1.36 = c(0.267978732865144, 
NA, 0.815842027204421, NA, NA, NA), x1.37 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.38 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.39 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.40 = c(NA, 
0.253595454669352, 1, NA, 1, NA), x1.41 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.42 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.43 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.44 = c(NA, 
0.801129291262679, 1.39524715613774, NA, 1, 1.34188880143232), 
    x1.45 = c(NA, NA, NA, NA, 3.10996537363054, NA), x1.46 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.48 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("X", 
"x1.25", "x1.26", "x1.27", "x1.28", "x1.29", "x1.30", "x1.31", 
"x1.32", "x1.33", "x1.34", "x1.35", "x1.36", "x1.37", "x1.38", 
"x1.39", "x1.40", "x1.41", "x1.42", "x1.43", "x1.44", "x1.45", 
"x1.46", "x1.48"), row.names = c(NA, 6L), class = "data.frame")

Data frame nr 3.

> dput(head(tbl_gel3))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220", 
"at1g01320", "at1g01420"), x1.49 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.50 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.51 = c(NA, 1, NA, 
NA, NA, NA), x1.52 = c(NA, 1.7994810956534, NA, NA, NA, NA), 
    x1.53 = c(NA, 1, NA, NA, NA, 1), x1.54 = c(NA, 7.89402612997038, 
    NA, NA, NA, NA), x1.55 = c(0.920776942793063, 0.996320522101043, 
    0.254584439603907, NA, NA, NA), x1.56 = c(1, 0.729758385900956, 
    0.300151773873743, NA, NA, NA), x1.57 = c(1, 0.947723222879912, 
    0.948619033067299, 1, NA, NA), x1.58 = c(1, 0.928854762925871, 
    1.3235617264432, 0.785944656498542, 0.675641973487141, NA
    ), x1.59 = c(1.06908415906789, 0.634382162824105, 1.04395304578544, 
    1, 0.650651881343625, NA), x1.60 = c(1.80853320689787, NA, 
    0.880820179658551, NA, NA, NA), x1.61 = c(1, NA, 1.6718152409295, 
    1.09278053029295, 1.01060798973004, NA), x1.62 = c(0.704459686809266, 
    NA, 1, NA, 1.08123985492291, NA), x1.63 = c(0.629128718440608, 
    0.445252633504756, 0.675960340502994, NA, 1, NA), x1.64 = c(0.171185393355124, 
    0.884594994748168, 1, NA, 1.08954220349952, NA), x1.65 = c(NA, 
    NA, 1.11460636151774, NA, NA, NA), x1.66 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), x1.67 = c(NA, NA, 
    NA, NA, 10.2238567979379, NA), x1.68 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), x1.69 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.70 = c(NA, 
    NA, 2.0577136925345, NA, 3.60392205648014, NA), x1.71 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.72 = c(NA, 
    NA, 10.9845898205719, NA, NA, NA)), .Names = c("X", "x1.49", 
"x1.50", "x1.51", "x1.52", "x1.53", "x1.54", "x1.55", "x1.56", 
"x1.57", "x1.58", "x1.59", "x1.60", "x1.61", "x1.62", "x1.63", 
"x1.64", "x1.65", "x1.66", "x1.67", "x1.68", "x1.69", "x1.70", 
"x1.71", "x1.72"), row.names = c(NA, 6L), class = "data.frame")

As an output I would like to have 1 data frame with 73 columns (one with the gene names and rest 72 with the values). If for example 1 gene is represented in only one data frame I want to put NAs in the specific columns. Don't want to lose any of the gene.

I tried to bind them by the loop but I am failing all the time. So I decided to ask here:

## Getting the list of all accessions
list_of_data = lapply(tbl, read.csv)
all_data = do.call(rbind.fill, list_of_data)


## Getting the proper names
vec_names <- all_data[,1]
vec_names <- unique(vec_names)

#### Loop to bind all of the data

gdata = lapply(list_of_data,function(x) x[3:nrow(x),2:25])

for( i in 1:length(list_of_data)){
  rownames(gdata[[i]]) = list_of_data[[i]][3:nrow(list_of_data[[i]]),1]
}
tmp = lapply(gdata,function(x) matrix(x),ncol=24)




final.table1=c()
for(i in 1:length(vec_names)){
  print(i)
  tmp=vec_names[i]
  f1 = function(x) {x[tmp,]}
  tmp2 = lapply(gdata,f1)
  tmp3 = c()
  for(j in 1:length(tmp2)){
    tmp3=rbind(tmp3,tmp2[[j]])
  }
  tmp4 = as.vector(t(tmp3))
  final.table1 = rbind(final.table1,tmp4)
}

rownames(final.table1) = vec_names

I am not a big fan of the loops so any other way to combine them together is welcome. I still will try to play with this loop...

Is all you want a join? Test it out with merge(tbl_gel1, tbl_gel2) — Hugh

Backlin Backlin · Accepted Answer · 2014-05-19T10:21:37

A double merge will do the trick:

merge(merge(tbl_gel1, tbl_gel2, by="X", all=TRUE), tbl_gel3, by="X", all=TRUE)

If you got a lot of data frames you can do them all with Reduce:

Reduce(
    function(a, b) merge(a, b, by="X", all=TRUE),
    list(tbl_gel1, tbl_gel2, tbl_gel3)
)

If merge doesn't get it right you can have a look at the join function of the plyr package.

Binding 3 data frames with only partially same names

1 Answers