1
votes

I have 14000 gene (column:Gene) and 200 samples (column: sample1 sample2 ...)

I am trying calculate correlations for ~14000 genes all against all and append all gene correlations and required columns from the dataset(test_df) in a new dataframe(df1) and write results to a text file.

When I run the the code, I am getting correlations between (Gene1 and Gene2) and (Gene1 and Gene3). When the loop comes to Gene2 It breaks and the error says

Error in cor.test.default(as.matrix(test_df[i, ][, 3:length(test_df)]), : not enough finite observations

I have 3 to 4 values per rows this shouldn't be the case.

Please suggest any efficient way of doing this since I have to do correlations for 14000 genes.How can I run this code on multiple cores to get results faster?

Please find the code and the resulted file below.

Thanks in advance

> test_df <- data.frame(ID=c("ID_3721", "ID_537", "ID_555"), 
                      Gene=c("Gene1","Gene2","Gene3"),
                      sample1=c(11397,78191,44838),
                      sample2=c(33768,33763,7680),
                      sample3=c(74521,33268,72367),
                      sample4=c(51486,11435,28772),
                      sample5=c(73539,21486,0))

> test_df
##       ID  Gene sample1 sample2 sample3 sample4 sample5
##1 ID_3721 Gene1   11397   33768   74521   51486   73539
##2  ID_537 Gene2   78191   33763   33268   11435   21486
##3  ID_555 Gene3   44838    7680   72367   28772       0
for(i in 1:2){
       for(j in i+1:3){

          p.cor <- cor.test(as.matrix(test_df[i,][,3:length(test_df)]), as.matrix(test_df[j,][,3:length(test_df)]), method="pearson")$estimate
          s.cor <- cor.test(as.matrix(test_df[i,][,3:length(test_df)]), as.matrix(test_df[j,][,3:length(test_df)]), method="spearman")$estimate

          df1 <- data.frame(ID1   = test_df[i,1],
                            ID2   = test_df[j,1],
                            Name1 = test_df[i,2],
                            Name2 = test_df[j,2],
                            correlation.p = p.cor
                            correlation.s = s.cor)

         write.table(df1, file="genecorr.txt", row.names=FALSE, sep="\t", append=TRUE, quote=FALSE, col.names = !file.exists("genecorr.txt"))

   }
}

**Error in cor.test.default(as.matrix(test_df[i, ][, 3:length(test_df)]),  : 
  not enough finite observations**

genecorr.txt

ID1     ID2     NAME1   NAME2    correlation.p      correlation.s
ID_3721 ID_537  Gene1   Gene2    -0.136733508500744  -0.1
ID_3721 ID_555  Gene1   Gene3    0.145998550191942    0.3

3
you don't need cor.test if you are just interested with the estimates. cor might be enough and faster probablyDJJ
you are saving genecorr.txt at each iteration. This might lengthen the computation a lot. I would suggest to save once the correlations are computed.DJJ

3 Answers

0
votes

I would suggest first to transform your data in the following way

 dt <- dcast(melt(id.vars=c("ID","Gene"),test_df),variable~Gene)

setDT(dt)

## > dt
##    variable Gene1 Gene2 Gene3
## 1:  sample1 11397 78191 44838
## 2:  sample2 33768 33763  7680
## 3:  sample3 74521 33268 72367
## 4:  sample4 51486 11435 28772
## 5:  sample5 73539 21486     0




nameidx <- combn(names(dt)[-1],2)
 ## > nameidx
 ##      [,1]    [,2]    [,3]   
 ## [1,] "Gene1" "Gene1" "Gene2"
 ## [2,] "Gene2" "Gene3" "Gene3"

notice how easy it is to produce the name index with the function combn. This way can help you avoid the double loop. you can choose to go with the ID instead of the name if name is not unique

Now it is just a matter of going through the name idx

res  <- dt[,lapply(1:ncol(nameidx),
         function(x){ c(pearson=cor.test(get(nameidx[1,x]),
                                    get(nameidx[2,x]),method="pearson")$estimate,
         spearman=cor.test(get(nameidx[1,x]),
                           get(nameidx[2,x]),method="spearman")$estimate)})]

## >  > res
##            V1        V2        V3
## 1: -0.7411691 0.0394641 0.3444608
## 2: -0.6000000 0.1000000 0.3000000

Then we can finish it with

 ## > res1 <- setnames(data.table(cbind(t(nameidx),t(res))),c("Name1","Name2","pearson","spearman"))[]
 ## > res1
 ##    Name1 Name2            pearson spearman
 ## 1: Gene1 Gene2 -0.741169112323627     -0.6
 ## 2: Gene1 Gene3 0.0394640960151169      0.1
 ## 3: Gene2 Gene3  0.344460833012615      0.3
0
votes

First at all i have an inefficient way with loops for your problem:

test_df <- data.frame(ID=c("ID_3721", "ID_537", "ID_555"),
                      Gene=c("Gene1","Gene2","Gene3"),
                      sample1=c(11397,78191,44838),
                      sample2=c(33768,33763,7680),
                      sample3=c(74521,33268,72367),
                      sample4=c(51486,11435,28772),
                      sample5=c(73539,21486,0))

df1<-data.frame(ID1=0,ID2=0,Name1=0,Name2=0,correlation=0)

k<-1

for(i in 1:2){
       for(j in i:3){
       if(i!=j){
          p.cor <- cor.test(as.matrix(test_df[i,][,3:length(test_df)]), as.matrix(test_df[j,][,3:length(test_df)]), method="pearson")$estimate
          s.cor <- cor.test(as.matrix(test_df[i,][,3:length(test_df)]), as.matrix(test_df[j,][,3:length(test_df)]), method="spearman")$estimate


          df1[k,] <- c(as.character(test_df[i,1]),as.character(test_df[j,1]),as.character(test_df[i,2]),as.character(test_df[j,2]),as.character(p.cor))

                            k<-k+1
                            }
   }
}

maybe this is a little bit faster

n<-nrow(test_df)

fun<-function(y)cor(x,y)


result<-c()
for(i in 1:(n-1))
{
x<-as.numeric(test_df[i,3:ncol(test_df)])
result<-c(result,apply(test_df[(i+1):nrow(test_df),3:ncol(test_df)],1,fun))
}


m<-rep((n-1):1,(n-1):1)

a<-rep(test_df[,1][-n],(n-1):1)
b<-rep(test_df[,2][-n],(n-1):1)

c<-d<-numeric()
for(i in 2:n)
{
c<-c(c,as.character(test_df[,1][i:n]))
d<-c(d,as.character(test_df[,2][i:n]))
}

df1<-data.frame(ID1=a,ID2=c,Name1=b,Name2=d,correlation=result)
0
votes

You don't need a for loop, the cor function works with a matrix. By default it calculates the pairwise correlation between columns, so for your situation, transpose the matrix:

rownames(test_df) = test_df[,2]
cor(t(test_df[,-c(1:2)]),method="pearson")
           Gene1      Gene2     Gene3
Gene1  1.0000000 -0.7411691 0.0394641
Gene2 -0.7411691  1.0000000 0.3444608
Gene3  0.0394641  0.3444608 1.0000000

Some of these are redundant, so we just get out only the upper triangle. And we get the indices of the comparison out before hand:

ind = which(upper.tri(cor(t(test_df[,-c(1:2)]))),arr.ind=TRUE)
     row col
[1,]   1   2
[2,]   1   3
[3,]   2   3

As you can see, this corresponds to the upper triangle of that matrix above. Below I will pull out the upper triangle of the matrix and join it with this vector.

So we joining the spearman and pearson with other info:

cor_vector = function(M,Method){
res = cor(M,method=Method)
res[upper.tri(res)]
}

data.frame(
test_df[ind[,1],1:2],
test_df[ind[,2],1:2],
pearson = cor_vector(t(test_df[,-c(1:2)]),"pearson"),
spearman = cor_vector(t(test_df[,-c(1:2)]),"spearman")
)

             ID  Gene   ID.1 Gene.1    pearson spearman
Gene1   ID_3721 Gene1 ID_537  Gene2 -0.7411691     -0.6
Gene1.1 ID_3721 Gene1 ID_555  Gene3  0.0394641      0.1
Gene2    ID_537 Gene2 ID_555  Gene3  0.3444608      0.3

However, I need to warn you, this calculation is extremely cumbersome for a matrix of your size, 14000*200. And if i do a quick calculation your output dataframe is going to be:

choose(14000,2)
[1] 97993000

90 million rows ! Are you sure about storing such a huge data.frame?