0
votes

I tested the following code (on a 300K row DF) to determine which is the fastest way (for loop vs lapply) for parallelization in R.

Q1. Is this always true that (based on elapsed time comparisons) Parallel lapply is faster than parallel for loop? Depending on different posts online I see that people either say "Duh! Lapply is always faster" or "Depending on your implementation for loop can be faster".

Q2. what is more surprising is running similar code by calling a function (to make the code look cleaner) is way slower. Have I benchmarked them correctly?

I see similar trends for 30K rows. Depending on the answers I will see if the parallelization scales up well with increasing cores.

Thanks.

#Results:
[1] 300000      3
[1] "For loop all conditions"
    user   system  elapsed 
1040.232    8.767 1048.897 
[1] "Parallel For loop all conditions"
   user  system elapsed 
266.861   8.462 276.064 
[1] "Lapply all conditions"
   user  system elapsed 
 66.364   0.014  66.369 
[1] "ParLapply all conditions"
   user  system elapsed 
  0.413   0.113  25.890 
[1] "Lapply all conditions call function"
    user   system  elapsed 
5293.981  223.524 5517.128 
[1] "ParLapply all conditions call function"
    user   system  elapsed 
   0.492    0.082 1949.433 
[1] "For loop all conditions call function"
     user    system   elapsed 
10506.028    82.372 10587.585 
[1] "Parallel For loop all conditions call function"
    user   system  elapsed 
 585.387   29.322 2246.441 

#Code:  
d1 = c(1,2,-3)
d2 = c(1,-2,-2)
d3 = c(1,-2,-4)
d = data.frame(d1,d2,d3)
# making a big data frame for testing
s_df = d[rep(seq_len(nrow(d)), each=100000),]


correlThreshold = 0
total_numb_input_files = 3
rows_passing_consistency = c()


print("For loop all conditions")
system.time(
        for(idx in 1:nrow(s_df)){
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }

            if((!is.na(neg)) && neg == (total_numb_input_files)){
                res = rname
            }
            rows_passing_consistency = append(rows_passing_consistency, res)
        }
)



print("Parallel For loop all conditions")
library(doParallel)
cl<-makeCluster(4, type="FORK")
registerDoParallel(cl)
system.time(
        foreach(idx = 1:nrow(s_df), .combine = c) %dopar% {
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }
            if((!is.na(neg)) && neg == (total_numb_input_files)){
                 res = rname
            }
            res
        }
)
stopCluster(cl)



print("Lapply all conditions")
system.time(
  lapply(1:nrow(s_df) , 
        function(idx, s_df){
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }

            if((!is.na(neg)) && neg == (total_numb_input_files)){
                res = rname
            }
            res         
        }
    , s_df
  )
)



print("ParLapply all conditions")
library(doParallel)
cl<-makeCluster(4, type="FORK")
#registerDoParallel(cl)
system.time(
  parLapply(cl, 1:nrow(s_df) , 
        function(idx, s_df){
            dfx = as.vector(unlist(s_df[idx, ,drop=T]))
            rname = rownames(s_df)[idx]
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }

            if((!is.na(neg)) && neg == (total_numb_input_files)){
                res = rname
            }
            res         
        }
    , s_df
  )
)
stopCluster(cl)





calc_consistency = function(rname, s_df){
            dfx = as.vector(unlist(s_df[rname, ,drop=T]))
            res = NULL
            #print(dfx)
            pos = sum(dfx > correlThreshold)
            neg = sum(dfx < correlThreshold)

            if((!is.na(pos)) && pos == (total_numb_input_files)){
                res = rname
            }
            if((!is.na(neg)) && neg == (total_numb_input_files)){
                 res = rname
            }
            return(res)
}

print("Lapply all conditions call function")
system.time(lapply(rownames(s_df), calc_consistency, s_df))

print("ParLapply all conditions call function")
library(doParallel)
cl<-makeCluster(4, type="FORK")
#registerDoParallel(cl)
system.time(parLapply(cl, rownames(s_df), calc_consistency, s_df))
stopCluster(cl)

print("For loop all conditions call function")
system.time(
for(rname in rownames(s_df)){
        rows_passing_consistency = append(rows_passing_consistency, calc_consistency(rname, s_df))
}
)

print("Parallel For loop all conditions call function")
library(doParallel)
cl<-makeCluster(4, type="FORK")
registerDoParallel(cl)
system.time(
foreach(rname=rownames(s_df), .combine = c) %dopar% {
        calc_consistency(rname, s_df)
}
)
stopCluster(cl)
1
For starters, vec <- append(vec,stuff) is one of the slowest things you could possibly do in R.joran
Do you think the big increase in time due to calling a method is expected?richard rodrigues

1 Answers

1
votes

So it turns out that the major difference in speeds is due to passing "row indexes" vs "rownames" to the apply function. I tried using (l)apply with inline and separate function call with and without parallelization. There is not much difference in inline vs function call with apply. Parallelization works equally well too. The major time delay is due to passing indexes vs rownames, although I am not sure why this happens.