My data is grouped by the IDs in V6 and ordered by position (V1:V3):
dt
V1 V2 V3 V4 V5 V6
1: chr1 3054233 3054733 . + ENSMUSG00000090025
2: chr1 3102016 3102125 . + ENSMUSG00000064842
3: chr1 3205901 3207317 . - ENSMUSG00000051951
4: chr1 3206523 3207317 . - ENSMUSG00000051951
5: chr1 3213439 3215632 . - ENSMUSG00000051951
6: chr1 3213609 3216344 . - ENSMUSG00000051951
7: chr1 3214482 3216968 . - ENSMUSG00000051951
8: chr1 3421702 3421901 . - ENSMUSG00000051951
9: chr1 3466587 3466687 . + ENSMUSG00000089699
10: chr1 3513405 3513553 . + ENSMUSG00000089699
What I would like to do is to add and extra column with an index by position, that is, per group in V6 the first element would be "1", the second "2", and so on. I can achieve that using ddply and a custom function:
rankExons <- function(x){
if(unique(x$V5) == "+"){
x$index <- seq_len(nrow(x))}
else{
x$index <- rev(seq_len(nrow(x)))}
x
}
indexed <- ddply(dt, .(V6), rankExons)
indexed
V1 V2 V3 V4 V5 V6 index
1 chr1 3205901 3207317 . - ENSMUSG00000051951 6
2 chr1 3206523 3207317 . - ENSMUSG00000051951 5
3 chr1 3213439 3215632 . - ENSMUSG00000051951 4
4 chr1 3213609 3216344 . - ENSMUSG00000051951 3
5 chr1 3214482 3216968 . - ENSMUSG00000051951 2
6 chr1 3421702 3421901 . - ENSMUSG00000051951 1
7 chr1 3102016 3102125 . + ENSMUSG00000064842 1
8 chr1 3466587 3466687 . + ENSMUSG00000089699 1
9 chr1 3513405 3513553 . + ENSMUSG00000089699 2
10 chr1 3054233 3054733 . + ENSMUSG00000090025 1
Unfortunately, it is extremely slow on the full dataset (~620k rows) and when using parallel it crashes and burns:
library(doMC)
registerDoMC(cores=6)
indexed <- ddply(dt, .(V6), rankExons, .parallel=TRUE)
Error: serialization is too large to store in a raw vector
Error: serialization is too large to store in a raw vector
Error: serialization is too large to store in a raw vector
Error: serialization is too large to store in a raw vector
Error: serialization is too large to store in a raw vector
Error: serialization is too large to store in a raw vector
Warning message:
In mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed = set.seed, :
all scheduled cores encountered errors in user code
So , I went for data.table but couldn't get it working. Here is what I tried:
setkey(dt, "V6")
dt[,index:=rankExons(dt), by=V6]
dt[,rankExons(.sd), by=V6, .SDcols=c("V5, V6")]
And both failed. How can I recreate my ddply with data.table?
dput(dt)
structure(list(V1 = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1", "chr1"), V2 = c(3054233L, 3102016L,
3205901L, 3206523L, 3213439L, 3213609L, 3214482L, 3421702L, 3466587L,
3513405L), V3 = c(3054733L, 3102125L, 3207317L, 3207317L, 3215632L,
3216344L, 3216968L, 3421901L, 3466687L, 3513553L), V4 = c(".",
".", ".", ".", ".", ".", ".", ".", ".", "."), V5 = c("+", "+",
"-", "-", "-", "-", "-", "-", "+", "+"), V6 = c("ENSMUSG00000090025",
"ENSMUSG00000064842", "ENSMUSG00000051951", "ENSMUSG00000051951",
"ENSMUSG00000051951", "ENSMUSG00000051951", "ENSMUSG00000051951",
"ENSMUSG00000051951", "ENSMUSG00000089699", "ENSMUSG00000089699"
)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6"), class = c("data.table",
"data.frame"), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x1de6a88>)