Here are a couple of ways...
In base R:
X <- setNames(strsplit(as.character(dat$Multi), " "), dat$ID)
X1 <- stack(X)
X1$order <- ave(X1$ind, X1$ind, FUN = seq_along)
X1
# values ind order
# 1 VAL1 1 1
# 2 VAL2 1 2
# 3 VAL3 1 3
# 4 VAL2 2 1
# 5 VAL3 2 2
# 6 VAL2 3 1
# 7 VAL3 3 2
# 8 VAL1 3 3
OR (better):
X <- strsplit(as.character(dat[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(dat[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
Using concat.split.multiple
from my "splitstackshape" package (probably not too efficient on 400,000 rows though).
library(splitstackshape)
out <- concat.split.multiple(dat, "Multi", " ", "long")
out[order(out$ID, out$time), ]
# ID time Multi
# 1 1 1 VAL1
# 4 1 2 VAL2
# 7 1 3 VAL3
# 2 2 1 VAL2
# 5 2 2 VAL3
# 8 2 3 <NA>
# 3 3 1 VAL2
# 6 3 2 VAL3
# 9 3 3 VAL1
And, since you requested "data.table":
library(data.table)
DT <- data.table(dat)
DTL <- DT[, list(unlist(strsplit(as.character(Multi), " "))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# 6: 3 VAL2 1
# 7: 3 VAL3 2
# 8: 3 VAL1 3
Update: Some timings
I didn't bother testing my "splitstackshape" approach, because that uses read.table
under the hood, so I know it won't stand up to the demands of performance.
However, base R still seems to win out!
Sample "big" data:
dat_big <- do.call(rbind, replicate(floor(4000/3), dat, simplify = FALSE))
dat_big <- do.call(rbind, replicate(100, dat_big, simplify = FALSE))
dat_big$ID <- make.unique(as.character(dat_big$ID))
DT <- data.table(dat)
DT_big <- data.table(dat_big)
Functions to test:
fun1 <- function(inDF) {
X <- strsplit(as.character(inDF[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(inDF[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
}
fun2 <- function(inDT) {
DTL <- inDT[, list(unlist(strsplit(as.character(Multi), " ", fixed = TRUE))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
}
The results for base R:
system.time(outDF <- fun1(dat_big))
# user system elapsed
# 6.418 0.000 6.454
dim(outDF)
# [1] 1066400 3
head(outDF)
# ID order Multi
# 1 1 1 VAL1
# 2 1 2 VAL2
# 3 1 3 VAL3
# 4 2 1 VAL2
# 5 2 2 VAL3
# 6 3 1 VAL2
tail(outDF)
# ID order Multi
# 1066395 1.133299 3 VAL3
# 1066396 2.133299 1 VAL2
# 1066397 2.133299 2 VAL3
# 1066398 3.133299 1 VAL2
# 1066399 3.133299 2 VAL3
# 1066400 3.133299 3 VAL1
The results for "data.table":
system.time(outDT <- fun2(DT_big))
# user system elapsed
# 14.035 0.000 14.057
dim(outDT)
# [1] 1066400 3
outDT
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# ---
# 1066396: 2.133299 VAL2 1
# 1066397: 2.133299 VAL3 2
# 1066398: 3.133299 VAL2 1
# 1066399: 3.133299 VAL3 2
# 1066400: 3.133299 VAL1 3