Normalizing a multivalued column in a large table

Question

I'm having an issue converting the VBA code seen in this this post to an R-script.

The problem is as follows, I have a column (from a source database, not by choice) that contains multiple values for the attribute. I'd like to normalize this table and retain the order in which each value occurs in each cell.

An example dataset:

dat <- data.frame(
  ID = c(1:3),
  Multi = c("VAL1 VAL2 VAL3","VAL2 VAL3","VAL3 VAL1")
  ,stringsAsFactors=FALSE)

  ID          Multi
1  1 VAL1 VAL2 VAL3
2  2      VAL2 VAL3
3  3 VAL2 VAL3 VAL1

The pseudocode would be something like:

Loop over each row
Split string Multi with a space as a separator
For each splitted string, append new row in a separate data.frame with the ID, Order within the total string and the value.

The result would look like:

      ID   Order    Multi
1      1       1     VAL1
2      1       2     VAL2
3      1       3     VAL3
4      2       1     VAL2
5      2       2     VAL3
6      3       1     VAL2
7      3       2     VAL3
8      3       3     VAL1

I'm currently looking at doing so with a data.frame, I'm thinking data.table would be more appropriate as my table will have approximately 400.000 of these rows.

I apologize for not having any code ready, I'm still contemplating whether I need to use the apply family, data.table or a simple for loop. I'll keep this post updated with any progress I make.

A5C1D2H2I1M1N2O1R2T1 A5C1D2H2I1M1N2O1R2T1 · Accepted Answer · 2014-03-23T10:38:28

Here are a couple of ways...

In base R:

X <- setNames(strsplit(as.character(dat$Multi), " "), dat$ID)
X1 <- stack(X)
X1$order <- ave(X1$ind, X1$ind, FUN = seq_along)
X1
#   values ind order
# 1   VAL1   1     1
# 2   VAL2   1     2
# 3   VAL3   1     3
# 4   VAL2   2     1
# 5   VAL3   2     2
# 6   VAL2   3     1
# 7   VAL3   3     2
# 8   VAL1   3     3

OR (better):

X <- strsplit(as.character(dat[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(dat[, "ID"], len), order = sequence(len), 
           Multi = unlist(X, use.names=FALSE))

Using concat.split.multiple from my "splitstackshape" package (probably not too efficient on 400,000 rows though).

library(splitstackshape)
out <- concat.split.multiple(dat, "Multi", " ", "long")
out[order(out$ID, out$time), ]
#   ID time Multi
# 1  1    1  VAL1
# 4  1    2  VAL2
# 7  1    3  VAL3
# 2  2    1  VAL2
# 5  2    2  VAL3
# 8  2    3  <NA>
# 3  3    1  VAL2
# 6  3    2  VAL3
# 9  3    3  VAL1

And, since you requested "data.table":

library(data.table)
DT <- data.table(dat)
DTL <- DT[, list(unlist(strsplit(as.character(Multi), " "))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
#    ID   V1 order
# 1:  1 VAL1     1
# 2:  1 VAL2     2
# 3:  1 VAL3     3
# 4:  2 VAL2     1
# 5:  2 VAL3     2
# 6:  3 VAL2     1
# 7:  3 VAL3     2
# 8:  3 VAL1     3

Update: Some timings

I didn't bother testing my "splitstackshape" approach, because that uses read.table under the hood, so I know it won't stand up to the demands of performance.

However, base R still seems to win out!

Sample "big" data:

dat_big <- do.call(rbind, replicate(floor(4000/3), dat, simplify = FALSE))
dat_big <- do.call(rbind, replicate(100, dat_big, simplify = FALSE))
dat_big$ID <- make.unique(as.character(dat_big$ID))

DT <- data.table(dat)
DT_big <- data.table(dat_big)

Functions to test:

fun1 <- function(inDF) {
  X <- strsplit(as.character(inDF[, "Multi"]), " ", fixed = TRUE)
  len <- vapply(X, length, 1L)
  data.frame(ID = rep(inDF[, "ID"], len), order = sequence(len), 
             Multi = unlist(X, use.names=FALSE))
}

fun2 <- function(inDT) {
  DTL <- inDT[, list(unlist(strsplit(as.character(Multi), " ", fixed = TRUE))), by = ID]
  DTL[, order := sequence(.N), by = ID]
  DTL
}

The results for base R:

system.time(outDF <- fun1(dat_big))
#    user  system elapsed 
#   6.418   0.000   6.454 
dim(outDF)
# [1] 1066400       3
head(outDF)
#   ID order Multi
# 1  1     1  VAL1
# 2  1     2  VAL2
# 3  1     3  VAL3
# 4  2     1  VAL2
# 5  2     2  VAL3
# 6  3     1  VAL2
tail(outDF)
#               ID order Multi
# 1066395 1.133299     3  VAL3
# 1066396 2.133299     1  VAL2
# 1066397 2.133299     2  VAL3
# 1066398 3.133299     1  VAL2
# 1066399 3.133299     2  VAL3
# 1066400 3.133299     3  VAL1

The results for "data.table":

system.time(outDT <- fun2(DT_big))
#    user  system elapsed 
#  14.035   0.000  14.057 
dim(outDT)
# [1] 1066400       3
outDT
#                ID   V1 order
#       1:        1 VAL1     1
#       2:        1 VAL2     2
#       3:        1 VAL3     3
#       4:        2 VAL2     1
#       5:        2 VAL3     2
#      ---                    
# 1066396: 2.133299 VAL2     1
# 1066397: 2.133299 VAL3     2
# 1066398: 3.133299 VAL2     1
# 1066399: 3.133299 VAL3     2
# 1066400: 3.133299 VAL1     3

Normalizing a multivalued column in a large table

1 Answers

Update: Some timings