4
votes

reshape2 is a package which allows an powerful array of data transformations, through its two-part melt/cast approach. However, like all tools it embeds assumptions which limit the cases it can handle.

What data reshaping problem can reshape2 not handle in its current form?

The ideal answer will include:

  • A description of the type of use cases where this data shape is typically found
  • Sample data
  • Code to accomplish the transformation (ideally using as much of the transformation with reshape2 as possible)

Example

"Wide" data is common in panel applications.

melt.wide <- function(data, id.vars, new.names, sep=".", variable.name="variable", ... ) {
  # Guess number of variables currently wide
  colnames(data) <- sub( paste0(sep,"$"), "",  colnames(data) )
  wide.vars <- colnames(data)[grep( sep, colnames(data) )]
  n.wide <- str_count( wide.vars, sep )
  stopifnot(length(new.names)==unique(n.wide))
  # Melt
  data.melt <- melt(data,id.vars=id.vars,measure.vars=wide.vars,...)
  new <- stack.list(str_split(data.melt$variable,sep))
  colnames(new) <- c(variable.name,new.names)
  data.melt <- subset(data.melt,select=c(-variable))
  cbind(data.melt,new)
}

choice.vars <- colnames(res)[grep("_",colnames(res))]
melt.wide( subset(res,select=c("WorkerId",choice.vars)), id.vars="WorkerId", new.names=c("set","option"), sep="_")

The new function returns a melted object that can then be *cast.

Where the data is:

so <- structure(list(WorkerId = c(12L, 13L, 27L, 25L, 30L, 8L), pio_1_1 = structure(c(2L, 
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    pio_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), pio_1_3 = structure(c(1L, 1L, 
    1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    pio_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), 
    pio_2_1 = structure(c(1L, 2L, 2L, 1L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), pio_2_2 = structure(c(1L, 1L, 
    1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    pio_2_3 = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), pio_2_4 = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = "No", class = "factor"), pio_3_1 = structure(c(2L, 
    2L, 2L, 2L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    pio_3_2 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), pio_3_3 = structure(c(2L, 1L, 
    2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    pio_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), 
    pio_4_1 = structure(c(2L, 1L, 2L, 2L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), pio_4_2 = structure(c(2L, 2L, 
    2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    pio_4_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), pio_4_4 = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_1_1 = structure(c(2L, 
    2L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    caremgmt_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), caremgmt_1_3 = structure(c(1L, 
    1L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    caremgmt_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), 
    caremgmt_2_1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), caremgmt_2_2 = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    caremgmt_2_3 = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), caremgmt_2_4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_3_1 = structure(c(2L, 
    1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    caremgmt_3_2 = structure(c(2L, 1L, 2L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), caremgmt_3_3 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    caremgmt_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), 
    caremgmt_4_1 = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), caremgmt_4_2 = structure(c(2L, 
    2L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    caremgmt_4_3 = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), caremgmt_4_4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_1_1 = structure(c(1L, 
    1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    prev_1_2 = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), prev_1_3 = structure(c(2L, 1L, 
    1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    prev_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), 
    prev_2_1 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), prev_2_2 = structure(c(2L, 2L, 
    1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    prev_2_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), prev_2_4 = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_3_1 = structure(c(1L, 
    2L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    prev_3_2 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), prev_3_3 = structure(c(2L, 2L, 
    1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    prev_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), 
    prev_4_1 = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), prev_4_2 = structure(c(1L, 1L, 
    2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    prev_4_3 = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), prev_4_4 = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = "No", class = "factor"), price_1_1 = structure(c(30L, 
    12L, 1L, 16L, 28L, 17L), .Label = c("$2,500", "$2,504", "$2,507", 
    "$2,509", "$2,512", "$2,513", "$2,515", "$2,526", "$2,547", 
    "$2,548", "$2,578", "$2,588", "$2,594", "$2,605", "$2,607", 
    "$2,617", "$2,618", "$2,622", "$2,635", "$2,649", "$2,670", 
    "$2,672", "$2,679", "$2,681", "$2,698", "$2,704", "$2,721", 
    "$2,782", "$2,851", "$2,884", "$2,919", "$2,925", "$2,935", 
    "$3,022"), class = "factor"), price_1_2 = structure(c(1L, 
    19L, 5L, 17L, 7L, 1L), .Label = c("$2,500", "$2,501", "$2,502", 
    "$2,504", "$2,513", "$2,515", "$2,517", "$2,532", "$2,535", 
    "$2,558", "$2,564", "$2,571", "$2,575", "$2,578", "$2,608", 
    "$2,633", "$2,634", "$2,675", "$2,678", "$2,687", "$2,730", 
    "$2,806", "$2,827", "$2,848", "$2,891", "$2,901", "$2,923", 
    "$2,933", "$2,937", "$2,958", "$2,987"), class = "factor"), 
    price_1_3 = structure(c(11L, 1L, 1L, 8L, 19L, 14L), .Label = c("$2,500", 
    "$2,504", "$2,507", "$2,513", "$2,516", "$2,518", "$2,564", 
    "$2,579", "$2,580", "$2,583", "$2,584", "$2,592", "$2,604", 
    "$2,608", "$2,639", "$2,643", "$2,646", "$2,665", "$2,667", 
    "$2,695", "$2,698", "$2,709", "$2,710", "$2,713", "$2,714", 
    "$2,750", "$2,757", "$2,876", "$2,978", "$2,984", "$3,024", 
    "$3,059"), class = "factor"), price_1_4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"), 
    price_2_1 = structure(c(27L, 32L, 19L, 22L, 4L, 26L), .Label = c("$2,500", 
    "$2,504", "$2,505", "$2,510", "$2,511", "$2,512", "$2,515", 
    "$2,517", "$2,518", "$2,529", "$2,533", "$2,537", "$2,551", 
    "$2,553", "$2,574", "$2,593", "$2,600", "$2,605", "$2,608", 
    "$2,612", "$2,613", "$2,618", "$2,639", "$2,657", "$2,714", 
    "$2,730", "$2,747", "$2,764", "$2,771", "$2,773", "$2,813", 
    "$2,859", "$2,901", "$3,019", "$3,037"), class = "factor"), 
    price_2_2 = structure(c(12L, 2L, 1L, 27L, 1L, 7L), .Label = c("$2,500", 
    "$2,502", "$2,510", "$2,514", "$2,515", "$2,516", "$2,517", 
    "$2,518", "$2,520", "$2,521", "$2,523", "$2,536", "$2,544", 
    "$2,575", "$2,583", "$2,592", "$2,602", "$2,624", "$2,644", 
    "$2,652", "$2,662", "$2,677", "$2,720", "$2,761", "$2,765", 
    "$2,770", "$2,772", "$2,835", "$2,873", "$2,911", "$2,950", 
    "$2,962"), class = "factor"), price_2_3 = structure(c(32L, 
    1L, 8L, 33L, 29L, 11L), .Label = c("$2,500", "$2,506", "$2,507", 
    "$2,510", "$2,511", "$2,512", "$2,515", "$2,517", "$2,527", 
    "$2,528", "$2,540", "$2,554", "$2,562", "$2,565", "$2,568", 
    "$2,581", "$2,597", "$2,611", "$2,616", "$2,631", "$2,652", 
    "$2,663", "$2,671", "$2,672", "$2,685", "$2,727", "$2,731", 
    "$2,742", "$2,771", "$2,778", "$2,781", "$2,970", "$2,984", 
    "$2,986", "$3,030"), class = "factor"), price_2_4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"), 
    price_3_1 = structure(c(24L, 1L, 28L, 7L, 18L, 21L), .Label = c("$2,500", 
    "$2,501", "$2,503", "$2,505", "$2,509", "$2,512", "$2,535", 
    "$2,537", "$2,542", "$2,553", "$2,556", "$2,560", "$2,561", 
    "$2,574", "$2,584", "$2,618", "$2,624", "$2,629", "$2,637", 
    "$2,664", "$2,761", "$2,840", "$2,875", "$2,883", "$2,891", 
    "$2,933", "$2,953", "$2,978", "$3,039", "$3,043", "$3,067"
    ), class = "factor"), price_3_2 = structure(c(3L, 1L, 5L, 
    19L, 25L, 9L), .Label = c("$2,500", "$2,501", "$2,503", "$2,504", 
    "$2,512", "$2,517", "$2,540", "$2,543", "$2,546", "$2,560", 
    "$2,567", "$2,573", "$2,586", "$2,592", "$2,594", "$2,603", 
    "$2,604", "$2,606", "$2,628", "$2,633", "$2,635", "$2,693", 
    "$2,696", "$2,714", "$2,734", "$2,739", "$2,770", "$2,791", 
    "$2,797", "$2,936", "$2,967", "$3,021", "$3,024"), class = "factor"), 
    price_3_3 = structure(c(26L, 7L, 5L, 32L, 10L, 24L), .Label = c("$2,500", 
    "$2,501", "$2,502", "$2,505", "$2,506", "$2,507", "$2,508", 
    "$2,509", "$2,512", "$2,515", "$2,519", "$2,547", "$2,556", 
    "$2,574", "$2,587", "$2,592", "$2,608", "$2,616", "$2,621", 
    "$2,635", "$2,638", "$2,667", "$2,671", "$2,688", "$2,694", 
    "$2,700", "$2,717", "$2,759", "$2,809", "$2,864", "$2,891", 
    "$2,912", "$3,011", "$3,012"), class = "factor"), price_3_4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"), 
    price_4_1 = structure(c(29L, 13L, 16L, 24L, 33L, 19L), .Label = c("$2,500", 
    "$2,505", "$2,506", "$2,508", "$2,511", "$2,525", "$2,549", 
    "$2,562", "$2,577", "$2,582", "$2,586", "$2,591", "$2,621", 
    "$2,636", "$2,654", "$2,670", "$2,722", "$2,726", "$2,733", 
    "$2,744", "$2,745", "$2,755", "$2,768", "$2,805", "$2,817", 
    "$2,827", "$2,835", "$2,888", "$2,925", "$2,959", "$3,001", 
    "$3,027", "$3,061", "$3,071"), class = "factor"), price_4_2 = structure(c(33L, 
    31L, 21L, 16L, 25L, 13L), .Label = c("$2,500", "$2,502", 
    "$2,503", "$2,505", "$2,506", "$2,511", "$2,513", "$2,516", 
    "$2,529", "$2,539", "$2,547", "$2,554", "$2,557", "$2,562", 
    "$2,567", "$2,579", "$2,581", "$2,583", "$2,585", "$2,591", 
    "$2,612", "$2,629", "$2,640", "$2,670", "$2,695", "$2,726", 
    "$2,737", "$2,788", "$2,790", "$2,798", "$2,852", "$3,031", 
    "$3,063"), class = "factor"), price_4_3 = structure(c(4L, 
    30L, 4L, 19L, 1L, 27L), .Label = c("$2,500", "$2,504", "$2,507", 
    "$2,509", "$2,511", "$2,512", "$2,514", "$2,516", "$2,543", 
    "$2,552", "$2,562", "$2,575", "$2,578", "$2,581", "$2,594", 
    "$2,614", "$2,615", "$2,617", "$2,636", "$2,640", "$2,641", 
    "$2,652", "$2,749", "$2,755", "$2,805", "$2,812", "$2,867", 
    "$2,906", "$2,910", "$2,917", "$2,924", "$2,927", "$2,961", 
    "$3,028", "$3,053", "$3,054"), class = "factor"), price_4_4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"), 
    plan_1_1 = structure(c(2L, 2L, 2L, 1L, 1L, 2L), .Label = c("", 
    "X"), class = "factor"), plan_1_2 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_1_3 = structure(c(1L, 
    1L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"), 
    plan_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "X"), class = "factor"), plan_2_1 = structure(c(1L, 2L, 1L, 
    2L, 2L, 2L), .Label = c("", "X"), class = "factor"), plan_2_2 = structure(c(1L, 
    1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"), 
    plan_2_3 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("", 
    "X"), class = "factor"), plan_2_4 = structure(c(1L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_3_1 = structure(c(1L, 
    2L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"), 
    plan_3_2 = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("", 
    "X"), class = "factor"), plan_3_3 = structure(c(2L, 1L, 1L, 
    1L, 1L, 2L), .Label = c("", "X"), class = "factor"), plan_3_4 = structure(c(1L, 
    1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"), 
    plan_4_1 = structure(c(2L, 2L, 1L, 1L, 1L, 1L), .Label = c("", 
    "X"), class = "factor"), plan_4_2 = structure(c(2L, 1L, 1L, 
    2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_4_3 = structure(c(1L, 
    1L, 1L, 1L, 2L, 2L), .Label = c("", "X"), class = "factor"), 
    plan_4_4 = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("", 
    "X"), class = "factor")), .Names = c("WorkerId", "pio_1_1", 
"pio_1_2", "pio_1_3", "pio_1_4", "pio_2_1", "pio_2_2", "pio_2_3", 
"pio_2_4", "pio_3_1", "pio_3_2", "pio_3_3", "pio_3_4", "pio_4_1", 
"pio_4_2", "pio_4_3", "pio_4_4", "caremgmt_1_1", "caremgmt_1_2", 
"caremgmt_1_3", "caremgmt_1_4", "caremgmt_2_1", "caremgmt_2_2", 
"caremgmt_2_3", "caremgmt_2_4", "caremgmt_3_1", "caremgmt_3_2", 
"caremgmt_3_3", "caremgmt_3_4", "caremgmt_4_1", "caremgmt_4_2", 
"caremgmt_4_3", "caremgmt_4_4", "prev_1_1", "prev_1_2", "prev_1_3", 
"prev_1_4", "prev_2_1", "prev_2_2", "prev_2_3", "prev_2_4", "prev_3_1", 
"prev_3_2", "prev_3_3", "prev_3_4", "prev_4_1", "prev_4_2", "prev_4_3", 
"prev_4_4", "price_1_1", "price_1_2", "price_1_3", "price_1_4", 
"price_2_1", "price_2_2", "price_2_3", "price_2_4", "price_3_1", 
"price_3_2", "price_3_3", "price_3_4", "price_4_1", "price_4_2", 
"price_4_3", "price_4_4", "plan_1_1", "plan_1_2", "plan_1_3", 
"plan_1_4", "plan_2_1", "plan_2_2", "plan_2_3", "plan_2_4", "plan_3_1", 
"plan_3_2", "plan_3_3", "plan_3_4", "plan_4_1", "plan_4_2", "plan_4_3", 
"plan_4_4"), row.names = c(NA, 6L), class = "data.frame")
1

1 Answers

5
votes

... almost a year later...

This came to mind the other day, and I have a sneaking suspicion that it is what you tried to show in your example, but unfortunately, your example code doesn't run!

melt sometimes takes things a bit too far for me when making my data "long". Sometimes, even though it is not what would necessarily be called "tidy data", I prefer to have a "semi-long" data.frame. This is easily achieved using base R's reshape, but requires a few extra steps with the "reshape2" package, as demonstrated below:

Prerequisite: sample data.

set.seed(1)
myDf <- data.frame(
  ID.1 = sample(letters[1:5], 5, replace = TRUE),
  ID.2 = 1:5,
  V.1 = sample(10:14, 5, replace = TRUE),
  V.2 = sample(5:9, 5, replace = TRUE),
  V.3 = sample(3:14, 5, replace = TRUE),
  W.1 = sample(LETTERS, 5, replace = TRUE),
  W.2 = sample(LETTERS, 5, replace = TRUE),
  W.3 = sample(LETTERS, 5, replace = TRUE)
)
myDf
#   ID.1 ID.2 V.1 V.2 V.3 W.1 W.2 W.3
# 1    b    1  14   6   8   Y   K   M
# 2    b    2  14   5  11   F   A   P
# 3    c    3  13   8  14   Q   J   M
# 4    e    4  13   6   7   D   W   E
# 5    b    5  10   8  12   G   I   V

The "semi-long" output that I'm looking for. Easily achieved with base R's reshape.

reshape(myDf, direction = "long", idvar=1:2, varying = 3:ncol(myDf))
#       ID.1 ID.2 time  V W
# b.1.1    b    1    1 14 Y
# b.2.1    b    2    1 14 F
# c.3.1    c    3    1 13 Q
# e.4.1    e    4    1 13 D
# b.5.1    b    5    1 10 G
# b.1.2    b    1    2  6 K
# b.2.2    b    2    2  5 A
# c.3.2    c    3    2  8 J
# e.4.2    e    4    2  6 W
# b.5.2    b    5    2  8 I
# b.1.3    b    1    3  8 M
# b.2.3    b    2    3 11 P
# c.3.3    c    3    3 14 M
# e.4.3    e    4    3  7 E
# b.5.3    b    5    3 12 V

melt is great if you wanted the equivalent of stack, especially since stack discards all factor variables, which is frustrating when read.table and family defaults to stringsAsFactors = TRUE. (You can make it work, but you need to convert the relevant columns to character before you can use stack). But, it is not what I'm looking for, in particular because of how it has handled the "variable" column.

library(reshape2)
myDfL <- melt(myDf, id.vars=1:2)
head(myDfL)
#   ID.1 ID.2 variable value
# 1    b    1      V.1    14
# 2    b    2      V.1    14
# 3    c    3      V.1    13
# 4    e    4      V.1    13
# 5    b    5      V.1    10
# 6    b    1      V.2     6

To fix this, one needs to first split the "variable" column, and then use dcast to get the same format of output as you would get from reshape.

myDfL <- cbind(myDfL, colsplit(myDfL$variable, "\\.", names=c("var", "time")))
dcast(myDfL, ID.1 + ID.2 + time ~ var, value.var="value")
#    ID.1 ID.2 time  V W
# 1     b    1    1 14 Y
# 2     b    1    2  6 K
# 3     b    1    3  8 M
# 4     b    2    1 14 F
# 5     b    2    2  5 A
# 6     b    2    3 11 P
# 7     b    5    1 10 G
# 8     b    5    2  8 I
# 9     b    5    3 12 V
# 10    c    3    1 13 Q
# 11    c    3    2  8 J
# 12    c    3    3 14 M
# 13    e    4    1 13 D
# 14    e    4    2  6 W
# 15    e    4    3  7 E