1
votes

I have a data frame that looks like

            m1      m2     m3
 P001.st   60.00   2.0     1
 P003.nd   14.30   2.077   1
 P003.rt   29.60   2.077   1
 P006.st   10.30   2.077   1
 P006.nd   79.30   2.077   1
 P008.nd    9.16   2.077   1

I want to reformat table so that only first part (before period, i.e., P001, P003 etc) of the row name appear as row names and append the each subsequent rows with similar names to columns. The output should look like

         m1st   m2st  m3st  m1nd   m2nd  m3nd   m1rt   m2rt   m3rt
 P001   60.00   2.0     1   0       0      0     0       0      0
 P003   0       0       0   14.30   2.077  1     29.60   2.077  1
 P006   10.30   2.077   1   79.30   2.077  1     0       0      0
 P008   0       0       0    9.16   2.077  1     0       0      0

The aggregate function like

aggregate(value~name, df, I)

or a method from data.table like

setDT(df)[, list(value=list(value)), by=name] 

would not work because row.names are not exactly the same. Any suggestions for matching hundreds of rows with many variable subtypes (i.e, after period: .nd, .st etc).

5

5 Answers

3
votes
dt = as.data.table(your_df, keep.rownames = T)

# split the row names into two id's
dt[, `:=`(id1 = sub('\\..*', '', rn), id2 = sub('.*\\.', '', rn), rn = NULL)]

# melt and dcast (need latest 1.9.5 or have to load reshape2 and use dcast.data.table)
dcast(melt(dt, id.vars = c('id1', 'id2')), id1 ~ variable + id2, fill = 0)
#    id1 m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
#1: P001  0.00   0.0  60.0 0.000 0.000 2.000     0     0     1
#2: P003 14.30  29.6   0.0 2.077 2.077 0.000     1     1     0
#3: P006 79.30   0.0  10.3 2.077 0.000 2.077     1     0     1
#4: P008  9.16   0.0   0.0 2.077 0.000 0.000     1     0     0
2
votes

Here's another way to do it:

library(dplyr)
library(tidyr)
(wide <- reshape(df %>% add_rownames() %>% separate(rowname, c("rowname", "id")), 
                 idvar = "rowname", 
                 timevar = "id", 
                 direction = "wide", 
                 sep = ""))
#   rowname m1st  m2st m3st  m1nd  m2nd m3nd m1rt  m2rt m3rt
# 1    P001 60.0 2.000    1    NA    NA   NA   NA    NA   NA
# 2    P003   NA    NA   NA 14.30 2.077    1 29.6 2.077    1
# 4    P006 10.3 2.077    1 79.30 2.077    1   NA    NA   NA
# 6    P008   NA    NA   NA  9.16 2.077    1   NA    NA   NA

wide[is.na(wide)] <- 0
rownames(wide) <- wide[, 1]
wide$rowname <- NULL
wide
#      m1st  m2st m3st  m1nd  m2nd m3nd m1rt  m2rt m3rt
# P001 60.0 2.000    1  0.00 0.000    0  0.0 0.000    0
# P003  0.0 0.000    0 14.30 2.077    1 29.6 2.077    1
# P006 10.3 2.077    1 79.30 2.077    1  0.0 0.000    0
# P008  0.0 0.000    0  9.16 2.077    1  0.0 0.000    0
1
votes

If you have your data frame is called "data":

library(reshape2)
data$prefix <- gsub("(.*)\\..*","\\1",row.names(data))
data$suffix <- gsub(".*\\.(.*)","\\1",row.names(data))
data.melt <- melt(data)
data.melt
data.cast <- dcast(data.melt,prefix~variable+suffix,mean)
# set the row names to prefix 
row.names(data.cast) <- data.cast$prefix
# get rid of the prefix column
data.cast <- data.cast[,-1]
data.cast 

Gives

Using prefix, suffix as id variables
     m1_nd m1_rt m1_st m2_nd m2_rt m2_st m3_nd m3_rt m3_st
P001   NaN   NaN  60.0   NaN   NaN 2.000   NaN   NaN     1
P003 14.30  29.6   NaN 2.077 2.077   NaN     1     1   NaN
P006 79.30   NaN  10.3 2.077   NaN 2.077     1   NaN     1
P008  9.16   NaN   NaN 2.077   NaN   NaN     1   NaN   NaN

To correct the column names and zeros instead of NaN, do

names(data.cast) <- gsub("_","",names(data.cast))
apply(data.cast,c(1,2),function(x){as.numeric(ifelse(is.na(x),0,x)) })

To get

       m1nd m1rt m1st  m2nd  m2rt  m2st m3nd m3rt m3st
P001  0.00  0.0 60.0 0.000 0.000 2.000    0    0    1
P003 14.30 29.6  0.0 2.077 2.077 0.000    1    1    0
P006 79.30  0.0 10.3 2.077 0.000 2.077    1    0    1
P008  9.16  0.0  0.0 2.077 0.000 0.000    1    0    0
1
votes

Try this:

library(tidyr)
library(dplyr)
library(reshape2)
library(stringr)

data <-
  structure(list(m1 = c(60, 14.3, 29.6, 10.3, 79.3, 9.16), 
                 m2 = c(2, 2.077, 2.077, 2.077, 2.077, 2.077),
                 m3 = c(1L, 1L, 1L, 1L, 1L, 1L)),
            .Names = c("m1", "m2", "m3"),
            class = "data.frame", 
            row.names = c("P001.st", "P003.nd", "P003.rt", 
                          "P006.st", "P006.nd", "P008.nd"))

my_data <- 
  as_data_frame(cbind(col_01 = rownames(data), data)) %>% 
  melt(.) %>% 
  separate(., col_01, into = c("var_01", "var_02"), sep = "\\.") %>% 
  mutate(my_var = str_c(variable, var_02)) %>% 
  select(var_01, my_var, value) %>% 
  arrange(var_01, my_var) %>% 
  spread(., my_var, value)

my_data

  var_01  m1nd m1rt m1st  m2nd  m2rt  m2st m3nd m3rt m3st
1   P001    NA   NA 60.0    NA    NA 2.000   NA   NA    1
2   P003 14.30 29.6   NA 2.077 2.077    NA    1    1   NA
3   P006 79.30   NA 10.3 2.077    NA 2.077    1   NA    1
4   P008  9.16   NA   NA 2.077    NA    NA    1   NA   NA

If you want to replace NAs with 0, you can do it like this:

my_data[is.na(my_data)] <- 0

  var_01  m1nd m1rt m1st  m2nd  m2rt  m2st m3nd m3rt m3st
1   P001  0.00  0.0 60.0 0.000 0.000 2.000    0    0    1
2   P003 14.30 29.6  0.0 2.077 2.077 0.000    1    1    0
3   P006 79.30  0.0 10.3 2.077 0.000 2.077    1    0    1
4   P008  9.16  0.0  0.0 2.077 0.000 0.000    1    0    0
1
votes

Using extract() instead of separate() with the more flexible regular expressions, using tidyr and dplyr:

df %>% 
  extract(id, c("id2", "var"), c("(P00.)\\.(..)")) %>% 
  gather(variable,value,c(m1,m2,m3)) %>% 
  mutate(var=paste0(variable,".",var)) %>% 
  select(-variable) %>% 
  spread(var,value,fill=0)

   id2 m1.nd m1.rt m1.st m2.nd m2.rt m2.st m3.nd m3.rt m3.st
1 P001  0.00   0.0  60.0 0.000 0.000 2.000     0     0     1
2 P003 14.30  29.6   0.0 2.077 2.077 0.000     1     1     0
3 P006 79.30   0.0  10.3 2.077 0.000 2.077     1     0     1
4 P008  9.16   0.0   0.0 2.077 0.000 0.000     1     0     0