2
votes

I need to reshape my dataset from wide to long format using one of the columns to create others.

My data looks like this:

Participant  V1     V2    V3   V4   V5   V6   V7   V8   V9   V10 ... V1000

Prob1_1     323.25   325.85
Prob1_2     236.12   455.23
Prob2_3     423.52   526.14     ....
Prob2_4     512.47   426.12
....
Prob2_100   235.14   632.14
Improb1_1   632.12   236.12

And I want to use the variable Participant to build the following long dataset.

Participant    Probability    RT          Trial     Session
    1              Prob           323.25     1          1
    1              Prob           325.85     2          1
    2              Prob           236.12     1          1
    2              Prob           455.23     2          1 
    3              Prob           423.52     1          2
    3              Prob           526.14     2          2
    4        
    5
    6...

I have tried to use the mutate function, but it seems to depend on the column name, and I want it to drive from the actual participants' name. For example "Prob1_1", the number immediately after Prob represents session and the last number is the participant number. The variables V1, V2.... represent the trial number.

Solutions mentioned in How do I convert a wide dataframe to a long dataframe for a multilevel structure with 'quadruple nesting'? did not work for me.

2

2 Answers

2
votes

Using tidyr::extract we can separate Participant into three groups/columns based on the follwing RegEx:

  • 1+ Non digits
  • 1+ digits
  • 0 or 1 _ "don't assign it to a group/column ",
  • 1+ digits

then gather and mutate

library(dplyr)
library(tidyr)
extract(df, Participant, into = c('Probability','Session','Participant'), 
                         regex = "^(\\D+)(\\d+)_*(\\d+)") %>% 
    gather(Trial, RT, -c('Probability','Participant','Session')) %>% 
    mutate(Trial=sub('V','',Trial)) %>% 
    select(Participant, Probability, RT, Trial, Session) %>% 
    arrange(Participant, Session, Trial)

    Participant Probability     RT Trial Session
  1           1        Prob 323.25     1       1
  2           1        Prob 325.85     2       1
  3           2        Prob 236.12     1       1
  4           2        Prob 455.23     2       1
  5           3        Prob 423.52     1       2
  6           3        Prob 526.14     2       2
  7           4        Prob 512.47     1       2
  8           4        Prob 426.12     2       2

data

df <- structure(list(Participant = structure(1:4, .Label = c("Prob1_1", 
    "Prob1_2", "Prob2_3", "Prob2_4"), class = "factor"), V1 = c(323.25, 
    236.12, 423.52, 512.47), V2 = c(325.85, 455.23, 526.14, 426.12
    )), class = "data.frame", row.names = c(NA, -4L))
1
votes

Here's a base R approach using reshape() and regular expressions to transform() the data.

dat.long <- reshape(transform(dat, 
                              probability=gsub("\\d.+", "", participant),
                              participant=gsub("\\w*_(\\d)", "\\1", participant),
                              trial=gsub("\\w*(\\d).+", "\\1", participant)), 
                    varying=2:11, idvar=c("participant", "probability", "trial"), 
                    direction="long", v.names="RT", timevar="session")
head(dat.long, 15)
#              participant probability trial session          RT
# 1.Prob.1.1             1        Prob     1       1  1.28978001
# 2.Prob.1.1             2        Prob     1       1 -1.40316524
# 3.Prob.1.1             3        Prob     1       1  0.51445097
# 1.Prob.2.1             1        Prob     2       1  0.14846476
# 2.Prob.2.1             2        Prob     2       1  0.06879947
# 3.Prob.2.1             3        Prob     2       1  0.02801546
# 1.Improb.1.1           1      Improb     1       1  1.26768662
# 2.Improb.1.1           2      Improb     1       1 -0.87197423
# 3.Improb.1.1           3      Improb     1       1 -1.04835070
# 1.Improb.2.1           1      Improb     2       1  0.65630521
# 2.Improb.2.1           2      Improb     2       1 -0.58099358
# 3.Improb.2.1           3      Improb     2       1 -0.15082366
# 1.Prob.1.2             1        Prob     1       2  0.58457874
# 2.Prob.1.2             2        Prob     1       2 -2.25150269
# 3.Prob.1.2             3        Prob     1       2  1.50887273

Data

dat <- structure(list(participant = structure(c(7L, 8L, 9L, 10L, 11L, 
12L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("Improb1_1", "Improb1_2", 
"Improb1_3", "Improb2_1", "Improb2_2", "Improb2_3", "Prob1_1", 
"Prob1_2", "Prob1_3", "Prob2_1", "Prob2_2", "Prob2_3"), class = "factor"), 
    V.1 = c(-0.78317903978425, -1.42256802537658, 0.342940938479779, 
    -1.31227758139305, -0.134871424882155, 0.372262460142081, 
    0.313809235928102, 0.296138275146936, 0.931606411400065, 
    0.375142970846081, -1.43961382169779, 0.815799047808872), 
    V.2 = c(-0.171646399523515, -0.644161162124944, 0.785979607398719, 
    0.47705655100109, -0.963859684799095, 0.154552158842357, 
    1.72307227079195, 0.986655369736914, -0.32413410098149, 0.44638843532548, 
    1.28716761230553, 0.628048242307817), V.3 = c(0.0521416442312076, 
    -1.31894376808205, -0.40743302087948, 1.14813283531483, 0.575490018443863, 
    -0.768152373604551, 0.668394957075075, 0.0652434186965083, 
    0.796433243461602, -0.607367768674947, 1.16046833952821, 
    0.416012124430193), V.4 = c(0.109068898771834, 0.310749865844485, 
    -0.578879180813773, -0.160584364698438, -0.567827511946429, 
    -0.0762903833505978, -0.940099003977588, -0.706132353777999, 
    0.551756154707779, 1.21946510393981, 0.0540021849120832, 
    0.371706858474099), V.5 = c(0.997271454464248, 0.351789857136835, 
    0.335620936190577, 0.115428590188729, -1.02709809154436, 
    0.640719901786663, -0.828218512265051, 1.57701044840292, 
    0.0587912355165915, -0.290236728884489, 0.875871492695704, 
    -0.130491615088836), V.6 = c(1.29218428325551, -0.60588680898263, 
    0.403803440305249, 1.0357840121496, -1.34874665542469, 0.883403082744137, 
    2.1083976501382, -0.133455001164623, -0.392764320879111, 
    2.45559047947122, -0.836168557148904, 0.542357603414291), 
    V.7 = c(0.283836115710646, -1.11604617217924, -0.702911947372907, 
    1.92979553472645, 1.07991010308695, 1.75404937440206, 0.477955966827059, 
    -0.64206114456452, -0.401702215242213, -1.36264088455225, 
    -0.948291093216559, 0.417484687283255), V.8 = c(-1.22023326762452, 
    -0.876955844153173, -1.14703776357049, -0.0850518753026808, 
    1.56853089303981, -0.278003253072658, -1.07862125797898, 
    2.03389661648939, 1.25326789139365, 0.700470424495529, 0.0722915950880813, 
    -1.16225205037457), V.9 = c(-1.52445924377294, -0.260093122031534, 
    0.982661963156681, -2.6411330557081, 1.0685535833561, -1.27019946336172, 
    0.387277102978568, 0.615191170581553, -0.592960414663139, 
    -0.0183305342891908, -0.392615477570169, -1.06251098372276
    ), V.10 = c(-0.295815527371644, 0.243080328002458, 0.517476015205563, 
    -1.4243221433541, 0.411574978845139, -0.164274442339443, 
    -0.0564129898114199, -1.05954278095433, -0.784089424501994, 
    0.422217107186452, -1.71720615045398, 0.482129993001465)), class = "data.frame", row.names = c(NA, 
-12L))