3
votes

I have been doing this data cleaning pretty often, where I am trying to convert data gathered in wide format from a Qualtrics survey into long format. I usually have e.g. two main questions which I ask multiple times for each country, some covariates, and an ID variable. The code below creates example data in this format. I would like to transform the data from the wide format below to a long format where one column is the country, two columns are the main questions, one is the covariate, and one is the ID variable. I have been doing this in what I am absolutely sure is a bad, inefficient way, but I can't find an example of how to do this exact task more efficiently. I'd be very grateful if someone can show me a more efficient way to do this, ideally using either base R or the tidyverse.

require(tidyr)
#make example data
dfLength = 500

wide = data.frame(happy_Belgium = runif(dfLength), happy_US= runif(dfLength), happy_UK= runif(dfLength), angry_Belgium= runif(dfLength), angry_US= runif(dfLength), angry_UK= runif(dfLength), id = 1:dfLength, other_variable = runif(dfLength))


#Make an individual long dataframe for each measure
longHappy = wide %>%
  gather(key="country", value="happy", happy_Belgium:happy_UK)
longAngry = wide %>%
  gather(key="country", value="angry", angry_Belgium:angry_UK)
#Make a variable for the country based on the format of the question titles
longHappy$country = substring(longHappy$country, 7, nchar(longHappy$country))
longAngry$country = substring(longAngry$country, 7, nchar(longAngry$country))
#Merge the two long variables
long = merge(longHappy, longAngry)
#Get rid of columns I don't need
keeps = c("id", "other_variable", "happy", "angry", "country")
long = long[,names(long) %in% keeps]
2

2 Answers

2
votes

Try:

library(tidyr)

long <- 
  wide %>% 
  pivot_longer(cols = contains("y_"), names_to = c(".value", "country"), names_sep = "_")

head(long)
#> # A tibble: 6 x 5
#>      id other_variable country happy  angry
#>   <int>          <dbl> <chr>   <dbl>  <dbl>
#> 1     1          0.822 Belgium 0.610 0.0304
#> 2     1          0.822 US      0.681 0.352 
#> 3     1          0.822 UK      0.727 0.708 
#> 4     2          0.693 Belgium 0.170 0.526 
#> 5     2          0.693 US      0.462 0.836 
#> 6     2          0.693 UK      0.466 0.577

Data

dfLength = 500

wide = data.frame(happy_Belgium = runif(dfLength), 
                  happy_US= runif(dfLength),
                  happy_UK= runif(dfLength), 
                  angry_Belgium= runif(dfLength), 
                  angry_US= runif(dfLength), 
                  angry_UK= runif(dfLength), 
                  id = 1:dfLength, 
                  other_variable = runif(dfLength))

Created on 2020-05-12 by the reprex package (v0.3.0)

1
votes

An option with matches to select the column names that starts with 'happy' or 'angry'

library(dplyr)
library(tidyr)
wide %>% 
 pivot_longer(cols = matches('^(happy|angry)'), 
       names_to = c(".value", "country"), names_sep = "_")
# A tibble: 1,500 x 5
#      id other_variable country  happy angry
#   <int>          <dbl> <chr>    <dbl> <dbl>
# 1     1          0.113 Belgium 0.430  0.295
# 2     1          0.113 US      0.359  0.410
# 3     1          0.113 UK      0.824  0.176
# 4     2          0.379 Belgium 0.979  0.446
# 5     2          0.379 US      0.624  0.911
# 6     2          0.379 UK      0.0781 0.741
# 7     3          0.785 Belgium 0.0606 0.590
# 8     3          0.785 US      0.461  0.149
# 9     3          0.785 UK      0.913  0.404
#10     4          0.486 Belgium 0.204  0.516
# … with 1,490 more rows

Or remove the columns that are not needed in cols with -

wide %>% 
  pivot_longer(cols = -c(id, other_variable), 
          names_to = c(".value", "country"), names_sep = "_")