0
votes

I have the following data:

mydata <- data.frame(var1_A=c(1,2,0,5,NA),var1_B=c(1,1,NA,0,3),var1_C=c(1,NA,1,1,0),
           var1_D=c(1,1,NA,0,3),var1_E=c(1,2,0,5,NA),var1_F=c(1,2,NA,1,0),
           var1_G=c(1,2,NA,5,NA),var1_H=c(1,NA,1,0,3),var1_I=c(1,NA,2,1,0), 
           ideology2_A=c(2,2,2,3,4), ideology2_B=c(NA,3,3,1,4), 
           ideology2_C=c(1,4,7,6,9), ideology2_D=c(8,9,7,6,4), 
           ideology2_E=c(NA,9,5,6,4),
           ideology2_F=c(1,1,1,1,4), ideology2_G=c(8,9,7,6,4), 
           ideology2_H=c(8,9,7,6,5), ideology2_I=c(8,9,8,9,8),
           ex_ideology_A=c(3,3,3,3,4), ex_ideology_B=c(3,NA,3,1,4), 
           ex_ideology_C=c(1,4,7,6,NA),ex_ideology_D=c(3,3,3,NA,0),
           ex_ideology_E=c(3,NA,3,1,1), ex_ideology_F = c(1,1,1,2,3),
           ex_ideology_G=c(1,2,3,4,5), ex_ideology_H=c(1,1,1,1,1)
           ex_ideology_I=c(2,2,2,NA,1), no_change_A=c(1,1,1,1,1),
           no_change_B=c(2,2,2,2,2), no_change_C=c(1,1,1,1,1),
           no_change_D=c(1,3,3,1,NA), no_change_E=c(1,1,1,1,1),
           no_change_F=c(0,0,1,1,1), no_change_G=c(1,1,2,2,2),
           no_change_H=c(5,6,7,8,9), no_change_I=c(1,1,1,1,1)
           country=c("BRA","USA","URU,"BRA","UK"),
           ID=c(1,2,3,4,5), v5=c(6:10))   

I need to recode all values in the groups of variables which have the word "ideology" corresponding to each variable with same letter in var1. More especifically, whenever var1_A == 0 or var1_A == NA, recode ideology2_A and ex_ideology3_A to NA, and so on with var1_B, etc. In fact it would be more appropriate to generate NEW variables, preserving the original ones (like dplyr's "mutate", not "transmute").

My real data has many more columns and 60k rows, but the columns involved in this problem would be only these three groups of 9, indexed by letters (there are other groups of variables also indexed with those letters, just like "no_change" in the example above, but that should not interfere with the solution).

EDIT: my example dataset now is closer to the real data I need to work with. In the first version of the post it was a dataset with only 3 columns for each group of variables (letters A to C) and all groups with similar names (var1, var2, var3). It also did not have other variables than the benchmark and the ones to recode.

3

3 Answers

3
votes

What about this base R option

do.call(
  cbind,
  lapply(
    unname(u <- split.default(mydata, gsub(".*_", "", names(mydata)))),
    function(x) cbind(x[1], x[-1] * ifelse(x[[1]] %in% c(0, NA), NA, 1))
  )
)[match(c(sapply(u, names)), names(mydata))]

which gives

  var1_A var1_B var1_C var2_A var2_B var2_C var3_A var3_B var3_C
1      1      1      1      2     NA      1      3      3      1
2      2      1     NA      2      3     NA      3     NA     NA
3      0     NA      1     NA     NA      7     NA     NA      7
4      5      0      1      3     NA      6      3     NA      6
5     NA      3      0     NA      4     NA     NA      4     NA

If you want to create recoded columns and preserve the original ones, try the code below

do.call(
  cbind,
  lapply(
    unname(u <- split.default(mydata, gsub(".*_", "", names(mydata)))),
    function(x) cbind(x, x[-1] * ifelse(x[[1]] %in% c(0, NA), NA, 1))
  )
)

which gives

  var1_A var2_A var3_A var2_A var3_A var1_B var2_B var3_B var2_B var3_B var1_C
1      1      2      3      2      3      1     NA      3     NA      3      1
2      2      2      3      2      3      1      3     NA      3     NA     NA
3      0      2      3     NA     NA     NA      3      3     NA     NA      1
4      5      3      3      3      3      0      1      1     NA     NA      1
5     NA      4      4     NA     NA      3      4      4      4      4      0
  var2_C var3_C var2_C var3_C
1      1      1      1      1
2      4      4     NA     NA
3      7      7      7      7
4      6      6      6      6
5      9     NA     NA     NA

Data

> dput(mydata)
structure(list(var1_A = c(1, 2, 0, 5, NA), var1_B = c(1, 1, NA, 
0, 3), var1_C = c(1, NA, 1, 1, 0), var2_A = c(2, 2, 2, 3, 4),
    var2_B = c(NA, 3, 3, 1, 4), var2_C = c(1, 4, 7, 6, 9), var3_A = c(3,
    3, 3, 3, 4), var3_B = c(3, NA, 3, 1, 4), var3_C = c(1, 4,
    7, 6, NA)), class = "data.frame", row.names = c(NA, -5L))
1
votes

Using varios pivots:

library(dplyr)
library(tidyr)
mydata %>% 
  mutate(row = row_number()) %>% 
  pivot_longer(-row, names_to = "key", values_to = "values") %>% 
  separate(key, c("var", "letter")) %>% 
  pivot_wider(names_from = var, values_from = values) %>% 
  mutate(
    var2 = if_else(var1 == 0 | is.na(var1), NA_real_, var2),
    var3 = if_else(var1 == 0 | is.na(var1), NA_real_, var3),
  ) %>% 
  pivot_longer(starts_with("var"), names_to = "var", values_to = "values") %>% 
  mutate(name = paste(var, letter, sep = "_")) %>% 
  pivot_wider(id_cols = row, names_from = name, values_from = values)
1
votes

Similar to the solution by @eastclintw00d, but with slightly different approach.

tmp <- mydata %>% 
  mutate(obs = 1:n()) %>% 
  pivot_longer(-obs, 
     names_pattern="(.*)_(.*)", 
     names_to=c("vars", "letter"), 
     values_to="vals") %>% 
  group_by(letter, obs) %>% 
  mutate(val1 = vals[which(vars == "var1")], 
         new = case_when(val1 == 0 | is.na(val1) ~ NA_real_, TRUE ~ vals)) %>% 
  select(-val1) %>% 
  pivot_wider(names_from = c("vars", "letter"), 
              values_from=c("vals", "new"))
names(tmp) <- gsub("vals_", "", names(tmp))


EDIT: to work with updated data

This should work with the updated data. The difference is how the variables to be pivoted are selected - it finds variables that end with and underscore and any capital letter. The ends_with() function didn't seem to work with the regular expression, so I used str_detect() from stringr wrapped in which().

tmp <- mydata %>% 
  mutate(obs = 1:n()) %>% 
  pivot_longer(which(str_detect(names(.), "_[A-Z]")), 
               names_pattern="(.*)_(.*)", 
               names_to=c("vars", "letter"), 
               values_to="vals") %>% 
  group_by(letter, obs) %>% 
  mutate(val1 = vals[which(vars == "var1")], 
         new = case_when(val1 == 0 | is.na(val1) ~ NA_real_, TRUE ~ vals)) %>% 
  select(-val1) %>% 
  pivot_wider(names_from = c("vars", "letter"), 
              values_from=c("vals", "new"))
names(tmp) <- gsub("vals_", "", names(tmp))