using case_when inside dplyr's mutate to create a new column in dataframe based on conditions

Question

I'm trying to create a new column in my data frame that is based off another column having a certain unique value. I have successfully done this by using the case_when argument inside a mutate. However, I would like to use a str_detect because I have a great deal of data and I don't want to have to type every single unique value because most are the same just different by the last few numbers in the string.. I keep getting errors when I try to use str_detect inside of the case_when.. does any one know how I can achieve this?

dput of data:

 dput(head(HUC_wq_summary,10))
structure(list(locid = c("21NJDEP1-01464380", "21NJDEP1-01464380", 
"21NJDEP1-01464380", "21NJDEP1-01464380", "21NJDEP1-01464380", 
"21NJDEP1-01464380", "21NJDEP1-01464380", "21NJDEP1-01464380", 
"21NJDEP1-01464380", "21NJDEP1-01464380"), parameter = c("Alkalinity, Carbonate as CaCO3", 
"Alkalinity, Carbonate as CaCO3", "Alkalinity, Carbonate as CaCO3", 
"Alkalinity, Carbonate as CaCO3", "Alkalinity, Carbonate as CaCO3", 
"Ammonia-nitrogen as N", "Ammonia-nitrogen as N", "Ammonia-nitrogen as N", 
"Ammonia-nitrogen as N", "Barometric pressure"), ActivityStartDate = structure(c(13902, 
13986, 14075, 14179, 14271, 13902, 13986, 14075, 14271, 13902
), class = "Date"), ActivityEndDate = c(NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_), Samples = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), MinValue = c(26, 44, 40, 
28, 25, 0.12, 0.01, 0.05, 0.13, 760), MaxValue = c(26, 44, 40, 
28, 25, 0.12, 0.01, 0.05, 0.13, 760), Average = c(26, 44, 40, 
28, 25, 0.12, 0.01, 0.05, 0.13, 760), HUC14 = c("HUC02040201040070", 
"HUC02040201040070", "HUC02040201040070", "HUC02040201040070", 
"HUC02040201040070", "HUC02040201040070", "HUC02040201040070", 
"HUC02040201040070", "HUC02040201040070", "HUC02040201040070"
), Trib = c("HUC02040201040070", "HUC02040201040070", "HUC02040201040070", 
"HUC02040201040070", "HUC02040201040070", "HUC02040201040070", 
"HUC02040201040070", "HUC02040201040070", "HUC02040201040070", 
"HUC02040201040070")), .Names = c("locid", "parameter", "ActivityStartDate", 
"ActivityEndDate", "Samples", "MinValue", "MaxValue", "Average", 
"HUC14", "Trib"), row.names = c(NA, -10L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = c("locid", "parameter", 
"ActivityStartDate"), drop = TRUE, indices = list(0L, 1L, 2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L), group_sizes = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
    locid = c("21NJDEP1-01464380", "21NJDEP1-01464380", "21NJDEP1-01464380", 
    "21NJDEP1-01464380", "21NJDEP1-01464380", "21NJDEP1-01464380", 
    "21NJDEP1-01464380", "21NJDEP1-01464380", "21NJDEP1-01464380", 
    "21NJDEP1-01464380"), parameter = c("Alkalinity, Carbonate as CaCO3", 
    "Alkalinity, Carbonate as CaCO3", "Alkalinity, Carbonate as CaCO3", 
    "Alkalinity, Carbonate as CaCO3", "Alkalinity, Carbonate as CaCO3", 
    "Ammonia-nitrogen as N", "Ammonia-nitrogen as N", "Ammonia-nitrogen as N", 
    "Ammonia-nitrogen as N", "Barometric pressure"), ActivityStartDate = structure(c(13902, 
    13986, 14075, 14179, 14271, 13902, 13986, 14075, 14271, 13902
    ), class = "Date")), row.names = c(NA, -10L), class = "data.frame", vars = c("locid", 
"parameter", "ActivityStartDate"), drop = TRUE, .Names = c("locid", 
"parameter", "ActivityStartDate")))

Code:

HUC_wq_summary<-HUC_wq_summary%>%
   mutate(Trib=case_when(HUC14 == "HUC02040202100060" ~ "Pennsauken",
                         HUC14 == "HUC02040202100050" ~ "Pennsauken",
                         HUC14 == "HUC02040202100020" ~ "Pennsauken",
                         HUC14 == "HUC02040202100030" ~ "Pennsauken",
                         HUC14 == "HUC02040202100040" ~ "Pennsauken",
                         HUC14 == "HUC02040202100010" ~ "Pennsauken",
                         HUC14 == "HUC02040202150010" ~ "Raccoon",
                         HUC14 == "HUC02040202150060" ~ "Raccoon",
                         HUC14 == "HUC02040202150020" ~ "Raccoon",
                         HUC14 == "HUC02040202150040" ~ "Raccoon",
                         HUC14 == "HUC02040202150050" ~ "Raccoon",
                         HUC14 == "HUC02040202150030" ~ "Raccoon",
                         HUC14 == "HUC02040202080040"~ "Rancocas",
                         HUC14 == "HUC02040202080030"~ "Rancocas",
                         HUC14 == str_detect(HUC14,"HUC020402020600"~ "Rancocas"),TRUE ~ HUC14))

As you can see most of the HUC14 strings are the same for each just different by a few numbers.

So what are you trying to use for your str_detect? It seems like this might actually be a regex question more than a case_when question, since you already have a case_when call setup — camille
you just have the syntax wrong: str_detect(HUC14,"HUC020402020600") ~ "Rancocas" — Chris
HUC14 == str_detect(HUC14,"HUC020402020600"~ "Rancocas") doesn't make sense. str_detect returns a true/false, so unless HUC14 might contain a logical value, you wouldn't be comparing those two things. You also have the ~ part of that line inside of you str_detect — camille
It would be helpful if you removed the columns of data that we don't need to address this problem, to keep the example minimal. — camille

A. Suliman A. Suliman · Accepted Answer · 2018-08-15T14:08:10

HUC_wq_summary <- HUC_wq_summary %>%          
         mutate(Trib=case_when(str_detect(HUC14,".*000[1-6]0") ~ "Pennsauken", #Search for 000 followed by a digit from 1-6 followed by 0
                               str_detect(HUC14,".*500[1-6]0" ) ~ "Raccoon", 
                               str_detect(HUC14,".*800[34]0" ) ~ "Rancocas",  #Search for 800 followed by 3 or 4 followed by 0
                               TRUE ~ HUC14))

using case_when inside dplyr's mutate to create a new column in dataframe based on conditions

2 Answers