0
votes

I have two datasets for two different years (2008 and 2009). The idea is to identify the new molecules by looking at their sales_units and Dollar_value. If in 2008 some molecule did not have any sales or dollar value but in 2009 that molecule has positive sales and dollar value I want to identify it as a new molecule. I thought generating an Indicator variable called New_Molecule that takes 1 when there is a new molecule otherwise 0, would be a good way to do this.

######YEAR 2008 data##########
    Year <- c("2008", "2008", "2008", "2008","2008", "2008", "2008", "2008")
    Country <- c("US", "US","US", "US", "Canada", "Canada","Canada", "Canada")
    Molecule <- c("A", "B", "C", "D","A", "B", "C", "D")
    Dollar_Value <- c(0, 0, 100, 200, 75, 0, 0 ,0)
    Sales_Units <- c(0, 0, 20, 40, 5, 0, 0, 0)
    df_2008 <- data.frame(Year,Country, Molecule, Dollar_Value,Sales_Units)

######YEAR 2009 data##########

    Year <- c("2009", "2009", "2009", "2009","2009","2009", "2009", "2009", "2009","2009")
    Country <- c("US", "US","US", "US","US", "Canada", "Canada","Canada", "Canada","Canada")
    Molecule <- c("A", "B", "C", "D", "E","A", "B", "C", "D", "E")
    Dollar_Value <- c(500, 0, 100, 200,0, 75, 0, 0 ,99,0)
    Sales_Units <- c(60, 0, 20, 40,0,5, 0, 0, 27,0)
    df_2009 <- data.frame(Year, Country, Molecule, Dollar_Value,Sales_Units)

######Want to generate This##########

    Year <- c("2009", "2009", "2009", "2009","2009","2009", "2009", "2009", "2009","2009")
    Country <- c("US", "US","US", "US","US", "Canada", "Canada","Canada", "Canada","Canada")
    Molecule <- c("A", "B", "C", "D", "E","A", "B", "C", "D", "E")
    Dollar_Value <- c(500, 0, 100, 200,0, 75, 0, 0 ,99,0)
    Sales_Units <- c(60, 0, 20, 40,0,5, 0, 0, 27,0)
    New_Molecule <- c(1, 0, 0, 0,0,0,0,0,1,0)
    df_2009_NewColumn <- data.frame(Year, Molecule, Dollar_Value,Sales_Units,New_Molecule)

What I have tried: first i tried to group both the data set by Year, Country, Molecule and then use mutate.

df_2008 <- group_by(df_2008,Year,Country,Molecule)
df_2009 <- group_by(df_2009,Year,Country,Molecule)

withnew <- mutate(df_2009, New_Molecule = case_when(df_2008$Dollar_Value ==0 & df_2008$Sales_Units ==0 & df_2009$Dollar_Value >0 & df_2009$Sales_Units >0 ~1,
                                  TRUE~0))

but this gives an error message:

Error: Column `New_Molecule` must be length 1 (the group size), not 10
In addition: Warning message:
In df_2008$Dollar_Value == 0 & df_2008$Sales_Units == 0 & df_2009$Dollar_Value >  :
  longer object length is not a multiple of shorter object length

Then i just tried the mutate but it does not generate the indicator variable as i need it.

1

1 Answers

0
votes

This is easier if you put your data together into wide format using a right_join. That way, you can reference all the variables that are now in the same row to make your comparison with an ifelse:

right_join(df_2008, df_2009, 
          by = c("Country", "Molecule"), 
          suffix = c("_2008", "_2009")) %>%
   group_by(Country, Molecule) %>%
   mutate(New_Molecule = ifelse(Dollar_Value_2008 == 0 & 
                                Sales_Units_2008  == 0 & 
                                Dollar_Value_2009 >  0 & 
                                Sales_Units_2009  >  0, 1, 0)) %>%
   ungroup() %>%
   transmute(Year = Year_2009, Country = Country, Molecule = Molecule,
             Dollar_Value = Dollar_Value_2009, Sales_Units = Sales_Units_2009,
             New_Molecule = New_Molecule)
#> # A tibble: 10 x 6
#>    Year  Country Molecule Dollar_Value Sales_Units New_Molecule
#>    <fct> <fct>   <chr>           <dbl>       <dbl>        <dbl>
#>  1 2009  US      A                 500          60            1
#>  2 2009  US      B                   0           0            0
#>  3 2009  US      C                 100          20            0
#>  4 2009  US      D                 200          40            0
#>  5 2009  US      E                   0           0            0
#>  6 2009  Canada  A                  75           5            0
#>  7 2009  Canada  B                   0           0            0
#>  8 2009  Canada  C                   0           0            0
#>  9 2009  Canada  D                  99          27            1
#> 10 2009  Canada  E                   0           0            0