1
votes

I need to create a goal variable that will deify if the number of cases when dummy.ciiu_compared = 1 is greater than 50% of the total cases will be 1 otherwise 0.

17/26=0.65 -> 1

The target will be the goal variable.

Note: Consider grouping by year and id.

Data

db = structure(list(year = structure(c("2020", "2020", "2020", "2019", 
                                      "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
                                      "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
                                      "2019", "2019", "2019", "2019", "2019", "2019", "2019", "2019", 
                                      "2019"), label = "AÃ<U+0091>O", format.stata = "%9s"), id = structure(c(732437, 
                                                                                                              732437, 732437, 178036, 178036, 178036, 178036, 178036, 178036, 
                                                                                                              178036, 178036, 178036, 178036, 178036, 178036, 178036, 178036, 
                                                                                                              178036, 178036, 178036, 178036, 178036, 178036, 178036, 178036, 
                                                                                                              178036, 178036, 178036, 178036), label = "EXPEDIENTE", format.stata = "%12.0g"), 
                   n_shareholder = c(3L, 3L, 3L, 26L, 26L, 26L, 26L, 26L, 26L, 
                                     26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 
                                     26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L), dummy = structure(list(
                                       ciiu_comparado = c(0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 
                                                          1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1)), class = c("tbl_df", 
                                                                                                                         "tbl", "data.frame"), row.names = c(NA, -29L)), n_dummy = c(3L, 
                                                                                                                                                                                     3L, 3L, 17L, 17L, 9L, 17L, 9L, 9L, 9L, 17L, 17L, 17L, 9L, 
                                                                                                                                                                                     17L, 17L, 9L, 17L, 17L, 9L, 17L, 9L, 17L, 17L, 17L, 17L, 
                                                                                                                                                                                     17L, 9L, 17L), goal = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 
                                                                                                                                                                                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                   -29L), groups = structure(list(year = structure(c("2019", "2020"
                                                                                                                                                                                                                                                                                   ), label = "AÃ<U+0091>O", format.stata = "%9s"), id = structure(c(178036, 
                                                                                                                                                                                                                                                                                                                                                     732437), label = "EXPEDIENTE", format.stata = "%12.0g"), .rows = structure(list(
                                                                                                                                                                                                                                                                                                                                                       4:29, 1:3), ptype = integer(0), class = c("vctrs_list_of", 
                                                                                                                                                                                                                                                                                                                                                                                                 "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                            "tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           "tbl_df", "tbl", "data.frame"))

# A tibble: 29 x 6
# Groups:   year, id [2]
   year      id n_shareholder dummy$ciiu_comparado n_dummy  goal
   <chr>  <dbl>         <int>                <dbl>   <int> <dbl>
 1 2020  732437             3                    0       3     0
 2 2020  732437             3                    0       3     0
 3 2020  732437             3                    0       3     0
 4 2019  178036            26                    1      17     1
 5 2019  178036            26                    1      17     1
 6 2019  178036            26                    0       9     1
 7 2019  178036            26                    1      17     1
 8 2019  178036            26                    0       9     1
 9 2019  178036            26                    0       9     1
10 2019  178036            26                    0       9     1
# ... with 19 more rows
2

2 Answers

1
votes

The following creates the dummy as the question defines it.

  1. The comparison dummy$ciiu_comparado == 1 returns FALSE/TRUE, internally coded as 0/1;
  2. sum(<logical>) gets the total 1's;
  3. and n() is the group's number of rows.
  4. Then, check if the result is greater than the threshold value 0.5.

Output ommited.

library(dplyr)

db %>%
  group_by(year, id) %>%
  mutate(goal = sum(dummy$ciiu_comparado == 1)/n(),
         goal = as.integer(goal > 0.5))

The goal can be computed in one instruction.

db %>%
  group_by(year, id) %>%
  mutate(goal = +(sum(dummy$ciiu_comparado)/n() > 0.5))
1
votes

You can do this:

libarary(dplyr)
db %>% 
    group_by(year, id) %>% 
    mutate(new_goal = ifelse(sum(dummy) > (0.5 * nrow(.)), 1, 0)) %>% 
    ungroup