R Calculate sample weights and weighted aggregation based on reference table

Question

After conducting a survey, Icollected the results in the form of a dataframe. Here's a reproducible version of what the actual data frame looks like.

library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
               "1105","1106","1107","1108",
               "1109","1110","1111","1112",
               "1113","1114","1115","1116",
               "1117","1118","1119","1120",
               "1121","1122","1123","1124",
               "1125","1126","1127","1128",
               "1129","1130","1131","1132",
               "1133","1134","1135","1136",
               "1137","1138","1139","1140",
               "1141","1142","1143","1144",
               "1145","1146","1147","1148",
               "1149","1150","1151","1152",
               "1153","1154","1155","1156"),
          Country=c("US","UK","Canada","Mexico",
                    "India","US","Peru","China",
                    "US","UK","Canada","Mexico",
                    "Portugal","India","Portugal","Mexico",
                    "Peru","India","Canada","Mexico",
                    "India","UK","India","Canada",
                    "US","UK","China","India",
                    "US","Mexico","Canada","Mexico",
                    "Canada","China","Canada","Canada",
                    "China","China","India","Mexico",
                    "Portugal","Portugal","Portugal","Portugal",
                    "UK","UK","UK","Peru",
                    "Peru","Mexico","US","US",
                    "Peru","Mexico","Peru","Mexico"),
          Gender=c("Male","Male","Male","Female",
                    "Female","Female","Male","Female",
                    "Female","Female","Male","Female",
                    "Male","Male","Female","Female",
                    "Female","Male","Female","Female",
                    "Female","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Female","Male","Female","Female",
                    "Male","Male","Male","Female",
                    "Male","Male","Female","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Male","Male","Male"),
          Age=c("<25","25-35","25-35","36-45",
                ">55",">55","25-35",">55",
                "<25","25-35","25-35","36-45",
                "25-35","25-35","25-35","36-45",
                ">55","36-45","46-55","36-45",
                ">55","46-55","25-35","46-55",
                "<25","46-55","25-35","46-55",
                "25-35","25-35","46-55","36-45",
                "<25","<25",">55","36-45",
                "36-45","46-55","<25","<25",
                "<25",">55","36-45","46-55",
                "<25",">55","36-45","46-55",
                "36-45",">55","36-45","46-55",
                "<25","46-55","<25","46-55"),
          Score_Q1=c(4,4,3,2,
                  1,1,4,2,
                  1,1,1,2,
                  2,1,4,3,
                  4,3,1,1,
                  1,2,1,1,
                  1,4,1,4,
                  3,4,3,3,
                  1,3,3,1,
                  1,1,2,1,
                  1,2,1,2,
                  1,1,1,1,
                  2,2,2,2,
                  1,2,3,4),
          Score_Q2=c(1,4,1,1,
                     1,2,1,1,
                     1,4,4,4,
                     2,1,1,3,
                     4,3,1,1,
                     1,3,3,3,
                     2,4,1,2,
                     4,4,4,4,
                     1,1,1,1,
                     1,2,3,4,
                     4,4,2,1,
                     1,2,3,2,
                     1,2,1,2,
                     4,3,2,1))

The dataframe can be split the following parts-

1) ID: A respondent ID

2) Country: Respondent's country of origin

3) Gender: The gender of the respondent

4) Age: Respondent age

5) Score_Q1: The satisfaction score for Q1, on a scale from 1 (Very satisfied) to 4(Very dissatisfied).

6) Score_Q2: The satisfaction score for Q2, on a scale from 1(Very satisfied) to 4(Very dissatisfied).

First some data cleaning -

#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)

Now I check the ratios for Age and Gender in my dataset -

Gender by Country

#1) Gender by Country
split_gender=df %>% select(Country,Gender) %>%
  group_by(Gender,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Gender,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"

Age by Country

#2) Age by Country
split_age=df %>% select(Country,Age) %>%
  group_by(Age,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Age,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"

#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))

The above two steps give me two data frames - split_age_sample & split_gender_sample. These dataframes contain the sample ratios for age and gender by country for my 56 respondents.

My Objective: Calculating Sampling Weights Based on Population Statistics

In order to make my data frame more representative of reality, I would like to attribute weights to my respondents based on the official population ratios for age and gender by country.

These are the official population ratios I found for the countries I surveyed.

#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
                                 Canada=c(0.4,0.6),
                                 China=c(0.3,0.7),
                                 India=c(0.3,0.7),
                                 Mexico=c(0.5,0.5),
                                 Peru=c(0.6,0.4),
                                 Portugal=c(0.5,0.5),
                                 UK=c(0.4,0.6),
                                 US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
                                 Canada=c(0.1,0.3,0.3,0.2,0.1),
                                 China=c(0.3,0.05,0.35,0.1,0.2),
                                 India=c(0.5,0.05,0.35,0.05,0.05),
                                 Mexico=c(0.2,0.3,0.2,0.1,0.2),
                                 Peru=c(0.1,0.3,0.2,0.2,0.2),
                                 Portugal=c(0.2,0.1,0.05,0.05,0.6),
                                 UK=c(0.2,0.3,0.1,0.3,0.1),
                                 US=c(0.2,0.3,0.1,0.3,0.1))

Desired Output

Based on my sample ratios and the offical population ratios for both age & gender, I'd like to attribute weights to my respondents, in a separate column called weights.

Currently I am unable to figure out how to do this calculation.

Then, once the weights are calculated, I'd like to summarize the scores using the weights column. The aggregation would look something like this (except with the weights factored into the calculation) -

Example: Weighted aggregated scores for the UK

#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
  select(-Age) %>%
  group_by(Country,Gender) %>%
  filter(Country=='UK') %>%
  summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
            Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))

I'd really appreciate any help I can get on the weight calculation and its usage in the weighted aggregation step that follows.

Santiago Capobianco Santiago Capobianco · Accepted Answer · 2019-02-21T22:03:21

Not sure if its exactly what you are looking for, but here is what I figure out. You need to merge the national weights with your dataframe, and then you can calculate the KPI.

> # Reshape national weights

> Nombres <- cbind.data.frame("Country" = colnames(split_gender_official)[colnames(split_gender_official) != "Gender"],
+                              "time" = 1:length(colnames(split_gender_official)[colnames(split_gender_official) != "Gender"]))
> Nombres$Country <- as.character(Nombres$Country)
> 
> split_gender_official_resh <- reshape(split_gender_official, direction = "long", varying = Nombres$Country, v.names = "Weights_gend")
> split_age_official_resh <- reshape(split_age_official, direction = "long", varying = Nombres$Country, v.names = "Weights_age")
> 
> split_gender_official_resh$id <- NULL
> split_age_official_resh$id <- NULL
> 
> split_gender_official_resh <- merge(split_gender_official_resh, Nombres, by = "time", all.x = TRUE)
> split_age_official_resh <- merge(split_age_official_resh, Nombres, by = "time", all.x = TRUE)
> 
> split_gender_official_resh$time <- NULL
> split_age_official_resh$time <- NULL

> # Merge weights with df

> df <- merge(df, split_gender_official_resh, by = c("Gender", "Country"), all.x = TRUE)
> df <- merge(df, split_age_official_resh, by = c("Age", "Country"), all.x = TRUE)
> 

> # Print tables
>
> # Without weights
>
> prop.table(table(df$Gender, df$Country), 2)

            Canada     China     India    Mexico      Peru  Portugal        UK        US
  Female 0.5000000 0.2000000 0.5714286 0.7000000 0.3333333 0.5000000 0.5714286 0.5714286
  Male   0.5000000 0.8000000 0.4285714 0.3000000 0.6666667 0.5000000 0.4285714 0.4285714
> prop.table(table(df$Age, df$Country), 2)

           Canada     China     India    Mexico      Peru  Portugal        UK        US
  <25   0.1250000 0.2000000 0.1428571 0.1000000 0.3333333 0.1666667 0.1428571 0.4285714
  >55   0.1250000 0.2000000 0.2857143 0.1000000 0.1666667 0.1666667 0.1428571 0.1428571
  25-35 0.2500000 0.2000000 0.2857143 0.1000000 0.1666667 0.3333333 0.2857143 0.1428571
  36-45 0.1250000 0.2000000 0.1428571 0.5000000 0.1666667 0.1666667 0.1428571 0.1428571
  46-55 0.3750000 0.2000000 0.1428571 0.2000000 0.1666667 0.1666667 0.2857143 0.1428571
> 
> # With weights
> prop.table(xtabs(Weights_gend ~ Gender + Country, df), 2)
        Country
Gender       Canada      China      India     Mexico       Peru   Portugal         UK         US
  Female 0.40000000 0.09677419 0.36363636 0.70000000 0.42857143 0.50000000 0.47058824 0.47058824
  Male   0.60000000 0.90322581 0.63636364 0.30000000 0.57142857 0.50000000 0.52941176 0.52941176
> prop.table(xtabs(Weights_age ~ Gender + Country, df), 2)
        Country
Gender      Canada     China     India    Mexico      Peru  Portugal        UK        US
  Female 0.3333333 0.0500000 0.4642857 0.6250000 0.4545455 0.7142857 0.5000000 0.5000000
  Male   0.6666667 0.9500000 0.5357143 0.3750000 0.5454545 0.2857143 0.5000000 0.5000000
> 
> #  Means with weights and scores
> tapply(df$Score_Q1 * df$Weights_gend, list(df$Gender, df$Country), mean)
       Canada China    India   Mexico Peru  Portugal  UK  US
Female    0.6  0.60 0.600000 1.000000  1.5 1.3333333 0.8 0.7
Male      1.2  1.05 1.166667 1.666667  1.0 0.6666667 1.2 1.4
> tapply(df$Score_Q1 * df$Weights_age, list(df$Age, df$Country), mean)
         Canada China India Mexico Peru Portugal   UK  US
<25   0.1000000  0.90  1.00   0.20  0.2     0.20 0.20 0.4
>55   0.9000000  0.10  0.05   0.60  1.2     0.20 0.30 0.3
25-35 0.6000000  0.35  0.35   0.80  0.8     0.15 0.25 0.3
36-45 0.2000000  0.10  0.15   0.22  0.4     0.05 0.30 0.6
46-55 0.1666667  0.20  0.20   0.60  0.2     1.20 0.30 0.2
>

Hope it helps.

R Calculate sample weights and weighted aggregation based on reference table

1 Answers