After conducting a survey, Icollected the results in the form of a dataframe. Here's a reproducible version of what the actual data frame looks like.
library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
"1105","1106","1107","1108",
"1109","1110","1111","1112",
"1113","1114","1115","1116",
"1117","1118","1119","1120",
"1121","1122","1123","1124",
"1125","1126","1127","1128",
"1129","1130","1131","1132",
"1133","1134","1135","1136",
"1137","1138","1139","1140",
"1141","1142","1143","1144",
"1145","1146","1147","1148",
"1149","1150","1151","1152",
"1153","1154","1155","1156"),
Country=c("US","UK","Canada","Mexico",
"India","US","Peru","China",
"US","UK","Canada","Mexico",
"Portugal","India","Portugal","Mexico",
"Peru","India","Canada","Mexico",
"India","UK","India","Canada",
"US","UK","China","India",
"US","Mexico","Canada","Mexico",
"Canada","China","Canada","Canada",
"China","China","India","Mexico",
"Portugal","Portugal","Portugal","Portugal",
"UK","UK","UK","Peru",
"Peru","Mexico","US","US",
"Peru","Mexico","Peru","Mexico"),
Gender=c("Male","Male","Male","Female",
"Female","Female","Male","Female",
"Female","Female","Male","Female",
"Male","Male","Female","Female",
"Female","Male","Female","Female",
"Female","Female","Male","Female",
"Male","Female","Male","Female",
"Female","Male","Female","Female",
"Male","Male","Male","Female",
"Male","Male","Female","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Male","Male","Male"),
Age=c("<25","25-35","25-35","36-45",
">55",">55","25-35",">55",
"<25","25-35","25-35","36-45",
"25-35","25-35","25-35","36-45",
">55","36-45","46-55","36-45",
">55","46-55","25-35","46-55",
"<25","46-55","25-35","46-55",
"25-35","25-35","46-55","36-45",
"<25","<25",">55","36-45",
"36-45","46-55","<25","<25",
"<25",">55","36-45","46-55",
"<25",">55","36-45","46-55",
"36-45",">55","36-45","46-55",
"<25","46-55","<25","46-55"),
Score_Q1=c(4,4,3,2,
1,1,4,2,
1,1,1,2,
2,1,4,3,
4,3,1,1,
1,2,1,1,
1,4,1,4,
3,4,3,3,
1,3,3,1,
1,1,2,1,
1,2,1,2,
1,1,1,1,
2,2,2,2,
1,2,3,4),
Score_Q2=c(1,4,1,1,
1,2,1,1,
1,4,4,4,
2,1,1,3,
4,3,1,1,
1,3,3,3,
2,4,1,2,
4,4,4,4,
1,1,1,1,
1,2,3,4,
4,4,2,1,
1,2,3,2,
1,2,1,2,
4,3,2,1))
The dataframe can be split the following parts-
1) ID: A respondent ID
2) Country: Respondent's country of origin
3) Gender: The gender of the respondent
4) Age: Respondent age
5) Score_Q1: The satisfaction score for Q1, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
6) Score_Q2: The satisfaction score for Q2, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
First some data cleaning -
#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)
Now I check the ratios for Age and Gender in my dataset -
Gender by Country
#1) Gender by Country
split_gender=df %>% select(Country,Gender) %>%
group_by(Gender,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Gender,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"
Age
by Country
#2) Age by Country
split_age=df %>% select(Country,Age) %>%
group_by(Age,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Age,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"
#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))
The above two steps give me two data frames - split_age_sample
& split_gender_sample
. These dataframes contain the sample ratios for age and gender by country for my 56 respondents.
My Objective: Calculating Sampling Weights Based on Population Statistics
In order to make my data frame more representative of reality, I would like to attribute weights to my respondents based on the official population ratios for age and gender by country.
These are the official population ratios I found for the countries I surveyed.
#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
Canada=c(0.4,0.6),
China=c(0.3,0.7),
India=c(0.3,0.7),
Mexico=c(0.5,0.5),
Peru=c(0.6,0.4),
Portugal=c(0.5,0.5),
UK=c(0.4,0.6),
US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
Canada=c(0.1,0.3,0.3,0.2,0.1),
China=c(0.3,0.05,0.35,0.1,0.2),
India=c(0.5,0.05,0.35,0.05,0.05),
Mexico=c(0.2,0.3,0.2,0.1,0.2),
Peru=c(0.1,0.3,0.2,0.2,0.2),
Portugal=c(0.2,0.1,0.05,0.05,0.6),
UK=c(0.2,0.3,0.1,0.3,0.1),
US=c(0.2,0.3,0.1,0.3,0.1))
Desired Output
Based on my sample ratios and the offical population ratios for both age & gender, I'd like to attribute weights to my respondents, in a separate column called weights
.
Currently I am unable to figure out how to do this calculation.
Then, once the weights are calculated, I'd like to summarize the scores using the weights
column. The aggregation would look something like this (except with the weights factored into the calculation) -
Example: Weighted aggregated scores for the UK
#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
select(-Age) %>%
group_by(Country,Gender) %>%
filter(Country=='UK') %>%
summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))
I'd really appreciate any help I can get on the weight calculation and its usage in the weighted aggregation step that follows.