Our analyst has performed a propensity score analysis on our data. Basically, he used country, age and biological start year to "balance" the female and male population in our dataset. He has done an overlap assessment between the two groups (female & male) and looked at the linearized propensity score to see if there is "good" overlap.
Dataset:
structure(list(gender = c(0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
1, 0, 1, 1, 1, 0, 0, 1), country = structure(c(1L, 2L, 2L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("CH", "CZ", "DK", "IS", "NL", "NO", "PT", "RO",
"SE", "SF", "SI", "TR", "UK"), class = "factor"), age = c(39,
37, 54, 33, 30, 62, 30, 48, 34, 40, 39, 41, 29, 31, 37, 27, 22,
23, 21, 31), bio_drug_name = structure(c(1L, 1L, 4L, 3L, 1L,
3L, 4L, 3L, 1L, 4L, 3L, 5L, 4L, 4L, 1L, 5L, 1L, 3L, 4L, 2L), .Label = c("adalimumab",
"certolizumab", "etanercept", "golimumab", "infliximab"), class = "factor"),
bio_drug_start_year = c(2007, 2011, 2012, 2012, 2012, 2004,
2012, 2012, 2012, 2012, 2012, 2012, 2016, 2015, 2013, 2015,
2013, 2013, 2014, 2013), asdas_crp_cii_6month = c(1, 1, 0,
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0), bio_drug_start_year_centered = c(-8,
-4, -3, -3, -3, -11, -3, -3, -3, -3, -3, -3, 1, 0, -2, 0,
-2, -2, -1, -2), age_std = structure(c(-0.211016383746095,
-0.375088510873223, 1.01952456970737, -0.70323276512748,
-0.949340955818173, 1.67581307821588, -0.949340955818173,
0.527308188325984, -0.621196701563916, -0.12898032018253,
-0.211016383746095, -0.046944256618966, -1.03137701938174,
-0.867304892254609, -0.375088510873223, -1.19544914650887,
-1.60562946432669, -1.52359340076312, -1.68766552789025,
-0.867304892254609), .Dim = c(20L, 1L)), ID = 1:20), na.action = structure(c(`111395` = 169L,
`769107` = 2619L, `844107` = 2624L, `164325` = 2681L, `1011013` = 2728L,
`114174` = 2763L, `116484` = 2778L, `231118` = 3058L), class = "omit"), row.names = c("463",
"7729", "7756", "8306", "8324", "128", "8440", "8450", "8663",
"8809", "8840", "8857", "9020", "9033", "9101", "9324", "9377",
"9523", "9702", "9718"), class = "data.frame")
Code used to create PS-model and calculate linearized PS-score for male and females
psmod = glm( gender ~ country + age_std + bio_drug_start_year_centered, family = 'binomial', data = dat)
psmod = step(psmod, scope = list(lower = ~country + age_std + bio_drug_start_year_centered,
upper = ~(country + age_std + bio_drug_start_year_centered)^2+
poly(dat$age_std,degree=3)[,2] + poly(dat$age_std,degree=3)[,3] +
poly(dat$bio_drug_start_year_centered,degree=3)[,2] +
poly(dat$bio_drug_start_year_centered,degree=3)[,3]
),
direction='forward' )
summary(psmod)
# Predict ps-score
ps = predict(psmod, type= 'response')
lps = log(ps/(1-ps))
# Overlap assessment
par(mfrow=c(2,1))
min.lps = min(lps)
max.lps = max(lps)
hist(lps[dat$gender==0], breaks=50,main='male', xlab='Linearized ps-score', xlim=c(min.lps,max.lps))
hist(lps[dat$gender==1], breaks=50,main='female', xlab='Linearized ps-score', xlim=c(min.lps,max.lps))
Here is the output of the image
Although this is fine for him, it is not sufficient for a scientific journal. I would like to use ggplot to create a nice histogram and show overlap between the males and females. There are some nice examples over here However, since the lenghts of the linearized PS-scores differ, I am not sure how to turn this into a dataset and then use it on ggplot.