In medical articles, 'Table 1' summarizes the demographics of the study population, usually broken down between subgroups
Generate data set
n <- 100
df <- data.frame(
age = sample(c(20:90), n, rep = T),
sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)),
smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)),
size = abs(rnorm(n, 20, 8)),
logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)),
labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))),
outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2))
)
# let's introduce some missing data
for (i in (1:floor(n/6))) {
df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA
}
head(df)
## age sex smoker size logitest labtest outcome
## 1 70 m former 39.17 NA med NA
## 2 51 f former 33.64 FALSE hi 1
## 3 58 f former 10.10 FALSE neg 1
## 4 30 m former 43.24 FALSE med 0
## 5 54 m former 22.78 FALSE lo 0
## 6 86 f former 8.20 FALSE neg 0
# df <- read.csv()
#you may need to eliminate some columns
#colnames(df)
#df0<-df #backup
#df <- df[,-c(1,...,27:38)]
Change this as needed: the column with the diagnosis has to be removed from the variables list!
dx <- 7 #index of outcome/diagnosis
####################################
summary(df[, -dx])
## age sex smoker size logitest
## Min. :20.0 f :44 active:19 Min. : 0.91 Mode :logical
## 1st Qu.:42.5 m :54 former:49 1st Qu.:15.00 FALSE:85
## Median :58.0 NA's: 2 never :30 Median :20.12 TRUE :12
## Mean :57.3 NA's : 2 Mean :20.44 NA's :3
## 3rd Qu.:74.0 3rd Qu.:27.10
## Max. :88.0 Max. :43.24
## NA's :1 NA's :2
## labtest
## hi : 4
## lo :29
## med :20
## neg :45
## NA's: 2
##
##
attach(df)
Build list of vars
vars <- colnames(df)
vars
## [1] "age" "sex" "smoker" "size" "logitest" "labtest"
## [7] "outcome"
catvars <- NULL #categorical variables
contvars <- NULL #continuous variables
logivars <- NULL #logic variables
vars <- vars[-dx]
vars
## [1] "age" "sex" "smoker" "size" "logitest" "labtest"
for (i in 1:length(vars)) {
ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[,
i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i])))
}
contvars
## [1] "age" "size"
catvars
## [1] "sex" "smoker" "labtest"
logivars
## [1] "logitest"
Create the subgroups
bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ]
nrow(bg) #; bg
## [1] 73
mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ]
nrow(mg) #; mg
## [1] 23
indet <- df[is.na(df[, dx]), ]
nrow(indet)
## [1] 4
indet
## age sex smoker size logitest labtest outcome
## 1 70 m former 39.173 NA med NA
## 9 87 m former 23.621 FALSE lo NA
## 18 65 m former 2.466 FALSE <NA> NA
## 67 88 f former 17.575 FALSE med NA
For continuous variables
normality <- NULL
for (i in 1:length(contvars)) {
j <- which(vars == contvars[i]) #find position of variable in the original data frame and its subsets
st <- shapiro.test(df[, j]) #normality testing on all patients, bg and mg alike
normality <- c(normality, st$p.value) #normality testing on all patients, bg and mg alike
}
normality
## [1] 0.00125 0.73602
ttpvalue <- NULL
for (i in 1:length(contvars)) {
j <- which(vars == contvars[i]) #find position of variable in the original data frame and its subsets
## if normal, use t-test, otherwise wilcoxon if shapiro p<.05 then pop
## likely NOT normally dist
ifelse(normality[i] < 0.05, tt <- wilcox.test(bg[, j], mg[, j]), tt <- t.test(bg[,
j], mg[, j]))
ttpvalue <- c(ttpvalue, tt$p.value) ##if t-test p<.05 then pop likely have different means
}
ttpvalue
## [1] 0.6358 0.3673
contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)
For categorical variables
chisqpvalue <- NULL
for (i in 1:length(catvars)) {
j <- which(vars == catvars[i]) #find position of variable in the original data frame and its subsets
tbl <- table(df[, j], df[, dx])
chisqtest <- summary(tbl)
chisqpvalue <- c(chisqpvalue, chisqtest$p.value)
}
chisqpvalue
## [1] 0.01579 0.77116 0.39484
catvarlist <- list(variables = catvars, chisq.by.subgroup = chisqpvalue)
For logic variables
proppvalue <- NULL
for (i in 1:length(logivars)) {
j <- which(vars == logivars[i]) #find position of variable in the original data frame and its subsets
tbl <- table(df[, j], df[, dx])
chisqtest <- summary(tbl)
proppvalue <- c(proppvalue, chisqtest$p.value)
}
proppvalue
## [1] 0.5551
logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)
And now, the results!
str(contvarlist) #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means
## List of 3
## $ variables : chr [1:2] "age" "size"
## $ normality : num [1:2] 0.00125 0.73602
## $ ttest.by.subgroup: num [1:2] 0.636 0.367
str(catvarlist) #if chisq p<.05 then variables are likely NOT independent
## List of 2
## $ variables : chr [1:3] "sex" "smoker" "labtest"
## $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948
str(logivarlist) #if chisq p<.05 then variables are likely NOT independent
## List of 2
## $ variables : chr "logitest"
## $ chisq.by.subgroup: num 0.555
table
'ignores'NA
. Comparewith(df, table(fac, outcome))
andwith(df, table(fac, outcome, useNA = "always"))
orwith(df, table(fac, outcome, exclude = NULL))
. – Henrikcan't find object 'fac'
, you need to refer to your datasetdf
. – Henriktable(na.omit(mean(age)), outcome)
: when you have considered the previous comment and checked?mean
,na.rm
, you may wonder again why it does not work, and what you really wish to achieve with this table. – Henrikage
is continuous, do you want a single row with the meanage
as the rowname and the two columns for the counts the two outcomes? – Gregor Thomas