multirow contingency table in R

Question

Let's consider this data set:

df <- data.frame(age=   sample(c(20:90), 20, rep=T), 
             sex =  sample(c('m', 'f'), 20, rep=T),
             smoker=sample(c("never", "former", "active"), 20, rep=T),
             size=  sample (c(8:40), 20, rep=T),
             fac =  as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
             outcome = sample(c(0,1), 20, rep=T)
             )
# let's introduce some missing data         
for (i in (1:3)) {df[sample(c(1:20),1),  sample(c(1:6),1)]  <- NA}

In a medical manuscript the first table summarizes the population (or its subgroups as appropriate); here the rows would be age, sex, smoking status, etc and the two outcomes would be listed in separate columns. The continuous variables are reported as means; the categorical variables as counts.

I was wondering if there is a function that I am missing that creates such contingency tables. I can do that manually but would love to be able to automatically update if the data set changes. Ultimately I need to output in latex.
the function would need to ignore missing data, but not delete those rows.

Asking too much?!

Default table 'ignores' NA. Compare with(df, table(fac, outcome)) and with(df, table(fac, outcome, useNA = "always")) or with(df, table(fac, outcome, exclude = NULL)). — Henrik
Regarding can't find object 'fac', you need to refer to your dataset df. — Henrik
Regarding table(na.omit(mean(age)), outcome): when you have considered the previous comment and checked ?mean, na.rm, you may wonder again why it does not work, and what you really wish to achieve with this table. — Henrik
I'm having trouble understanding. Since age is continuous, do you want a single row with the mean age as the rowname and the two columns for the counts the two outcomes? — Gregor Thomas

K Owen - Reinstate Monica K Owen - Reinstate Monica · Accepted Answer · 2013-08-14T02:58:15

In medical articles, 'Table 1' summarizes the demographics of the study population, usually broken down between subgroups

Generate data set

n <- 100
df <- data.frame(
age = sample(c(20:90), n, rep = T), 
sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), 
smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), 
size = abs(rnorm(n, 20, 8)), 
logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), 
labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), 
outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2))
)

# let's introduce some missing data
for (i in (1:floor(n/6))) {
    df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA
}
head(df)

##   age sex smoker  size logitest labtest outcome
## 1  70   m former 39.17       NA     med      NA
## 2  51   f former 33.64    FALSE      hi       1
## 3  58   f former 10.10    FALSE     neg       1
## 4  30   m former 43.24    FALSE     med       0
## 5  54   m former 22.78    FALSE      lo       0
## 6  86   f former  8.20    FALSE     neg       0

# df <- read.csv() 
#you may need to eliminate some columns    
#colnames(df) 
#df0<-df #backup 
#df <- df[,-c(1,...,27:38)]

Change this as needed: the column with the diagnosis has to be removed from the variables list!

dx <- 7  #index of outcome/diagnosis
####################################
summary(df[, -dx])

##       age         sex        smoker        size        logitest      
##  Min.   :20.0   f   :44   active:19   Min.   : 0.91   Mode :logical  
##  1st Qu.:42.5   m   :54   former:49   1st Qu.:15.00   FALSE:85       
##  Median :58.0   NA's: 2   never :30   Median :20.12   TRUE :12       
##  Mean   :57.3             NA's  : 2   Mean   :20.44   NA's :3        
##  3rd Qu.:74.0                         3rd Qu.:27.10                  
##  Max.   :88.0                         Max.   :43.24                  
##  NA's   :1                            NA's   :2                      
##  labtest  
##  hi  : 4  
##  lo  :29  
##  med :20  
##  neg :45  
##  NA's: 2  
##           
##

attach(df)

Build list of vars

vars <- colnames(df)
vars

## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest" 
## [7] "outcome"

catvars <- NULL  #categorical variables
contvars <- NULL  #continuous variables
logivars <- NULL  #logic variables

vars <- vars[-dx]
vars

## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest"

for (i in 1:length(vars)) {
    ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, 
        i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i])))
}
contvars

## [1] "age"  "size"

catvars

## [1] "sex"     "smoker"  "labtest"

logivars

## [1] "logitest"

Create the subgroups

bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ]
nrow(bg)  #; bg

## [1] 73

mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ]
nrow(mg)  #; mg

## [1] 23

indet <- df[is.na(df[, dx]), ]
nrow(indet)

## [1] 4

indet

##    age sex smoker   size logitest labtest outcome
## 1   70   m former 39.173       NA     med      NA
## 9   87   m former 23.621    FALSE      lo      NA
## 18  65   m former  2.466    FALSE    <NA>      NA
## 67  88   f former 17.575    FALSE     med      NA

For continuous variables

normality <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    st <- shapiro.test(df[, j])  #normality testing on all patients, bg and mg alike
    normality <- c(normality, st$p.value)  #normality testing on all patients, bg and mg alike
}
normality

## [1] 0.00125 0.73602

ttpvalue <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    ## if normal, use t-test, otherwise wilcoxon if shapiro p<.05 then pop
    ## likely NOT normally dist
    ifelse(normality[i] < 0.05, tt <- wilcox.test(bg[, j], mg[, j]), tt <- t.test(bg[, 
        j], mg[, j]))
    ttpvalue <- c(ttpvalue, tt$p.value)  ##if t-test p<.05 then pop likely have different means
}
ttpvalue

## [1] 0.6358 0.3673

contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)

For categorical variables

chisqpvalue <- NULL
for (i in 1:length(catvars)) {
    j <- which(vars == catvars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    chisqpvalue <- c(chisqpvalue, chisqtest$p.value)
}
chisqpvalue

## [1] 0.01579 0.77116 0.39484

catvarlist <- list(variables = catvars, chisq.by.subgroup = chisqpvalue)

For logic variables

proppvalue <- NULL
for (i in 1:length(logivars)) {
    j <- which(vars == logivars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    proppvalue <- c(proppvalue, chisqtest$p.value)
}
proppvalue

## [1] 0.5551

logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)

And now, the results!

str(contvarlist)  #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means

## List of 3
##  $ variables        : chr [1:2] "age" "size"
##  $ normality        : num [1:2] 0.00125 0.73602
##  $ ttest.by.subgroup: num [1:2] 0.636 0.367

str(catvarlist)  #if chisq p<.05 then variables are likely NOT independent

## List of 2
##  $ variables        : chr [1:3] "sex" "smoker" "labtest"
##  $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948

str(logivarlist)  #if chisq p<.05 then variables are likely NOT independent

## List of 2
##  $ variables        : chr "logitest"
##  $ chisq.by.subgroup: num 0.555