1
votes

I'm trying to write a function that calculates the descriptive statistics for both numeric and categorical variables (factors). For numeric variables, it should calculate mean (MEAN), median (MEDIAN), standard deviation(SD), and count the number of missing values (NMiss). For character variables, it should tabulate the count within each level of the variable and count the number of missing values.

The starting input data is:

   ID GLUC TGL HDL LDL  HRT MAMM SMOKE
1   A   88  NA  32  99    Y <NA>  ever
2   B   NA 150  60  NA <NA>   no never
3   C  110  NA  NA 120    N <NA>  <NA>
4   D   NA 200  65 165 <NA>  yes never

And I'd like it to come out looking like this:

> table1 (dat=patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
$numericStats
 varName MEAN   MEDIAN SD      NMiss
1 TGL 180.66667 180.0 23.03620 4
2 HDL 55.66667  62.5  19.00175 4
3 LDL 160.28571 165.0 40.06126 3
$FactorStats
  varName group  count
1   HRT   N       2
2         Y       3
3         NMiss   5
4   MAMM  no      2
5         yes     4
6        NMiss    4

This is the code I have so far:

#numericstats
    findnum = function(dat, numvar){
      numstats=data.frame()
      for (i in length(numvar[])){
        var_select = dat[[numvar[i]]]
        mean_value = round(mean(var_select, na.rm=T),2)
        median_value = round(median(var_select, na.rm=T),2)
        SD = round(sd(var_select, na.rm=T),2)
        N = length(var_select[!is.na(var_select)])
        N_miss = length(var_select[is.na(var_select)])
        numstats = 
          cbind(varname = numvar, mean = mean_value, median = median_value, sd = SD, nmissing = N_miss) 
      } 
      return(numstats)
    }
    findnum(dat=patient, numvar=c("TGL","HDL","LDL"))
    
    #factorstats
    findfactor = function(dat, charvar){
      factstats=data.frame()
      for (i in length(charvar[])){
        var_select = dat[[charvar[i]]]
        count = length(charvar)
        group = charvar
        factstats = 
          cbind(varname = charvar, group = charvar, count = count) 
      } 
      return(factstats)
    }
    findfactor(dat=patient, charvar=c("MAMM","SMOKE"))
    
    #full function
    table1 = function(dat, numvar, charvar){
      for (i in 1:length(dat)){
        if (!is.numeric(i))
          numericstats = findnum(dat, i)
        else factorstats = findfactor(dat, i)
        return(data.frame(numericstats, factorstats))
      }
    }
1

1 Answers

2
votes

Here is one way using lapply :

table1 <- function(df, numvar, charvar) {
  list(numericStats = cbind(VarName = numvar,do.call(rbind, 
        lapply(df[numvar], function(x) {
    data.frame(MEAN = mean(x, na.rm = TRUE), MEDIAN = median(x, na.rm = TRUE), 
              SD = sd(x, na.rm = TRUE), NMiss = sum(!is.na(x)))
  }))), 
  FactorStats = do.call(rbind, lapply(charvar, function(x) {
    tab <- stack(c(table(df[[x]]), Nmiss = sum(is.na(df[[x]]))))[2:1]
    names(tab) <- c('group', 'count')
    cbind(Varname = x, tab)
  })))
}

table1(patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))

#$numericStats
#    VarName  MEAN MEDIAN   SD NMiss
#TGL     TGL 175.0    175 35.4     2
#HDL     HDL  52.3     60 17.8     3
#LDL     LDL 128.0    120 33.7     3

#$FactorStats
#  Varname group count
#1     HRT     N     1
#2     HRT     Y     1
#3     HRT Nmiss     2
#4    MAMM    no     1
#5    MAMM   yes     1
#6    MAMM Nmiss     2

data

patient <- structure(list(ID = c("A", "B", "C", "D"), GLUC = c(88L, NA, 
110L, NA), TGL = c(NA, 150L, NA, 200L), HDL = c(32L, 60L, NA, 
65L), LDL = c(99L, NA, 120L, 165L), HRT = c("Y", NA, "N", NA), 
    MAMM = c(NA, "no", NA, "yes"), SMOKE = c("ever", "never", 
    NA, "never")), row.names = c(NA, -4L), class = "data.frame")