I'm trying to write a function that calculates the descriptive statistics for both numeric and categorical variables (factors). For numeric variables, it should calculate mean (MEAN), median (MEDIAN), standard deviation(SD), and count the number of missing values (NMiss). For character variables, it should tabulate the count within each level of the variable and count the number of missing values.
The starting input data is:
ID GLUC TGL HDL LDL HRT MAMM SMOKE
1 A 88 NA 32 99 Y <NA> ever
2 B NA 150 60 NA <NA> no never
3 C 110 NA NA 120 N <NA> <NA>
4 D NA 200 65 165 <NA> yes never
And I'd like it to come out looking like this:
> table1 (dat=patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
$numericStats
varName MEAN MEDIAN SD NMiss
1 TGL 180.66667 180.0 23.03620 4
2 HDL 55.66667 62.5 19.00175 4
3 LDL 160.28571 165.0 40.06126 3
$FactorStats
varName group count
1 HRT N 2
2 Y 3
3 NMiss 5
4 MAMM no 2
5 yes 4
6 NMiss 4
This is the code I have so far:
#numericstats
findnum = function(dat, numvar){
numstats=data.frame()
for (i in length(numvar[])){
var_select = dat[[numvar[i]]]
mean_value = round(mean(var_select, na.rm=T),2)
median_value = round(median(var_select, na.rm=T),2)
SD = round(sd(var_select, na.rm=T),2)
N = length(var_select[!is.na(var_select)])
N_miss = length(var_select[is.na(var_select)])
numstats =
cbind(varname = numvar, mean = mean_value, median = median_value, sd = SD, nmissing = N_miss)
}
return(numstats)
}
findnum(dat=patient, numvar=c("TGL","HDL","LDL"))
#factorstats
findfactor = function(dat, charvar){
factstats=data.frame()
for (i in length(charvar[])){
var_select = dat[[charvar[i]]]
count = length(charvar)
group = charvar
factstats =
cbind(varname = charvar, group = charvar, count = count)
}
return(factstats)
}
findfactor(dat=patient, charvar=c("MAMM","SMOKE"))
#full function
table1 = function(dat, numvar, charvar){
for (i in 1:length(dat)){
if (!is.numeric(i))
numericstats = findnum(dat, i)
else factorstats = findfactor(dat, i)
return(data.frame(numericstats, factorstats))
}
}