consider this data frame:
set.seed(123)
dat1 <- data.frame(Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200),
var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
Location Loc
is a grouping variable for the measurements var1:6
on each ID
. There are several pairs of Loc
s that are so close to each other (geographically) that they should probably be considered a single group instead of two independent groups. Therefore I have written a function that will bootstrap each of the variables to see if these groups seem to have come from the same distribution:
library(tidyverse)
BootT <- function(dat, var, gv1, gv2){
set.seed(123)
a<- dplyr::filter(dat, Loc == gv1)
a2 <- dplyr::select(a, var)
b <- dplyr::filter(dat, Loc == gv2)
b2 <- dplyr::select(b, var)
pooled <- rbind(a2, b2)
boot.t <- c(1:999)
for(i in 1:999){
sample.index <- sample(c(1:length(pooled[,1])), replace = TRUE)
sample.x <- pooled[sample.index,][1:length(a2[,1])]
sample.y <- pooled[sample.index,][-c(1:length(b2[,1]))]
boot.t[i] <- t.test(sample.x, sample.y)$statistic
}
p.pooled <- data.frame(p.pooled = 1 + sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic))) / (999+1)
return(p.pooled)
ids <- data.frame(Group1 = paste0(gv1), Group2 = paste0(gv2), Variable = paste0(var))
p.pooled <- p.pooled%>%
dplyr::mutate(Group1 = ids[,1], Group2 = ids[,2], Variable = ids[,3])
p.pooled <- p.pooled[,c(2,3,4,1)]
return(p.pooled)
}
#compare 2 locs of interest with a single variable
BootT(dat = dat1, var = "var2", gv1 = "a", gv2 = "g")
#compare all 6 variables
vars <- names(dat1[,3:8])
results <- list()
for(i in vars){
res <- BootT(dat = dat1, var = i, gv1 = "a", gv2 = "b")
results <- rbind(results, res)
}
I would like to modify this function so that it will output a classic histogram showing the bootstrapped distribution for each variable vs the observed value, and contain summary statistics on the plot. How can I modify this function to accomplish this?
Edit:
Originally, I was going to use the boot package to do this, which would have been easier, but I was not confident that I understood how the different arguments would change the sampling procedure. In situations where the two Loc
s have equal variance (assessed with an F-test), I want to sample the pooled sample as I have demonstrated above. However, when samples are heterogenous, I want to subtract each groups mean before creating the pooled samples to compare (which forces the null hypothesis to be true, and makes no assumption about homogenous variance). For more info, see this post: https://stats.stackexchange.com/questions/136661/using-bootstrap-under-h0-to-perform-a-test-for-the-difference-of-two-means-repl
I have actually made a very similar function (with another very original name) to the one above to deal with the cases where there is an issue of heterogenous variance:
BootT2 <- function(dat, var, gv1, gv2){ set.seed(123) a<- dplyr::filter(dat, Loc == gv1) a2 <- dplyr::select(a, var) b <- dplyr::filter(dat, Loc == gv2) b2 <- dplyr::select(b, var) pooled <- rbind(a2,b2) xt <- a2[,1] - mean(a2[,1]) + mean(pooled[,1]) yt <- b2[,1] - mean(b2[,1]) + mean(pooled[,1]) boot.t <- c(1:999) for(i in 1:999){ sample.x <- sample(xt, replace = T) sample.y <- sample(yt, replace = T) boot.t[i] <- t.test(sample.x, sample.y)$statistic } p.h0 <- data.frame(p.ho = (1+sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic)) / 999+1)-2) #p.h0 <- data.frame(p.ho = sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic)) / 999) ids <- data.frame(Group1 = paste0(gv1), Group2 = paste0(gv2), Variable = paste0(var)) p.h0 <- p.h0%>% mutate(Group1 = ids[,1], Group2 = ids[,2], Variable = ids[,3]) p.h0 <- p.h0[,c(2,3,4,1)] return(p.h0) } #compare 2 locs of interest with a single variable BootT2(dat = dat1, var = "var2", gv1 = "a", gv2 = "g") #compare all 6 variables vars <- names(dat1[,3:8]) results.bootT2 <- list() for(i in vars){ res <- BootT2(dat = dat1, var = i, gv1 = "a", gv2 = "b") results.bootT2 <- rbind(results.bootT2, res) }
If someone would like to explain how I can do these procedures and produce plots using the boot() package instead, that would be great.