1
votes

consider this data frame:

set.seed(123)
dat1 <- data.frame(Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200),
                   var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)

Location Loc is a grouping variable for the measurements var1:6 on each ID. There are several pairs of Locs that are so close to each other (geographically) that they should probably be considered a single group instead of two independent groups. Therefore I have written a function that will bootstrap each of the variables to see if these groups seem to have come from the same distribution:

library(tidyverse)
BootT <- function(dat, var, gv1, gv2){
  set.seed(123)
  a<- dplyr::filter(dat, Loc == gv1)
  a2 <- dplyr::select(a, var)
  b <- dplyr::filter(dat, Loc == gv2)
  b2 <- dplyr::select(b, var)
  pooled <- rbind(a2, b2)
  boot.t <- c(1:999)
  for(i in 1:999){
    sample.index <- sample(c(1:length(pooled[,1])), replace = TRUE)
    sample.x <- pooled[sample.index,][1:length(a2[,1])]
    sample.y <- pooled[sample.index,][-c(1:length(b2[,1]))]
    boot.t[i] <- t.test(sample.x, sample.y)$statistic
  }
  p.pooled <-  data.frame(p.pooled = 1 + sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic))) / (999+1) 
 return(p.pooled)
  ids <- data.frame(Group1 = paste0(gv1), Group2 = paste0(gv2), Variable = paste0(var))
  p.pooled <- p.pooled%>%
    dplyr::mutate(Group1 = ids[,1], Group2 = ids[,2], Variable = ids[,3])
  p.pooled <- p.pooled[,c(2,3,4,1)]
 return(p.pooled)
}
#compare 2 locs of interest with a single variable
BootT(dat = dat1, var = "var2", gv1 = "a", gv2 = "g") 
#compare all 6 variables 
vars <- names(dat1[,3:8])
results <- list()
for(i in vars){
  res <- BootT(dat = dat1, var = i, gv1 = "a", gv2 = "b")
  results <- rbind(results, res)
} 

I would like to modify this function so that it will output a classic histogram showing the bootstrapped distribution for each variable vs the observed value, and contain summary statistics on the plot. How can I modify this function to accomplish this? Edit: Originally, I was going to use the boot package to do this, which would have been easier, but I was not confident that I understood how the different arguments would change the sampling procedure. In situations where the two Locs have equal variance (assessed with an F-test), I want to sample the pooled sample as I have demonstrated above. However, when samples are heterogenous, I want to subtract each groups mean before creating the pooled samples to compare (which forces the null hypothesis to be true, and makes no assumption about homogenous variance). For more info, see this post: https://stats.stackexchange.com/questions/136661/using-bootstrap-under-h0-to-perform-a-test-for-the-difference-of-two-means-repl

I have actually made a very similar function (with another very original name) to the one above to deal with the cases where there is an issue of heterogenous variance:

BootT2 <- function(dat, var, gv1, gv2){
  set.seed(123)
  a<- dplyr::filter(dat, Loc == gv1)
  a2 <- dplyr::select(a, var)
  b <- dplyr::filter(dat, Loc == gv2)
  b2 <- dplyr::select(b, var)
  pooled <- rbind(a2,b2)
  xt <- a2[,1] - mean(a2[,1]) + mean(pooled[,1])
  yt <- b2[,1] - mean(b2[,1]) + mean(pooled[,1])
  boot.t <- c(1:999)
  for(i in 1:999){
    sample.x <- sample(xt, replace = T)
    sample.y <- sample(yt, replace = T)
    boot.t[i] <- t.test(sample.x, sample.y)$statistic
  }
  p.h0 <- data.frame(p.ho = (1+sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic)) / 999+1)-2)
  #p.h0 <- data.frame(p.ho = sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic)) / 999)
  ids <- data.frame(Group1 = paste0(gv1), Group2 = paste0(gv2), Variable = paste0(var))
  p.h0 <- p.h0%>%
    mutate(Group1 = ids[,1], Group2 = ids[,2], Variable = ids[,3])
  p.h0 <- p.h0[,c(2,3,4,1)]
 return(p.h0)
}
#compare 2 locs of interest with a single variable
BootT2(dat = dat1, var = "var2", gv1 = "a", gv2 = "g") 
#compare all 6 variables 
vars <- names(dat1[,3:8])
results.bootT2 <- list()
for(i in vars){
  res <- BootT2(dat = dat1, var = i, gv1 = "a", gv2 = "b")
  results.bootT2 <- rbind(results.bootT2, res)
} 

If someone would like to explain how I can do these procedures and produce plots using the boot() package instead, that would be great.

1

1 Answers

0
votes

If I understand correctly, the following will run bootstrapped t-tests of 2 Loc of a variable var in data set dat1. It uses the accepted answer to this CrossValidated post bootstrap in function bootTstat, but this is called from function funBoot. Function funBoot is responsible for subsetting the groups gv1 and gv2 rows and the column var. The data set thus formed is passed on to bootTstat.

bootTstat <- function(x, y, R){
  pool <- c(x, y)
  xt <- x - mean(x) + mean(pool)
  yt <- y - mean(y) + mean(pool)
  boot.t <- numeric(R)
  for (i in seq_len(R)){
    sample.x <- sample(xt, replace = TRUE)
    sample.y <- sample(yt, replace = TRUE)
    boot.t[i] <- t.test(sample.x, sample.y)$statistic
  }
  p.h0 <- (1 + sum(abs(boot.t) > abs(t.test(x, y)$statistic))) / (R + 1)  
  list(
    statistic = boot.t,
    p.value = p.h0
  )
}

funBoot <- function(data, R, var, gv1, gv2){
  i <- data[["Loc"]] == gv1
  j <- data[["Loc"]] == gv2
  x <- data[i, var]
  y <- data[j, var]
  bootTstat(x, y, R)
}

For "var2" and groups "a" and "g" run a t-test with the entire groups data and R = 1000 tests.

First the t-test.

a <- subset(dat1, Loc == 'a', select = 'var2')
g <- subset(dat1, Loc == 'g', select = 'var2')
t.test(a, g)
#
#        Welch Two Sample t-test
#
#data:  a and g
#t = 1.1002, df = 47, p-value = 0.2769
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -0.2585899  0.8828038
#sample estimates:
# mean of x  mean of y 
# 0.1755209 -0.1365860 

And the bootsrtapped t-tests. R <- 1000 set.seed(123)

b_ag <- funBoot(dat1, R, var = "var2", gv1 = "a", gv2 = "g")
b_ag$p.value
#[1] 0.2737263

This p-value is similar to p.value = 0.2769 obtained previously.
And the histogram can easily be plotted.

hist(b_ag$statistic, main = "Bootstrapped t-test")

enter image description here

Now run tests for all variables and groups "a" and "b". Plot with package ggplot2.

ttest_list <- lapply(names(dat1)[3:8], function(v) {
  b <- funBoot(data = dat1, R = R, var = v, gv1 = "a", gv2 = "b")
  list(
    p.value = b$p.value,
    test = data.frame(var = v, stat = b$statistic)
  )
})

ttest_df <- lapply(ttest_list, '[[', 'test')
ttest_df <- do.call(rbind, ttest_df)

library(ggplot2)

ggplot(ttest_df, aes(stat)) +
  geom_histogram(bins = 25) +
  facet_wrap(~ var)

enter image description here