0
votes

I have 50 of randomly generated numbers with my set parameters. I want to randomly sample the 50 random numbers into 10 groups of 5 (without replacement)

I want to store the 10 groups as matrix/dataframe and run an ANOVA test on the groups, then repeat the whole process 1000 times storing the F, F Critical, P values from each iteration.

I have the following

samp <- rnorm(50,3.47,0.0189) # 50 samples, mean of 3.47 and SD of 0.0189

for (i in 1:10){
  x <- sample(samp, 5, replace = F)
}

x <- #all my random samples

my anova code that I usually use when data is in a list with a second column identifying the group

Samp_lm <- lm(Samp_lm ~ factor(group), data = x) 
AnovaResults <- anova(Samp_lm)

criticalValues <- cbind(AnovaResults, 'F Critical Value' = qf(1 - 0.05, test.Aov[1, 1], test.Aov[2, 1]))
AnovaStats <- cbind(criticalValues[1,4],criticalValues[1,5],criticalValues[1,6]

Not sure where to go from here.

2
can you provide an example of Samp_lm and group? - Elia
Hi, are you trying to get an estimate of the null distribution by doing resampling? or is the resampling for something else? - Justin Landis

2 Answers

1
votes

Since you are repeating the random sampling, you should start by making a function that does what you want:

SimAnova <- function() {
     Groups <-rep(LETTERS[1:10], each=5)
     Values <- rnorm(50, 3.47, 0.0189)
     AnovaResults <- anova(lm(Values~Groups))
     F <- AnovaResults[1, 4]
     df <- AnovaResults[, 1]
     Crit <- qf(1 - .05, df[1], df[2])
     P <- AnovaResults[1, 5]
     c("F-Value"=F, "Critical F-Value" =Crit, "P-Value"=P)
}
SimAnova()
#          F-Value Critical F-Value          P-Value 
#        1.7350592        2.1240293        0.1126789 
SimAnova()
#          F-Value Critical F-Value          P-Value 
#       2.04024282       2.12402926       0.05965209 
SimAnova()
#          F-Value Critical F-Value          P-Value 
#        1.635386         2.124029         0.138158 

Now just repeat it 1000 times:

result <- t(replicate(1000, SimAnova()))
head(result)
#        F-Value Critical F-Value   P-Value
# [1,] 0.5659946         2.124029 0.8164247
# [2,] 0.7717596         2.124029 0.6427732
# [3,] 0.8377358         2.124029 0.5862101
# [4,] 1.6284143         2.124029 0.1401280
# [5,] 0.2191311         2.124029 0.9899751
# [6,] 0.2744286         2.124029 0.9780476

Notice that you don't really need to save the Critical F-Value because it is the same for every sample.

1
votes

Here is how I'd refactor your code using dplyr and purrr packages.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
set.seed(345)
samp <- rnorm(50,3.47,0.0189)

do_sampling <- function(vec, n_groups, iterations = 1000){
  #hacky 
  group_size <- length(vec)/n_groups
  if(group_size%%1!=0) stop("group sizes are uneven")
  
  #from purrr
  map_dfr(1:iterations, function(i){
    data <- tibble(
      samp = vec,
      groups = factor(sample(rep(1:group_size, each = n_groups)))
    )
    
    samp_lm <- lm(samp ~ groups, data = data)
    AnovaResults <- anova(samp_lm)
    bind_cols(
      as_tibble(AnovaResults[1,c("F value","Pr(>F)")]),
      tibble(
        `F Critical Value` = qf( 1 - 0.05, AnovaResults[1,1], AnovaResults[2,1]),
        iteration = i
      )
    )
  })
}

do_sampling(samp, 10)
#> # A tibble: 1,000 x 4
#>    `F value` `Pr(>F)` `F Critical Value` iteration
#>        <dbl>    <dbl>              <dbl>     <int>
#>  1     0.117  0.976                 2.58         1
#>  2     0.445  0.775                 2.58         2
#>  3     1.12   0.359                 2.58         3
#>  4     0.914  0.464                 2.58         4
#>  5     5.04   0.00192               2.58         5
#>  6     0.964  0.437                 2.58         6
#>  7     1.19   0.327                 2.58         7
#>  8     1.77   0.151                 2.58         8
#>  9     0.399  0.808                 2.58         9
#> 10     0.955  0.441                 2.58        10
#> # … with 990 more rows

Created on 2021-05-11 by the reprex package (v1.0.0)

Finally, have a look at the infer package's vignette on anova. It may be helpful to you