MTH 412/512 - Ch 3: permutation resampling

example: data on beer and hotwings consumption

head(Beerwings,2) # look at the first few rows of the data frame

##   ID Hotwings Beer Gender
## 1  1        4   24      F
## 2  2        5    0      F

Extract data for each group A,B; then create a single array for each group

Asubset <- subset(Beerwings, subset=(Gender=="F"))  # separate out F data
Bsubset <- subset(Beerwings, subset=(Gender=="M"))  # separate out M data
#head(Asubset)
Adata <- Asubset$Hotwings  # wings eaten
Bdata <- Bsubset$Hotwings  # wings eaten

mean for each group and difference in means

xbar_A <- mean(Adata)  # mean of A data
xbar_B <- mean(Bdata)  # mean of B data
delta_xbar <- xbar_B-xbar_A  # difference in means (B - A)
nA <- length(Adata) # number in A category
nB <- length(Bdata) # number in B category
n <- nA + nB    # total number of A + B

nA = 15 in A category with mean xbar_A = 9.3333333
nB = 15 in B category with mean xbar_B = 14.5333333
difference in means (B-A) delta_xbar = 5.2

# construct Null Distribution from all the data and note its mean
NullDist <- Beerwings$Hotwings  # null distribution is all wing consumption
Null_mean <- mean(NullDist)     # mean of null distribution

NullDist mean = 11.9333333

Implement permutation resampling algorithm here:

nsamples = 10**3 - 1    # number of samples (-1 here so +1 later gives power of 10)
delta = numeric(nsamples)  # create a vector to hold the delta for each sample
# loop to create sample and then calculate the difference in means
for (i in seq(1,nsamples)) {   
  # sample for A is random sample from NullData of size nA 
  # "replace = FALSE" means without replacement
  index <- sample(n, size=nA, replace = FALSE)  # random set of indexes from (1,...,n) of size nA
  sample_A <- NullDist[index]   # permutation resample for A based on index set
  sample_B <- NullDist[-index]  # B is all elements in NullDist NOT including the index set 
  B_mean <- mean(sample_B)
  A_mean <- mean(sample_A)
  delta[i] <- B_mean - A_mean  # stores the delta in an array
}
head(delta) # delta array is our sampling distribution of the test statistic

## [1] -2.400000 -0.400000 -1.066667  0.000000 -1.466667  0.800000

Count number of exceedances (when sample delta >= delta_xbar)

exceed_count = sum(delta >= delta_xbar)
pvalue = (exceed_count+1)/(nsamples+1)

samples+1	exceedances+1	p-value
1000	2	0.002

try different values of nsamples above and cut-and-paste into table of results:

samples+1	exceed+1	p-value
100	1	0.01
1000	2	0.002
10000	12	0.0012
1e+05	119	0.00119
1e+06	1078	0.001078

From the trend for large number of samples, the p-value from resampling is approximately p=0.001.
Thus the likelihood that random chance would produce observed difference in the means (B-A) is small.

Plot a histogram of the sample values with the original data value as a visual check.

options(repr.plot.width=5, repr.plot.height=3.5) # resize plots
hist(delta,col='lightblue')
abline(v=delta_xbar,col='red') # puts vertical line at observed delta value