First some utility functions.
The first function returns a binary representation of 1:2\(^n\).
binary.v <- function(n){
x <- 1:(2^n)
mx <- max(x)
digits <- floor(log2(mx))
ans <- 0:(digits-1); lx <- length(x)
x <- matrix(rep(x,rep(digits, lx)),ncol=lx)
x <- (x %/% 2^ans) %% 2
}
The second function uses the first to perform a paired permutation test.
paired.perm.test <- function(d,n.perm=1000){
n <- length(d)
if(is.null(n.perm)) {
ind <- binary.v(n)
allt <- apply(ind,2,function(x,y)
t.test((2*x-1)*y)$statistic,d)
}
else {
allt <- 1:n.perm
for(i in 1:n.perm)
allt[i] <- t.test(d*sample(c(-1,1),n,repl=TRUE))$statistic
}
allt
}
The paired data.
x <- c(117.3, 100.1, 94.5, 135.5, 92.9, 118.9, 144.8, 103.9, 103.8, 153.6, 163.1)
y <- c(145.9, 94.8, 108.0, 122.6, 130.2, 143.9, 149.9, 138.5, 91.7, 162.6, 202.5)
d <- y-x
d
## [1] 28.6 -5.3 13.5 -12.9 37.3 25.0 5.1 34.6 -12.1 9.0 39.4
Permutation test, all possible permutations.
set.seed(1)
tall <- paired.perm.test(d,n.perm=NULL)
Estimate the p-value.
tobs <- t.test(d)$statistic
mean(abs(tall) >= abs(tobs))
## [1] 0.03710938
Permutation test, 1000 permutations.
set.seed(2)
tsamp <- paired.perm.test(d,n.perm=1000)
Estimate the p-value.
mean(abs(tsamp) >= abs(tobs))
## [1] 0.043
A utility function to perform the permutation test.
perm.test <- function(x, y, n.perm=1000, var.equal=TRUE){
kx <- length(x)
ky <- length(y)
n <- kx + ky
X <- c(x,y)
z <- rep(1:0,c(kx,ky))
if(is.null(n.perm)) {
o <- binary.v(n)
o <- o[,apply(o,2,sum)==kx]
nc <- choose(n,kx)
allt <- 1:nc
for(i in 1:nc) {
xn <- X[o[,i]==1]
yn <- X[o[,i]==0]
allt[i] <- t.test(xn,yn,var.equal=var.equal)$statistic
}
}
else {
allt <- 1:n.perm
for(i in 1:n.perm) {
z <- sample(z)
xn <- X[z==1]
yn <- X[z==0]
allt[i] <- t.test(xn,yn,var.equal=var.equal)$statistic
}
}
allt
}
The two-sample data.
x <- c(43.3, 57.1, 35.0, 50.0, 38.2, 61.2)
y <- c(51.9, 95.1, 90.0, 49.7, 101.5, 74.1, 84.5, 46.8, 75.1)
Permutation test, all possible permutations.
set.seed(3)
tall <- perm.test(x,y,n.perm=NULL)
Estimate the p-value.
tobs <- t.test(x,y,var.equal=TRUE)$statistic
mean(abs(tall) >= abs(tobs))
## [1] 0.01498501
Permutation test, 1000 permutations.
set.seed(4)
tsamp <- perm.test(x,y,n.perm=1000)
Estimate the p-value.
mean(abs(tsamp) >= abs(tobs))
## [1] 0.024
xA <- c(27.0,54.6,33.5,27.6,46.0,22.2,44.2,17.3,15.9,32.8)
xB <- c(17.4,20.5,13.9,14.8,27.9,10.6,33.7,15.4,25.0,24.1)
xAnew <- xA
xAnew[2] <- 99.9
Plot the two data sets.
par(mfrow=c(1,2),las=1)
stripchart(list(xA,xB),vertical=T,pch=21,xlim=c(0.5,2.5),ylim=c(0,100))
segments(0.9,mean(xA),1.1,mean(xA),lwd=2,col="red")
segments(1.9,mean(xB),2.1,mean(xB),lwd=2,col="blue")
abline(h=mean(c(xA,xB)),lty=2)
stripchart(list(xAnew,xB),vertical=T,pch=21,xlim=c(0.5,2.5),ylim=c(0,100))
segments(0.9,mean(xAnew),1.1,mean(xAnew),lwd=2,col="red")
segments(1.9,mean(xB),2.1,mean(xB),lwd=2,col="blue")
abline(h=mean(c(xAnew,xB)),lty=2)
For the permutation test, we use the test statistic from the two sample t-test, but we do not use the parametric null distribution.
t.test(xA,xB)$statistic
## t
## 2.533564
Let’s write a function first, to make things easier. Note that the input are two vectors, generically called xa and xb. The output is the t statistic from the shuffled data.
my.perm <- function(xa,xb){
x <- c(xa,xb)
n <- length(x)
na <- length(xa)
z <- x[sample(1:n)]
return(t.test(z[1:na],z[-(1:na)])$stat)
}
Let’s try one with our xA and xB.
set.seed(1)
my.perm(xA,xB)
## t
## 1.12726
The permutation test with 10,000 shuffles.
nit <- 10000
myres <- rep(NA,nit)
for(j in 1:nit) myres[j] <- my.perm(xA,xB)
Let’s plot the distribution of the shuffled t statistics, add the parametric t, and show the observed test statistic.
hist(myres,breaks=51,col="lightgrey",probability=TRUE,main="permutation t statistics / data set 1")
tstat <- t.test(xA,xB)$stat
tstat
## t
## 2.533564
df <- t.test(xA,xB)$parameter
df
## df
## 14.22405
curve(dt(x,df),lwd=2,col="blue",add=TRUE)
abline(v=tstat,col="red")
Calculate the permutation p-value.
mean(abs(myres)>abs(tstat))
## [1] 0.021
Let’s do the same for the data with the outlier.
tstatnew <- t.test(xAnew,xB)$stat
dfnew <- t.test(xAnew,xB)$parameter
myres <- rep(NA,nit)
for(j in 1:nit) myres[j] <- my.perm(xAnew,xB)
hist(myres,breaks=51,col="lightgrey",probability=TRUE,main="permutation t statistics / data set 2")
curve(dt(x,dfnew),lwd=2,col="blue",add=TRUE)
abline(v=tstatnew,col="red")
mean(abs(myres)>abs(tstatnew))
## [1] 0.0198