```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=TRUE, fig.align="center")
```

#### Statistics for Laboratory Scientists ( 140.615 )

## Permutation and Non-Parametric Tests - Advanced

#### Example 1 from class, permutation test

First some utility functions.

```{r}
# returns binary representation of 1:(2^n)
binary.v <- function(n){
  x <- 1:(2^n)
  mx <- max(x)
  digits <- floor(log2(mx))
  ans <- 0:(digits-1); lx <- length(x)
  x <- matrix(rep(x,rep(digits, lx)),ncol=lx)
  x <- (x %/% 2^ans) %% 2
}

# function to perform a paired permutation test.
paired.perm.test <- function(d,n.perm=1000){
  n <- length(d)
  if(is.null(n.perm)) { # do exact test
    ind <- binary.v(n)
    allt <- apply(ind,2,function(x,y)
                  t.test((2*x-1)*y)$statistic,d)
  }
  else { # do n.perm samples
    allt <- 1:n.perm
    for(i in 1:n.perm) 
      allt[i] <- t.test(d*sample(c(-1,1),n,repl=TRUE))$statistic
  }
  allt
}
```

The paired data.

```{r}
x <- c(117.3, 100.1,  94.5, 135.5,  92.9, 118.9, 144.8, 103.9, 103.8, 153.6, 163.1)
y <- c(145.9,  94.8, 108.0, 122.6, 130.2, 143.9, 149.9, 138.5,  91.7, 162.6, 202.5)
d <- y-x
d
```

Permutation test, all possible permutations.

```{r}
set.seed(1)
tall <- paired.perm.test(d,n.perm=NULL)
# p-value
tobs <- t.test(d)$statistic
mean(abs(tall) >= abs(tobs))
```

Permutation test, 1000 permutations.

```{r}
set.seed(2)
tsamp <- paired.perm.test(d,n.perm=1000)
# p-value
mean(abs(tsamp) >= abs(tobs))
```

#### Example 2 from class, permutation test

A utility function.

```{r}
# function to perform permutation test

perm.test <- function(x, y, n.perm=1000, var.equal=TRUE){
  # number of data points
  kx <- length(x)
  ky <- length(y)
  n <- kx + ky

  # Data re-compiled
  X <- c(x,y)
  z <- rep(1:0,c(kx,ky))

  if(is.null(n.perm)) { # do exact permutation test
    o <- binary.v(n)  # indicator of all possible samples
    o <- o[,apply(o,2,sum)==kx]  
    nc <- choose(n,kx)
    allt <- 1:nc
    for(i in 1:nc) {
      xn <- X[o[,i]==1]
      yn <- X[o[,i]==0]
      allt[i] <- t.test(xn,yn,var.equal=var.equal)$statistic
    }
  }
  else { # do 1000 permutations of the data
    allt <- 1:n.perm
    for(i in 1:n.perm) {
      z <- sample(z)
      xn <- X[z==1]
      yn <- X[z==0]
      allt[i] <- t.test(xn,yn,var.equal=var.equal)$statistic
    }
  }
  allt
}
```

The two-sample data.

```{r}
x <- c(43.3, 57.1, 35.0, 50.0, 38.2, 61.2)
y <- c(51.9, 95.1, 90.0, 49.7, 101.5, 74.1, 84.5, 46.8, 75.1)
```

Permutation test, all possible permutations.

```{r}
set.seed(3)
tall <- perm.test(x,y,n.perm=NULL)
# p-value
tobs <- t.test(x,y,var.equal=TRUE)$statistic
mean(abs(tall) >= abs(tobs))
```

Permutation test, 1000 permutations.

```{r}
set.seed(4)
tsamp <- perm.test(x,y,n.perm=1000)
# p-value
mean(abs(tsamp) >= abs(tobs))
```

#### Example 3, permutation test

```{r,fig.width=4}
xA <- c(27.0,54.6,33.5,27.6,46.0,22.2,44.2,17.3,15.9,32.8)
xB <- c(17.4,20.5,13.9,14.8,27.9,10.6,33.7,15.4,25.0,24.1)

stripchart(list(xA,xB),vertical=T,pch=21,xlim=c(0.5,2.5))
segments(0.9,mean(xA),1.1,mean(xA),lwd=2,col="red")
segments(1.9,mean(xB),2.1,mean(xB),lwd=2,col="blue")
abline(h=mean(c(xA,xB)),lty=2)

xAnew <- xA
xAnew[2] <- 99.9

stripchart(list(xAnew,xB),vertical=T,pch=21,xlim=c(0.5,2.5))
segments(0.9,mean(xAnew),1.1,mean(xAnew),lwd=2,col="red")
segments(1.9,mean(xB),2.1,mean(xB),lwd=2,col="blue")
abline(h=mean(c(xAnew,xB)),lty=2)
```

For the permutation test, we use the test statistic from the two sample t-test, but we do not use the parametric null distribution. 

```{r}
t.test(xA,xB)$statistic
```

Let's write a function first, to make things easier. Note that the input are two vectors, generically called xa and xb.

```{r}
my.perm <- function(xa,xb){
  # combine the two vectors
  x <- c(xa,xb)
  # calculate the total length of observations
  n <- length(x)
  # calculate the length of the first vector
  na <- length(xa)
  # shuffle the order of the observations across groups
  z <- x[sample(1:n)]
  # return the t test statistic from the shuffled data
  return(t.test(z[1:na],z[-(1:na)])$stat)
}
```

Example:

```{r}
set.seed(1)
# let's try one with our xA and xB
my.perm(xA,xB)
# and onther one
my.perm(xA,xB)
# and onther one
my.perm(xA,xB)
```

The permutation test.

```{r}
# let's do 10000 shuffles
nit <- 10000
# create a vector of length nit (10000)
myres <- rep(NA,nit)
# fill in the slots of that vector with the permutation test t statistics
for(j in 1:nit) myres[j] <- my.perm(xA,xB)
```

Let's plot the distribution of the t statistics, add the parametric t, and show the observed test statistic.

```{r}
hist(myres,breaks=51,col="lightgrey",probability=TRUE)
tstat <- t.test(xA,xB)$stat 
tstat
df <- t.test(xA,xB)$parameter
df
curve(dt(x,df),lwd=2,col="blue",add=TRUE)
abline(v=tstat,col="red")
```

Calculate the permutation p-value.

```{r}
mean(abs(myres)>abs(tstat))
```

Let's do the same for the data with the outlier.

```{r}
myres <- rep(NA,nit)
for(j in 1:nit) myres[j] <- my.perm(xAnew,xB)
hist(myres,breaks=51,col="lightgrey",probability=TRUE)
curve(dt(x,df),lwd=2,col="blue",add=TRUE)
abline(v=tstat,col="red")
mean(abs(myres)>abs(tstat))
```