Statistics for Laboratory Scientists ( 140.615 )

Goodness of Fit - Advanced

Example from class: simulations

o <- c(35,43,22)
p <- c(0.25,0.50,0.25)
e <- sum(o)*p
e
## [1] 25 50 25

Functions to calculate the test statistics.

f.xsq <- function(o,p){
  e <- (p*sum(o))
  sum((o-e)^2/e)
}

f.lrt <- function(o,p){
  e <- (p*sum(o))
  2*sum(o*log(o/e),na.rm=TRUE)
}

xsq <- f.xsq(o,p)
xsq
## [1] 5.34
lrt <- f.lrt(o,p)
lrt
## [1] 4.95762

The Multinomial distribution.

set.seed(1)
rmultinom(1,100,c(0.25,0.50,0.25))
##      [,1]
## [1,]   22
## [2,]   53
## [3,]   25
rmultinom(5,100,c(0.25,0.50,0.25))
##      [,1] [,2] [,3] [,4] [,5]
## [1,]   26   21   32   26   21
## [2,]   44   47   44   55   57
## [3,]   30   32   24   19   22

Simulate 10,000 test statistics under the null.

set.seed(45)
simdat <- rmultinom(10000,100,c(0.25,0.50,0.25))
dim(simdat)
## [1]     3 10000
res.xsq <- apply(simdat,2,f.xsq,p=c(0.25,0.50,0.25))
str(res.xsq)
##  num [1:10000] 0.22 0.54 0.64 0.54 7.26 1.92 0.24 0.16 4.54 0.72 ...
res.lrt <- apply(simdat,2,f.lrt,p=c(0.25,0.50,0.25))
str(res.lrt)
##  num [1:10000] 0.224 0.552 0.641 0.551 7.516 ...

Plot the histograms of the test statistics, and estimate the p-values.

hist(res.xsq,col="lightgrey",prob=TRUE,breaks=seq(0,20,1))
abline(v=xsq,col="red",lwd=2)

mean(res.xsq>xsq)
## [1] 0.0699
hist(res.lrt,col="lightgrey",prob=TRUE,breaks=seq(0,20,1))
abline(v=lrt,col="red",lwd=2)

mean(res.lrt>lrt)
## [1] 0.0909

Example from class: fit of Poisson counts

x <- c(2, 2, 0, 0, 0,   0, 0, 1, 0, 0,
       0, 0, 4, 0, 0,   3, 0, 0, 0, 3,
       0, 5, 0, 1, 0,   1, 2, 0, 2, 1,
       0, 0, 0, 0, 0,   0, 0, 0)

ob <- table(x)
ob
## x
##  0  1  2  3  4  5 
## 26  4  4  2  1  1
mu <- mean(x)
mu
## [1] 0.7105263

Binning 5+ together.

dpois(0:10,mu)
##  [1] 4.913855e-01 3.491423e-01 1.240374e-01 2.937728e-02 5.218333e-03 7.415526e-04 8.781543e-05 8.913597e-06
##  [9] 7.916681e-07 6.250012e-08 4.440798e-09
dpois(0:4,mu)
## [1] 0.491385505 0.349142333 0.124037408 0.029377281 0.005218333
ppois(4,mu)
## [1] 0.9991609
ppois(4,mu,lower=FALSE)
## [1] 0.0008391405
ppois(4,mu)+ppois(4,mu,lower=FALSE)
## [1] 1
p <- c(dpois(0:4,mu),ppois(4,mu,lower=FALSE))
p
## [1] 0.4913855054 0.3491423328 0.1240374077 0.0293772808 0.0052183328 0.0008391405
e <- p*length(x)
e
## [1] 18.67264921 13.26740865  4.71342149  1.11633667  0.19829665  0.03188734

The parametric hypothesis tests.

lrt <- f.lrt(ob,p)
lrt
## [1] 18.76827
pchisq(lrt,4,lower=FALSE)
## [1] 0.0008727531
xsq <- f.xsq(ob,p)
xsq
## [1] 42.78971
pchisq(xsq,4,lower=FALSE)
## [1] 1.144142e-08

Inference via simulations.

n <- length(x)
n
## [1] 38
set.seed(1)
tmp <- rpois(n,mu)
tmp
##  [1] 0 0 1 2 0 2 2 1 1 0 0 0 1 0 1 1 1 3 0 1 2 0 1 0 0 0 0 0 2 0 0 1 1 0 1 1 1 0
table(tmp)
## tmp
##  0  1  2  3 
## 18 14  5  1
table(factor(tmp))
## 
##  0  1  2  3 
## 18 14  5  1
table(factor(tmp,levels=0:10))
## 
##  0  1  2  3  4  5  6  7  8  9 10 
## 18 14  5  1  0  0  0  0  0  0  0
tmp[tmp>5] <-5
table(factor(tmp,levels=0:5))
## 
##  0  1  2  3  4  5 
## 18 14  5  1  0  0
nit <- 10000
res.lrt <- rep(NA,nit)
res.xsq <- rep(NA,nit)

set.seed(1)
for(j in 1:nit){
  tmp <- rpois(n,mu)
  tmp[tmp>5] <-5
  x <- table(factor(tmp,levels=0:5))
  res.lrt[j] <- f.lrt(x,p)
  res.xsq[j] <- f.xsq(x,p)
}

Plot the histograms of the test statistics, and estimate the p-values.

hist(res.lrt,col="lightgrey",prob=TRUE,breaks=101)
abline(v=lrt,col="red",lwd=2)

mean(res.lrt>lrt)
## [1] 5e-04
hist(res.xsq,col="lightgrey",prob=TRUE,breaks=101)
abline(v=xsq,col="red",lwd=2)

mean(res.xsq>xsq)
## [1] 0.0011