Methods in Biostatistics 2 (140.652)

Goodness of fit

Example data

obs <- c(35, 43, 22)

# Expected proportions (i.e., the null hypothesis)
p0 <- c(0.25, 0.5, 0.25)

# Expected counts (note: sum(obs) = total count)
exp <- sum(obs) * p0

# LRT statistic 
lrt <- 2 * sum(obs * log(obs/exp)) 
lrt
## [1] 4.95762
# Chi-square statistic 
chis <- sum( (obs-exp)^2 / exp)
chis
## [1] 5.34
# P-value for LRT 
1 - pchisq(lrt, 3-1)
## [1] 0.08384295
# P-value for chi-square test 
1 - pchisq(chis, 3-1)
## [1] 0.06925223

The built-in function

# You can use chisq.test() to do the chi-square test
chisq.test(obs, p=c(1/4,1/2,1/4))
## 
##  Chi-squared test for given probabilities
## 
## data:  obs
## X-squared = 5.34, df = 2, p-value = 0.06925

Likelihood ratio test for goodness of fit

I start here with two functions:

  • lrt / for calculating the LRT goodness-of fit statistic,
  • chisq / for calculating the chi-square goodness-of-fit statistic.

Copy and paste these into R before doing the next bits.

lrt <- function(x=c(35,43,22), p=c(0.25,0.5,0.25))
  2 * (dmultinom(x, prob=x/sum(x),log=TRUE) - dmultinom(x, prob=p,log=TRUE))

# alternatively, this could have been the following,

lrt2 <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){
  ex <- (p*sum(x))[x > 0]
  x <- x[x>0]
  2 * sum(x * log(x / ex) )
}

chisq <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){
  # check that x,p are appropriate
  p <- p/sum(p) # ensure that the probabilities sum to 1
  if(any(p < 0)) stop("probabilities p must be non-negative.")
  if(any(x < 0)) stop("x's must all be non-negative.")
  if(length(x) != length(p)) stop("length(x) should be the same as length(p).")
  n <- sum(x)
  expected <- n*p
  sum( (x - expected)^2 / expected )
}

The Main Example

# The 35:43:22 table
x <- c(35, 43, 22)

# Note that you might also have started with a vector of 1's, 2's and 3's and used the function "table" instead, as follows:
y <- rep(1:3, c(35, 43, 22))
x <- table(y)
x
## y
##  1  2  3 
## 35 43 22
# Calculate LRT test statistic to test the 1:2:1 proportions
LRT <- lrt(x, p=c(0.25, 0.5, 0.25)) 
LRT
## [1] 4.95762
# Calculate chi-square statistic to test the 1:2:1 proportions
xsq <- chisq(x, p=c(0.25, 0.5, 0.25))
xsq
## [1] 5.34
# You could also use the built-in function chisq.test()
chisq.test(x, p=c(0.25, 0.5, 0.25))
## 
##  Chi-squared test for given probabilities
## 
## data:  x
## X-squared = 5.34, df = 2, p-value = 0.06925
# P-value for LRT statistic, using the asymptotic approximation
1 - pchisq(LRT, 2)
## [1] 0.08384295
# P-value for chi-square statistic, using the asymptotic approx'n
1 - pchisq(xsq, 2)
## [1] 0.06925223
# Simulations to est the null dist'n of the LRT and chi-square stats
set.seed(45)
simdat <- rmultinom(1000, 100, c(0.25, 0.5, 0.25))
results <- cbind(apply(simdat, 2, lrt, p=c(0.25, 0.5, 0.25)),
                 apply(simdat, 2, chisq, p=c(0.25, 0.5, 0.25)))

# Estimated p-value for LRT stat
mean(results[,1] >= LRT)
## [1] 0.111
# Estimated p-value for chi-square stat
mean(results[,2] >= xsq)
## [1] 0.095

Example 1 in the “composite hypotheses” section of the lecture

I start with some additional functions:

  • sim.ex1, for simulating data under the null hypothesis (H0).
  • mle.ex1, for estimating the allele frequency under H0.
  • lrt.ex1, calculates LRT statistic for testing this H0.
  • chisq.ex1, calculates chi-square stat for testing this H0.
# Simulate data
sim.ex1 <- function(n=100, f=0.2){
  table(factor(sample(c("AA","AB","BB"), n, repl=TRUE, 
                      prob=c(f^2,2*f*(1-f),(1-f)^2)), 
               levels=c("AA","AB","BB")))
  }

# MLE under H0
mle.ex1 <- function(x=c(5,20,75)) (x[1]+x[2]/2)/sum(x)
  
# LRT statistic
lrt.ex1 <- function(x=c(5,20,75)){
  mle <- mle.ex1(x)
  p0 <- c(mle^2,2*mle*(1-mle),(1-mle)^2)
  2*(dmultinom(x,prob=x/sum(x),log=TRUE) - dmultinom(x,prob=p0,log=TRUE))
}

# Chi-square statistic
chisq.ex1 <- function(x=c(5,20,75)){
  mle <- mle.ex1(x)
  expected <- sum(x)*c(mle^2,2*mle*(1-mle),(1-mle)^2)
  sum((x-expected)^2/expected)
}

# The example data
x <- c(5, 20, 75)

# The estimated allele frequency
mle <- mle.ex1(x)
mle
## [1] 0.15
# LRT statistic
LRT <- lrt.ex1(x)
LRT
## [1] 3.870598
# Chi-square stat
xsq <- chisq.ex1(x)
xsq
## [1] 4.652057
# Asymptotic approximation for the p-value for the LRT statistic
1 - pchisq(LRT, 1)
## [1] 0.04913902
# Asymptotic approximation for the p-value for chi-square statistic
1 - pchisq(xsq, 1)
## [1] 0.03101636
# Simulations to estimate the null distribution of the LRT and chi-square statistics
results <- matrix(ncol=2,nrow=1000)
set.seed(33)
for(i in 1:1000){
  mytable <- sim.ex1(100, mle)
  results[i,1] <- lrt.ex1(mytable)
  results[i,2] <- chisq.ex1(mytable)
}

# Estimated p-value for the LRT statistic
mean(results[,1] >= LRT)
## [1] 0.084
# Estimated p-value for chi-square statistic
mean(results[,2] >= xsq)
## [1] 0.026

Example 2

We can use the EM algorithm to find the MLEs of the allele frequencies. Some functions first.

estep <- function(n,p){
  pAA <- p[2]^2/(p[2]^2+2*p[2]*p[1])
  pBB <- p[3]^2/(p[3]^2+2*p[3]*p[1])
  ex <- c(n[1], pAA*n[2], (1-pAA)*n[2],
          pBB*n[3], (1-pBB)*n[3],
          n[4])
}

mstep <- function(ex){
  n <- sum(ex)
  p <- c((ex[1]+(ex[3]+ex[5])/2)/n,
         (ex[2]+(ex[3]+ex[6])/2)/n,
         (ex[4]+(ex[5]+ex[6])/2)/n)
  p
}

em <- function(n, tol=1e-5, maxit=1000, verbose=TRUE){
  curp <- c(n[1], n[2]+n[4]/2,
            n[3]+n[4]/2)/sum(n)
  curll <- llik(n,curp)
  if(verbose) cat(c(0,curp,curll),"\n")
  
  for(i in 1:maxit) {
    ex <- estep(n,curp)
    newp <- mstep(ex)
    if(max(abs(newp-curp))<tol) break
    newll <- llik(n,newp)
    if(verbose) cat(c(i,newp,newll,newll-curll),"\n")
    curp <- newp
    curll <- newll
  }
  newp
}

llik <- function(n, p,tol=1e-13){
  pp <- c(p[1]^2,
          p[2]^2+2*p[2]*p[1],
          p[3]^2+2*p[3]*p[1],
          2*p[2]*p[3])

  if(any(pp<tol & n>0)) return(-Inf)
  sum(n*log(pp))
}

Get te MLEs.

n <- c("O"=104, "A"=91, "B"=36, "AB"=19)
mle <- em(n,verbose=FALSE)
mle
##         O         A         B 
## 0.6339922 0.2499713 0.1160365

Plot the data.

p <- seq(0,1,length=251)
thellik <- matrix(ncol=length(p),nrow=length(p))
for(i in 1:length(p)) {
  for(j in which(p[i]+p<=1)) {
    thellik[i,j] <- llik(n,c(p[i],p[j],1-p[i]-p[j]))
    if(p[i]+p[j]<0 || p[i]+p[j]>1)
      cat(i,j,p[i],p[j],1-p[i]-p[j],"\n")
  }
}

par(pty="s",las=1,mar=c(4.1,4.1,0.6,0.6))
image(p,p,thellik,col=rev(rainbow(start=0,end=2/3,n=251)),
      xlab=expression(p[O]),ylab=expression(p[A]),main="")
contour(p,p,thellik,add=TRUE,levels=seq(-400,-2000,by= -100))
contour(p,p,thellik,add=TRUE,levels=seq(-400,-250,by=25)[-1],col="blue")
points(mle[1],mle[2],lwd=2,pch=4)

An alternative optimization algorithm.

dat <- c(104,91,36,19)

simple <- function(dat){
  fr <- dat/sum(dat)
  po <- sqrt(fr[1])
  pa <- sqrt(po^2+fr[2])-po
  pb <- fr[4]/pa
  c(O=po,A=pa,B=pb)
}

loglik <- function(p,dat){
  p <- c(p,1-p[1]-p[2]) 
  ll <- -(dat[1]*log(p[1]^2)+dat[2]*log(p[2]^2+2*p[2]*p[1]) + 
            dat[3]*log(p[3]^2+2*p[3]*p[1]) + 
            dat[4]*log(2*p[3]*p[2]))
}

mles <- function(dat){
  fs <- optim(p=simple(dat)[-3],loglik,dat=dat)$par
  fs <- c(fs,1-sum(fs))
  names(fs) <- c("O","A","B")
  return(fs)
}

mles(dat)
##         O         A         B 
## 0.6339820 0.2499967 0.1160212

Example 3 regarding fit of Poisson counts

# The raw data
x <- c(2, 2, 0, 0, 0,   0, 0, 1, 0, 0,
       0, 0, 4, 0, 0,   3, 0, 0, 0, 3,
       0, 5, 0, 1, 0,   1, 2, 0, 2, 1,
       0, 0, 0, 0, 0,   0, 0, 0)

# Counts
ob <- table(x)
ob
## x
##  0  1  2  3  4  5 
## 26  4  4  2  1  1
# Sample mean
mu <- mean(x)
mu
## [1] 0.7105263
# Expected counts; binning 5+ together
ex <- c(dpois(0:4, mu), 1-ppois(4,mu)) * length(x)
ex
## [1] 18.67264921 13.26740865  4.71342149  1.11633667  0.19829665  0.03188734
# Chi-square and LRT statistics
chis <- sum((ob-ex)^2/ex)
lrt <- sum(2*ob*log(ob/ex))

# P-values via asymptotic approximation
1-pchisq(chis, length(ob)-2)
## [1] 1.144142e-08
1-pchisq(lrt, length(ob)-2)
## [1] 0.0008727531
# P-value based on simulations
n.sim <- 10000
set.seed(22)
simd <- apply(matrix(rpois(n.sim*length(x), mu), ncol=n.sim),2,
              function(x) { x <-table(factor(x,levels=0:100)); x[6] <- sum(x[-(1:5)]); x[1:6] })

# Function to calculate the likelihood ratio test statistic
lrtt <- function(a,b){
    b <- b[a>0]
    a <- a[a>0]
    sum(2*a*log(a/b))
  }

# Calculate the statistics
simlrt <- apply(simd, 2, lrtt, ex)
simchisq <- apply(simd, 2, function(a,b) sum((a-b)^2/b), ex)

# The p-values
mean(simchisq >= sum((ob-ex)^2/ex))
## [1] 0.0017
mean(simlrt >= sum(2*ob*log(ob/ex)))
## [1] 5e-04
# Plot the permutation null distributions
hist(simchisq,col="lightgrey",breaks=51)
abline(v=sum((ob-ex)^2/ex),col="red",lwd=2)

hist(simlrt,col="lightgrey",breaks=51)
abline(v=sum(2*ob*log(ob/ex)),col="red",lwd=2)

Contingency tables

Example 1

x <- rbind(c(18, 2), c(11, 9))
x
##      [,1] [,2]
## [1,]   18    2
## [2,]   11    9
# Marginal totals
rs <- apply(x, 1, sum) # sum of each row
rs
## [1] 20 20
cs <- apply(x, 2, sum) # sum of each column
cs
## [1] 29 11
n  <- sum(x)
n
## [1] 40
# Expected counts
expected <- outer(rs,cs,"*")/n
expected
##      [,1] [,2]
## [1,] 14.5  5.5
## [2,] 14.5  5.5
# Chi-square statistic
chi <- sum((x-expected)^2/expected) 
chi
## [1] 6.144201
# P-value
1-pchisq(chi,1) 
## [1] 0.01318437
# Lrt statistic
lrt <- 2*sum(x * log(x/expected)) 
# P-value
1-pchisq(lrt,1)
## [1] 0.01063906
# Mind the correction factor!
chisq.test(x)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 4.5141, df = 1, p-value = 0.03362
chisq.test(x,correct=FALSE) # same as above
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 6.1442, df = 1, p-value = 0.01318

Example 2

x <- rbind(c(9,9), c(20, 62))
x
##      [,1] [,2]
## [1,]    9    9
## [2,]   20   62
# Marginal totals
rs <- apply(x, 1, sum) # sum of each row
rs
## [1] 18 82
cs <- apply(x, 2, sum) # sum of each column
cs
## [1] 29 71
n  <- sum(x)
n
## [1] 100
# Expected counts
expected <- outer(rs,cs,"*")/n
expected
##       [,1]  [,2]
## [1,]  5.22 12.78
## [2,] 23.78 58.22
# Chi-square statistic
chi <- sum((x-expected)^2/expected) 
chi
## [1] 4.701548
# P-value
1-pchisq(chi,1) 
## [1] 0.03013546
# Lrt statistic
lrt <- 2*sum(x * log(x/expected)) 
lrt
## [1] 4.369036
# P-value
1-pchisq(lrt,1)
## [1] 0.03659769

Blood groups by state

# The data
x <- rbind(c(122, 117, 19, 244),
           c(1781, 1351, 288,3301),
           c(353, 269, 60, 713))
x
##      [,1] [,2] [,3] [,4]
## [1,]  122  117   19  244
## [2,] 1781 1351  288 3301
## [3,]  353  269   60  713
# Chi-square test for independence
chi <- chisq.test(x)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 5.6382, df = 6, p-value = 0.4649
# Expected counts
ex <- chi$expected
ex
##           [,1]      [,2]      [,3]      [,4]
## [1,]  131.4124  101.1806  21.37781  248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,]  365.1799  281.1691  59.40647  689.2446
# A direct method to get the expected counts
rs <- apply(x,1,sum) # row sums
rs
## [1]  502 6721 1395
cs <- apply(x,2,sum) # col sums
cs
## [1] 2256 1737  367 4258
n <- sum(rs) # total sample size
n
## [1] 8618
ex <- outer(rs,cs,"*")/n # expected counts under independence
ex
##           [,1]      [,2]      [,3]      [,4]
## [1,]  131.4124  101.1806  21.37781  248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,]  365.1799  281.1691  59.40647  689.2446
# Lrt statistic
lrt <- 2*sum(x * log(x/ex)) 
lrt
## [1] 5.548169
# Degrees of freedom
df <- prod(dim(x)-1)  
df
## [1] 6
# Another way to get the degrees of freedom
df <- (ncol(x) - 1) * (nrow(x) - 1)
df
## [1] 6
# P-value
1-pchisq(lrt, df)
## [1] 0.475654

End of code