obs <- c(35, 43, 22)
# Expected proportions (i.e., the null hypothesis)
p0 <- c(0.25, 0.5, 0.25)
# Expected counts (note: sum(obs) = total count)
exp <- sum(obs) * p0
# LRT statistic
lrt <- 2 * sum(obs * log(obs/exp))
lrt
## [1] 4.95762
# Chi-square statistic
chis <- sum( (obs-exp)^2 / exp)
chis
## [1] 5.34
# P-value for LRT
1 - pchisq(lrt, 3-1)
## [1] 0.08384295
# P-value for chi-square test
1 - pchisq(chis, 3-1)
## [1] 0.06925223
# You can use chisq.test() to do the chi-square test
chisq.test(obs, p=c(1/4,1/2,1/4))
##
## Chi-squared test for given probabilities
##
## data: obs
## X-squared = 5.34, df = 2, p-value = 0.06925
I start here with two functions:
Copy and paste these into R before doing the next bits.
lrt <- function(x=c(35,43,22), p=c(0.25,0.5,0.25))
2 * (dmultinom(x, prob=x/sum(x),log=TRUE) - dmultinom(x, prob=p,log=TRUE))
# alternatively, this could have been the following,
lrt2 <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){
ex <- (p*sum(x))[x > 0]
x <- x[x>0]
2 * sum(x * log(x / ex) )
}
chisq <- function(x=c(35,43,22), p=c(0.25,0.5,0.25)){
# check that x,p are appropriate
p <- p/sum(p) # ensure that the probabilities sum to 1
if(any(p < 0)) stop("probabilities p must be non-negative.")
if(any(x < 0)) stop("x's must all be non-negative.")
if(length(x) != length(p)) stop("length(x) should be the same as length(p).")
n <- sum(x)
expected <- n*p
sum( (x - expected)^2 / expected )
}
# The 35:43:22 table
x <- c(35, 43, 22)
# Note that you might also have started with a vector of 1's, 2's and 3's and used the function "table" instead, as follows:
y <- rep(1:3, c(35, 43, 22))
x <- table(y)
x
## y
## 1 2 3
## 35 43 22
# Calculate LRT test statistic to test the 1:2:1 proportions
LRT <- lrt(x, p=c(0.25, 0.5, 0.25))
LRT
## [1] 4.95762
# Calculate chi-square statistic to test the 1:2:1 proportions
xsq <- chisq(x, p=c(0.25, 0.5, 0.25))
xsq
## [1] 5.34
# You could also use the built-in function chisq.test()
chisq.test(x, p=c(0.25, 0.5, 0.25))
##
## Chi-squared test for given probabilities
##
## data: x
## X-squared = 5.34, df = 2, p-value = 0.06925
# P-value for LRT statistic, using the asymptotic approximation
1 - pchisq(LRT, 2)
## [1] 0.08384295
# P-value for chi-square statistic, using the asymptotic approx'n
1 - pchisq(xsq, 2)
## [1] 0.06925223
# Simulations to est the null dist'n of the LRT and chi-square stats
set.seed(45)
simdat <- rmultinom(1000, 100, c(0.25, 0.5, 0.25))
results <- cbind(apply(simdat, 2, lrt, p=c(0.25, 0.5, 0.25)),
apply(simdat, 2, chisq, p=c(0.25, 0.5, 0.25)))
# Estimated p-value for LRT stat
mean(results[,1] >= LRT)
## [1] 0.111
# Estimated p-value for chi-square stat
mean(results[,2] >= xsq)
## [1] 0.095
I start with some additional functions:
# Simulate data
sim.ex1 <- function(n=100, f=0.2){
table(factor(sample(c("AA","AB","BB"), n, repl=TRUE,
prob=c(f^2,2*f*(1-f),(1-f)^2)),
levels=c("AA","AB","BB")))
}
# MLE under H0
mle.ex1 <- function(x=c(5,20,75)) (x[1]+x[2]/2)/sum(x)
# LRT statistic
lrt.ex1 <- function(x=c(5,20,75)){
mle <- mle.ex1(x)
p0 <- c(mle^2,2*mle*(1-mle),(1-mle)^2)
2*(dmultinom(x,prob=x/sum(x),log=TRUE) - dmultinom(x,prob=p0,log=TRUE))
}
# Chi-square statistic
chisq.ex1 <- function(x=c(5,20,75)){
mle <- mle.ex1(x)
expected <- sum(x)*c(mle^2,2*mle*(1-mle),(1-mle)^2)
sum((x-expected)^2/expected)
}
# The example data
x <- c(5, 20, 75)
# The estimated allele frequency
mle <- mle.ex1(x)
mle
## [1] 0.15
# LRT statistic
LRT <- lrt.ex1(x)
LRT
## [1] 3.870598
# Chi-square stat
xsq <- chisq.ex1(x)
xsq
## [1] 4.652057
# Asymptotic approximation for the p-value for the LRT statistic
1 - pchisq(LRT, 1)
## [1] 0.04913902
# Asymptotic approximation for the p-value for chi-square statistic
1 - pchisq(xsq, 1)
## [1] 0.03101636
# Simulations to estimate the null distribution of the LRT and chi-square statistics
results <- matrix(ncol=2,nrow=1000)
set.seed(33)
for(i in 1:1000){
mytable <- sim.ex1(100, mle)
results[i,1] <- lrt.ex1(mytable)
results[i,2] <- chisq.ex1(mytable)
}
# Estimated p-value for the LRT statistic
mean(results[,1] >= LRT)
## [1] 0.084
# Estimated p-value for chi-square statistic
mean(results[,2] >= xsq)
## [1] 0.026
We can use the EM algorithm to find the MLEs of the allele frequencies. Some functions first.
estep <- function(n,p){
pAA <- p[2]^2/(p[2]^2+2*p[2]*p[1])
pBB <- p[3]^2/(p[3]^2+2*p[3]*p[1])
ex <- c(n[1], pAA*n[2], (1-pAA)*n[2],
pBB*n[3], (1-pBB)*n[3],
n[4])
}
mstep <- function(ex){
n <- sum(ex)
p <- c((ex[1]+(ex[3]+ex[5])/2)/n,
(ex[2]+(ex[3]+ex[6])/2)/n,
(ex[4]+(ex[5]+ex[6])/2)/n)
p
}
em <- function(n, tol=1e-5, maxit=1000, verbose=TRUE){
curp <- c(n[1], n[2]+n[4]/2,
n[3]+n[4]/2)/sum(n)
curll <- llik(n,curp)
if(verbose) cat(c(0,curp,curll),"\n")
for(i in 1:maxit) {
ex <- estep(n,curp)
newp <- mstep(ex)
if(max(abs(newp-curp))<tol) break
newll <- llik(n,newp)
if(verbose) cat(c(i,newp,newll,newll-curll),"\n")
curp <- newp
curll <- newll
}
newp
}
llik <- function(n, p,tol=1e-13){
pp <- c(p[1]^2,
p[2]^2+2*p[2]*p[1],
p[3]^2+2*p[3]*p[1],
2*p[2]*p[3])
if(any(pp<tol & n>0)) return(-Inf)
sum(n*log(pp))
}
Get te MLEs.
n <- c("O"=104, "A"=91, "B"=36, "AB"=19)
mle <- em(n,verbose=FALSE)
mle
## O A B
## 0.6339922 0.2499713 0.1160365
Plot the data.
p <- seq(0,1,length=251)
thellik <- matrix(ncol=length(p),nrow=length(p))
for(i in 1:length(p)) {
for(j in which(p[i]+p<=1)) {
thellik[i,j] <- llik(n,c(p[i],p[j],1-p[i]-p[j]))
if(p[i]+p[j]<0 || p[i]+p[j]>1)
cat(i,j,p[i],p[j],1-p[i]-p[j],"\n")
}
}
par(pty="s",las=1,mar=c(4.1,4.1,0.6,0.6))
image(p,p,thellik,col=rev(rainbow(start=0,end=2/3,n=251)),
xlab=expression(p[O]),ylab=expression(p[A]),main="")
contour(p,p,thellik,add=TRUE,levels=seq(-400,-2000,by= -100))
contour(p,p,thellik,add=TRUE,levels=seq(-400,-250,by=25)[-1],col="blue")
points(mle[1],mle[2],lwd=2,pch=4)
An alternative optimization algorithm.
dat <- c(104,91,36,19)
simple <- function(dat){
fr <- dat/sum(dat)
po <- sqrt(fr[1])
pa <- sqrt(po^2+fr[2])-po
pb <- fr[4]/pa
c(O=po,A=pa,B=pb)
}
loglik <- function(p,dat){
p <- c(p,1-p[1]-p[2])
ll <- -(dat[1]*log(p[1]^2)+dat[2]*log(p[2]^2+2*p[2]*p[1]) +
dat[3]*log(p[3]^2+2*p[3]*p[1]) +
dat[4]*log(2*p[3]*p[2]))
}
mles <- function(dat){
fs <- optim(p=simple(dat)[-3],loglik,dat=dat)$par
fs <- c(fs,1-sum(fs))
names(fs) <- c("O","A","B")
return(fs)
}
mles(dat)
## O A B
## 0.6339820 0.2499967 0.1160212
# The raw data
x <- c(2, 2, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 4, 0, 0, 3, 0, 0, 0, 3,
0, 5, 0, 1, 0, 1, 2, 0, 2, 1,
0, 0, 0, 0, 0, 0, 0, 0)
# Counts
ob <- table(x)
ob
## x
## 0 1 2 3 4 5
## 26 4 4 2 1 1
# Sample mean
mu <- mean(x)
mu
## [1] 0.7105263
# Expected counts; binning 5+ together
ex <- c(dpois(0:4, mu), 1-ppois(4,mu)) * length(x)
ex
## [1] 18.67264921 13.26740865 4.71342149 1.11633667 0.19829665 0.03188734
# Chi-square and LRT statistics
chis <- sum((ob-ex)^2/ex)
lrt <- sum(2*ob*log(ob/ex))
# P-values via asymptotic approximation
1-pchisq(chis, length(ob)-2)
## [1] 1.144142e-08
1-pchisq(lrt, length(ob)-2)
## [1] 0.0008727531
# P-value based on simulations
n.sim <- 10000
set.seed(22)
simd <- apply(matrix(rpois(n.sim*length(x), mu), ncol=n.sim),2,
function(x) { x <-table(factor(x,levels=0:100)); x[6] <- sum(x[-(1:5)]); x[1:6] })
# Function to calculate the likelihood ratio test statistic
lrtt <- function(a,b){
b <- b[a>0]
a <- a[a>0]
sum(2*a*log(a/b))
}
# Calculate the statistics
simlrt <- apply(simd, 2, lrtt, ex)
simchisq <- apply(simd, 2, function(a,b) sum((a-b)^2/b), ex)
# The p-values
mean(simchisq >= sum((ob-ex)^2/ex))
## [1] 0.0017
mean(simlrt >= sum(2*ob*log(ob/ex)))
## [1] 5e-04
# Plot the permutation null distributions
hist(simchisq,col="lightgrey",breaks=51)
abline(v=sum((ob-ex)^2/ex),col="red",lwd=2)
hist(simlrt,col="lightgrey",breaks=51)
abline(v=sum(2*ob*log(ob/ex)),col="red",lwd=2)
x <- rbind(c(18, 2), c(11, 9))
x
## [,1] [,2]
## [1,] 18 2
## [2,] 11 9
# Marginal totals
rs <- apply(x, 1, sum) # sum of each row
rs
## [1] 20 20
cs <- apply(x, 2, sum) # sum of each column
cs
## [1] 29 11
n <- sum(x)
n
## [1] 40
# Expected counts
expected <- outer(rs,cs,"*")/n
expected
## [,1] [,2]
## [1,] 14.5 5.5
## [2,] 14.5 5.5
# Chi-square statistic
chi <- sum((x-expected)^2/expected)
chi
## [1] 6.144201
# P-value
1-pchisq(chi,1)
## [1] 0.01318437
# Lrt statistic
lrt <- 2*sum(x * log(x/expected))
# P-value
1-pchisq(lrt,1)
## [1] 0.01063906
# Mind the correction factor!
chisq.test(x)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: x
## X-squared = 4.5141, df = 1, p-value = 0.03362
chisq.test(x,correct=FALSE) # same as above
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 6.1442, df = 1, p-value = 0.01318
x <- rbind(c(9,9), c(20, 62))
x
## [,1] [,2]
## [1,] 9 9
## [2,] 20 62
# Marginal totals
rs <- apply(x, 1, sum) # sum of each row
rs
## [1] 18 82
cs <- apply(x, 2, sum) # sum of each column
cs
## [1] 29 71
n <- sum(x)
n
## [1] 100
# Expected counts
expected <- outer(rs,cs,"*")/n
expected
## [,1] [,2]
## [1,] 5.22 12.78
## [2,] 23.78 58.22
# Chi-square statistic
chi <- sum((x-expected)^2/expected)
chi
## [1] 4.701548
# P-value
1-pchisq(chi,1)
## [1] 0.03013546
# Lrt statistic
lrt <- 2*sum(x * log(x/expected))
lrt
## [1] 4.369036
# P-value
1-pchisq(lrt,1)
## [1] 0.03659769
# The data
x <- rbind(c(122, 117, 19, 244),
c(1781, 1351, 288,3301),
c(353, 269, 60, 713))
x
## [,1] [,2] [,3] [,4]
## [1,] 122 117 19 244
## [2,] 1781 1351 288 3301
## [3,] 353 269 60 713
# Chi-square test for independence
chi <- chisq.test(x)
chi
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 5.6382, df = 6, p-value = 0.4649
# Expected counts
ex <- chi$expected
ex
## [,1] [,2] [,3] [,4]
## [1,] 131.4124 101.1806 21.37781 248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,] 365.1799 281.1691 59.40647 689.2446
# A direct method to get the expected counts
rs <- apply(x,1,sum) # row sums
rs
## [1] 502 6721 1395
cs <- apply(x,2,sum) # col sums
cs
## [1] 2256 1737 367 4258
n <- sum(rs) # total sample size
n
## [1] 8618
ex <- outer(rs,cs,"*")/n # expected counts under independence
ex
## [,1] [,2] [,3] [,4]
## [1,] 131.4124 101.1806 21.37781 248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,] 365.1799 281.1691 59.40647 689.2446
# Lrt statistic
lrt <- 2*sum(x * log(x/ex))
lrt
## [1] 5.548169
# Degrees of freedom
df <- prod(dim(x)-1)
df
## [1] 6
# Another way to get the degrees of freedom
df <- (ncol(x) - 1) * (nrow(x) - 1)
df
## [1] 6
# P-value
1-pchisq(lrt, df)
## [1] 0.475654