Statistics for Laboratory Scientists ( 140.615 )

Contingency Tables

Examples from class

Example 1

The 2x2 table.

x <- rbind(c(18,2),c(11,9))
x
##      [,1] [,2]
## [1,]   18    2
## [2,]   11    9

The margins.

rs <- apply(x,1,sum) 
rs
## [1] 20 20
cs <- apply(x,2,sum) 
cs
## [1] 29 11
n  <- sum(x)
n
## [1] 40

The expected counts.

e <- outer(rs,cs,"*")/n
e
##      [,1] [,2]
## [1,] 14.5  5.5
## [2,] 14.5  5.5

The chi-square test.

xsq <- sum((x-e)^2/e) 
xsq
## [1] 6.144201
pchisq(xsq,1,lower.tail=FALSE) 
## [1] 0.01318437

The likelihood ratio test.

lrt <- 2*sum(x*log(x/e)) 
lrt
## [1] 6.524631
pchisq(lrt,1,lower.tail=FALSE)
## [1] 0.01063906

The built-in function for the chi-square test. Note the correction factor for 2x2 tables, used by default.

chisq.test(x)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 4.5141, df = 1, p-value = 0.03362
chisq.test(x,correct=FALSE) 
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 6.1442, df = 1, p-value = 0.01318

You can also get the expected counts from this function.

chisq.test(x)$expected
##      [,1] [,2]
## [1,] 14.5  5.5
## [2,] 14.5  5.5

Fisher’s exact test.

fisher.test(x) 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.03095
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   1.147793 78.183838
## sample estimates:
## odds ratio 
##   6.994073
fisher.test(x)$p.value
## [1] 0.03095031

Example 2

x <- rbind(c(9,9),c(20,62))
x
##      [,1] [,2]
## [1,]    9    9
## [2,]   20   62
rs <- apply(x,1,sum) 
cs <- apply(x,2,sum) 
n  <- sum(x)
e <- outer(rs,cs,"*")/n
e
##       [,1]  [,2]
## [1,]  5.22 12.78
## [2,] 23.78 58.22
xsq <- sum((x-e)^2/e) 
xsq
## [1] 4.701548
pchisq(xsq,1,lower.tail=FALSE) 
## [1] 0.03013546
lrt <- 2*sum(x*log(x/e)) 
lrt
## [1] 4.369036
pchisq(lrt,1,lower.tail=FALSE)
## [1] 0.03659769
chisq.test(x)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 3.54, df = 1, p-value = 0.0599
chisq.test(x,correct=FALSE) 
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 4.7015, df = 1, p-value = 0.03014
fisher.test(x) 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.04386
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.9356744 10.0979315
## sample estimates:
## odds ratio 
##   3.059762

Example 3: paired data

z <- rbind(c(9,9),c(20,62))
z
##      [,1] [,2]
## [1,]    9    9
## [2,]   20   62

McNemar’s test.

z[1,2]
## [1] 9
z[2,1]
## [1] 20
xsq <- (z[1,2]-z[2,1])^2/(z[1,2]+z[2,1])
xsq
## [1] 4.172414
pchisq(xsq,1,lower.tail=FALSE) 
## [1] 0.04108722

Or use the built-in function. Again, mind the correction factor.

mcnemar.test(z)
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  z
## McNemar's chi-squared = 3.4483, df = 1, p-value = 0.06332
mcnemar.test(z,correct=F)
## 
##  McNemar's Chi-squared test
## 
## data:  z
## McNemar's chi-squared = 4.1724, df = 1, p-value = 0.04109

Another alternative is the exact Binomial test.

binom.test(z[1,2],z[1,2]+z[2,1]) 
## 
##  Exact binomial test
## 
## data:  z[1, 2] and z[1, 2] + z[2, 1]
## number of successes = 9, number of trials = 29, p-value = 0.06143
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.1528459 0.5083234
## sample estimates:
## probability of success 
##              0.3103448

Example 4: blood groups by state

x <- rbind(c(122,117,19,244),
           c(1781,1351,288,3301),
           c(353,269,60,713))
x
##      [,1] [,2] [,3] [,4]
## [1,]  122  117   19  244
## [2,] 1781 1351  288 3301
## [3,]  353  269   60  713
rs <- apply(x,1,sum) 
rs
## [1]  502 6721 1395
cs <- apply(x,2,sum) 
cs
## [1] 2256 1737  367 4258
n  <- sum(x)
n
## [1] 8618
e <- outer(rs,cs,"*")/n
e
##           [,1]      [,2]      [,3]      [,4]
## [1,]  131.4124  101.1806  21.37781  248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,]  365.1799  281.1691  59.40647  689.2446
chisq.test(x)$expected
##           [,1]      [,2]      [,3]      [,4]
## [1,]  131.4124  101.1806  21.37781  248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,]  365.1799  281.1691  59.40647  689.2446
df <- (ncol(x)-1)*(nrow(x)-1)
df
## [1] 6
dim(x)
## [1] 3 4
dim(x)-1
## [1] 2 3
prod(dim(x)-1)  
## [1] 6
xsq <- sum((x-e)^2/e) 
xsq
## [1] 5.63817
pchisq(xsq,df,lower.tail=FALSE) 
## [1] 0.4649167
lrt <- 2*sum(x*log(x/e)) 
lrt
## [1] 5.548169
pchisq(lrt,df,lower.tail=FALSE)
## [1] 0.475654

The asymptotic null distribution should be just fine, but here is Fisher’s test anyways, using simulations to estimate the p-value. Using fisher.test(x) won’t work - try it!

set.seed(1)
fisher.test(x,simulate=TRUE)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
## 
## data:  x
## p-value = 0.4943
## alternative hypothesis: two.sided
fisher.test(x,simulate=TRUE,B=10000)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on 10000 replicates)
## 
## data:  x
## p-value = 0.4907
## alternative hypothesis: two.sided

Example 5: survival rates in five strains

x <- rbind(c(15,5),c(17,3),c(10,10),c(17,3),c(16,4))
x
##      [,1] [,2]
## [1,]   15    5
## [2,]   17    3
## [3,]   10   10
## [4,]   17    3
## [5,]   16    4
e <- chisq.test(x)$expected
e
##      [,1] [,2]
## [1,]   15    5
## [2,]   15    5
## [3,]   15    5
## [4,]   15    5
## [5,]   15    5
df <- chisq.test(x)$parameter 
df
## df 
##  4
xsq <- sum((x-e)^2/e) 
xsq
## [1] 9.066667
pchisq(xsq,df,lower.tail=FALSE) 
## [1] 0.05945456
lrt <- 2*sum(x*log(x/e)) 
lrt
## [1] 8.414912
pchisq(lrt,df,lower.tail=FALSE)
## [1] 0.07750874
fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.0874
## alternative hypothesis: two.sided

Note: the correction factor in the built-in function for the chi-square test only applies to 2x2 tables.

chisq.test(x)
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 9.0667, df = 4, p-value = 0.05945
chisq.test(x,correct=FALSE) 
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 9.0667, df = 4, p-value = 0.05945

The Hypergeometric distribution

Consider an urn with 30 balls, of which 12 are white and 18 are black.

Suppose we draw 10 balls without replacement and count the number of white balls obtained.

Simulate this.

set.seed(1)
rhyper(1,12,18,10) 
## [1] 3

Simulate this 100 times.

rhyper(100,12,18,10)
##   [1] 4 4 6 3 6 6 5 4 2 3 3 5 4 5 4 5 7 4 5 6 3 4 3 3 4 1 4 5 3 4 4 4 3 5 5 5 2 5 4 5 4 5 4 4 5 2 4 5 5 4 5 4
##  [53] 3 2 2 3 4 5 4 6 3 4 3 4 3 4 5 2 5 3 5 3 3 4 6 5 4 5 6 4 5 4 3 5 3 5 3 3 3 3 2 4 5 5 5 4 4 5 4 4

What is the probability of getting no white balls in the sample?

dhyper(0,12,18,10)
## [1] 0.001456415

What is the probability of getting exactly 2 white balls in the sample?

dhyper(2,12,18,10)
## [1] 0.09612337

What is the probability of getting 2 or fewer white balls in the sample?

phyper(2,12,18,10)
## [1] 0.1169986

The 2.5th and 97.5th percentiles of the number of white balls drawn.

qhyper(c(0.025, 0.975),12,18,10)
## [1] 2 6

Example 1 revisited

The Hypergeometric probabilities from class.

dhyper(20,29,11,20)
## [1] 7.26533e-05
dhyper(19,29,11,20)
## [1] 0.001598373
dhyper(18,29,11,20)
## [1] 0.01380413

The table probabilities from class.

dhyper(20:9,29,11,20)
##  [1] 0.0000726533 0.0015983726 0.0138041267 0.0621185702 0.1624639528 0.2599423245 0.2599423245 0.1624639528
##  [9] 0.0621185702 0.0138041267 0.0015983726 0.0000726533
sum(dhyper(20:9,29,11,20))
## [1] 1
dhyper(20:18,29,11,20)
## [1] 0.0000726533 0.0015983726 0.0138041267
dhyper(11:9,29,11,20)
## [1] 0.0138041267 0.0015983726 0.0000726533

Fisher’s exact test, the pedestrian way.

sum(dhyper(20:18,29,11,20))+sum(dhyper(11:9,29,11,20))
## [1] 0.03095031

Alternatively, for a two-sided test:

pobs <- dhyper(18,29,11,20)
pobs
## [1] 0.01380413
p <- dhyper(20:9,29,11,20)
p
##  [1] 0.0000726533 0.0015983726 0.0138041267 0.0621185702 0.1624639528 0.2599423245 0.2599423245 0.1624639528
##  [9] 0.0621185702 0.0138041267 0.0015983726 0.0000726533
p<=pobs
##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
p[p<=pobs]
## [1] 0.0000726533 0.0015983726 0.0138041267 0.0138041267 0.0015983726 0.0000726533
sum(p[p<=pobs])
## [1] 0.03095031

Imagine you knew that the survival probability in strain A could not be better than the survival probability in strain B. You would then carry out a one-sided test.

x <- rbind(c(18,2),c(11,9))
x
##      [,1] [,2]
## [1,]   18    2
## [2,]   11    9
fisher.test(x,alternative="greater")
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.01548
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.431622      Inf
## sample estimates:
## odds ratio 
##   6.994073

Using the Hypergeometric distribution, we would only add those tables that are at least as extreme, i.e. have outcome probabilities no bigger than the observed and for which the fraction of survivors in strain A is less than in strain B.

sum(dhyper(20:18,29,11,20))
## [1] 0.01547515