c.cot.knit

Contingency Tables

Examples from class

Example 1

The 2x2 table.

x <- rbind(c(18,2),c(11,9))
x

##      [,1] [,2]
## [1,]   18    2
## [2,]   11    9

The margins.

rs <- apply(x,1,sum) 
rs

## [1] 20 20

cs <- apply(x,2,sum) 
cs

## [1] 29 11

n  <- sum(x)
n

## [1] 40

The expected counts.

e <- outer(rs,cs,"*")/n
e

##      [,1] [,2]
## [1,] 14.5  5.5
## [2,] 14.5  5.5

The chi-square test.

xsq <- sum((x-e)^2/e) 
xsq

## [1] 6.144201

pchisq(xsq,1,lower.tail=FALSE)

## [1] 0.01318437

The likelihood ratio test.

lrt <- 2*sum(x*log(x/e)) 
lrt

## [1] 6.524631

pchisq(lrt,1,lower.tail=FALSE)

## [1] 0.01063906

The built-in function for the chi-square test. Note the correction factor for 2x2 tables, used by default.

chisq.test(x)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 4.5141, df = 1, p-value = 0.03362

chisq.test(x,correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 6.1442, df = 1, p-value = 0.01318

You can also get the expected counts from this function.

chisq.test(x)$expected

##      [,1] [,2]
## [1,] 14.5  5.5
## [2,] 14.5  5.5

Fisher’s exact test.

fisher.test(x)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.03095
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   1.147793 78.183838
## sample estimates:
## odds ratio 
##   6.994073

fisher.test(x)$p.value

## [1] 0.03095031

Example 2

x <- rbind(c(9,9),c(20,62))
x

##      [,1] [,2]
## [1,]    9    9
## [2,]   20   62

rs <- apply(x,1,sum) 
cs <- apply(x,2,sum) 
n  <- sum(x)
e <- outer(rs,cs,"*")/n
e

##       [,1]  [,2]
## [1,]  5.22 12.78
## [2,] 23.78 58.22

xsq <- sum((x-e)^2/e) 
xsq

## [1] 4.701548

pchisq(xsq,1,lower.tail=FALSE)

## [1] 0.03013546

lrt <- 2*sum(x*log(x/e)) 
lrt

## [1] 4.369036

pchisq(lrt,1,lower.tail=FALSE)

## [1] 0.03659769

chisq.test(x)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 3.54, df = 1, p-value = 0.0599

chisq.test(x,correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 4.7015, df = 1, p-value = 0.03014

fisher.test(x)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.04386
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.9356744 10.0979315
## sample estimates:
## odds ratio 
##   3.059762

Example 3: paired data

z <- rbind(c(9,9),c(20,62))
z

##      [,1] [,2]
## [1,]    9    9
## [2,]   20   62

McNemar’s test.

z[1,2]

## [1] 9

z[2,1]

## [1] 20

xsq <- (z[1,2]-z[2,1])^2/(z[1,2]+z[2,1])
xsq

## [1] 4.172414

pchisq(xsq,1,lower.tail=FALSE)

## [1] 0.04108722

Or use the built-in function. Again, mind the correction factor.

mcnemar.test(z)

## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  z
## McNemar's chi-squared = 3.4483, df = 1, p-value = 0.06332

mcnemar.test(z,correct=F)

## 
##  McNemar's Chi-squared test
## 
## data:  z
## McNemar's chi-squared = 4.1724, df = 1, p-value = 0.04109

Another alternative is the exact Binomial test.

binom.test(z[1,2],z[1,2]+z[2,1])

## 
##  Exact binomial test
## 
## data:  z[1, 2] and z[1, 2] + z[2, 1]
## number of successes = 9, number of trials = 29, p-value = 0.06143
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.1528459 0.5083234
## sample estimates:
## probability of success 
##              0.3103448

Example 4: blood groups by state

x <- rbind(c(122,117,19,244),
           c(1781,1351,288,3301),
           c(353,269,60,713))
x

##      [,1] [,2] [,3] [,4]
## [1,]  122  117   19  244
## [2,] 1781 1351  288 3301
## [3,]  353  269   60  713

rs <- apply(x,1,sum) 
rs

## [1]  502 6721 1395

cs <- apply(x,2,sum) 
cs

## [1] 2256 1737  367 4258

n  <- sum(x)
n

## [1] 8618

e <- outer(rs,cs,"*")/n
e

##           [,1]      [,2]      [,3]      [,4]
## [1,]  131.4124  101.1806  21.37781  248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,]  365.1799  281.1691  59.40647  689.2446

chisq.test(x)$expected

##           [,1]      [,2]      [,3]      [,4]
## [1,]  131.4124  101.1806  21.37781  248.0292
## [2,] 1759.4078 1354.6504 286.21571 3320.7262
## [3,]  365.1799  281.1691  59.40647  689.2446

df <- (ncol(x)-1)*(nrow(x)-1)
df

## [1] 6

dim(x)

## [1] 3 4

dim(x)-1

## [1] 2 3

prod(dim(x)-1)

## [1] 6

xsq <- sum((x-e)^2/e) 
xsq

## [1] 5.63817

pchisq(xsq,df,lower.tail=FALSE)

## [1] 0.4649167

lrt <- 2*sum(x*log(x/e)) 
lrt

## [1] 5.548169

pchisq(lrt,df,lower.tail=FALSE)

## [1] 0.475654

The asymptotic null distribution should be just fine, but here is Fisher’s test anyways, using simulations to estimate the p-value. Using fisher.test(x) won’t work - try it!

set.seed(1)
fisher.test(x,simulate=TRUE)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on 2000 replicates)
## 
## data:  x
## p-value = 0.4943
## alternative hypothesis: two.sided

fisher.test(x,simulate=TRUE,B=10000)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on 10000 replicates)
## 
## data:  x
## p-value = 0.4907
## alternative hypothesis: two.sided

Example 5: survival rates in five strains

x <- rbind(c(15,5),c(17,3),c(10,10),c(17,3),c(16,4))
x

##      [,1] [,2]
## [1,]   15    5
## [2,]   17    3
## [3,]   10   10
## [4,]   17    3
## [5,]   16    4

e <- chisq.test(x)$expected
e

##      [,1] [,2]
## [1,]   15    5
## [2,]   15    5
## [3,]   15    5
## [4,]   15    5
## [5,]   15    5

df <- chisq.test(x)$parameter 
df

## df 
##  4

xsq <- sum((x-e)^2/e) 
xsq

## [1] 9.066667

pchisq(xsq,df,lower.tail=FALSE)

## [1] 0.05945456

lrt <- 2*sum(x*log(x/e)) 
lrt

## [1] 8.414912

pchisq(lrt,df,lower.tail=FALSE)

## [1] 0.07750874

fisher.test(x)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.0874
## alternative hypothesis: two.sided

Note: the correction factor in the built-in function for the chi-square test only applies to 2x2 tables.

chisq.test(x)

## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 9.0667, df = 4, p-value = 0.05945

chisq.test(x,correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 9.0667, df = 4, p-value = 0.05945

The Hypergeometric distribution

Consider an urn with 30 balls, of which 12 are white and 18 are black.

Suppose we draw 10 balls without replacement and count the number of white balls obtained.

Simulate this.

set.seed(1)
rhyper(1,12,18,10)

## [1] 3

Simulate this 100 times.

rhyper(100,12,18,10)

##   [1] 4 4 6 3 6 6 5 4 2 3 3 5 4 5 4 5 7 4 5 6 3 4 3 3 4 1 4 5 3 4 4 4 3 5 5 5 2 5 4 5 4 5 4 4 5 2 4 5 5 4 5 4
##  [53] 3 2 2 3 4 5 4 6 3 4 3 4 3 4 5 2 5 3 5 3 3 4 6 5 4 5 6 4 5 4 3 5 3 5 3 3 3 3 2 4 5 5 5 4 4 5 4 4

What is the probability of getting no white balls in the sample?

dhyper(0,12,18,10)

## [1] 0.001456415

What is the probability of getting exactly 2 white balls in the sample?

dhyper(2,12,18,10)

## [1] 0.09612337

What is the probability of getting 2 or fewer white balls in the sample?

phyper(2,12,18,10)

## [1] 0.1169986

The 2.5th and 97.5th percentiles of the number of white balls drawn.

qhyper(c(0.025, 0.975),12,18,10)

## [1] 2 6

Example 1 revisited

The Hypergeometric probabilities from class.

dhyper(20,29,11,20)

## [1] 7.26533e-05

dhyper(19,29,11,20)

## [1] 0.001598373

dhyper(18,29,11,20)

## [1] 0.01380413

The table probabilities from class.

dhyper(20:9,29,11,20)

##  [1] 0.0000726533 0.0015983726 0.0138041267 0.0621185702 0.1624639528 0.2599423245 0.2599423245 0.1624639528
##  [9] 0.0621185702 0.0138041267 0.0015983726 0.0000726533

sum(dhyper(20:9,29,11,20))

## [1] 1

dhyper(20:18,29,11,20)

## [1] 0.0000726533 0.0015983726 0.0138041267

dhyper(11:9,29,11,20)

## [1] 0.0138041267 0.0015983726 0.0000726533

Fisher’s exact test, the pedestrian way.

sum(dhyper(20:18,29,11,20))+sum(dhyper(11:9,29,11,20))

## [1] 0.03095031

Alternatively, for a two-sided test:

pobs <- dhyper(18,29,11,20)
pobs

## [1] 0.01380413

p <- dhyper(20:9,29,11,20)
p

##  [1] 0.0000726533 0.0015983726 0.0138041267 0.0621185702 0.1624639528 0.2599423245 0.2599423245 0.1624639528
##  [9] 0.0621185702 0.0138041267 0.0015983726 0.0000726533

p<=pobs

##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE

p[p<=pobs]

## [1] 0.0000726533 0.0015983726 0.0138041267 0.0138041267 0.0015983726 0.0000726533

sum(p[p<=pobs])

## [1] 0.03095031

Imagine you knew that the survival probability in strain A could not be better than the survival probability in strain B. You would then carry out a one-sided test.

x <- rbind(c(18,2),c(11,9))
x

##      [,1] [,2]
## [1,]   18    2
## [2,]   11    9

fisher.test(x,alternative="greater")

## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.01548
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.431622      Inf
## sample estimates:
## odds ratio 
##   6.994073

Using the Hypergeometric distribution, we would only add those tables that are at least as extreme, i.e. have outcome probabilities no bigger than the observed and for which the fraction of survivors in strain A is less than in strain B.

sum(dhyper(20:18,29,11,20))

## [1] 0.01547515

Statistics for Laboratory Scientists ( 140.615 )