Statistics for Laboratory Scientists ( 140.615 )

Analysis of Variance

We are starting to use functions and data from the SPH.140.615 GitHub package. Uncomment and execute the line calling the function install.packages if you have not yet installed devtools.

# install.packages("devtools")
library(devtools,quiet=TRUE)
devtools::install_github("bllfrg/SPH.140.615",quiet=TRUE)
library(SPH.140.615)

Example from class: test for equality of variances

The data.

xa <- c(672, 747, 749, 792, 875, 888, 930, 962, 994,1295)
xb <- c(290, 359, 384, 466, 510, 516, 522, 532, 595, 706)

Plot the data, using the custom function dot.plot from the SPH.140.615 package.

dot.plot(xa,xb, includeCI=FALSE)

Plot the data, using the stripchart function. Look at the help file to explore what all the arguments do.

par(las=1)
stripchart(list(xa,xb), method="jitter", pch=1, vertical=TRUE, xlim=c(0.5,2.5), group.names=c("A","B"))
abline(v=1:2,lty=2)

The 95% confidence interval for the ratio of the population variances, the pedestrian way.

ratio <- var(xa)/var(xb)
ratio
## [1] 2.14067
na <- length(xa)
na
## [1] 10
nb <- length(xb)
nb
## [1] 10
L <- qf(0.025,na-1,nb-1)
L
## [1] 0.2483859
U <- qf(0.975,na-1,nb-1)
U
## [1] 4.025994
c(ratio/U,ratio/L)
## [1] 0.5317123 8.6183268

The 95% confidence interval for the ratio of the population standard deviations.

sqrt(c(ratio/U,ratio/L))
## [1] 0.729186 2.935699

The built-in function.

var.test(xa,xb)
## 
##  F test to compare two variances
## 
## data:  xa and xb
## F = 2.1407, num df = 9, denom df = 9, p-value = 0.2723
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5317123 8.6183268
## sample estimates:
## ratio of variances 
##            2.14067
attributes(var.test(xa,xb))
## $names
## [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"    "null.value"  "alternative"
## [8] "method"      "data.name"  
## 
## $class
## [1] "htest"
var.test(xa,xb)$estimate
## ratio of variances 
##            2.14067
var.test(xa,xb)$p.value
## [1] 0.2722659
var.test(xa,xb)$conf.int
## [1] 0.5317123 8.6183268
## attr(,"conf.level")
## [1] 0.95
sqrt(var.test(xa,xb)$conf.int)
## [1] 0.729186 2.935699
## attr(,"conf.level")
## [1] 0.95

Example from class: diets and blood coagulation times

coag <- c(62, 60, 63, 59,
          63, 67, 71, 64, 65, 66,
          68, 66, 71, 67, 68, 68,
          56, 62, 60, 61, 63, 64, 63, 59)
ttt <-  c("A","A","A","A",
          "B","B","B","B","B","B",
          "C","C","C","C","C","C",
          "D","D","D","D","D","D","D","D")
ttt
##  [1] "A" "A" "A" "A" "B" "B" "B" "B" "B" "B" "C" "C" "C" "C" "C" "C" "D" "D" "D" "D" "D" "D" "D" "D"
LETTERS[1:4]
## [1] "A" "B" "C" "D"
ttt <- rep(LETTERS[1:4],c(4,6,6,8))
ttt
##  [1] "A" "A" "A" "A" "B" "B" "B" "B" "B" "B" "C" "C" "C" "C" "C" "C" "D" "D" "D" "D" "D" "D" "D" "D"

Make the treatment “ttt” a factor.

str(ttt)
##  chr [1:24] "A" "A" "A" "A" "B" "B" "B" "B" "B" "B" "C" "C" "C" "C" "C" "C" "D" "D" "D" "D" "D" "D" "D" ...
ttt <- factor(ttt,levels=c("A","B","C","D"))
str(ttt)
##  Factor w/ 4 levels "A","B","C","D": 1 1 1 1 2 2 2 2 2 2 ...

Plot the data. In the stripchart function you want to jitter the data points horizontally, vertically, or stack them, since there are identical coagulation times recorded in some of the groups.

set.seed(1)
par(las=1) 
stripchart(coag~ttt, method="jitter", pch=1, vertical=TRUE, xlab="diet", ylab="coagulation time [ sec ]") 

stripchart(jitter(coag,factor=2)~ttt, pch=1, vertical=TRUE, xlab="diet", ylab="coagulation time [ sec ]")  

stripchart(coag~ttt, method="stack", pch=1, vertical=TRUE, xlab="diet", ylab="coagulation time [ sec ]") 

Note: the function tapply splits the first vector into groups according to the values in the second vector, and then “applies” the function, here ‘mean’, to each group.

The within-group means.

tapply(coag,ttt,mean)
##  A  B  C  D 
## 61 66 68 61

The number of data points per group.

tapply(coag,ttt,length)
## A B C D 
## 4 6 6 8

One-way ANOVA, the pedestrain way.

k <- 4
m <- tapply(coag,ttt,mean)
m
##  A  B  C  D 
## 61 66 68 61
gm <- mean(coag)
gm
## [1] 64
n <- tapply(coag,ttt,length)
n
## A B C D 
## 4 6 6 8
N <- sum(n)
N
## [1] 24
m-gm
##  A  B  C  D 
## -3  2  4 -3
SB <- sum(n*(m-gm)^2)
SB
## [1] 228
coag
##  [1] 62 60 63 59 63 67 71 64 65 66 68 66 71 67 68 68 56 62 60 61 63 64 63 59
rep(m,n)
##  A  A  A  A  B  B  B  B  B  B  C  C  C  C  C  C  D  D  D  D  D  D  D  D 
## 61 61 61 61 66 66 66 66 66 66 68 68 68 68 68 68 61 61 61 61 61 61 61 61
coag-rep(m,n)
##  A  A  A  A  B  B  B  B  B  B  C  C  C  C  C  C  D  D  D  D  D  D  D  D 
##  1 -1  2 -2 -3  1  5 -2 -1  0  0 -2  3 -1  0  0 -5  1 -1  0  2  3  2 -2
SW <- sum((coag-rep(m,n))^2)
SW
## [1] 112
MB <- SB/(k-1)
MB
## [1] 76
MW <- SW/(N-k)
MW
## [1] 5.6
Fstat <- MB/MW
Fstat
## [1] 13.57143
p <- pf(Fstat,k-1,N-k,lower=FALSE)
p
## [1] 4.658471e-05

The built-in function.

aov(coag~ttt)
## Call:
##    aov(formula = coag ~ ttt)
## 
## Terms:
##                 ttt Residuals
## Sum of Squares  228       112
## Deg. of Freedom   3        20
## 
## Residual standard error: 2.366432
## Estimated effects may be unbalanced
summary(aov(coag~ttt)) 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## ttt          3    228    76.0   13.57 4.66e-05 ***
## Residuals   20    112     5.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(aov(coag~ttt))
## Analysis of Variance Table
## 
## Response: coag
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## ttt        3    228    76.0  13.571 4.658e-05 ***
## Residuals 20    112     5.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Example from class: meioses in females and males

The meiosis data used in this lecture are available in the SPH.140.615 R package you installed and loaded above.

Reference: Broman et al. (1998) Am J Hum Genet 63:861-869.

The data have three columns: family, female, male. These last two give the total number of crossovers on the 22 autosomes in each of the male and female meioses.

head(meioses)
##   family female male
## 1    102     28   24
## 2    102     50   22
## 3    102     32   26
## 4    102     35   14
## 5    102     39   20
## 6    102     44   21
str(meioses)
## 'data.frame':    92 obs. of  3 variables:
##  $ family: int  102 102 102 102 102 102 102 102 102 102 ...
##  $ female: int  28 50 32 35 39 44 33 40 28 37 ...
##  $ male  : int  24 22 26 14 20 21 21 22 30 20 ...

We need to make the family column a “factor” to get the ANOVA to work properly.

meioses$family <- as.factor(meioses$family)
str(meioses)
## 'data.frame':    92 obs. of  3 variables:
##  $ family: Factor w/ 8 levels "102","884","1331",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ female: int  28 50 32 35 39 44 33 40 28 37 ...
##  $ male  : int  24 22 26 14 20 21 21 22 30 20 ...

Plot the data for the female meioses, get the within-group means, and add the means to the plot.

par(las=1) 
stripchart(meioses$female~meioses$family,method="jitter",pch=1)
female.me <- tapply(meioses$female,meioses$family,mean)
segments(female.me,(1:8)-0.25,female.me,(1:8)+0.25,lwd=2,col="blue")

Same for the male counts.

par(las=1)
stripchart(meioses$male~meioses$family,method="jitter",pch=1)
male.me <- tapply(meioses$male,meioses$family,mean)
segments(male.me,(1:8)-0.25,male.me,(1:8)+0.25,lwd=2,col="blue")

The ANOVA tables.

out.fem <- aov(female~family,data=meioses)
summary(out.fem)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## family       7   1485  212.15   4.601 0.000216 ***
## Residuals   84   3873   46.11                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
out.mal <- aov(male~family,data=meioses)
summary(out.mal)
##             Df Sum Sq Mean Sq F value Pr(>F)
## family       7    114   16.28    1.23  0.296
## Residuals   84   1112   13.24

Another example

xA <- c(47,48,51,54)
xB <- c(39,44,38,39)
xC <- c(47,49,41,43)
x <- c(xA,xB,xC)
treat <- rep(1:3,rep(4,3))
x
##  [1] 47 48 51 54 39 44 38 39 47 49 41 43
treat
##  [1] 1 1 1 1 2 2 2 2 3 3 3 3
ex <- data.frame(rsp=x,trt=treat)
ex
##    rsp trt
## 1   47   1
## 2   48   1
## 3   51   1
## 4   54   1
## 5   39   2
## 6   44   2
## 7   38   2
## 8   39   2
## 9   47   3
## 10  49   3
## 11  41   3
## 12  43   3
par(las=1)
stripchart(ex$rsp~ex$trt, method="jitter", pch=1, vertical=TRUE)

stripchart(jitter(ex$rsp,factor=2)~ex$trt, pch=1, vertical=TRUE)

The following is wrong.

aov.out <- aov(rsp~trt,data=ex)
summary(aov.out)
##             Df Sum Sq Mean Sq F value Pr(>F)
## trt          1     50    50.0   2.066  0.181
## Residuals   10    242    24.2

The correct way.

treat <- factor(rep(1:3,rep(4,3)))
ex.c <- data.frame(rsp=x,trt=treat)
aov.out.c <- aov(rsp~trt,data=ex.c)
summary(aov.out.c)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## trt          2    200  100.00   9.783 0.00553 **
## Residuals    9     92   10.22                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Compare.

summary(aov.out)
##             Df Sum Sq Mean Sq F value Pr(>F)
## trt          1     50    50.0   2.066  0.181
## Residuals   10    242    24.2
summary(aov.out.c)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## trt          2    200  100.00   9.783 0.00553 **
## Residuals    9     92   10.22                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
str(ex)
## 'data.frame':    12 obs. of  2 variables:
##  $ rsp: num  47 48 51 54 39 44 38 39 47 49 ...
##  $ trt: int  1 1 1 1 2 2 2 2 3 3 ...
str(ex.c)
## 'data.frame':    12 obs. of  2 variables:
##  $ rsp: num  47 48 51 54 39 44 38 39 47 49 ...
##  $ trt: Factor w/ 3 levels "1","2","3": 1 1 1 1 2 2 2 2 3 3 ...
summary(ex)
##       rsp             trt   
##  Min.   :38.00   Min.   :1  
##  1st Qu.:40.50   1st Qu.:1  
##  Median :45.50   Median :2  
##  Mean   :45.00   Mean   :2  
##  3rd Qu.:48.25   3rd Qu.:3  
##  Max.   :54.00   Max.   :3
summary(ex.c)
##       rsp        trt  
##  Min.   :38.00   1:4  
##  1st Qu.:40.50   2:4  
##  Median :45.50   3:4  
##  Mean   :45.00        
##  3rd Qu.:48.25        
##  Max.   :54.00