Statistics for Laboratory Scientists ( 140.615 )

Analysis of Variance - Model Assumptions and Diagnostics

Quantile-quantile plots

For the example, we take 100 random draws from a standard Normal distribution.

set.seed(1)
n <- 100
y <- rnorm(n)
plot(sort(y))

The expected normal quantiles. Note that other possibilities have been proposed as well, for example qnorm((1:n)/(n+1)).

p <- ((1:n)-0.5)/n
p
##   [1] 0.005 0.015 0.025 0.035 0.045 0.055 0.065 0.075 0.085 0.095 0.105 0.115 0.125 0.135 0.145 0.155 0.165
##  [18] 0.175 0.185 0.195 0.205 0.215 0.225 0.235 0.245 0.255 0.265 0.275 0.285 0.295 0.305 0.315 0.325 0.335
##  [35] 0.345 0.355 0.365 0.375 0.385 0.395 0.405 0.415 0.425 0.435 0.445 0.455 0.465 0.475 0.485 0.495 0.505
##  [52] 0.515 0.525 0.535 0.545 0.555 0.565 0.575 0.585 0.595 0.605 0.615 0.625 0.635 0.645 0.655 0.665 0.675
##  [69] 0.685 0.695 0.705 0.715 0.725 0.735 0.745 0.755 0.765 0.775 0.785 0.795 0.805 0.815 0.825 0.835 0.845
##  [86] 0.855 0.865 0.875 0.885 0.895 0.905 0.915 0.925 0.935 0.945 0.955 0.965 0.975 0.985 0.995
x <- qnorm(p) 
x
##   [1] -2.57582930 -2.17009038 -1.95996398 -1.81191067 -1.69539771 -1.59819314 -1.51410189 -1.43953147
##   [9] -1.37220381 -1.31057911 -1.25356544 -1.20035886 -1.15034938 -1.10306256 -1.05812162 -1.01522203
##  [17] -0.97411388 -0.93458929 -0.89647336 -0.85961736 -0.82389363 -0.78919165 -0.75541503 -0.72247905
##  [25] -0.69030882 -0.65883769 -0.62800601 -0.59776013 -0.56805150 -0.53883603 -0.51007346 -0.48172685
##  [33] -0.45376219 -0.42614801 -0.39885507 -0.37185609 -0.34512553 -0.31863936 -0.29237490 -0.26631061
##  [41] -0.24042603 -0.21470157 -0.18911843 -0.16365849 -0.13830421 -0.11303854 -0.08784484 -0.06270678
##  [49] -0.03760829 -0.01253347  0.01253347  0.03760829  0.06270678  0.08784484  0.11303854  0.13830421
##  [57]  0.16365849  0.18911843  0.21470157  0.24042603  0.26631061  0.29237490  0.31863936  0.34512553
##  [65]  0.37185609  0.39885507  0.42614801  0.45376219  0.48172685  0.51007346  0.53883603  0.56805150
##  [73]  0.59776013  0.62800601  0.65883769  0.69030882  0.72247905  0.75541503  0.78919165  0.82389363
##  [81]  0.85961736  0.89647336  0.93458929  0.97411388  1.01522203  1.05812162  1.10306256  1.15034938
##  [89]  1.20035886  1.25356544  1.31057911  1.37220381  1.43953147  1.51410189  1.59819314  1.69539771
##  [97]  1.81191067  1.95996398  2.17009038  2.57582930
plot(x, p)
abline(h=c(0,1))
axis(4, p, rep("",n))

The qq-plot ‘from scratch’, and the built-in function.

par(mfrow=c(1,2))
plot(x, sort(y), main="from scratch")
qqnorm(y)

Example from class: IL10 cytokines

Read the data, and include log10 of IL10 as a column.

library(SPH.140.615)
head(il10)
##   Strain     IL10
## 1      7  985.425
## 2      7 1057.254
## 3      7  989.480
## 4      7  816.093
## 5      7 1410.767
## 6     25 1159.141
il10 <- cbind(il10, logIL10=log10(il10$IL10))
head(il10)
##   Strain     IL10  logIL10
## 1      7  985.425 2.993624
## 2      7 1057.254 3.024179
## 3      7  989.480 2.995407
## 4      7  816.093 2.911740
## 5      7 1410.767 3.149455
## 6     25 1159.141 3.064136

Plot the data.

par(las=1, mfrow=c(1,2))
stripchart(il10$IL10 ~ il10$Strain, 
           method="jitter", pch=1, ylab="Strain", xlab="IL10 response", jitter=0.2)
abline(h=1:21, lty=2, col="gray")
stripchart(il10$logIL10 ~ il10$Strain, 
           method="jitter", pch=1, ylab="Strain", xlab="log10 IL10 response", jitter=0.2)
abline(h=1:21, lty=2, col="gray")

The analysis of variance tables.

aov.il10 <- aov(IL10 ~ Strain, data=il10)
anova(aov.il10)
## Analysis of Variance Table
## 
## Response: IL10
##            Df    Sum Sq Mean Sq F value  Pr(>F)  
## Strain     20  33757424 1687871  1.7001 0.04154 *
## Residuals 125 124099169  992793                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov.logil10 <- aov(logIL10 ~ Strain, data=il10)
anova(aov.logil10)
## Analysis of Variance Table
## 
## Response: logIL10
##            Df Sum Sq  Mean Sq F value   Pr(>F)   
## Strain     20 3.3476 0.167380  2.2528 0.003575 **
## Residuals 125 9.2871 0.074297                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plot the residuals. Note that you can get the residuals and fitted values from the output of the ‘aov’ function.

attributes(aov.il10)
## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values" "assign"       
##  [7] "qr"            "df.residual"   "contrasts"     "xlevels"       "call"          "terms"        
## [13] "model"        
## 
## $class
## [1] "aov" "lm"
par(las=1, mfrow=c(1,2)) 
stripchart(aov.il10$residuals ~ il10$Strain, 
           method="jitter", pch=1, ylab="Strain", xlab="residuals (IL10)", jitter=0.2)
abline(h=1:21, lty=2, col="gray")
abline(v=0, lty=2, col="red")
stripchart(aov.logil10$residuals ~ il10$Strain, 
           method="jitter", pch=1, ylab="Strain", xlab="log10 residuals (IL10)", jitter=0.2)
abline(h=1:21, lty=2, col="gray")
abline(v=0, lty=2, col="red")

The qq-plots of all residuals, with histograms Note: ‘mfcol’ versus ‘mfrow’ leads you to fill up the different plots working down columns rather than across rows, and ‘qqline’ adds a line to the ‘qqnorm’ plot.

par(las=1, mfcol=c(2,2))  
hist(aov.il10$residuals, breaks=30, yaxt="n", ylab="", xlab="Residuals", main="IL10")
qqnorm(aov.il10$residuals, main="IL10")
qqline(aov.il10$residuals, col="blue", lty=2, lwd=1)
hist(aov.logil10$residuals, breaks=30, yaxt="n", ylab="", xlab="Residuals", main="log10 IL10")
qqnorm(aov.logil10$residuals, main="log10 IL10")
qqline(aov.logil10$residuals, col="blue", lty=2, lwd=1)

Plot the residuals versus the fitted values.

par(las=1, mfrow=c(1,2))
plot(aov.il10$fitted, aov.il10$residuals, 
     pch=1, xlab="fitted values (IL10)",ylab="residuals (IL10)")
abline(h=0, lty=2, col="red")
plot(aov.logil10$fitted, aov.logil10$residuals, 
     pch=1, xlab="fitted values (log10 IL10)",ylab="residuals (log10 IL10)")
abline(h=0, lty=2, col="red")

Plot the standard deviations versus the means.

m <- tapply(il10$IL10, il10$Strain, mean)
s <- tapply(il10$IL10, il10$Strain, sd)  
logm <- tapply(il10$logIL10, il10$Strain, mean)
logs <- tapply(il10$logIL10, il10$Strain, sd)

par(las=1, mfrow=c(1,2))
plot(m, s, pch=1, xlab="Mean (IL10)", ylab="SD (IL10)")
plot(logm, logs, pch=1, xlab="Mean (log10 IL10)", ylab="SD (log10 IL10)")

Bartlett’s test for equality of variances

The simple way, using the built-in function.

bartlett.test(il10$IL10 ~ il10$Strain) 
## 
##  Bartlett test of homogeneity of variances
## 
## data:  il10$IL10 by il10$Strain
## Bartlett's K-squared = 74.842, df = 20, p-value = 2.895e-08
bartlett.test(il10$logIL10 ~ il10$Strain) 
## 
##  Bartlett test of homogeneity of variances
## 
## data:  il10$logIL10 by il10$Strain
## Bartlett's K-squared = 31.842, df = 20, p-value = 0.04501