Here are the data.
h2o2 <- c(0,0,0, 10,10,10, 25,25,25, 50,50,50)
pf3d7 <- c(0.3399,0.3563,0.3538, 0.3168,0.3054,0.3174, 0.2460,0.2618,0.2848, 0.1535,0.1613,0.1525)
pyoel <- c(0.3332,0.3414,0.3299, 0.2940,0.2948,0.2903, 0.2089,0.2189,0.2102, 0.1006,0.1031,0.1452)
mydat <- data.frame(y = c(pf3d7,pyoel), x1 = rep(h2o2,2), x2 = rep(0:1,rep(length(h2o2),2)))
mydat
## y x1 x2
## 1 0.3399 0 0
## 2 0.3563 0 0
## 3 0.3538 0 0
## 4 0.3168 10 0
## 5 0.3054 10 0
## 6 0.3174 10 0
## 7 0.2460 25 0
## 8 0.2618 25 0
## 9 0.2848 25 0
## 10 0.1535 50 0
## 11 0.1613 50 0
## 12 0.1525 50 0
## 13 0.3332 0 1
## 14 0.3414 0 1
## 15 0.3299 0 1
## 16 0.2940 10 1
## 17 0.2948 10 1
## 18 0.2903 10 1
## 19 0.2089 25 1
## 20 0.2189 25 1
## 21 0.2102 25 1
## 22 0.1006 50 1
## 23 0.1031 50 1
## 24 0.1452 50 1
Fit two separate linear regressions.
lm.outA <- lm(y ~ x1, data=mydat, subset=(x2==0))
lm.outB <- lm(y ~ x1, data=mydat, subset=(x2==1))
summary(lm.outA)
##
## Call:
## lm(formula = y ~ x1, data = mydat, subset = (x2 == 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.013150 -0.007486 0.001275 0.003107 0.028525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3530501 0.0049955 70.67 7.84e-15 ***
## x1 -0.0038710 0.0001759 -22.00 8.43e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01148 on 10 degrees of freedom
## Multiple R-squared: 0.9798, Adjusted R-squared: 0.9777
## F-statistic: 484.1 on 1 and 10 DF, p-value: 8.426e-10
summary(lm.outB)
##
## Call:
## lm(formula = y ~ x1, data = mydat, subset = (x2 == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.013734 -0.009662 -0.001581 0.005268 0.033063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.333131 0.005849 56.96 6.75e-14 ***
## x1 -0.004420 0.000206 -21.46 1.08e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01344 on 10 degrees of freedom
## Multiple R-squared: 0.9787, Adjusted R-squared: 0.9766
## F-statistic: 460.4 on 1 and 10 DF, p-value: 1.078e-09
Plot the data, and add the two regression lines.
par(las=1)
plot(y ~ x1, data=mydat, type="n", xlab="H2O2 concentration", ylab="OD")
points(y ~ x1, data=mydat, subset=(x2==0), col="blue", lwd=2)
points(y ~ x1, data=mydat, subset=(x2==1), col="green", lwd=2)
abline(lm.outA$coef, col="blue", lty=2, lwd=2)
abline(lm.outB$coef, col="green", lty=2, lwd=2)
Fit the full model with the heme species / concentration interaction.
lm.out <- lm(y ~ x1 * x2, data=mydat)
summary(lm.out)
##
## Call:
## lm(formula = y ~ x1 * x2, data = mydat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.013734 -0.008964 0.000410 0.003704 0.033063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3530501 0.0054389 64.912 < 2e-16 ***
## x1 -0.0038710 0.0001915 -20.209 8.86e-15 ***
## x2 -0.0199192 0.0076918 -2.590 0.0175 *
## x1:x2 -0.0005489 0.0002709 -2.026 0.0563 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0125 on 20 degrees of freedom
## Multiple R-squared: 0.98, Adjusted R-squared: 0.977
## F-statistic: 326.4 on 3 and 20 DF, p-value: < 2.2e-16
Fit a reduced model, assuming that the two hemes have the same line.
lm.red <- lm(y ~ x1, data=mydat)
summary(lm.red)
##
## Call:
## lm(formula = y ~ x1, data = mydat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.035219 -0.011800 0.001037 0.015314 0.045345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3430905 0.0064787 52.96 < 2e-16 ***
## x1 -0.0041454 0.0002282 -18.17 9.82e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02105 on 22 degrees of freedom
## Multiple R-squared: 0.9375, Adjusted R-squared: 0.9347
## F-statistic: 330.1 on 1 and 22 DF, p-value: 9.818e-15
Compare the full and reduced models.
anova(lm.red, lm.out)
## Analysis of Variance Table
##
## Model 1: y ~ x1
## Model 2: y ~ x1 * x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 22 0.0097495
## 2 20 0.0031233 2 0.0066262 21.215 1.138e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Fit a reduced model, assuming that the two hemes have the same slope.
lm.red <- lm(y ~ x1 + x2, data=mydat)
summary(lm.red)
##
## Call:
## lm(formula = y ~ x1 + x2, data = mydat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0194271 -0.0099417 0.0004309 0.0069542 0.0295537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3588821 0.0049445 72.583 < 2e-16 ***
## x1 -0.0041454 0.0001451 -28.566 < 2e-16 ***
## x2 -0.0315833 0.0054660 -5.778 9.79e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01339 on 21 degrees of freedom
## Multiple R-squared: 0.9759, Adjusted R-squared: 0.9736
## F-statistic: 424.7 on 2 and 21 DF, p-value: < 2.2e-16
Compare the full and reduced models.
anova(lm.red, lm.out)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2
## Model 2: y ~ x1 * x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 21 0.0037645
## 2 20 0.0031233 1 0.00064118 4.1058 0.05628 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Note: the p-value is the same for the interaction term in the full model.
The model parameters.
params <- lm.red$coef
params
## (Intercept) x1 x2
## 0.358882122 -0.004145433 -0.031583333
The intercept and slope of the first heme.
params[1:2]
## (Intercept) x1
## 0.358882122 -0.004145433
The intercept of the second heme.
params[1]+params[3]
## (Intercept)
## 0.3272988
The intercept and slope of the second heme.
c(params[1]+params[3], params[2])
## (Intercept) x1
## 0.327298789 -0.004145433
Plot the data, and add the two regression lines.
par(las=1)
plot(y ~ x1, data=mydat, type="n", xlab="H2O2 concentration", ylab="OD")
points(y ~ x1, data=mydat, subset=(x2==0), col="blue", lwd=2)
points(y ~ x1, data=mydat, subset=(x2==1), col="green", lwd=2)
abline(params[1:2], col="blue", lty=2, lwd=2)
abline(c(params[1]+params[3], params[2]), col="green", lty=2, lwd=2)
There are two data sets in the package SPH.140.615, with blood pressure in mice as outcome, and two factors (two levels each) as predictors: water (plan and salted) and diet (normal and high fat). Read the data, and convert the predictors to factors.
library(SPH.140.615)
summary(dat.bp1)
## bp water diet
## Min. : 78.90 plain:40 normal :40
## 1st Qu.: 95.47 salt :40 highfat:40
## Median :102.95
## Mean :103.03
## 3rd Qu.:110.00
## Max. :127.00
summary(dat.bp2)
## bp water diet
## Min. : 78.40 plain:40 normal :40
## 1st Qu.: 94.88 salt :40 highfat:40
## Median :102.65
## Mean :105.15
## 3rd Qu.:115.12
## Max. :135.20
Make boxplots of the data.
par(mfrow=c(1,2), las=2, mar=c(8,5,4,2))
boxplot(split(dat.bp1$bp, list(dat.bp1$diet,dat.bp1$water)), ylab="blood pressure")
boxplot(split(dat.bp2$bp, list(dat.bp2$diet,dat.bp2$water)), ylab="blood pressure")
In both data sets, it looks like blood pressure is higher on average in the high fat diet arm, and it looks like blood pressure is also higher on average in the salted water arm. It also looks like there is a positive interaction between the two factors in the second data set. Let’s see what the linear models say.
lm.fit <- lm(bp ~ water * diet, data=dat.bp1)
summary(lm.fit)
##
## Call:
## lm(formula = bp ~ water * diet, data = dat.bp1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0500 -2.5500 -0.0825 2.7638 11.4850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.950 1.023 88.879 < 2e-16 ***
## watersalt 9.745 1.447 6.734 2.77e-09 ***
## diethighfat 14.010 1.447 9.681 6.69e-15 ***
## watersalt:diethighfat 0.810 2.047 0.396 0.693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.576 on 76 degrees of freedom
## Multiple R-squared: 0.7962, Adjusted R-squared: 0.7882
## F-statistic: 98.99 on 3 and 76 DF, p-value: < 2.2e-16
For the first data set, the interaction is not significant, so let’s re-fit the linear model without it.
lm.fit <- lm(bp ~ water + diet, data=dat.bp1)
summary(lm.fit)
##
## Call:
## lm(formula = bp ~ water + diet, data = dat.bp1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.848 -2.596 0.120 2.816 11.688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.7475 0.8813 102.966 < 2e-16 ***
## watersalt 10.1500 1.0177 9.974 1.62e-15 ***
## diethighfat 14.4150 1.0177 14.165 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.551 on 77 degrees of freedom
## Multiple R-squared: 0.7958, Adjusted R-squared: 0.7905
## F-statistic: 150.1 on 2 and 77 DF, p-value: < 2.2e-16
Both factors are highly significant! Here is the interpretation of the parameter estimates.
params <- summary(lm.fit)$coef[,1]
params
## (Intercept) watersalt diethighfat
## 90.7475 10.1500 14.4150
Comparing two mice who receive the same diet, one drinking salted water, one drinking plain water, we expect the blood pressure to be
params[2]
## watersalt
## 10.15
units higher for the mouse drinking salted water.
Comparing two mice who drink the same type of water, one eating the high fat and the other the normal diet, we expect the blood pressure to be
params[3]
## diethighfat
## 14.415
units higher for the mouse eating the high fat diet.
lm.fit <- lm(bp ~ water * diet, data=dat.bp2)
summary(lm.fit)
##
## Call:
## lm(formula = bp ~ water * diet, data = dat.bp2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.570 -3.645 -0.520 3.947 11.055
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.970 1.280 71.094 < 2e-16 ***
## watersalt 9.515 1.810 5.258 1.30e-06 ***
## diethighfat 14.030 1.810 7.753 3.26e-11 ***
## watersalt:diethighfat 9.630 2.559 3.763 0.000328 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.722 on 76 degrees of freedom
## Multiple R-squared: 0.8243, Adjusted R-squared: 0.8173
## F-statistic: 118.8 on 3 and 76 DF, p-value: < 2.2e-16
For the second data set, the interaction is significant, which makes the interpretation of the parameter estimates a bit trickier.
params <- summary(lm.fit)$coef[,1]
params
## (Intercept) watersalt diethighfat watersalt:diethighfat
## 90.970 9.515 14.030 9.630
Comparing two mice who receive the normal diet, one drinking salted water, one drinking plain water, we expect the blood pressure to be
params[2]
## watersalt
## 9.515
units higher for the mouse drinking salted water.
Comparing two mice who receive the high fat diet, one drinking salted water, one drinking plain water, we expect the blood pressure to be
params[2]+params[4]
## watersalt
## 19.145
units higher for the mouse drinking salted water.
Comparing two mice who drink plain water, one eating the high fat diet, one eating the normal diet, we expect the blood pressure to be
params[3]
## diethighfat
## 14.03
units higher for the mouse eating the high fat diet.
Comparing two mice who drink salted water, one eating the high fat diet, one eating the normal diet, we expect the blood pressure to be
params[3]+params[4]
## diethighfat
## 23.66
units higher for the mouse eating the high fat diet.
Data set 1, as a 2-way ANOVA with interaction.
aov.fit <- aov(bp ~ water * diet, data=dat.bp1)
summary(aov.fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## water 1 2060 2060 98.385 2.36e-15 ***
## diet 1 4156 4156 198.438 < 2e-16 ***
## water:diet 1 3 3 0.157 0.693
## Residuals 76 1592 21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Data set 1, as a 2-way ANOVA without interaction.
aov.fit <- aov(bp ~ water + diet, data=dat.bp1)
summary(aov.fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## water 1 2060 2060 99.47 1.62e-15 ***
## diet 1 4156 4156 200.63 < 2e-16 ***
## Residuals 77 1595 21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Data set 2, as a 2-way ANOVA with interaction.
aov.fit <- aov(bp ~ water * diet, data=dat.bp2)
summary(aov.fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## water 1 4107 4107 125.42 < 2e-16 ***
## diet 1 7103 7103 216.90 < 2e-16 ***
## water:diet 1 464 464 14.16 0.000328 ***
## Residuals 76 2489 33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The data.
flies <- data.frame(rsp = c( 9.6, 9.3, 9.3,
10.6, 9.1, 9.2,
9.8, 9.3, 9.5,
10.7, 9.1,10.0,
11.1,11.1,10.4,
10.9,11.8,10.8,
12.8,10.6,10.7),
strain = factor(rep(c("OL","BELL","bwb"),7)),
density = factor(rep(c(60,80,160,320,640,1280,2560),rep(3,7))))
summary(flies)
## rsp strain density
## Min. : 9.10 BELL:7 60 :3
## 1st Qu.: 9.30 bwb :7 80 :3
## Median :10.40 OL :7 160 :3
## Mean :10.27 320 :3
## 3rd Qu.:10.80 640 :3
## Max. :12.80 1280:3
## 2560:3
Plot the data.
par(las=1)
interaction.plot(flies$density, flies$strain, flies$rsp, lwd=2, col=c("blue","red","green3"), lty=1, type="b",
xlab="density", ylab="response")
We don’t have replicates in the cells, so for the two-way ANOVA we had to assume there is no interaction (otherwise we didn’t have any degrees of freedom to fit an error).
flies.aov <- aov(rsp ~ strain + density, data=flies)
anova(flies.aov)
## Analysis of Variance Table
##
## Response: rsp
## Df Sum Sq Mean Sq F value Pr(>F)
## strain 2 2.7886 1.39429 4.0695 0.044757 *
## density 6 12.5429 2.09048 6.1015 0.003945 **
## Residuals 12 4.1114 0.34262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In such an analysis, the levels of a factor do not have any particular ordering. We simply ask whether there are differences in means between the levels. However, here it looks like there is an ordering between the levels of the factor ‘density’, higher densities seem to lead to a longer development period. So maybe we should look at the densities as numeric quantities rather than levels of an (unordered) factor. The strain is still a factor, and our primary question might still be about differences between strains. Such an analysis, with a factor and a numeric predictor, is called an Analysis of Covariance (ANCOVA). So let’s record the densities as numeric quantities instead.
flies <- data.frame(rsp = c( 9.6, 9.3, 9.3,
10.6, 9.1, 9.2,
9.8, 9.3, 9.5,
10.7, 9.1,10.0,
11.1,11.1,10.4,
10.9,11.8,10.8,
12.8,10.6,10.7),
strain = factor(rep(c("OL","BELL","bwb"),7)),
density = rep(c(60,80,160,320,640,1280,2560),rep(3,7)))
summary(flies)
## rsp strain density
## Min. : 9.10 BELL:7 Min. : 60.0
## 1st Qu.: 9.30 bwb :7 1st Qu.: 80.0
## Median :10.40 OL :7 Median : 320.0
## Mean :10.27 Mean : 728.6
## 3rd Qu.:10.80 3rd Qu.:1280.0
## Max. :12.80 Max. :2560.0
Plot the data.
plot(flies$density, flies$rsp, type="n", xlab="density", ylab="response")
tmp <- subset(flies, strain=="OL")
lines(tmp$density, tmp$rsp, col="green", type="b")
tmp <- subset(flies, strain=="BELL")
lines(tmp$density, tmp$rsp, col="blue", type="b")
tmp <- subset(flies, strain=="bwb")
lines(tmp$density, tmp$rsp, col="red", type="b")
The relationship between density and the response does not look linear. Let’s try a logarithmic transformation of the densities. Note that the density levels typically double in the number of flies, so a logarithmic transformation makes sense.
plot(flies$density, flies$rsp,type="n", log="x")
tmp <- subset(flies, strain=="OL")
lines(tmp$density, tmp$rsp, col="green", type="b")
tmp <- subset(flies, strain=="BELL")
lines(tmp$density, tmp$rsp, col="blue", type="b")
tmp <- subset(flies, strain=="bwb")
lines(tmp$density, tmp$rsp, col="red", type="b")
That looks better. Let’s fit the linear model with factor ‘strain’ and log2 ‘density’ as a numeric predictor.
ancova.fit <- lm(rsp ~ strain + log2(density), data=flies)
summary(ancova.fit)
##
## Call:
## lm(formula = rsp ~ strain + log2(density), data = flies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90985 -0.31482 -0.04773 0.24417 1.00009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.72243 0.56995 11.795 1.31e-09 ***
## strainbwb -0.05714 0.29139 -0.196 0.8469
## strainOL 0.74286 0.29139 2.549 0.0207 *
## log2(density) 0.39503 0.06322 6.248 8.83e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5451 on 17 degrees of freedom
## Multiple R-squared: 0.7402, Adjusted R-squared: 0.6943
## F-statistic: 16.14 on 3 and 17 DF, p-value: 3.184e-05
We are testing if the factor ‘strain’ is significant.
ancova.fit.red <- lm(rsp ~ log2(density), data=flies)
summary(ancova.fit.red)
##
## Call:
## lm(formula = rsp ~ log2(density), data = flies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1384 -0.3434 -0.1285 0.4616 1.3765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.9510 0.6417 10.832 1.43e-09 ***
## log2(density) 0.3950 0.0745 5.302 4.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6424 on 19 degrees of freedom
## Multiple R-squared: 0.5967, Adjusted R-squared: 0.5755
## F-statistic: 28.12 on 1 and 19 DF, p-value: 4.064e-05
anova(ancova.fit.red, ancova.fit)
## Analysis of Variance Table
##
## Model 1: rsp ~ log2(density)
## Model 2: rsp ~ strain + log2(density)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19 7.8405
## 2 17 5.0519 2 2.7886 4.6918 0.02385 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The full linear model indicates a significant difference between BELL (baseline) and OL, but not between BELL and bwb. Let’s fit a model with strain being OL versus BELL/bwb.
lm.fit <- lm(rsp ~ ifelse(strain=="OL",1,0) + log2(density), data=flies)
summary(lm.fit)
##
## Call:
## lm(formula = rsp ~ ifelse(strain == "OL", 1, 0) + log2(density),
## data = flies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88127 -0.28624 -0.04773 0.27274 1.02866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.69386 0.53610 12.486 2.66e-10 ***
## ifelse(strain == "OL", 1, 0) 0.77143 0.24552 3.142 0.00564 **
## log2(density) 0.39503 0.06151 6.422 4.80e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5304 on 18 degrees of freedom
## Multiple R-squared: 0.7396, Adjusted R-squared: 0.7106
## F-statistic: 25.56 on 2 and 18 DF, p-value: 5.51e-06
Here is the interpretation of our model.
params <- summary(lm.fit)$coef[,1]
params
## (Intercept) ifelse(strain == "OL", 1, 0) log2(density)
## 6.6938593 0.7714286 0.3950305
Comparing strains of flies at any density, we expect the OL flies to take
params[2]
## ifelse(strain == "OL", 1, 0)
## 0.7714286
days longer to develop than either the BELL or bwb flies.
Comparing flies of the same strain at two densities, one twice as large as the other, we expect the flies to take
params[3]
## log2(density)
## 0.3950305
days longer to develop in the more crowded environment.