Statistics for Laboratory Scientists ( 140.615 )

Prediction and Calibration

Prediction

A simple example

Generate some random data, 20 design points equally spaced on the x-axis, and assume E[y|x] = 10 + 5x. Assume Gaussian noise with mean zero and standard deviation 10.

set.seed(1)
x <- 1:20
x
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
eps <- rnorm(20, sd=10)
eps
##  [1]  -6.2645381   1.8364332  -8.3562861  15.9528080   3.2950777  -8.2046838   4.8742905   7.3832471
##  [9]   5.7578135  -3.0538839  15.1178117   3.8984324  -6.2124058 -22.1469989  11.2493092  -0.4493361
## [17]  -0.1619026   9.4383621   8.2122120   5.9390132
y <- 10 + 5*x + eps
y
##  [1]   8.735462  21.836433  16.643714  45.952808  38.295078  31.795316  49.874291  57.383247  60.757814
## [10]  56.946116  80.117812  73.898432  68.787594  57.853001  96.249309  89.550664  94.838097 109.438362
## [19] 113.212212 115.939013
plot(x,y)

Fit a linear regression model.

xydat <- data.frame(x,y)
head(xydat)
##   x         y
## 1 1  8.735462
## 2 2 21.836433
## 3 3 16.643714
## 4 4 45.952808
## 5 5 38.295078
## 6 6 31.795316
lm.fit <- lm(y ~ x, data=xydat)
summary(lm.fit)
## 
## Call:
## lm(formula = y ~ x, data = xydat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.808  -5.168   1.875   4.833  15.450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6391     4.3158   2.233   0.0385 *  
## x             5.2158     0.3603  14.477 2.33e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.291 on 18 degrees of freedom
## Multiple R-squared:  0.9209, Adjusted R-squared:  0.9165 
## F-statistic: 209.6 on 1 and 18 DF,  p-value: 2.33e-11
confint(lm.fit)
##                2.5 %    97.5 %
## (Intercept) 0.571931 18.706212
## x           4.458915  5.972736

The fitted values.

lm.fit$fitted
##         1         2         3         4         5         6         7         8         9        10        11 
##  14.85490  20.07072  25.28655  30.50237  35.71820  40.93402  46.14985  51.36568  56.58150  61.79733  67.01315 
##        12        13        14        15        16        17        18        19        20 
##  72.22898  77.44480  82.66063  87.87645  93.09228  98.30810 103.52393 108.73976 113.95558
fitted(lm.fit)
##         1         2         3         4         5         6         7         8         9        10        11 
##  14.85490  20.07072  25.28655  30.50237  35.71820  40.93402  46.14985  51.36568  56.58150  61.79733  67.01315 
##        12        13        14        15        16        17        18        19        20 
##  72.22898  77.44480  82.66063  87.87645  93.09228  98.30810 103.52393 108.73976 113.95558
predict(lm.fit)
##         1         2         3         4         5         6         7         8         9        10        11 
##  14.85490  20.07072  25.28655  30.50237  35.71820  40.93402  46.14985  51.36568  56.58150  61.79733  67.01315 
##        12        13        14        15        16        17        18        19        20 
##  72.22898  77.44480  82.66063  87.87645  93.09228  98.30810 103.52393 108.73976 113.95558

Make a picture with the regression line and the fitted values.

plot(xydat$x, xydat$y, xlab="x", ylab="y")
abline(lsfit(xydat$x, xydat$y))
points(xydat$x, predict(lm.fit), pch=21, bg="grey")

You can use the ‘predict’ function to get predicted values and confidence limits for the mean response for any value of x.

predict(lm.fit, data.frame(x=c(5,10,20)), interval="confidence")
##         fit       lwr       upr
## 1  35.71820  29.68662  41.74978
## 2  61.79733  57.41639  66.17826
## 3 113.95558 105.54399 122.36717
predict(lm.fit, data.frame(x=c(5,10,20)), interval="confidence", level=0.99)
##         fit       lwr       upr
## 1  35.71820  27.45442  43.98198
## 2  61.79733  55.79508  67.79958
## 3 113.95558 102.43100 125.48016

Plot the confidence band for the mean response.

xx <- seq(1, 20, by=0.1)
predict.mean <- predict(lm.fit, data.frame(x=xx), interval="confidence")
plot(xydat$x, xydat$y, xlab="x", ylab="y", ylim=range(predict.mean), col="lightgrey")
lines(xx, predict.mean[,1], lwd=2)
lines(xx, predict.mean[,2], lty=2)
lines(xx, predict.mean[,3], lty=2)

You can also use the ‘predict’ function to get prediction intervals for new observations.

predict(lm.fit, data.frame(x=c(5,10,20)), interval="prediction")
##         fit      lwr       upr
## 1  35.71820 15.28863  56.14777
## 2  61.79733 41.79283  81.80182
## 3 113.95558 92.70136 135.20980

Plot the confidence band for the mean response, and the prediction interval for new observations.

predict.new <- predict(lm.fit, data.frame(x=xx), interval="prediction")
plot(xydat$x, xydat$y, xlab="x", ylab="y", ylim=range(predict.new), col="lightgrey")
lines(xx, predict.mean[,1], lwd=2)
lines(xx, predict.mean[,2], lty=2)
lines(xx, predict.mean[,3], lty=2)
lines(xx, predict.new[,2], lty=2, col="blue")
lines(xx, predict.new[,3], lty=2, col="blue")

Example: David Sullivan’s heme data

h2o2 <- rep(c(0,10,25,50), each=3)
pf3d7 <- c(0.3399,0.3563,0.3538,
           0.3168,0.3054,0.3174,
           0.2460,0.2618,0.2848,
           0.1535,0.1613,0.1525)
dat <- data.frame(conc=h2o2, od=pf3d7)

The regression of optical density on hydrogen peroxide concentration.

lm.fit <- lm(od ~ conc, data=dat)
summary(lm.fit)$coef
##                 Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)  0.353050073 0.0049955194  70.67335 7.844454e-15
## conc        -0.003870984 0.0001759324 -22.00268 8.426407e-10

Predict the optical density at H202 concentration 30.

predict(lm.fit, data.frame(conc=30))
##         1 
## 0.2369206

Predict the optical density at H202 concentrations 20, 30, and 40.

predict(lm.fit, data.frame(conc=c(20,30,40)))
##         1         2         3 
## 0.2756304 0.2369206 0.1982107

Predict the optical density at H202 concentrations 20, 30, and 40, and get the 95% confidence intervals.

predict(lm.fit, data.frame(conc=c(20,30,40)), interval="confidence")
##         fit       lwr       upr
## 1 0.2756304 0.2682315 0.2830293
## 2 0.2369206 0.2287800 0.2450611
## 3 0.1982107 0.1877931 0.2086283

Predict the optical density at H202 concentrations 20, 30, and 40, and get the 95% prediction intervals.

predict(lm.fit, data.frame(conc=c(20,30,40)),interval="prediction")
##         fit       lwr       upr
## 1 0.2756304 0.2490074 0.3022533
## 2 0.2369206 0.2100820 0.2637591
## 3 0.1982107 0.1705961 0.2258253

Plot the confidence band for the mean response, and the prediction interval for new observations.

xx <- seq(0, 50, length=101)
predict.mean <- predict(lm.fit, data.frame(conc=xx), interval="confidence")
predict.new <- predict(lm.fit, data.frame(conc=xx), interval="prediction")
plot(dat$conc, dat$od, xlab="H2O2 concentration", ylab="OD", ylim=range(predict.new), col="lightgrey")
lines(xx, predict.mean[,1], lwd=2)
lines(xx, predict.mean[,2], lty=2)
lines(xx, predict.mean[,3], lty=2)
lines(xx, predict.new[,2], lty=2, col="blue")
lines(xx, predict.new[,3], lty=2, col="blue")

Calibration

Example from class: concentration of quinine in a set of standards

dat <- data.frame(quinine=c( 0, 0, 0, 
                            12,12,12, 
                            24,24,24, 
                            36,36,36),
                  fluor=c(100.45, 98.92,101.33, 
                          133.19,127.33,126.78,
                          152.72,157.43,160.81, 
                          188.73,191.96,183.70))
dat
##    quinine  fluor
## 1        0 100.45
## 2        0  98.92
## 3        0 101.33
## 4       12 133.19
## 5       12 127.33
## 6       12 126.78
## 7       24 152.72
## 8       24 157.43
## 9       24 160.81
## 10      36 188.73
## 11      36 191.96
## 12      36 183.70

Plot the data.

plot(dat)
abline(lsfit(dat$quinine, dat$fluor), col="blue")

Using the ‘calibrate’ function from the SPH.140.615 package, estimate the concentration corresponding to the fluorescence of 143.70.

library(SPH.140.615)
calibrate(dat$quinine, dat$fluor, 143.70)
##      est       lo       hi 
## 18.03601 14.97447 21.09755

Estimate the concentration on the basis of 3 independent measurements.

newy <- c(148.56, 149.36, 150.29)
calibrate(dat$quinine, dat$fluor, newy)
##      est       lo       hi 
## 20.38325 18.46910 22.29739