Statistics for Laboratory Scientists ( 140.615 )

Linear Regression

Example from class: David Sullivan’s heme data

h2o2 <- rep(c(0,10,25,50), each=3)
h2o2
##  [1]  0  0  0 10 10 10 25 25 25 50 50 50
pf3d7 <- c(0.3399,0.3563,0.3538,
           0.3168,0.3054,0.3174,
           0.2460,0.2618,0.2848,
           0.1535,0.1613,0.1525)
pf3d7
##  [1] 0.3399 0.3563 0.3538 0.3168 0.3054 0.3174 0.2460 0.2618 0.2848 0.1535 0.1613 0.1525

Plot the data, and add the least squares fit line.

plot(h2o2, pf3d7, xlab="H2O2 concentration", ylab="OD")
abline(lsfit(h2o2, pf3d7), col="red", lty=2)

The regression of optical density on hydrogen peroxide concentration, using the built-in ‘lm’ function
lm.out <- lm(pf3d7 ~ h2o2)
lm.out
## 
## Call:
## lm(formula = pf3d7 ~ h2o2)
## 
## Coefficients:
## (Intercept)         h2o2  
##    0.353050    -0.003871
lm.sum <- summary(lm.out)
lm.sum
## 
## Call:
## lm(formula = pf3d7 ~ h2o2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.013150 -0.007486  0.001275  0.003107  0.028525 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.3530501  0.0049955   70.67 7.84e-15 ***
## h2o2        -0.0038710  0.0001759  -22.00 8.43e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01148 on 10 degrees of freedom
## Multiple R-squared:  0.9798, Adjusted R-squared:  0.9777 
## F-statistic: 484.1 on 1 and 10 DF,  p-value: 8.426e-10
attributes(lm.sum)
## $names
##  [1] "call"          "terms"         "residuals"     "coefficients"  "aliased"       "sigma"        
##  [7] "df"            "r.squared"     "adj.r.squared" "fstatistic"    "cov.unscaled" 
## 
## $class
## [1] "summary.lm"

The table of coefficients.

lm.sum$coef
##                 Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)  0.353050073 0.0049955194  70.67335 7.844454e-15
## h2o2        -0.003870984 0.0001759324 -22.00268 8.426407e-10

The parameter estimates.

lm.sum$coef[,1]
##  (Intercept)         h2o2 
##  0.353050073 -0.003870984

The residual standard deviation.

lm.sum$sigma
## [1] 0.01147782

To avoid data glitches, it is a good habit to make a data frame.

dat <- data.frame(conc=h2o2, od=pf3d7)
dat
##    conc     od
## 1     0 0.3399
## 2     0 0.3563
## 3     0 0.3538
## 4    10 0.3168
## 5    10 0.3054
## 6    10 0.3174
## 7    25 0.2460
## 8    25 0.2618
## 9    25 0.2848
## 10   50 0.1535
## 11   50 0.1613
## 12   50 0.1525
str(dat)
## 'data.frame':    12 obs. of  2 variables:
##  $ conc: num  0 0 0 10 10 10 25 25 25 50 ...
##  $ od  : num  0.34 0.356 0.354 0.317 0.305 ...
plot(dat$conc, dat$od, xlab="H2O2 concentration", ylab="OD")
abline(lsfit(dat$conc, dat$od), col="red", lty=2)

lm.out <- lm(od ~ conc, data=dat)
summary(lm.out)
## 
## Call:
## lm(formula = od ~ conc, data = dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.013150 -0.007486  0.001275  0.003107  0.028525 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.3530501  0.0049955   70.67 7.84e-15 ***
## conc        -0.0038710  0.0001759  -22.00 8.43e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01148 on 10 degrees of freedom
## Multiple R-squared:  0.9798, Adjusted R-squared:  0.9777 
## F-statistic: 484.1 on 1 and 10 DF,  p-value: 8.426e-10
Confidence intervals for the regression parameters
confint(lm.out)
##                    2.5 %       97.5 %
## (Intercept)  0.341919363  0.364180784
## conc        -0.004262986 -0.003478982
confint(lm.out, level=0.99)
##                    0.5 %       99.5 %
## (Intercept)  0.337217910  0.368882237
## conc        -0.004428562 -0.003313406
Checking model assumptions
lm.out$fitted
##         1         2         3         4         5         6         7         8         9        10        11 
## 0.3530501 0.3530501 0.3530501 0.3143402 0.3143402 0.3143402 0.2562755 0.2562755 0.2562755 0.1595009 0.1595009 
##        12 
## 0.1595009
lm.out$residuals
##             1             2             3             4             5             6             7 
## -0.0131500734  0.0032499266  0.0007499266  0.0024597651 -0.0089402349  0.0030597651 -0.0102754772 
##             8             9            10            11            12 
##  0.0055245228  0.0285245228 -0.0060008811  0.0017991189 -0.0070008811

Alternatively, there are also generic functions.

fitted(lm.out)
##         1         2         3         4         5         6         7         8         9        10        11 
## 0.3530501 0.3530501 0.3530501 0.3143402 0.3143402 0.3143402 0.2562755 0.2562755 0.2562755 0.1595009 0.1595009 
##        12 
## 0.1595009
residuals(lm.out)
##             1             2             3             4             5             6             7 
## -0.0131500734  0.0032499266  0.0007499266  0.0024597651 -0.0089402349  0.0030597651 -0.0102754772 
##             8             9            10            11            12 
##  0.0055245228  0.0285245228 -0.0060008811  0.0017991189 -0.0070008811

The residual qq plot.

qqnorm(lm.out$residuals, main="")
qqline(lm.out$residuals, col="blue", lty=2)

Fitted values versus residuals.

plot(lm.out$fitted, lm.out$residuals, pch=1, xlab="fitted values", ylab="residuals")
abline(h=0, col="blue", lty=2)