Statistics for Laboratory Scientists ( 140.615 )

Logistic Regression

Example from class: spider mites

The data are in the SPH.140.615 package.

library(SPH.140.615)
example(spiders)
## 
## spidrs> spiders
##   dose n.dead  n
## 1  0.0      7 25
## 2  0.5      6 25
## 3  1.0     13 25
## 4  1.5     20 25
## 5  2.0     19 25
## 6  2.5     23 25
## 7  3.0     24 25
## 
## spidrs> spiders$n.dead/spiders$n
## [1] 0.28 0.24 0.52 0.80 0.76 0.92 0.96

Fit the logistic regression model.

glm.out <- glm(n.dead/n ~ dose, data=spiders, weights=n, family=binomial(link=logit))
summary(glm.out)$coef
##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -1.329592  0.3274858 -4.059999 4.907289e-05
## dose         1.448457  0.2303103  6.289153 3.192028e-10

Plot the data with the fitted curve.

par(las=1)
plot(n.dead/n ~ dose, data=spiders, lwd=2, ylim=c(0,1), ylab="proportion dead")
u <- par("usr")
x <- seq(0,u[2],len=250)
y <- predict(glm.out, data.frame(dose=x), type="response")
lines(x, y, col="blue", lwd=2)

Interpretation of the model

The model parameters from the logistic model.

params <- glm.out$coef
params
## (Intercept)        dose 
##   -1.329592    1.448457

When the dose is equal to zero, the log odds of death among the spider mites is

params[1]
## (Intercept) 
##   -1.329592

When the dose is equal to zero, the probability of death among the spider mites is

exp(params[1])/(1+exp(params[1]))
## (Intercept) 
##   0.2092268

Comparing a spider mite receiving a dose one unit larger than another spider mite, the log odds ratio (or equivalently, difference in log odds) of death is

params[2]
##     dose 
## 1.448457

In other words, the odds of death are

exp(params[2])
##     dose 
## 4.256541

times larger for the spider mite receiving the higher dose.

What is the LD50?
ld50 <- -params[1]/params[2]
ld50
## (Intercept) 
##    0.917937

Calculate the 95% confidence interval using a function from the SPH.140.615 package to calculate the estimated standard error.

se.ld50 <- ld50se(glm.out)
se.ld50
## (Intercept) 
##   0.1359084
ci.ld50 <- ld50 + c(-1,1) * qnorm(0.975) * se.ld50
ci.ld50
## [1] 0.6515615 1.1843125
Alternative fitting of the logistic regression

Sometimes the data are recorded for each experimental unit, with individual outcome (e.g., 0/1, alive/dead, etc).

d <- spiders$dose
n0 <- 25-spiders$n.dead
n1 <- spiders$n.dead
d0 <- rep(d,n0)
d1 <- rep(d,n1)
y0 <- rep(0,sum(n0))
y1 <- rep(1,sum(n1))
dat <- data.frame(y=c(y0,y1),dose=c(d0,d1))
dim(dat)
## [1] 175   2
head(dat)
##   y dose
## 1 0    0
## 2 0    0
## 3 0    0
## 4 0    0
## 5 0    0
## 6 0    0
tail(dat)
##     y dose
## 170 1    3
## 171 1    3
## 172 1    3
## 173 1    3
## 174 1    3
## 175 1    3
table(dat$y,dat$dose)
##    
##      0 0.5  1 1.5  2 2.5  3
##   0 18  19 12   5  6   2  1
##   1  7   6 13  20 19  23 24

The logistic regression for this data format.

glm.out <- glm(y ~ dose, data=dat, family=binomial(link=logit))
summary(glm.out)$coef
##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -1.329592  0.3274702 -4.060193 4.903219e-05
## dose         1.448457  0.2302846  6.289855 3.177630e-10

Example from class: tobacco budworms

The data are in the SPH.140.615 package, and use log2 dose.

worms
##    dose n.dead  n    sex
## 1     1      1 20   male
## 2     2      4 20   male
## 3     4      9 20   male
## 4     8     13 20   male
## 5    16     18 20   male
## 6    32     20 20   male
## 7     1      0 20 female
## 8     2      2 20 female
## 9     4      6 20 female
## 10    8     10 20 female
## 11   16     12 20 female
## 12   32     16 20 female
worms$log2dose <- log2(worms$dose)
worms
##    dose n.dead  n    sex log2dose
## 1     1      1 20   male        0
## 2     2      4 20   male        1
## 3     4      9 20   male        2
## 4     8     13 20   male        3
## 5    16     18 20   male        4
## 6    32     20 20   male        5
## 7     1      0 20 female        0
## 8     2      2 20 female        1
## 9     4      6 20 female        2
## 10    8     10 20 female        3
## 11   16     12 20 female        4
## 12   32     16 20 female        5

Fit the model, sexes completely different.

glm.out <- glm(n.dead/n ~ sex*log2dose, weights=n, data=worms, family=binomial(link=logit))
summary(glm.out)$coef
##                    Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)      -2.9935418  0.5526997 -5.4162175 6.087304e-08
## sexmale           0.1749868  0.7783100  0.2248292 8.221122e-01
## log2dose          0.9060364  0.1671016  5.4220678 5.891353e-08
## sexmale:log2dose  0.3529130  0.2699902  1.3071324 1.911678e-01

Fit the model with different slopes but common intercept.

glm.out <- glm(n.dead/n ~ log2dose + sex:log2dose, weights=n, data=worms, family=binomial(link=logit))
summary(glm.out)$coef
##                    Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)      -2.9072994  0.3892857 -7.468293 8.124171e-14
## log2dose          0.8823333  0.1275071  6.919875 4.520414e-12
## log2dose:sexmale  0.4070062  0.1247047  3.263759 1.099447e-03

Plot the data with the fitted curves.

par(las=1)
plot(n.dead/n ~ log2dose, data=worms, col=ifelse(sex=="male","blue","red"), lwd=2, ylim=c(0,1), 
     xlab="log2 dose", ylab="proprtion dead")
u <- par("usr")
x <- seq(0,u[2],len=250)
ym <- predict(glm.out, data.frame(log2dose=x,sex=factor(rep("male",length(x)))), type="response")
lines(x, ym, col="blue")
yf <- predict(glm.out, data.frame(log2dose=x,sex=factor(rep("female",length(x)))), type="response")
lines(x, yf, col="red")
legend(u[2], u[3], col=c("blue","red"), lty=1, xjust=1, yjust=0, c("Male","Female"))

Interpretation of the model

The model parameters from the logistic model.

params <- glm.out$coef
params
##      (Intercept)         log2dose log2dose:sexmale 
##       -2.9072994        0.8823333        0.4070062

The intercept and slope for the female tobacco budworms.

params[1:2]
## (Intercept)    log2dose 
##  -2.9072994   0.8823333

The slope for the male tobacco budworms.

params[2]+params[3]
## log2dose 
##  1.28934

The intercept and slope for the male tobacco budworms.

c(params[1], params[2]+params[3])
## (Intercept)    log2dose 
##   -2.907299    1.289340

When the dose is equal to one (i.e., the log2 dose is 0), the log odds of death among the tobacco budworms (both male and female) is

params[1]
## (Intercept) 
##   -2.907299

When the dose is equal to one (i.e., the log2 dose is 0), the probability of death among the tobacco budworms (both male and female) is

exp(params[1])/(1+exp(params[1]))
## (Intercept) 
##  0.05179391

Comparing a female tobacco budworm receiving a dose twice as large as another female tobacco budworm (i.e., the log2 dose difference is 1), the log odds ratio of death is

params[2]
##  log2dose 
## 0.8823333

In other words, the odds of death are

exp(params[2])
## log2dose 
## 2.416532

times larger for the female tobacco budworm receiving the higher dose.

Comparing a male tobacco budworm receiving a dose twice as large as another male tobacco budworm (i.e., the log2 dose difference is 1), the log odds ratio of death is

params[2]+params[3]
## log2dose 
##  1.28934

In other words, the odds of death are

exp(params[2]+params[3])
## log2dose 
## 3.630388

times larger for the male tobacco budworm receiving the higher dose.

Example from class: the ticks on the clay islands, revisited

The data.

ticks
##   Tsex Leg Dsex  T U
## 1   ma  fo   fe 24 5
## 2   fe  fo   fe 18 5
## 3   ma  fo   ma 23 4
## 4   fe  fo   ma 20 4
## 5   ma  hi   fe 17 8
## 6   fe  hi   fe 25 3
## 7   ma  hi   ma 21 6
## 8   fe  hi   ma 25 2

Calculating the total number of ticks under the respective conditions.

ticks$n <- ticks$T + ticks$U
ticks
##   Tsex Leg Dsex  T U  n
## 1   ma  fo   fe 24 5 29
## 2   fe  fo   fe 18 5 23
## 3   ma  fo   ma 23 4 27
## 4   fe  fo   ma 20 4 24
## 5   ma  hi   fe 17 8 25
## 6   fe  hi   fe 25 3 28
## 7   ma  hi   ma 21 6 27
## 8   fe  hi   ma 25 2 27

Fit the full model, will all higher order interactions.

glm.out1 <- glm(T/n ~ Tsex * Leg * Dsex, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out1)$coef
##                        Estimate Std. Error    z value   Pr(>|z|)
## (Intercept)          1.28093385  0.5055250  2.5338683 0.01128111
## Tsexma               0.28768207  0.7051399  0.4079787 0.68328928
## Leghi                0.83932969  0.7930252  1.0583898 0.28987779
## Dsexma               0.32850407  0.7453560  0.4407345 0.65940525
## Tsexma:Leghi        -1.65417381  1.0268296 -1.6109525 0.10719007
## Tsexma:Dsexma       -0.14792013  1.0443661 -0.1416363 0.88736730
## Leghi:Dsexma         0.07696104  1.2119773  0.0635004 0.94936804
## Tsexma:Leghi:Dsexma  0.24144619  1.5498849  0.1557833 0.87620383

The three-way interaction is not significant. Fit the model without the three-way interaction.

glm.out2 <- glm(T/n ~ Tsex * Leg * Dsex - Tsex:Leg:Dsex, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out2)$coef
##                  Estimate Std. Error     z value    Pr(>|z|)
## (Intercept)    1.30673406  0.4811558  2.71582291 0.006611127
## Tsexma         0.23785126  0.6284471  0.37847461 0.705078048
## Leghi          0.77665144  0.6796707  1.14268790 0.253168194
## Dsexma         0.27293367  0.6532456  0.41781172 0.676084782
## Tsexma:Leghi  -1.54898015  0.7701768 -2.01120062 0.044304270
## Tsexma:Dsexma -0.03859667  0.7736434 -0.04988948 0.960210457
## Leghi:Dsexma   0.22481996  0.7570753  0.29695850 0.766498193

Alternative syntax.

glm.out2 <- glm(T/n ~ (Tsex + Leg + Dsex)^2, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out2)$coef
##                  Estimate Std. Error     z value    Pr(>|z|)
## (Intercept)    1.30673406  0.4811558  2.71582291 0.006611127
## Tsexma         0.23785126  0.6284471  0.37847461 0.705078048
## Leghi          0.77665144  0.6796707  1.14268790 0.253168194
## Dsexma         0.27293367  0.6532456  0.41781172 0.676084782
## Tsexma:Leghi  -1.54898015  0.7701768 -2.01120062 0.044304270
## Tsexma:Dsexma -0.03859667  0.7736434 -0.04988948 0.960210457
## Leghi:Dsexma   0.22481996  0.7570753  0.29695850 0.766498193

The two-way interactions are not very significant. Fit an additive model, and test for a significant decrease in the likelihood.

glm.out3 <- glm(T/n ~ Tsex + Leg + Dsex, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out3)$coef
##                Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)  1.71152444  0.3854051  4.4408448 8.960639e-06
## Tsexma      -0.53662365  0.3729989 -1.4386736 1.502430e-01
## Leghi       -0.05517661  0.3656716 -0.1508912 8.800616e-01
## Dsexma       0.33388122  0.3669443  0.9098961 3.628773e-01
anova(glm.out3, glm.out2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: T/n ~ Tsex + Leg + Dsex
## Model 2: T/n ~ (Tsex + Leg + Dsex)^2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1         4     4.2926                     
## 2         1     0.0242  3   4.2684   0.2339

Calculate the confidence intervals.

confint(glm.out3)
## Waiting for profiling to be done...
##                  2.5 %    97.5 %
## (Intercept)  0.9912551 2.5101459
## Tsexma      -1.2878749 0.1841138
## Leghi       -0.7783154 0.6636186
## Dsexma      -0.3816322 1.0656641

Summary: the ticks seems to have a preference for the treated tubes (since 0 is not in the 95% CI for the intercept), but neither tick sex, deer sex, or type of leg seem to matter as far as rates are concerned. So we fit a model with an intercept only.

glm.out <- glm(T/n ~ 1, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out)$coef
##             Estimate Std. Error  z value     Pr(>|z|)
## (Intercept) 1.542374   0.181128 8.515378 1.660476e-17

Estimate the proportion of ticks running to the treated tube on average.

b0 <- summary(glm.out)$coef[1,1]
b0
## [1] 1.542374
exp(b0)/(1+exp(b0))
## [1] 0.8238095

Get the confidence interval for the proportion of ticks running to the treated tube on average.

ci <- confint(glm.out)
## Waiting for profiling to be done...
ci
##    2.5 %   97.5 % 
## 1.199959 1.912093
exp(ci)/(1+exp(ci))
##     2.5 %    97.5 % 
## 0.7685174 0.8712541