Statistics for Laboratory Scientists ( 140.615 )

Logistic Regression

Example from class: spider mites

The data are in the SPH.140.615 package.

library(SPH.140.615)
example(spiders)
## 
## spidrs> spiders
##   dose n.dead  n
## 1  0.0      7 25
## 2  0.5      6 25
## 3  1.0     13 25
## 4  1.5     20 25
## 5  2.0     19 25
## 6  2.5     23 25
## 7  3.0     24 25
## 
## spidrs> spiders$n.dead/spiders$n
## [1] 0.28 0.24 0.52 0.80 0.76 0.92 0.96

Fit the logistic regression model.

glm.out <- glm(n.dead/n ~ dose, data=spiders, weights=n, family=binomial(link=logit))
summary(glm.out)$coef
##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -1.329592  0.3274858 -4.059999 4.907289e-05
## dose         1.448457  0.2303103  6.289153 3.192028e-10

Plot the data with the fitted curve.

par(las=1)
plot(n.dead/n ~ dose, data=spiders, lwd=2, ylim=c(0,1), ylab="proportion dead")
u <- par("usr")
x <- seq(0,u[2],len=250)
y <- predict(glm.out, data.frame(dose=x), type="response")
lines(x, y, col="blue", lwd=2)

Interpretation of the model

The model parameters from the logistic model.

params <- glm.out$coef
params
## (Intercept)        dose 
##   -1.329592    1.448457

When the dose is equal to zero, the log odds of death among the spider mites is

params[1]
## (Intercept) 
##   -1.329592

When the dose is equal to zero, the probability of death among the spider mites is

exp(params[1])/(1+exp(params[1]))
## (Intercept) 
##   0.2092268

Comparing a spider mite receiving a dose one unit larger than another spider mite, the log odds ratio (or equivalently, difference in log odds) of death is

params[2]
##     dose 
## 1.448457

In other words, the odds of death are

exp(params[2])
##     dose 
## 4.256541

times larger for the spider mite receiving the higher dose.

What is the LD50?
ld50 <- -params[1]/params[2]
ld50
## (Intercept) 
##    0.917937

Calculate the 95% confidence interval using a function from the SPH.140.615 package to calculate the estimated standard error.

se.ld50 <- ld50se(glm.out)
se.ld50
## (Intercept) 
##   0.1359084
ci.ld50 <- ld50 + c(-1,1) * qnorm(0.975) * se.ld50
ci.ld50
## [1] 0.6515615 1.1843125
Alternative fitting of the logistic regression

Sometimes the data are recorded for each experimental unit, with individual outcome (e.g., 0/1, alive/dead, etc).

d <- spiders$dose
n0 <- 25-spiders$n.dead
n1 <- spiders$n.dead
d0 <- rep(d,n0)
d1 <- rep(d,n1)
y0 <- rep(0,sum(n0))
y1 <- rep(1,sum(n1))
dat <- data.frame(y=c(y0,y1),dose=c(d0,d1))
dim(dat)
## [1] 175   2
head(dat)
##   y dose
## 1 0    0
## 2 0    0
## 3 0    0
## 4 0    0
## 5 0    0
## 6 0    0
tail(dat)
##     y dose
## 170 1    3
## 171 1    3
## 172 1    3
## 173 1    3
## 174 1    3
## 175 1    3
table(dat$y,dat$dose)
##    
##      0 0.5  1 1.5  2 2.5  3
##   0 18  19 12   5  6   2  1
##   1  7   6 13  20 19  23 24

The logistic regression for this data format.

glm.out <- glm(y ~ dose, data=dat, family=binomial(link=logit))
summary(glm.out)$coef
##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -1.329592  0.3274702 -4.060193 4.903219e-05
## dose         1.448457  0.2302846  6.289855 3.177630e-10

Example from class: tobacco budworms

The data are in the SPH.140.615 package, and use log2 dose.

worms
##    dose n.dead  n    sex
## 1     1      1 20   male
## 2     2      4 20   male
## 3     4      9 20   male
## 4     8     13 20   male
## 5    16     18 20   male
## 6    32     20 20   male
## 7     1      0 20 female
## 8     2      2 20 female
## 9     4      6 20 female
## 10    8     10 20 female
## 11   16     12 20 female
## 12   32     16 20 female
worms$log2dose <- log2(worms$dose)
worms
##    dose n.dead  n    sex log2dose
## 1     1      1 20   male        0
## 2     2      4 20   male        1
## 3     4      9 20   male        2
## 4     8     13 20   male        3
## 5    16     18 20   male        4
## 6    32     20 20   male        5
## 7     1      0 20 female        0
## 8     2      2 20 female        1
## 9     4      6 20 female        2
## 10    8     10 20 female        3
## 11   16     12 20 female        4
## 12   32     16 20 female        5

Fit the model, sexes completely different.

glm.out <- glm(n.dead/n ~ sex*log2dose, weights=n, data=worms, family=binomial(link=logit))
summary(glm.out)$coef
##                    Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)      -2.9935418  0.5526997 -5.4162175 6.087304e-08
## sexmale           0.1749868  0.7783100  0.2248292 8.221122e-01
## log2dose          0.9060364  0.1671016  5.4220678 5.891353e-08
## sexmale:log2dose  0.3529130  0.2699902  1.3071324 1.911678e-01

Fit the model with different slopes but common intercept.

glm.out <- glm(n.dead/n ~ log2dose + sex:log2dose, weights=n, data=worms, family=binomial(link=logit))
summary(glm.out)$coef
##                    Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)      -2.9072994  0.3892857 -7.468293 8.124171e-14
## log2dose          0.8823333  0.1275071  6.919875 4.520414e-12
## log2dose:sexmale  0.4070062  0.1247047  3.263759 1.099447e-03

Plot the data with the fitted curves.

par(las=1)
plot(n.dead/n ~ log2dose, data=worms, col=ifelse(sex=="male","blue","red"), lwd=2, ylim=c(0,1), 
     xlab="log2 dose", ylab="proprtion dead")
u <- par("usr")
x <- seq(0,u[2],len=250)
ym <- predict(glm.out, data.frame(log2dose=x,sex=factor(rep("male",length(x)))), type="response")
lines(x, ym, col="blue")
yf <- predict(glm.out, data.frame(log2dose=x,sex=factor(rep("female",length(x)))), type="response")
lines(x, yf, col="red")
legend(u[2], u[3], col=c("blue","red"), lty=1, xjust=1, yjust=0, c("Male","Female"))

Interpretation of the model

The model parameters from the logistic model.

params <- glm.out$coef
params
##      (Intercept)         log2dose log2dose:sexmale 
##       -2.9072994        0.8823333        0.4070062

The intercept and slope for the female tobacco budworms.

params[1:2]
## (Intercept)    log2dose 
##  -2.9072994   0.8823333

The slope for the male tobacco budworms.

params[2]+params[3]
## log2dose 
##  1.28934

The intercept and slope for the male tobacco budworms.

c(params[1], params[2]+params[3])
## (Intercept)    log2dose 
##   -2.907299    1.289340

When the dose is equal to one (i.e., the log2 dose is 0), the log odds of death among the tobacco budworms (both male and female) is

params[1]
## (Intercept) 
##   -2.907299

When the dose is equal to one (i.e., the log2 dose is 0), the probability of death among the tobacco budworms (both male and female) is

exp(params[1])/(1+exp(params[1]))
## (Intercept) 
##  0.05179391

Comparing a female tobacco budworm receiving a dose twice as large as another female tobacco budworm (i.e., the log2 dose difference is 1), the log odds ratio of death is

params[2]
##  log2dose 
## 0.8823333

In other words, the odds of death are

exp(params[2])
## log2dose 
## 2.416532

times larger for the female tobacco budworm receiving the higher dose.

Comparing a male tobacco budworm receiving a dose twice as large as another male tobacco budworm (i.e., the log2 dose difference is 1), the log odds ratio of death is

params[2]+params[3]
## log2dose 
##  1.28934

In other words, the odds of death are

exp(params[2]+params[3])
## log2dose 
## 3.630388

times larger for the male tobacco budworm receiving the higher dose.

Example from class: the ticks on the clay islands, revisited

The data.

ticks
##   Tsex Leg Dsex  T U
## 1   ma  fo   fe 24 5
## 2   fe  fo   fe 18 5
## 3   ma  fo   ma 23 4
## 4   fe  fo   ma 20 4
## 5   ma  hi   fe 17 8
## 6   fe  hi   fe 25 3
## 7   ma  hi   ma 21 6
## 8   fe  hi   ma 25 2

Calculating the total number of ticks under the respective conditions.

ticks$n <- ticks$T + ticks$U
ticks
##   Tsex Leg Dsex  T U  n
## 1   ma  fo   fe 24 5 29
## 2   fe  fo   fe 18 5 23
## 3   ma  fo   ma 23 4 27
## 4   fe  fo   ma 20 4 24
## 5   ma  hi   fe 17 8 25
## 6   fe  hi   fe 25 3 28
## 7   ma  hi   ma 21 6 27
## 8   fe  hi   ma 25 2 27

This is a \(\text{2}^\text{3}\) factorial design (three factors with two levels each). These designs in particular allow you to investigate the interactions between the factors.

Two-way interactions:

Does the tick sex effect differ across the levels of the factor leg?

Does the tick sex effect differ across the levels of the factor deer sex?

Does the leg effect differ across the levels of the factor deer sex?

Three-way interaction:

Does the tick sex / leg interaction differ across the levels of the factor deer sex?

Equivalently:

Does the tick sex / deer sex interaction differ across the levels of the factor leg?

Does the leg / deer sex interaction differ across the levels of the factor tick sex?

Fit the full model, will all higher order interactions. If you fit higher order interactions (e.g., the three-way interaction) always include the lower order interactions (e.g., the two-way interactions).

glm.out1 <- glm(T/n ~ Tsex * Leg * Dsex, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out1)$coef
##                        Estimate Std. Error    z value   Pr(>|z|)
## (Intercept)          1.28093385  0.5055250  2.5338683 0.01128111
## Tsexma               0.28768207  0.7051399  0.4079787 0.68328928
## Leghi                0.83932969  0.7930252  1.0583898 0.28987779
## Dsexma               0.32850407  0.7453560  0.4407345 0.65940525
## Tsexma:Leghi        -1.65417381  1.0268296 -1.6109525 0.10719007
## Tsexma:Dsexma       -0.14792013  1.0443661 -0.1416363 0.88736730
## Leghi:Dsexma         0.07696104  1.2119773  0.0635004 0.94936804
## Tsexma:Leghi:Dsexma  0.24144619  1.5498849  0.1557833 0.87620383

The three-way interaction is not significant. Fit the model without the three-way interaction.

glm.out2 <- glm(T/n ~ Tsex * Leg * Dsex - Tsex:Leg:Dsex, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out2)$coef
##                  Estimate Std. Error     z value    Pr(>|z|)
## (Intercept)    1.30673406  0.4811558  2.71582291 0.006611127
## Tsexma         0.23785126  0.6284471  0.37847461 0.705078048
## Leghi          0.77665144  0.6796707  1.14268790 0.253168194
## Dsexma         0.27293367  0.6532456  0.41781172 0.676084782
## Tsexma:Leghi  -1.54898015  0.7701768 -2.01120062 0.044304270
## Tsexma:Dsexma -0.03859667  0.7736434 -0.04988948 0.960210457
## Leghi:Dsexma   0.22481996  0.7570753  0.29695850 0.766498193

Alternative syntax.

glm.out2 <- glm(T/n ~ (Tsex + Leg + Dsex)^2, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out2)$coef
##                  Estimate Std. Error     z value    Pr(>|z|)
## (Intercept)    1.30673406  0.4811558  2.71582291 0.006611127
## Tsexma         0.23785126  0.6284471  0.37847461 0.705078048
## Leghi          0.77665144  0.6796707  1.14268790 0.253168194
## Dsexma         0.27293367  0.6532456  0.41781172 0.676084782
## Tsexma:Leghi  -1.54898015  0.7701768 -2.01120062 0.044304270
## Tsexma:Dsexma -0.03859667  0.7736434 -0.04988948 0.960210457
## Leghi:Dsexma   0.22481996  0.7570753  0.29695850 0.766498193

The two-way interactions are not very significant. Fit an additive model, and test for a significant decrease in the likelihood.

glm.out3 <- glm(T/n ~ Tsex + Leg + Dsex, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out3)$coef
##                Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)  1.71152444  0.3854051  4.4408448 8.960639e-06
## Tsexma      -0.53662365  0.3729989 -1.4386736 1.502430e-01
## Leghi       -0.05517661  0.3656716 -0.1508912 8.800616e-01
## Dsexma       0.33388122  0.3669443  0.9098961 3.628773e-01
anova(glm.out3, glm.out2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: T/n ~ Tsex + Leg + Dsex
## Model 2: T/n ~ (Tsex + Leg + Dsex)^2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1         4     4.2926                     
## 2         1     0.0242  3   4.2684   0.2339

Calculate the confidence intervals.

confint(glm.out3)
## Waiting for profiling to be done...
##                  2.5 %    97.5 %
## (Intercept)  0.9912551 2.5101459
## Tsexma      -1.2878749 0.1841138
## Leghi       -0.7783154 0.6636186
## Dsexma      -0.3816322 1.0656641

Summary: the ticks seems to have a preference for the treated tubes (since 0 is not in the 95% CI for the intercept), but neither tick sex, deer sex, or type of leg seem to matter as far as rates are concerned. So we fit a model with an intercept only.

glm.out <- glm(T/n ~ 1, data=ticks, weights=n, family=binomial(link=logit))
summary(glm.out)$coef
##             Estimate Std. Error  z value     Pr(>|z|)
## (Intercept) 1.542374   0.181128 8.515378 1.660476e-17

Estimate the proportion of ticks running to the treated tube on average.

b0 <- summary(glm.out)$coef[1,1]
b0
## [1] 1.542374
exp(b0)/(1+exp(b0))
## [1] 0.8238095

Get the confidence interval for the proportion of ticks running to the treated tube on average.

ci <- confint(glm.out)
## Waiting for profiling to be done...
ci
##    2.5 %   97.5 % 
## 1.199959 1.912093
exp(ci)/(1+exp(ci))
##     2.5 %    97.5 % 
## 0.7685174 0.8712541