Statistics for Laboratory Scientists ( 140.615 )

Non-linear Regression

Example from class: Michaelis-Menten equations

The data.

library(SPH.140.615)
puro
##    conc vel     state
## 1  0.02  76   treated
## 2  0.02  47   treated
## 3  0.06  97   treated
## 4  0.06 107   treated
## 5  0.11 123   treated
## 6  0.11 139   treated
## 7  0.22 159   treated
## 8  0.22 152   treated
## 9  0.56 191   treated
## 10 0.56 201   treated
## 11 1.10 207   treated
## 12 1.10 200   treated
## 13 0.02  67 untreated
## 14 0.02  51 untreated
## 15 0.06  84 untreated
## 16 0.06  86 untreated
## 17 0.11  98 untreated
## 18 0.11 115 untreated
## 19 0.22 131 untreated
## 20 0.22 124 untreated
## 21 0.56 144 untreated
## 22 0.56 158 untreated
## 23 1.10 160 untreated

Plot the data, just for the untreated cells.

plot(vel ~ conc, data=puro, subset=(state=="untreated"), xlab="concentration", ylab="initial velocity", lwd=2)

The regression of 1/V on 1/C, just with the data on untreated cells.

temp <- lm(1/vel ~ I(1/conc), data=puro, subset=(state=="untreated"))
summary(temp)$coef
##                 Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) 0.0069721337 5.678297e-04 12.278565 6.331192e-07
## I(1/conc)   0.0002150012 2.479421e-05  8.671428 1.156213e-05
vmax <- 1/temp$coef[1]
vmax
## (Intercept) 
##    143.4281
km <- temp$coef[2]*vmax
km
##  I(1/conc) 
## 0.03083721

Plot the data with the regression line.

plot(I(1/vel) ~ I(1/conc), data=puro, subset=(state=="untreated"), 
     xlab="1 / concentration", ylab="1 / initial velocity", lwd=2)
abline(temp$coef, col="red", lty=2, lwd=2)

The non-linear fit.

nls.out <- nls(vel ~ (Vm*conc)/(K+conc), data=puro, subset=(state=="untreated"), start = c(Vm=143,K=0.031))
summary(nls.out)$param
##        Estimate  Std. Error   t value     Pr(>|t|)
## Vm 160.28001125 6.480243120 24.733642 1.384612e-09
## K    0.04770813 0.007781868  6.130678 1.727056e-04

Plot the data, with both fitted curves.

plot(vel ~ conc, data=puro, subset=(state=="untreated"), xlab="concentration", ylab="initial velocity", lwd=2)
u <- par("usr")
x <- seq(u[1],u[2],length=250)
y1 <- 1/predict(temp, data.frame(conc=x))
y2 <- predict(nls.out, data.frame(conc=x))
lines(x,y2, lwd=2, col="blue")
lines(x,y1, lwd=2, col="red", lty=2)

Non-linear regression estimation for just the treated cells.

nls.outB <- nls(vel ~ (Vm*conc)/(K+conc), data=puro, subset=(puro$state=="treated"), start=c(Vm=160,K=0.048))
summary(nls.outB)$param
##        Estimate  Std. Error   t value     Pr(>|t|)
## Vm 212.68377342 6.947156655 30.614507 3.241166e-11
## K    0.06412133 0.008280954  7.743229 1.565132e-05

Fit both regressions all in one.

puro$x <- ifelse(puro$state=="treated",1,0)
nls.outC <- nls(vel ~ ((Vm+dV*x)*conc)/(K+dK*x+conc), data=puro, start=c(Vm=160,K=0.048,dV=0,dK=0))
summary(nls.outC)$param
##        Estimate  Std. Error   t value     Pr(>|t|)
## Vm 160.28012878 6.896020420 23.242409 2.040955e-15
## K    0.04770831 0.008281174  5.761057 1.496420e-05
## dV  52.40364441 9.551022356  5.486705 2.712600e-05
## dK   0.01641302 0.011428988  1.436087 1.672387e-01

Fit with the K’s constrained to be equal.

nls.outD <- nls(vel ~ ((Vm+dV*x)*conc)/(K+conc), data=puro, start=c(Vm=160,K=0.048,dV=0))
summary(nls.outD)$param
##        Estimate  Std. Error   t value     Pr(>|t|)
## Vm 166.60407675 5.807428478 28.688098 1.006288e-17
## K    0.05797179 0.005910173  9.808816 4.372679e-09
## dV  42.02596380 6.272141186  6.700417 1.605939e-06

Plot data and both fits.

plot(vel ~ conc, data=puro, xlab="concentration", ylab="initial velocity", lwd=2,
     pch=as.numeric(puro$state)-1, col=ifelse(puro$state=="treated","blue","red"))
u <- par("usr")
x <- seq(u[1],u[2],len=250)
y1 <- predict(nls.out, data.frame(conc=x))
y2 <- predict(nls.outB, data.frame(conc=x))
y3 <- predict(nls.outD, data.frame(conc=x, x=rep(0,length(x))))
y4 <- predict(nls.outD, data.frame(conc=x, x=rep(1,length(x))))
lines(x, y1, lwd=2, col="red", lty=2)
lines(x, y2, lwd=2, col="blue", lty=2)
lines(x, y3, lwd=2, col="red")
lines(x, y4, lwd=2, col="blue")

Example from class: the mud snails

The data.

sediment
##    time pellets
## 1     6     119
## 2     6     111
## 3     6      87
## 4    12     348
## 5    12     210
## 6    12     272
## 7    18     438
## 8    18     778
## 9    18     548
## 10   24     774
## 11   24     896
## 12   24     670
## 13   42     896
## 14   42     729
## 15   42     886
## 16   78     763
## 17   78     954
## 18   78     954

Fit a cubic model. We need to use I() to get the square and cubic terms to work.

lm.out <- lm(pellets ~ time + I(time^2) + I(time^3), data=sediment)
summary(lm.out)
## 
## Call:
## lm(formula = pellets ~ time + I(time^2) + I(time^3), data = sediment)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -154.63  -83.57   28.61   54.98  196.21 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.389e+02  1.274e+02  -2.660 0.018668 *  
## time         7.573e+01  1.543e+01   4.907 0.000231 ***
## I(time^2)   -1.545e+00  4.804e-01  -3.215 0.006228 ** 
## I(time^3)    9.942e-03  3.945e-03   2.520 0.024479 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 112.2 on 14 degrees of freedom
## Multiple R-squared:  0.8961, Adjusted R-squared:  0.8738 
## F-statistic: 40.23 on 3 and 14 DF,  p-value: 3.925e-07

Fit a “linear spline” model.

f <- function(x,b0,b1,x0) ifelse(x<x0, b0+b1*x, b0+b1*x0)
nls.out <- nls(pellets ~ f(time,b0,b1,x0), data=sediment, start=c(b0=0,b1=50,x0=30))
summary(nls.out)$param
##      Estimate Std. Error   t value     Pr(>|t|)
## b0 -146.00000  71.292691 -2.047896 5.849712e-02
## b1   38.90556   4.338735  8.967027 2.055707e-07
## x0   25.95173   1.780619 14.574561 2.911998e-10

Plot the data and the fitted curves.

plot(pellets ~ time, data=sediment)
u <- par("usr") 
x <- seq(u[1],u[2],length=250)
y <- predict(lm.out, data.frame(time=x))
lines(x, y, col="blue", lwd=2)
y2 <- predict(nls.out, data.frame(time=x))
lines(x,y2,col="red",lwd=2)

Example from class: Michaelis-Menten equations (revisited)

A closer look at the residual sum of squares and the inference for the parameters Vmax and K.

We are using the data from the untreated cells first.

nls.out <- nls(vel ~ (Vm*conc)/(K+conc), data=puro, subset=(state=="untreated"), start = c(Vm=143,K=0.031))
summary(nls.out)$param
##        Estimate  Std. Error   t value     Pr(>|t|)
## Vm 160.28001125 6.480243120 24.733642 1.384612e-09
## K    0.04770813 0.007781868  6.130678 1.727056e-04

Visualize the residual sum of squares in a neighborhood of the parameter estimates, show the location of the least squares estimates, and show the confidence intervals from the confint function. We are truncating the RSS in the image (red colors).

dat <- subset(puro,state=="untreated")
f <- function(z,dat) sum((dat$vel-(z[2]*dat$conc)/(z[1]+dat$conc))^2)
hm <- 101
K <- seq(0.01,0.1,length=hm)
V <- seq(130,200,length=hm)
z <- expand.grid(K,V)
rss <- apply(z,1,f,dat=dat)
cut <- 5000
rss[rss>cut] <- cut
fit <- summary(nls.out)$param[,1]
cis <- suppressMessages(confint(nls.out))
par(las=1)
image(V,K,matrix(rss,ncol=hm,byrow=T),xlab="Vmax",col=rev(rainbow(256,start=0,end=2/3)))
text(fit[1],fit[2],"x",col="white")
lines(cis[1,],rep(0.012,2),col="white",lwd=3)
lines(rep(fit[1],2),c(0.011,0.013),col="white",lwd=3)
lines(rep(132,2),cis[2,],col="white",lwd=3)
lines(c(131,133),rep(fit[2],2),col="white",lwd=3)

Notice that the confidence intervals are not symmetric! The confint function uses a more complicated method (profiling) to generate these intervals. The methods are described in ore detail in the book Modern Applied Statistics with S by Venables and Ripley.

https://www.stats.ox.ac.uk/pub/MASS4/

confint(nls.out)
## Waiting for profiling to be done...
##           2.5%        97.5%
## Vm 145.6369544 176.54440158
## K    0.0313746   0.07006131

Below are the symmetric confidence intervals based on a normality assumption of the parameter estimates. Note that the p-values in the summary are derived making that normality assumption!

fit <- summary(nls.out)$param
fit[1,1]+c(-1,1)*qt(0.975,9)*fit[1,2]
## [1] 145.6207 174.9393
fit[2,1]+c(-1,1)*qt(0.975,9)*fit[2,2]
## [1] 0.03010432 0.06531194

Revisiting our model using both the treated and untreated cells.

nls.outC <- nls(vel ~ ((Vm+dV*x)*conc)/(K+dK*x+conc), data=puro, start=c(Vm=160,K=0.048,dV=0,dK=0))
summary(nls.outC)$param
##        Estimate  Std. Error   t value     Pr(>|t|)
## Vm 160.28012878 6.896020420 23.242409 2.040955e-15
## K    0.04770831 0.008281174  5.761057 1.496420e-05
## dV  52.40364441 9.551022356  5.486705 2.712600e-05
## dK   0.01641302 0.011428988  1.436087 1.672387e-01
confint(nls.outC)
## Waiting for profiling to be done...
##            2.5%        97.5%
## Vm 145.85284480 176.28020955
## K    0.03158687   0.06965928
## dV  31.33677121  73.09295345
## dK  -0.01042896   0.04186662

The confidence interval for dK contains zero, so it is plausible that the rate constant K is the same for the treated and untreated cells.

Non-linear standard curves

For some experiments the standard curves are not linear for the entire range of concentrations. This can happen if we reach the limit of detection for the lower concentrations or the measurement limits of our instrument for the higher concentrations. Below is one such example, where the log10 fluorescence intensity is not entirely linear in the log10 measurements of the spike-in concentrations.

x <- c(-0.155, 0.204, 0.982, 1.453, 2.107, 2.716, 3.289)
y <- c( 1.557, 1.753, 2.343, 2.828, 3.421, 3.856, 4.084)
dat <- data.frame(cbind(x,y))
par(las=1, xaxs="i", yaxs="i")
plot(dat, xlab="log10 ( concentration )", ylab="log10 ( fluorescence intensity )", xlim=c(-1,4), ylim=c(1,5))

In this settings a generalized logistic function is often helpful. One particular example is the 4 parameter logistic “4PL” or “Hill” model, also “S-curve”. The expected vaue for the log10 response is given by the following equation, where \(x\) is the log10 concentration:

\[f(x|\theta) = \theta_1 + (\theta_4-\theta_1) / (1+10^{\theta_3 (x-\theta_2)})\]

f.4 <- function(x,p1,p2,p3,p4) p1 + (p4-p1)/(1+(10^(p3*(x-p2))))

In these types of experiments the spiked-in concentrations often span multiple orders of magnitude and fold changes are used (1, 10, 100, etc). Thus logarithmic transformations of concentration and output should be used. Vendor software packages sometimes do these calculations on the un-transformed concentrations and readings, however. Make sure you understand what you are looking at!

The parameters \(\theta_1\) and \(\theta_4\) are called the upper asymptote and lower asymptote respectively, since the curve approaches \(\theta_1\) when the log10 concentrations approach \(+ \infty\) and \(\theta_4\) when the log10 concentrations approach \(- \infty\).

The S-curve is symmetric around \(\theta_2\), which is the log10 concentration where you have reached half the overall log10 fluorescence intensity increase.

The parameter \(\theta_3\) gives you the tangent (slope) for the S-curve at log10 concentrations \(\theta_2\).

To illustrate how these S-curves look like we plot some examples with \(\theta_1\) = 5 and \(\theta_4\) = 0 below.

In the first example we chose \(\theta_3\) = 1 and three different values for \(\theta_2\).

par(las=1)
curve(f.4(x,p1=5,p2=0,p3=1,p4=0), 
      from=-5,to=5, lwd=2,
      xlab="log10 ( concentration )", 
      ylab="log10 ( fluorescence intensity )")
curve(f.4(x,p1=5,p2=-1,p3=1,p4=0), col="red", add=TRUE, lwd=2)
curve(f.4(x,p1=5,p2=1,p3=1,p4=0), col="blue", add=TRUE, lwd=2)
axis(4,0,expression(theta[4]))
axis(4,5,expression(theta[1]))
legend("topleft",
       c(expression(paste(theta[2],"  =  -1")),
         expression(paste(theta[2],"  =   0")),
         expression(paste(theta[2],"  =   1"))),
       col=c("red","black","blue"), lty=1, lwd=2)

In the first example we chose \(\theta_2\) = 0 and three different values for \(\theta_3\).

f.4 <- function(x,p1,p2,p3,p4) p1 + (p4-p1)/(1+(10^(x-p2))^p3)
par(las=1,xaxs="i")
curve(f.4(x,p1=5,p2=0,p3=1,p4=0), 
      from=-5,to=5, lwd=2,
      xlab="log10 ( concentration ) ", 
      ylab="log10 ( fluorescence intensity )")
curve(f.4(x,p1=5,p2=0,p3=0.5,p4=0), col="red", add=TRUE, lwd=2)
curve(f.4(x,p1=5,p2=0,p3=2,p4=0), col="blue", add=TRUE, lwd=2)
axis(4,0,expression(theta[4]))
axis(4,5,expression(theta[1]))
legend("topleft",
       c(expression(paste(theta[3],"  =  1/2")),
         expression(paste(theta[3],"  =   1")),
         expression(paste(theta[3],"  =   2"))),
       col=c("red","black","blue"),
       lty=1, lwd=2)

For our observed data, let’s fit the 4 parameter S-curve using non-linear least squares.

nls.out <- nls(y ~ f.4(x,p1,p2,p3,p4), data=dat, start = c(p1=5,p2=1,p3=1,p4=1))
summary(nls.out)
## 
## Formula: y ~ f.4(x, p1, p2, p3, p4)
## 
## Parameters:
##    Estimate Std. Error t value Pr(>|t|)    
## p1  4.35541    0.03578  121.73 1.22e-06 ***
## p2  1.39761    0.02020   69.19 6.65e-06 ***
## p3  0.54817    0.02103   26.07 0.000124 ***
## p4  1.16722    0.04559   25.60 0.000131 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0123 on 3 degrees of freedom
## 
## Number of iterations to convergence: 5 
## Achieved convergence tolerance: 2.723e-07

The parameter estimates we obtained using non-linear least squares:

p <- summary(nls.out)$parameters[,1]
p
##        p1        p2        p3        p4 
## 4.3554127 1.3976129 0.5481687 1.1672242

Let’s plot the data and add the fitted curve.

par(las=1)
plot(dat, xlab="log10 ( concentration )", ylab="log10 ( fluorescence intensity )")
curve(f.4(x,p1=p[1],p2=p[2],p3=p[3],p4=p[4]), col="green3", add=TRUE, lwd=2)

You can better see the S-shape when plotting the curve on a wider range.

par(las=1)
plot(dat, xlab="log10 ( concentration )", ylab="log10 ( fluorescence intensity )", xlim=c(-4,7), ylim=c(1,5))
curve(f.4(x,p1=p[1],p2=p[2],p3=p[3],p4=p[4]), col="green3", add=TRUE, lwd=2)
abline(h=p[c(1,4)],lty="dotted")

Assume you get log10 fluorescence intensities for five more samples.

new.y <- c(3.796, 2.463, 3.506, 1.985, 3.238)

If you solve the above equation for \(x\) you can get the log10 concentration estimates for these five new measurements.

f.4.inv <- function(y,p1,p2,p3,p4) log10((p4-p1)/(y-p1)-1)/p3+p2
new.x <- round(f.4.inv(new.y,p1=p[1],p2=p[2],p3=p[3],p4=p[4]),3)
new.x
## [1] 2.624 1.098 2.200 0.554 1.886

Here is a plot.

par(las=1)
plot(dat, xlab="log10 ( concentration )", ylab="log10 ( fluorescence intensity )")
curve(f.4(x,p1=p[1],p2=p[2],p3=p[3],p4=p[4]), col="green3", add=TRUE, lwd=2)
for(k in 1:5) lines(c(-4,new.x[k]),rep(new.y[k],2),lty="dotted")
for(k in 1:5) lines(rep(new.x[k],2),c(new.y[k],1),lty="dotted")
points(new.x,new.y,pch=20)

Note that you can estimate the concentration for any log10 fluorescence intensity between \(\theta_4\) and \(\theta_1\). Sometimes it happens that a new intensity is outside that range and you will end up with a missing data point.

The above defined S-curve offers a lot of flexibility, but assumes that a symmetric S shaped curve describes your observed data well. This is very often the case. There is an extension for the above model that uses a fifth parameter to allow for a departure from symmetry. This makes the model fitting more complicated and may or may not improve the model fit. See generalalized logistic functions.