Statistics for Laboratory Scientists ( 140.615 )

Confidence Intervals

Working with the Normal distribution

Plot the standard Gaussian N(0,1), and look at some normal quantiles.

curve(dnorm,-3.5,3.5)
abline(h=0)
qnorm(0.025)
## [1] -1.959964
abline(v=qnorm(0.025),col="red")
qnorm(0.975)
## [1] 1.959964
abline(v=qnorm(0.975),col="red")
qnorm(0.005)
## [1] -2.575829
abline(v=qnorm(0.005),col="blue")
qnorm(0.995)
## [1] 2.575829
abline(v=qnorm(0.995),col="blue")

Confidence interval for the population mean, population SD known

Take ten random samples from a Normal distribution with population mean 5 and population standard deviation 2. Calculate a 95% confidence interval for the population mean from the sample, under the assumption that the standard deviation is known. Repeat once more, and compare the two confidence intervals.

First confidence interval:

set.seed(1)
x <- rnorm(10,mean=5,sd=2)
x
##  [1] 3.747092 5.367287 3.328743 8.190562 5.659016 3.359063 5.974858 6.476649 6.151563 4.389223
mean(x)
## [1] 5.264406
qnorm(0.975)
## [1] 1.959964
mean(x) - qnorm(0.975) * 2/sqrt(10)
## [1] 4.024815
mean(x) + qnorm(0.975) * 2/sqrt(10)
## [1] 6.503996
ci1 <- c(mean(x) - qnorm(0.975) * 2/sqrt(10), mean(x) + qnorm(0.975) * 2/sqrt(10))
ci1
## [1] 4.024815 6.503996

Second confidence interval:

y <- rnorm(10,mean=5,sd=2)
y
##  [1] 8.0235623 5.7796865 3.7575188 0.5706002 7.2498618 4.9101328 4.9676195 6.8876724 6.6424424 6.1878026
mean(y)
## [1] 5.49769
ci2 <- c(mean(y) - qnorm(0.975) * 2/sqrt(10), mean(y) + qnorm(0.975) * 2/sqrt(10))
ci2
## [1] 4.25810 6.73728

Compare:

ci1
## [1] 4.024815 6.503996
ci2
## [1] 4.25810 6.73728
mean(ci1)
## [1] 5.264406
mean(ci2)
## [1] 5.49769
diff(ci1)
## [1] 2.47918
diff(ci2)
## [1] 2.47918
2*qnorm(0.975)*2/sqrt(10)
## [1] 2.47918

The intervals had a 95% a priori probability of covering the population mean 5 (in this simulation, both did). Both have the same length, as the only random quantity in the confidence interval is the sample mean.

Working with the t distribution

Some t quantiles.

qt(0.975,9)
## [1] 2.262157
qt(0.975,19)
## [1] 2.093024
qt(0.975,29)
## [1] 2.04523
qt(0.975,2:50)
##  [1] 4.302653 3.182446 2.776445 2.570582 2.446912 2.364624 2.306004 2.262157 2.228139 2.200985 2.178813
## [12] 2.160369 2.144787 2.131450 2.119905 2.109816 2.100922 2.093024 2.085963 2.079614 2.073873 2.068658
## [23] 2.063899 2.059539 2.055529 2.051831 2.048407 2.045230 2.042272 2.039513 2.036933 2.034515 2.032245
## [34] 2.030108 2.028094 2.026192 2.024394 2.022691 2.021075 2.019541 2.018082 2.016692 2.015368 2.014103
## [45] 2.012896 2.011741 2.010635 2.009575 2.008559

A t distribution with nine degrees of freedom.

curve(dt(x,9),-3.5,3.5)
abline(h=0)
curve(dnorm,-3.5,3.5,add=TRUE,lty="dotted")
qt(0.025,9)
## [1] -2.262157
abline(v=qt(0.025,9),col="red")
qt(0.975,9)
## [1] 2.262157
abline(v=qt(0.975,9),col="red")

Confidence interval for the population mean, population SD unknown

First example from class
x <- c(0.2, 1.3, 1.4, 2.3, 4.2, 4.7, 4.7, 5.1, 5.9, 7.0)
x
##  [1] 0.2 1.3 1.4 2.3 4.2 4.7 4.7 5.1 5.9 7.0
stripchart(x,method="stack",pch=1,ylim=c(0,2))
abline(h=1,lty=2,col="gray40")
segments(mean(x),0.9,mean(x),1.1, col="blue",lwd=2)

Calculating the 95% confidence interval.

length(x)
## [1] 10
mean(x)
## [1] 3.68
sd(x)
## [1] 2.240932
sd(x)/sqrt(length(x))
## [1] 0.708645
qt(0.975,length(x)-1)
## [1] 2.262157
mean(x) - qt(0.975,length(x)-1) * sd(x)/sqrt(length(x))
## [1] 2.076934
mean(x) + qt(0.975,length(x)-1) * sd(x)/sqrt(length(x))
## [1] 5.283066
mean(x) + c(-1,1) * qt(0.975,length(x)-1) * sd(x)/sqrt(length(x))
## [1] 2.076934 5.283066
round(mean(x) + c(-1,1) * qt(0.975,length(x)-1) * sd(x)/sqrt(length(x)), 2)
## [1] 2.08 5.28

Using the function t.test().

t.test(x)
## 
##  One Sample t-test
## 
## data:  x
## t = 5.193, df = 9, p-value = 0.0005694
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2.076934 5.283066
## sample estimates:
## mean of x 
##      3.68
attributes(t.test(x))
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"    "null.value"  "stderr"     
##  [8] "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
t.test(x)$conf.int
## [1] 2.076934 5.283066
## attr(,"conf.level")
## [1] 0.95
round(t.test(x)$conf.int,2)
## [1] 2.08 5.28
## attr(,"conf.level")
## [1] 0.95

Plotting the confidence interval.

ci <- t.test(x)$conf.int
stripchart(x,method="stack",pch=1,ylim=c(0,2))
segments(mean(x),0.9,mean(x),1.1, col="blue",lwd=3)
segments(ci[1],1,ci[2],1, col="blue",lwd=3)

Calculating a 99% confidence interval.

qt(0.975,length(x)-1)
## [1] 2.262157
qt(0.995,length(x)-1)
## [1] 3.249836
mean(x) + c(-1,1) * qt(0.995,length(x)-1) * sd(x)/sqrt(length(x))
## [1] 1.37702 5.98298
t.test(x,conf.level=0.99)
## 
##  One Sample t-test
## 
## data:  x
## t = 5.193, df = 9, p-value = 0.0005694
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
##  1.37702 5.98298
## sample estimates:
## mean of x 
##      3.68
t.test(x,conf.level=0.99)$conf.int
## [1] 1.37702 5.98298
## attr(,"conf.level")
## [1] 0.99
Second example from class
x <- c(1.17, 6.35, 7.76)
x
## [1] 1.17 6.35 7.76
stripchart(x,pch=1,xlim=c(-5,15),ylim=c(0,2))
abline(h=1,lty=2,col="gray40")
segments(mean(x),0.9,mean(x),1.1, col="blue",lwd=2)

Calculating the 95% confidence interval.

length(x)
## [1] 3
mean(x)
## [1] 5.093333
sd(x)
## [1] 3.470077
sd(x)/sqrt(length(x))
## [1] 2.00345
qt(0.975,length(x)-1)
## [1] 4.302653
mean(x) + c(-1,1) * qt(0.975,length(x)-1) * sd(x)/sqrt(length(x))
## [1] -3.526815 13.713482
t.test(x)$conf.int
## [1] -3.526815 13.713482
## attr(,"conf.level")
## [1] 0.95

Plotting the confidence interval.

ci <- t.test(x)$conf.int
stripchart(x,pch=1,xlim=c(-5,15),ylim=c(0,2))
segments(mean(x),0.9,mean(x),1.1, col="blue",lwd=3)
segments(ci[1],1,ci[2],1, col="blue",lwd=3)

Third example from class
x <- c(34.9, 28.5, 34.3, 38.4, 29.6,
       29.6, 38.7, 22.4, 30.1, 23.1,
       29.6, 33.4, 20.6, 33.6, 42.4,
       28.2, 25.3, 22.1, 37.3, 32.1)
x
##  [1] 34.9 28.5 34.3 38.4 29.6 29.6 38.7 22.4 30.1 23.1 29.6 33.4 20.6 33.6 42.4 28.2 25.3 22.1 37.3 32.1
stripchart(x,method="stack",pch=1,ylim=c(0,2))
abline(h=1,lty=2,col="gray40")
segments(mean(x),0.9,mean(x),1.1, col="blue",lwd=2)

Calculating the 95% confidence interval.

length(x)
## [1] 20
mean(x)
## [1] 30.71
sd(x)
## [1] 6.064817
sd(x)/sqrt(length(x))
## [1] 1.356134
qt(0.975,length(x)-1)
## [1] 2.093024
mean(x) + c(-1,1) * qt(0.975,length(x)-1) * sd(x)/sqrt(length(x))
## [1] 27.87158 33.54842
t.test(x)$conf.int
## [1] 27.87158 33.54842
## attr(,"conf.level")
## [1] 0.95

Plotting the confidence interval.

ci <- t.test(x)$conf.int
stripchart(x,method="stack",pch=1,ylim=c(0,2))
segments(mean(x),0.9,mean(x),1.1, col="blue",lwd=3)
segments(ci[1],1,ci[2],1, col="blue",lwd=3)

Confidence intervals for the difference in means

Example from class
x <- c(59.2, 54.6, 58.1, 41.8, 64.7, 50.8, 51.6, 47.0, 66.3, 58.1)
y <- c(73.3, 91.4, 69.1, 104.2, 64.6, 78.4, 56.4, 55.6, 61.1, 39.8, 43.9, 63.6, 78.6, 76.5, 45.1, 90.2)
list(x,y)
## [[1]]
##  [1] 59.2 54.6 58.1 41.8 64.7 50.8 51.6 47.0 66.3 58.1
## 
## [[2]]
##  [1]  73.3  91.4  69.1 104.2  64.6  78.4  56.4  55.6  61.1  39.8  43.9  63.6  78.6  76.5  45.1  90.2
par(las=1)
stripchart(list(x,y),method="stack",pch=1,group.names=c("A","B"),ylim=c(0,3))

stripchart(list(x,y),method="jitter",jitter=0.05,pch=1,group.names=c("A","B"),ylim=c(0,3))

Calculating the confidence interval.

mean(x)
## [1] 55.22
mean(y)
## [1] 68.2375
mean(x) - mean(y)
## [1] -13.0175
sd(x)
## [1] 7.643123
sd(y)
## [1] 18.14629
var(x)
## [1] 58.41733
var(y)
## [1] 329.2878
nx <- length(x)
nx
## [1] 10
ny <- length(y)
ny
## [1] 16
spooled <- sqrt( (var(x)*(nx-1)+var(y)*(ny-1)) / (nx+ny-2))
spooled
## [1] 15.09011
spooled * sqrt(1/nx+1/ny)
## [1] 6.083017
qt(0.975,nx+ny-2) 
## [1] 2.063899
mean(x)-mean(y) - qt(0.975,nx+ny-2) * spooled * sqrt(1/nx+1/ny) 
## [1] -25.57223
mean(x)-mean(y) + qt(0.975,nx+ny-2) * spooled * sqrt(1/nx+1/ny) 
## [1] -0.4627689
mean(x)-mean(y) + c(-1,1) * qt(0.975,nx+ny-2) * spooled * sqrt(1/nx+1/ny) 
## [1] -25.5722311  -0.4627689

95% CI for the population mean of the first population using the function t.test().

t.test(x)$conf
## [1] 49.75244 60.68756
## attr(,"conf.level")
## [1] 0.95

95% CI for the population mean of the second population.

t.test(y)$conf
## [1] 58.56802 77.90698
## attr(,"conf.level")
## [1] 0.95

95% CI for difference in population means assuming the population SDs are equal.

t.test(x,y,var.equal=TRUE)$conf
## [1] -25.5722311  -0.4627689
## attr(,"conf.level")
## [1] 0.95

95% CI for difference in population means not assuming the population SDs are equal.

t.test(x,y)$conf
## [1] -23.683495  -2.351505
## attr(,"conf.level")
## [1] 0.95

Confidence intervals for the population standard deviations

Example from class

First population.

x
##  [1] 59.2 54.6 58.1 41.8 64.7 50.8 51.6 47.0 66.3 58.1
length(x)
## [1] 10
sd(x)
## [1] 7.643123

The chi-square distribution.

curve(dchisq(x,df=nx-1),from=0,to=30)
abline(h=0)
qchisq(0.025,nx-1)
## [1] 2.700389
abline(v=qchisq(0.025,nx-1),lty="dotted",col="blue")
qchisq(0.975,nx-1)
## [1] 19.02277
abline(v=qchisq(0.975,nx-1),lty="dotted",col="blue")

95% CI for the population standard deviation of the first population.

L <- qchisq(0.025,nx-1)
U <- qchisq(0.975,nx-1)
sd(x) * sqrt((nx-1)/U)
## [1] 5.257209
sd(x) * sqrt((nx-1)/L)
## [1] 13.95336
sd(x) * sqrt((nx-1)/c(U,L))
## [1]  5.257209 13.953364

95% CI for the population variance of the first population.

var(x)
## [1] 58.41733
( sd(x)*sqrt((nx-1)/c(U,L)) )^2
## [1]  27.63825 194.69636
var(x) * (nx-1) / c(U,L)
## [1]  27.63825 194.69636

We can write a function to make this more convenient!

sdci <- function(x,conf.level=0.95){
  nx <- length(x)
  alpha <- 1-conf.level
  ci <- sd(x)*sqrt((nx-1)/qchisq(c(1-alpha/2,alpha/2),nx-1))
  return(ci)
}

95% CI for the population standard deviation of the first population.

sdci(x)
## [1]  5.257209 13.953364

99% CI for the population standard deviation of the first population.

sdci(x,conf.level=0.99)
## [1]  4.721001 17.408076

95% CI for the population variance of the first population.

sdci(x)^2
## [1]  27.63825 194.69636

95% CI for the population standard deviation of the second population.

sdci(y)
## [1] 13.40475 28.08485