Statistics for Laboratory Scientists ( 140.615 )

Regression and Correlation

Concordance versus correlation

Simulate some highly correlated bivariate data, and plot the data.

set.seed(1)
m <- rpois(1000,50)
eps1 <- rnorm(1000)
eps2 <- rnorm(1000)
x1 <- m + eps1
x2 <- m + eps2
par(pty="s")
plot(x1, x2, xlim=c(10,90), ylim=c(10,90))
abline(c(0,1), col="blue")

The Pearson correlation and the concordance.

cor.con <- function(x,y) 2*cov(x,y)/(var(x)+var(y)+(mean(x)-mean(y))^2)
cor(x1,x2)
## [1] 0.9789568
cor.con(x1,x2)
## [1] 0.9788988

A linear transformation of x2.

x3 <- -10 + 1.25*x2
par(pty="s")
plot(x1, x3, xlim=c(10,90), ylim=c(10,90))
abline(c(0,1), col="blue")

cor(x1,x3)
## [1] 0.9789568
cor.con(x1,x3)
## [1] 0.9110233

Another linear transformation of x2.

x4 <- -40 + 1.75*x2
par(pty="s")
plot(x1, x4, xlim=c(10,90), ylim=c(10,90))
abline(c(0,1), col="blue")

cor(x1,x4)
## [1] 0.9789568
cor.con(x1,x4)
## [1] 0.8188843

Induced correlation

Generate some hypothetical, independent log2 measurements, and plot the data. Note the difference of logs is the same as the log ratio.

set.seed(1)
x <- rnorm(1000)
y <- rnorm(1000)
r1 <- max(abs(c(x,y)))
r1 <- c(-r1,r1)
par(las=1, pty="s")
plot(x, y , xlab="log2(X)", ylab="log2(Y)", xlim=r1, ylim=r1, pch=1, cex=0.7)

Simulate and independent “reference standard”, and plot the log2 measurements relative to the standard.

z <- rnorm(1000,sd=2)
r2 <- max(abs(c(x-z,y-z)))
r2 <- c(-r2,r2)
par(las=1, pty="s")
plot(x-z, y-z , xlab="log2(X/Z)", ylab="log2(Y/Z)", xlim=r2, ylim=r2, pch=1, cex=0.7)

The respective correlations.

cor(x,y)
## [1] 0.006401211
cor(x-z,y-z)
## [1] 0.7934214

Another example.

z <- rnorm(1000,sd=1)
r3 <- max(abs(c(x-z,y-z)))
r3 <- c(-r3,r3)
par(las=1, pty="s")
plot(x-z, y-z , xlab="log2(X/Z)", ylab="log2(Y/Z)", xlim=r3, ylim=r3, pch=1, cex=0.7)

cor(x-z,y-z)
## [1] 0.4939203

Assume that some measurements in the reference standard failed, so the intensities are near zero (background), and therefore the log2 intensities are super small.

z[1:20] <- log2(runif(20,0,1e-3))
z[1:20]
##  [1] -12.92858 -12.93505 -10.32844 -11.21681 -10.44795 -12.67132 -13.46826 -12.25190 -15.52666 -10.71385
## [11] -10.43458 -10.85769 -13.63112 -13.34472 -10.04536 -11.33832 -10.01398 -10.43432 -10.09490 -10.23836
r4 <- max(abs(c(x-z,y-z)))
r4 <- c(-r4,r4)
par(las=1, pty="s")
plot(x-z, y-z, xlab="log2(X/Z)", ylab="log2(Y/Z)", xlim=r4, ylim=r4, pch=1, cex=0.7)
points((x-z)[1:20], (y-z)[1:20], pch=21, bg="red", cex=0.7)

cor(x-z,y-z)
## [1] 0.7750309

MA plots

To show how useful MA plots are, we simulate some (log2) gene expression data from two technical replicates, with and without artifacts.

n <- 1000
set.seed(1)
x <- runif(n,6,14)

No artifacts.

x1 <- x + rnorm(n,sd=1/x^2)
x2 <- x + rnorm(n,sd=1/x^2)
r <- max(abs(x1-x2))
r <- c(-r,r)
par(mfrow=c(1,2), las=1)
par(pty="s")
plot(x1, x2, xlab="log2(R1)", ylab="log2(R2)")
abline(c(0,1), col="red", lwd=2)
par(pty="m")
plot((x1+x2)/2, x1-x2, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r)
abline(h=0, col="red", lwd=2)

Replicate 1 has on average a 5% higher abundance.

x1b <- x1 + log2(1.05)
x2b <- x2
r <- max(abs(x1b-x2b))
r <- c(-r,r)
par(mfrow=c(1,2), las=1)
par(pty="s")
plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)")
abline(c(0,1), col="red", lwd=2)
par(pty="m")
plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r)
abline(h=0, col="red", lwd=2)

Replicate 1 has on average a 1% higher abundance.

x1b <- x1 + log2(1.01)
x2b <- x2
r <- max(abs(x1b-x2b))
r <- c(-r,r)
par(mfrow=c(1,2), las=1)
par(pty="s")
plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)")
abline(c(0,1), col="red", lwd=2)
par(pty="m")
plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r)
abline(h=0,col="red",lwd=2)

Assume there is a bias depending on the gene abundance.

m <- mean(c(x1,x2))
x1b <- x1 - 0.01*(x1-m)
x2b <- x2
r <- max(abs(x1b-x2b))
r <- c(-r,r)
par(mfrow=c(1,2), las=1)
par(pty="s")
plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)")
abline(c(0,1), col="red", lwd=2)
par(pty="m")
plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2",ylab="log2( R1 / R2 )", ylim=r)
abline(h=0, col="red", lwd=2)

Something like this we often observe in real data.

m <- mean(c(x1,x2))
x1b <- x1 - (x1-m)/(x1^2)
x2b <- x2
r <- max(abs(x1b-x2b))
r <- c(-r,r)
par(mfrow=c(1,2), las=1)
par(pty="s")
plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)")
abline(c(0,1), col="red", lwd=2)
par(pty="m")
plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r)
abline(h=0, col="red", lwd=2)