```{r setup, include=FALSE} knitr::opts_chunk$set(echo=TRUE, fig.align="center") ``` #### Statistics for Laboratory Scientists ( 140.615 ) ## Regression and Correlation ### Concordance versus correlation Simulate some highly correlated bivariate data, and plot the data. ```{r} set.seed(1) m <- rpois(1000,50) eps1 <- rnorm(1000) eps2 <- rnorm(1000) x1 <- m + eps1 x2 <- m + eps2 par(pty="s") plot(x1, x2, xlim=c(10,90), ylim=c(10,90)) abline(c(0,1), col="blue") ``` The Pearson correlation and the concordance. ```{r} cor.con <- function(x,y) 2*cov(x,y)/(var(x)+var(y)+(mean(x)-mean(y))^2) cor(x1,x2) cor.con(x1,x2) ``` A linear transformation of x2. ```{r} x3 <- -10 + 1.25*x2 par(pty="s") plot(x1, x3, xlim=c(10,90), ylim=c(10,90)) abline(c(0,1), col="blue") cor(x1,x3) cor.con(x1,x3) ``` Another linear transformation of x2. ```{r} x4 <- -40 + 1.75*x2 par(pty="s") plot(x1, x4, xlim=c(10,90), ylim=c(10,90)) abline(c(0,1), col="blue") cor(x1,x4) cor.con(x1,x4) ``` ### Induced correlation Generate some hypothetical, independent log2 measurements, and plot the data. Note the difference of logs is the same as the log ratio. ```{r} set.seed(1) x <- rnorm(1000) y <- rnorm(1000) r1 <- max(abs(c(x,y))) r1 <- c(-r1,r1) par(las=1, pty="s") plot(x, y , xlab="log2(X)", ylab="log2(Y)", xlim=r1, ylim=r1, pch=1, cex=0.7) ``` Simulate and independent "reference standard", and plot the log2 measurements relative to the standard. ```{r} z <- rnorm(1000,sd=2) r2 <- max(abs(c(x-z,y-z))) r2 <- c(-r2,r2) par(las=1, pty="s") plot(x-z, y-z , xlab="log2(X/Z)", ylab="log2(Y/Z)", xlim=r2, ylim=r2, pch=1, cex=0.7) ``` The respective correlations. ```{r} cor(x,y) cor(x-z,y-z) ``` Another example. ```{r} z <- rnorm(1000,sd=1) r3 <- max(abs(c(x-z,y-z))) r3 <- c(-r3,r3) par(las=1, pty="s") plot(x-z, y-z , xlab="log2(X/Z)", ylab="log2(Y/Z)", xlim=r3, ylim=r3, pch=1, cex=0.7) cor(x-z,y-z) ``` Assume that some measurements in the reference standard failed, so the intensities are near zero (background), and therefore the log2 intensities are super small. ```{r} z[1:20] <- log2(runif(20,0,1e-3)) z[1:20] r4 <- max(abs(c(x-z,y-z))) r4 <- c(-r4,r4) par(las=1, pty="s") plot(x-z, y-z, xlab="log2(X/Z)", ylab="log2(Y/Z)", xlim=r4, ylim=r4, pch=1, cex=0.7) points((x-z)[1:20], (y-z)[1:20], pch=21, bg="red", cex=0.7) cor(x-z,y-z) ``` ### MA plots To show how useful MA plots are, we simulate some (log2) gene expression data from two technical replicates, with and without artifacts. ```{r} n <- 1000 set.seed(1) x <- runif(n,6,14) ``` No artifacts. ```{r} x1 <- x + rnorm(n,sd=1/x^2) x2 <- x + rnorm(n,sd=1/x^2) r <- max(abs(x1-x2)) r <- c(-r,r) par(mfrow=c(1,2), las=1) par(pty="s") plot(x1, x2, xlab="log2(R1)", ylab="log2(R2)") abline(c(0,1), col="red", lwd=2) par(pty="m") plot((x1+x2)/2, x1-x2, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r) abline(h=0, col="red", lwd=2) ``` Replicate 1 has on average a 5% higher abundance. ```{r} x1b <- x1 + log2(1.05) x2b <- x2 r <- max(abs(x1b-x2b)) r <- c(-r,r) par(mfrow=c(1,2), las=1) par(pty="s") plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)") abline(c(0,1), col="red", lwd=2) par(pty="m") plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r) abline(h=0, col="red", lwd=2) ``` Replicate 1 has on average a 1% higher abundance. ```{r} x1b <- x1 + log2(1.01) x2b <- x2 r <- max(abs(x1b-x2b)) r <- c(-r,r) par(mfrow=c(1,2), las=1) par(pty="s") plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)") abline(c(0,1), col="red", lwd=2) par(pty="m") plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r) abline(h=0,col="red",lwd=2) ``` Assume there is a bias depending on the gene abundance. ```{r} m <- mean(c(x1,x2)) x1b <- x1 - 0.01*(x1-m) x2b <- x2 r <- max(abs(x1b-x2b)) r <- c(-r,r) par(mfrow=c(1,2), las=1) par(pty="s") plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)") abline(c(0,1), col="red", lwd=2) par(pty="m") plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2",ylab="log2( R1 / R2 )", ylim=r) abline(h=0, col="red", lwd=2) ``` Something like this we often observe in real data. ```{r} m <- mean(c(x1,x2)) x1b <- x1 - (x1-m)/(x1^2) x2b <- x2 r <- max(abs(x1b-x2b)) r <- c(-r,r) par(mfrow=c(1,2), las=1) par(pty="s") plot(x1b, x2b, xlab="log2(R1)", ylab="log2(R2)") abline(c(0,1), col="red", lwd=2) par(pty="m") plot((x1b+x2b)/2, x1b-x2b, xlab="( log2(R1) + log2(R2) ) / 2", ylab="log2( R1 / R2 )", ylim=r) abline(h=0, col="red", lwd=2) ```