```{r setup, include=FALSE} knitr::opts_chunk$set(echo=TRUE, fig.align="center") options(width=110) ``` #### Statistics for Laboratory Scientists ( 140.615 ) ## Principal Components Analysis #### Example 1: father and daughter heights Plot the data using identical axis. ```{r,fig.height=6} library(SPH.140.615) r <- range(pear) par(pty="s",las=1) plot(pear,xlim=r,ylim=r) abline(c(0,1),lty="dotted") ``` We will center and scale the data using the sample means and the sample standard deviations. ```{r} apply(pear,2,mean) apply(pear,2,sd) ``` We will first center our random variables, i.e. subtract the sample means. The sample standard deviations are unaffected by this. ```{r} m <- apply(pear,2,mean) pear.c <- sweep(pear,2,m,FUN="-") apply(pear.c,2,mean) apply(pear.c,2,sd) ``` Plot the centered data. ```{r,fig.height=6} r <- range(pear.c) par(pty="s",las=1) plot(pear.c,xlim=r,ylim=r) abline(c(0,1),lty="dotted") points(0,0,pch=21,bg="red") ``` We will now scale our random variables, i.e. divide by the sample standard deviations. ```{r} s <- apply(pear.c,2,sd) pear.s <- sweep(pear.c,2,s,FUN="/") apply(pear.s,2,mean) apply(pear.s,2,sd) ``` Plot the centered and scaled data. ```{r,fig.height=6} r <- range(pear.s) par(pty="s",las=1) plot(pear.s,xlim=r,ylim=r) abline(c(0,1),lty="dotted") points(0,0,pch=21,bg="red") ``` Calculate the principal components. ```{r} pear.pca <- prcomp(pear,center=TRUE,scale=TRUE) int <- pear.pca$center int sca <- pear.pca$scale sca rot <- pear.pca$rotation rot ``` Plot the rotated data. ```{r} head(pear.pca$x) par(pty="m",las=1) plot(pear.pca$x,asp=1) points(0,0,pch=21,bg="red") axis(1,-4:4) head(predict(pear.pca)) ``` Plot the original data and add the axis of variation to the scatter plot. ```{r,fig.height=6} int rot b1 <- rot[2,1]/rot[1,1] a1 <- int[2]-b1*int[1] b2 <- rot[2,2]/rot[1,2] a2 <- int[2]-b2*int[1] r <- range(pear) par(pty="s",las=1) plot(pear,xlim=r,ylim=r) abline(a1,b1,lwd=2,col="red") abline(a2,b2,lwd=2,col="blue") b1 b2 ``` The variance contributions. ```{r} summary(pear.pca) ``` #### Example 2: a multivariate Normal distribution in 10 dimensions A function to simulate multivariate normal data. ```{r} myrmvn <- function(mu,sigma,hm=1,...){ n=length(mu) if(sum((dim(sigma)-rep(n,2))^2)!=0) stop("Check the dimensions of mu and sigma!") if(det(sigma)==0) stop("The covariance matrix is singular!") a=t(chol(sigma)) z=matrix(rnorm(n*hm),nrow=n) y=t(a%*%z+mu) return(y) } ``` Generate a mean vector of zeros, and a variance-covariance matrix with some strong correlations. ```{r} mu <- rep(0,10) mu sigma <- diag(0.2,10)+0.8 sigma[1:8,9:10] <- 0.2 sigma[9:10,1:8] <- 0.2 sigma ``` Simulate data from a multivariate normal distribution with the mean vector and the variance-covariance matrix, and plot the data in the ten dimensions. ```{r,fig.width=8,fig.height=8} set.seed(1) n <- 100 dat <- myrmvn(mu,sigma,hm=n) pairs(dat) ``` The principal components analysis. ```{r} dat.pca <- prcomp(dat,center=TRUE,scale=TRUE) ``` The variance contributions. ```{r} summary(dat.pca) par(las=1) plot(dat.pca) ``` #### Example 3: two populations with different genetic background A function to simulate some single nucleotide polymorphisms (SNPs). ```{r} my.snp.pc.data=function(n,maf){ hm=length(maf) z=matrix(ncol=hm,nrow=n) for(j in 1:hm){ p=1-maf[j] mp=c(p^2,2*p*(1-p),(1-p)^2) z[,j]=apply(rmultinom(n,1,mp),2,order)[3,]-1 } return(z) } ``` We have two populations, and we measure the genotype at 50 loci with differing minor allele frequencies (MAFs). These type of loci are often called ancestry informative markers. Here are the minor allele frequencies for the 50 SNPs. ```{r} ns <- 50 set.seed(1) maf1 <- runif(ns,0.1,0.5) maf2 <- runif(ns,0.1,0.5) maf1 maf2 ``` Randomly sample 100 subjects from each population. ```{r} n1 <- n2 <- 100 z1 <- my.snp.pc.data(n1,maf1) z2 <- my.snp.pc.data(n2,maf2) z <- data.frame(rbind(z1,z2)) head(z,10) ``` The principal components analysis. ```{r} snp.pca <- prcomp(z,center=TRUE,scale=TRUE) ``` The principal components. ```{r} snp.p <- predict(snp.pca) dim(snp.p) head(snp.p[,1:5],10) ``` Plot the first two principal components. ```{r,fig.height=6} par(las=1) plot(snp.p[,1:2],pch=21,bg=rep(c("blue","red"),each=100)) ``` Simulate a new set of 50 samples from the first population. ```{r} n3 <- 50 z3 <- my.snp.pc.data(n3,maf1) z3 <- data.frame(z3) head(z3) ``` Add these 50 samples to the first two principal components plot. ```{r,fig.height=6} p3 <- predict(snp.pca,z3) par(las=1) plot(snp.p[,1:2],pch=21,bg=rep(c("blue","red"),each=100)) points(p3[,1:2],pch=21,bg="green3",cex=1.5) ```