Statistics for Laboratory Scientists ( 140.615 )

Principal Components Analysis

Example 1: father and daughter heights

Plot the data using identical axis.

library(SPH.140.615)
r <- range(pear)
par(pty="s",las=1)
plot(pear,xlim=r,ylim=r)
abline(c(0,1),lty="dotted")

We will center and scale the data using the sample means and the sample standard deviations.

apply(pear,2,mean)
##   father daughter 
## 67.67871 63.83823
apply(pear,2,sd)
##   father daughter 
## 2.770190 2.656137

We will first center our random variables, i.e. subtract the sample means. The sample standard deviations are unaffected by this.

m <- apply(pear,2,mean)
pear.c <- sweep(pear,2,m,FUN="-")
apply(pear.c,2,mean)
##        father      daughter 
## -1.045030e-15  1.322069e-14
apply(pear.c,2,sd)
##   father daughter 
## 2.770190 2.656137

Plot the centered data.

r <- range(pear.c)
par(pty="s",las=1)
plot(pear.c,xlim=r,ylim=r)
abline(c(0,1),lty="dotted")
points(0,0,pch=21,bg="red")

We will now scale our random variables, i.e. divide by the sample standard deviations.

s <- apply(pear.c,2,sd)
pear.s <- sweep(pear.c,2,s,FUN="/")
apply(pear.s,2,mean)
##        father      daughter 
## -1.680019e-15  1.493637e-15
apply(pear.s,2,sd)
##   father daughter 
##        1        1

Plot the centered and scaled data.

r <- range(pear.s)
par(pty="s",las=1)
plot(pear.s,xlim=r,ylim=r)
abline(c(0,1),lty="dotted")
points(0,0,pch=21,bg="red")

Calculate the principal components.

pear.pca <- prcomp(pear,center=TRUE,scale=TRUE)
int <- pear.pca$center 
int
##   father daughter 
## 67.67871 63.83823
sca <- pear.pca$scale
sca
##   father daughter 
## 2.770190 2.656137
rot <- pear.pca$rotation
rot
##                PC1        PC2
## father   0.7071068 -0.7071068
## daughter 0.7071068  0.7071068

Plot the rotated data.

head(pear.pca$x)
##            PC1         PC2
## [1,] -4.032912 -1.95068591
## [2,] -3.584728 -1.70670702
## [3,] -2.696034 -1.58377961
## [4,] -4.299761  0.23291997
## [5,] -4.173229  0.05314493
## [6,] -3.381936 -0.73814768
par(pty="m",las=1)
plot(pear.pca$x,asp=1)
points(0,0,pch=21,bg="red")
axis(1,-4:4)

head(predict(pear.pca))
##            PC1         PC2
## [1,] -4.032912 -1.95068591
## [2,] -3.584728 -1.70670702
## [3,] -2.696034 -1.58377961
## [4,] -4.299761  0.23291997
## [5,] -4.173229  0.05314493
## [6,] -3.381936 -0.73814768

Plot the original data and add the axis of variation to the scatter plot.

int
##   father daughter 
## 67.67871 63.83823
rot
##                PC1        PC2
## father   0.7071068 -0.7071068
## daughter 0.7071068  0.7071068
b1 <- rot[2,1]/rot[1,1]
a1 <- int[2]-b1*int[1]
b2 <- rot[2,2]/rot[1,2]
a2 <- int[2]-b2*int[1]
r <- range(pear)
par(pty="s",las=1)
plot(pear,xlim=r,ylim=r)
abline(a1,b1,lwd=2,col="red") 
abline(a2,b2,lwd=2,col="blue") 

b1
## [1] 1
b2
## [1] -1

The variance contributions.

summary(pear.pca)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.2318 0.6947
## Proportion of Variance 0.7587 0.2413
## Cumulative Proportion  0.7587 1.0000

Example 2: a multivariate Normal distribution in 10 dimensions

A function to simulate multivariate normal data.

myrmvn <- function(mu,sigma,hm=1,...){
  n=length(mu)
  if(sum((dim(sigma)-rep(n,2))^2)!=0) 
    stop("Check the dimensions of mu and sigma!")
  if(det(sigma)==0) stop("The covariance matrix is singular!")
  a=t(chol(sigma))
  z=matrix(rnorm(n*hm),nrow=n)
  y=t(a%*%z+mu)
  return(y)
}

Generate a mean vector of zeros, and a variance-covariance matrix with some strong correlations.

mu <- rep(0,10)
mu
##  [1] 0 0 0 0 0 0 0 0 0 0
sigma <- diag(0.2,10)+0.8
sigma[1:8,9:10] <- 0.2
sigma[9:10,1:8] <- 0.2
sigma
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.8  0.8  0.8  0.8  0.8  0.8  0.8  0.2   0.2
##  [2,]  0.8  1.0  0.8  0.8  0.8  0.8  0.8  0.8  0.2   0.2
##  [3,]  0.8  0.8  1.0  0.8  0.8  0.8  0.8  0.8  0.2   0.2
##  [4,]  0.8  0.8  0.8  1.0  0.8  0.8  0.8  0.8  0.2   0.2
##  [5,]  0.8  0.8  0.8  0.8  1.0  0.8  0.8  0.8  0.2   0.2
##  [6,]  0.8  0.8  0.8  0.8  0.8  1.0  0.8  0.8  0.2   0.2
##  [7,]  0.8  0.8  0.8  0.8  0.8  0.8  1.0  0.8  0.2   0.2
##  [8,]  0.8  0.8  0.8  0.8  0.8  0.8  0.8  1.0  0.2   0.2
##  [9,]  0.2  0.2  0.2  0.2  0.2  0.2  0.2  0.2  1.0   0.8
## [10,]  0.2  0.2  0.2  0.2  0.2  0.2  0.2  0.2  0.8   1.0

Simulate data from a multivariate normal distribution with the mean vector and the variance-covariance matrix, and plot the data in the ten dimensions.

set.seed(1)
n <- 100
dat <- myrmvn(mu,sigma,hm=n)
pairs(dat)

The principal components analysis.

dat.pca <- prcomp(dat,center=TRUE,scale=TRUE)

The variance contributions.

summary(dat.pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5    PC6     PC7     PC8     PC9    PC10
## Standard deviation     2.5745 1.3361 0.5604 0.50644 0.46858 0.4336 0.42375 0.40214 0.38626 0.34434
## Proportion of Variance 0.6628 0.1785 0.0314 0.02565 0.02196 0.0188 0.01796 0.01617 0.01492 0.01186
## Cumulative Proportion  0.6628 0.8413 0.8727 0.89834 0.92030 0.9391 0.95705 0.97322 0.98814 1.00000
par(las=1)
plot(dat.pca)

Example 3: two populations with different genetic background

A function to simulate some single nucleotide polymorphisms (SNPs).

my.snp.pc.data=function(n,maf){
  hm=length(maf)
  z=matrix(ncol=hm,nrow=n)
  for(j in 1:hm){
    p=1-maf[j]
    mp=c(p^2,2*p*(1-p),(1-p)^2)
    z[,j]=apply(rmultinom(n,1,mp),2,order)[3,]-1
  }
  return(z)
}

We have two populations, and we measure the genotype at 50 loci with differing minor allele frequencies (MAFs). These type of loci are often called ancestry informative markers.

Here are the minor allele frequencies for the 50 SNPs.

ns <- 50
set.seed(1)
maf1 <- runif(ns,0.1,0.5)
maf2 <- runif(ns,0.1,0.5)
maf1
##  [1] 0.2062035 0.2488496 0.3291413 0.4632831 0.1806728 0.4593559 0.4778701 0.3643191 0.3516456 0.1247145
## [11] 0.1823898 0.1706227 0.3748091 0.2536415 0.4079366 0.2990797 0.3870474 0.4967624 0.2520141 0.4109781
## [21] 0.4738821 0.1848570 0.3606695 0.1502220 0.2068883 0.2544456 0.1053561 0.2529552 0.4478763 0.2361396
## [31] 0.2928320 0.3398263 0.2974165 0.1744870 0.4309493 0.3673867 0.4176959 0.1431775 0.3894844 0.2645098
## [41] 0.4283785 0.3588241 0.4131731 0.3212145 0.3118878 0.4157425 0.1093325 0.2908920 0.3929255 0.3770926
maf2
##  [1] 0.2910478 0.4444838 0.2752388 0.1979189 0.1282716 0.1397865 0.2265087 0.3074537 0.3648020 0.2627321
## [11] 0.4651504 0.2174413 0.2836263 0.2329579 0.3603482 0.2032067 0.2914181 0.4065243 0.1336988 0.4501285
## [21] 0.2356292 0.4357761 0.2386734 0.2335100 0.2905405 0.4568793 0.4457358 0.2559958 0.4109283 0.4842472
## [31] 0.2738638 0.3850059 0.2599977 0.2301409 0.4028349 0.1810769 0.3844485 0.1486768 0.1981954 0.1573218
## [41] 0.1958518 0.1235738 0.3569153 0.4505077 0.4115659 0.4189235 0.2821098 0.2640336 0.4243481 0.3419733

Randomly sample 100 subjects from each population.

n1 <- n2 <- 100
z1 <- my.snp.pc.data(n1,maf1)
z2 <- my.snp.pc.data(n2,maf2)
z <- data.frame(rbind(z1,z2))
head(z,10)
##    X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29
## 1   1  0  0  2  0  0  0  1  2   0   0   0   0   0   0   1   2   1   0   1   1   0   0   0   1   0   0   1   2
## 2   0  0  0  0  1  0  2  0  1   0   0   0   2   0   1   0   2   1   0   0   2   1   1   0   0   0   0   0   1
## 3   1  0  1  0  1  2  1  1  1   1   0   0   0   1   1   0   1   1   1   1   1   0   1   2   0   1   1   1   2
## 4   0  2  1  0  2  1  0  0  1   0   0   1   1   1   1   0   1   1   0   2   1   0   2   0   0   0   0   1   0
## 5   0  0  0  0  1  1  2  0  0   0   1   1   0   1   0   0   1   0   0   0   2   1   0   0   1   0   1   0   0
## 6   0  0  1  1  0  1  0  1  1   0   0   0   2   0   1   0   2   0   1   1   0   0   0   0   0   1   0   0   1
## 7   1  0  1  1  0  2  1  2  1   0   1   0   1   1   1   1   2   2   0   0   0   0   2   1   0   1   0   0   1
## 8   1  0  0  1  0  2  1  0  1   0   0   1   0   2   1   0   1   2   0   0   1   0   0   0   1   0   0   0   1
## 9   0  0  0  1  0  0  0  0  0   0   0   0   0   0   0   0   0   1   1   1   2   1   2   0   2   0   0   0   1
## 10  0  0  1  1  1  2  2  1  1   0   0   0   1   1   1   0   1   1   1   0   0   0   0   0   0   0   0   0   1
##    X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50
## 1    1   0   1   0   0   2   0   0   0   0   1   0   0   2   0   1   2   1   0   0   1
## 2    0   0   0   0   0   1   0   1   0   0   0   1   1   0   0   1   1   0   1   0   1
## 3    1   0   2   0   1   1   1   0   0   2   0   0   1   0   0   0   0   1   0   0   1
## 4    0   0   2   0   0   1   1   2   0   2   0   1   0   0   0   0   0   0   0   1   1
## 5    1   1   0   0   0   1   0   1   0   1   0   2   1   1   0   1   0   0   1   1   0
## 6    0   1   2   1   1   1   1   1   0   0   1   0   0   1   1   1   1   1   0   1   0
## 7    1   2   2   0   1   0   1   2   1   0   1   0   0   1   0   1   1   0   0   2   0
## 8    0   1   1   1   1   2   0   1   0   1   0   0   1   0   0   2   1   0   1   0   1
## 9    1   1   0   1   0   1   0   2   0   2   1   0   0   1   0   0   1   1   0   2   1
## 10   0   1   1   0   0   0   0   2   1   0   2   0   0   1   2   0   2   0   1   2   1

The principal components analysis.

snp.pca <- prcomp(z,center=TRUE,scale=TRUE)

The principal components.

snp.p <- predict(snp.pca)
dim(snp.p)
## [1] 200  50
head(snp.p[,1:5],10)
##             PC1        PC2         PC3        PC4        PC5
##  [1,] 0.3673817 -0.2734193  0.09206113  1.1476370 -0.8903395
##  [2,] 2.5758992 -1.3162147  0.18734551  0.2379644 -1.0162325
##  [3,] 1.5395234  1.2247341 -1.74680376 -2.6340989 -1.3447263
##  [4,] 1.4044452  2.2137645  0.57825878 -2.0429535 -1.6062355
##  [5,] 1.3579743 -1.6474900  2.15651541 -0.3185241  1.2566002
##  [6,] 0.1300893  1.2469950  0.48849650  1.8581512  0.2805900
##  [7,] 1.4539448  2.2667258 -1.03415060  0.2116116 -0.1846399
##  [8,] 1.3134604 -1.0234999 -0.16670297  0.9321990  0.1387367
##  [9,] 0.6562900  0.7513414  0.57334602  0.9303713  1.4989877
## [10,] 2.9232054 -0.2476571 -0.82555553  2.5477727 -0.4787476

Plot the first two principal components.

par(las=1)
plot(snp.p[,1:2],pch=21,bg=rep(c("blue","red"),each=100))

Simulate a new set of 50 samples from the first population.

n3 <- 50
z3 <- my.snp.pc.data(n3,maf1)
z3 <- data.frame(z3)
head(z3)
##   X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29
## 1  0  1  0  0  1  1  2  0  0   0   1   0   0   1   1   1   0   0   0   1   2   0   1   0   0   0   0   0   0
## 2  1  1  2  2  1  1  2  0  0   0   0   0   1   1   1   0   1   2   2   0   2   0   0   0   1   0   0   0   2
## 3  0  0  2  0  0  0  1  0  0   0   1   1   1   0   1   1   1   1   1   1   1   1   1   0   0   1   0   1   1
## 4  1  0  1  1  1  0  1  0  0   1   0   0   0   0   0   1   1   1   0   0   1   0   1   0   0   0   0   0   1
## 5  1  0  1  1  0  2  0  0  1   0   0   1   2   1   1   0   1   0   0   0   1   1   0   1   1   0   0   0   2
## 6  0  1  0  1  0  2  0  1  1   0   1   0   1   1   1   1   0   1   1   2   1   0   2   0   0   1   0   1   0
##   X30 X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49 X50
## 1   0   1   0   0   1   2   1   2   1   0   2   1   0   0   0   0   0   0   0   1   1
## 2   0   0   1   1   0   0   1   0   1   0   1   0   1   1   0   1   1   1   1   0   1
## 3   0   0   0   0   1   1   2   0   0   1   0   1   0   2   0   0   0   0   0   1   0
## 4   0   0   1   0   0   1   1   2   0   0   2   1   0   1   0   0   2   0   2   1   0
## 5   0   2   2   1   0   1   1   1   0   1   0   0   2   2   0   1   1   0   0   1   2
## 6   0   1   0   0   1   0   1   1   0   1   0   2   0   1   1   0   2   0   0   1   1

Add these 50 samples to the first two principal components plot.

p3 <- predict(snp.pca,z3)
par(las=1)
plot(snp.p[,1:2],pch=21,bg=rep(c("blue","red"),each=100))
points(p3[,1:2],pch=21,bg="green3",cex=1.5)