######## # load in the data and a few functions load("CHARM_data_for_genomics_class.rda") # lets see what go loaded in ls() # a few data objects and a few functions table(chr) # just chr 1 summary(p) p[1:5,1:5] pd[1:5,] # rows of pd match with columms of p sum(colnames(p) != rownames(pd)) match(colnames(p), rownames(pd)) diff(match(colnames(p), rownames(pd))) # histogram of 1 sample hist(p[,1]) # and of control probes hist(p[controlIndex,1]) # get most variable probes rowsd = apply(p,1, sd) hist(rowsd) top = order(rowsd, decreasing = T)[1:5000] rowsd[head(top)] pp = p[top,] # take the most variable probes d = dist(t(pp)) # get the distance between SAMPLES - hence the t() hc = hclust(d) # cluster ttypes = paste(pd$TissueType,pd$DiseaseState,sep=":") # make unique tissue names unique(ttypes) # cluster dendogram myplclust(hc, lab = ttypes,lab.col =as.numeric(factor(ttypes)), hang = 0.01) # heatmaps heatmap(pp[1:100,], labCol = pd$TissueType) heatmap(pp[1:500,], labCol = pd$TissueType) heatmap(pp[1:500,], labCol = pd$DiseaseState) # check out pns groups head(table(pns)) quantile(table(pns)) quantile(table(pns), seq(0,1,0.1)) hist(log10(table(pns))) # lets plot the methylation in the fourth pns group library(RColorBrewer) Index = which(pns == 4) palette(brewer.pal(n=8,"Set1")) matplot(y=p[Index,], x= pos[Index], type = "l", col = as.numeric(factor(ttypes)), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") colon = which(pd$TissueType == "colon") matplot(y=p[Index,colon], x= pos[Index], type = "l", col = as.numeric(factor(ttypes[colon])), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") ##### logit = function(x) log(x)-log(1-x) # install.packages("corpcor") library(corpcor) y = logit(p) mm = apply(y,1,median) s = fast.svd(y - mm, tol = 0) # pc1 vs pc2 plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2", pch = 19, cex=1.5) plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2", pch = 19, cex=1.5, col = as.numeric(factor(ttypes))) legend("bottomright", levels(factor(ttypes)), col = 1:5, pch = 19, pt.cex = 2) #################### # analysis # source("http://bioconductor.org/biocLite.R") # biocLite("limma") library(limma) # cancer vs normal p1 = p[,colon] pd1 = pd[colon,] mod = model.matrix(~pd1$DiseaseState) fit = lmFit(p1, mod) beta=fit$coef[,2] fit=limma::ebayes(fit) wald=fit$t[,2] # unsmoothed Index = which(pns == pns[which.max(wald)]) matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2, col = as.numeric(factor(pd1$DiseaseState)), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") legend("bottomright", levels(factor(pd1$DiseaseState)), col = 1:2, lwd = 5) plot(wald[Index]~pos[Index], type = "l") # smooth using running medians swald = runMedPns(wald, pns) sbeta = runMedPns(beta, pns) plot(wald[Index]~pos[Index], pch = 19, col = "black") lines(swald[Index] ~ pos[Index], col = 1, lwd = 3) ### smoothed # unsmoothed Index = which(pns == pns[which.max(swald)]) matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2, col = as.numeric(factor(pd1$DiseaseState)), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") legend("bottomright", levels(factor(pd1$DiseaseState)), col = 1:2, lwd = 5) Index = (which.max(swald)-10):(which.max(swald)+10) plot(wald[Index]~pos[Index], pch = 19, col = "black") lines(swald[Index] ~ pos[Index], col = 1, lwd = 3) matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2, col = as.numeric(factor(pd1$DiseaseState)), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") legend("bottomright", levels(factor(pd1$DiseaseState)), col = 1:2, lwd = 5) par(mfrow = c(2,1)) hist(wald,breaks = 30, xlim = c(-10,10)) hist(swald,breaks = 30, xlim = c(-10,10)) # check control Index par(mfrow = c(1,1)) hist(swald[controlIndex], breaks = 30) # using regionFinder dmrs = regionFinder(swald, pns, chr, pos, y=sbeta, cutoff = 2.54) # interpretation par(mfrow = c(2,2)) PAD=10 for(i in 1:4) { Index=(dmrs[i,7]-PAD):(dmrs[i,8]+PAD) Index=Index[pns[Index]==dmrs$pns[i]] matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2, col = as.numeric(factor(pd1$DiseaseState)), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") if(i == 3) {legend("bottomright", levels(factor(pd1$DiseaseState)), col = 1:2, lwd = 5)} } for(i in 1:4) { Index=(dmrs[i,7]-PAD):(dmrs[i,8]+PAD) Index=Index[pns[Index]==dmrs$pns[i]] plot(wald[Index]~ pos[Index], pch = 19, ylim = c(-10,10)) lines(swald[Index] ~ pos[Index], col = 1, lwd = 3) } # using regionFinder dmrs2 = regionFinder(swald, pns, chr, pos, y=sbeta, cutoff = 4) # interpretation par(mfrow = c(2,2)) PAD=10 for(i in 1:4) { Index=(dmrs2[i,7]-PAD):(dmrs2[i,8]+PAD) Index=Index[pns[Index]==dmrs2$pns[i]] matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2, col = as.numeric(factor(pd1$DiseaseState)), lty =1, ylim = c(0,1), ylab = "methylation",xlab = "position") if(i == 3) {legend("bottomright", levels(factor(pd1$DiseaseState)), col = 1:2, lwd = 5)} } ############################# # illumina data load("forJeffDataGenomics.rda") # gives a summary dat class(dat) # extract some info pheno <- pData(dat) # phenotypes map <- featureData(dat)@data # probe info # get methylation data meth <- exprs(dat) range(meth,na.rm=T) # looks good # reduce pheno dimensions pheno1 <- pheno[,10:20] names(pheno1) <- c("case","ageDraw","ageRecruit","ageDiagnosis", "histology", "stage", "grade","preTreat","postTreat","ca125","batch") # format phenotype outcome <- as.character(pheno1[,1]) case <- grep("Case",outcome) outcome_num <- rep(0,length(outcome)) outcome_num[case] <- 1 pheno1[,1] <- outcome_num for(j in 2:11) { pheno1[,j] <- as.character(pheno1[,j]) for(i in 1:nrow(pheno1)) { pheno1[i,j] <- strsplit(pheno1[i,j],":")[[1]][2] } } for(i in 1:nrow(pheno1)) { pheno1[i,7] <- strsplit(pheno1[i,j]," ")[[1]][2] } # make integers from factors for(i in c(3,4,7,10,11)) pheno1[,i] <- as.integer(pheno1[,i]) # make range look better pheno1[,2] <- sub(" to ", "-",pheno1[,2]) pheno1[,2] <- sub("Over ", "+",pheno1[,2]) for(i in 8:9) { pheno1[,i] <- as.character(pheno1[,i]) pheno1[,i] <- ifelse(pheno1[,i] == " Yes",1,0) } ########## ## Map reduce map1 <- map[,c(9,10,23,30,31)] ####### par(mfrow = c(1,1)) boxplot(meth) # looks normalized # start some analyses checkCol <- colMeans(is.na(meth)) which.max(check) checkRow <- rowMeans(is.na(meth)) which.max(check) # drop low qc meth = meth[checkRow < 0.2, checkCol < 0.05] map1 = map1[checkRow < 0.2,] pheno1 = pheno1[checkCol < 0.05,] # impute data # install.packages("impute") library(impute) meth <- impute.knn(meth)$data # pca y = logit(meth) y[y==-Inf] = rep(-999) mm = apply(y,1,median) s = fast.svd(y - mm, tol = 0) # pc1 vs pc2 plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2", pch = 19, cex=1.5) # pc1 vs pc2 plot(s$v[,2], s$v[,3], xlab = "pc2", ylab= "pc3", pch = 19, cex=1.5) plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2", pch = 19, cex=1.5, col = pheno1$case +1) legend("bottomright", levels(factor(ttypes)), col = 1:5, pch = 19, pt.cex = 2) boxplot(s$v[,1] ~ pheno1$batch)