######################################################################################## # This R Script reproduces the analyses described in Leek (2011) "A method for statistically # validating high-throughput -omic results" Please see that paper and # http://www.biostat.jhsph.edu/~jleek/validate/ for more details. # Copyright (C) 2011 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, see . # # Depends (all files can be found at http://www.biostat.jhsph.edu/~jleek/validate/code.html): # validate.R - The R script containing the validation functions # validate-data.rda - An rda containing the data sets used in the analysis. # RColorBrewer,GEOquery,qvalue - freely available R packages # ######################################################################################## ### ### Read in the validation functions ### and load the q-value library ### source('validate.R') library(qvalue) ### ### Calculate the validation probabilities for the ### Science and Nature examples in Leek (2011) ### ### Zhang et al. Science Example original.fdr = 0.01 number.tested = 31 number.validated = 28 validate(number.validated,number.tested,original.fdr,plot=F) ### Carro et al. Nature Example original.fdr = 0.05 number.tested = 2 number.validated = 2 validate(number.validated,number.tested,original.fdr,plot=F) ### ### Analyze the RNA-sequencing and qPCR data from Leek (2011) ### ### Load the data load("validate-data.rda") ### Get the upper quartile for each lane of the sequencing data for normalization uq <- apply(rnaseq.dat,2,function(x){quantile(x,prob=0.75)}) ### Get the p-values, fitting the intercept term from Langmead et al. (2010) Genome Biology 11:R83 pval <- apply(rnaseq.dat,1,function(x){summary(glm(x ~ rep(c(0,1),each=7) + uq))$coeff[2,4]}) ### Get the absolute fold-changes for the qPCR data fc <- abs(qpcr.foldchange) tmp <- !is.na(fc) ### Only consider the genes where there weren't problems ### with the qPCR and that don't have problems with the ### p-values fc <- fc[tmp] pval <- pval[tmp] ### Get the adjusted p-values and those less than 10% qval <- qvalue(pval,lambda=0) Index <- which(qval$qvalue < 0.10) ### Find the percentages of true positives for the ### 20 most significant genes and the genes significant at an FDR of 10% n.fdr10 <- mean(fc[Index] > 0.25)*20 n.top20 <- sum(fc[order(pval)][1:20] > 0.25) ### Get the FDR level for the top 20 genes qval <- qvalue(pval,lambda=0) top20fdr <- qval$qvalue[order(qval$pvalue)[20]] ### Calculate the validation probabilities for the two cases out.fdr10 <- validate(n.fdr10,20,0.10,plot=T) out.top20 <- validate(n.top20,20,top20fdr,plot=T) ### Create Figure 2 using the validate function pp = matrix(NA,nrow=7,ncol=100) aa = c(0.01,0.05,0.10,0.2,0.3,0.4,0.5) for(i in 1:100){ for(j in 1:length(aa)){ out = validate(i-max(i*aa[j]*0.7,0),i,aa[j],plot=F) pp[j,i] = out$validationProb } if(i %% 10 == 0){cat(i)} } library(RColorBrewer) mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){ par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0)) par(mfrow=c(a,b),...) palette(brewer.pal(brewer.n,brewer.name)) } mypar() par(mar=c(4,4,4,4)) matplot(t(pp),type="l",lwd=4,col=1:7,lty=1,cex.axis=2,xlab="",ylab="") ## Make the cost plot for figure 3 par(mfrow=c(2,1)) ngenes = 1:100 nsamples = 20 timeTaqMan = 1/(4*22*12) * (floor((ngenes*nsamples)/96) + 1) costTaqMan = 250*ngenes + (4 + 150)*(floor((ngenes*nsamples)/96) + 1) + 40000*timeTaqMan plot(costTaqMan,xlab="Number of Genes",ylab="Cost in Dollars",col=1,pch=19) timeSYBrGreen = 1/(4*22*12) * (floor((ngenes*nsamples*2*3)/96) + 1) costSYBrGreen = (4+150) * (floor((ngenes*nsamples*2*3)/96) + 1) + 40000*timeSYBrGreen points(costSYBrGreen,col=2,pch=19) legend(10,26000,c("TaqMan","SYBrGreen"),col=c(1,2),pch=19) plot(timeTaqMan,xlab="Number of Genes",ylab="Time in Research Associate Years",col=1,pch=19,ylim=c(0,0.12)) points(timeSYBrGreen,xlab="Number of Genes",ylab="Time in Research Associate Years",col=2,pch=19) legend(10,0.11,c("TaqMan","SYBrGreen"),col=c(1,2),pch=19) ## Load a p-value helper function f.pvalue <- function(dat,mod,mod0){ # This is a function for performing # parametric f-tests on the data matrix # dat comparing the null model mod0 # to the alternative model mod. n <- dim(dat)[2] m <- dim(dat)[1] df1 <- dim(mod)[2] df0 <- dim(mod0)[2] p <- rep(0,m) Id <- diag(n) resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*% t(mod)) rss1 <- rowSums(resid*resid) rm(resid) resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*% t(mod0)) rss0 <- rowSums(resid0*resid0) rm(resid0) fstats <- ((rss0 - rss1)/(df1-df0))/(rss1/(n-df1)) p <- 1-pf(fstats,df1=(df1-df0),df2=(n-df1)) return(p) } costFunction <- function(ngenes,nsamples){ timeTaqMan = 1/(4*22*12) * (floor((ngenes*nsamples)/96) + 1) costTaqMan = 250*ngenes + (4 + 150)*(floor((ngenes*nsamples)/96) + 1) + 40000*timeTaqMan timeSYBrGreen = 1/(4*22*12) * (floor((ngenes*nsamples*2*3)/96) + 1) costSYBrGreen = (4+150) * (floor((ngenes*nsamples*2*3)/96) + 1) + 40000*timeSYBrGreen return(list(timeTaqMan,costTaqMan,timeSYBrGreen,costSYBrGreen)) } ######## Calculate the validation samples for several GEO studies source("validate.R") #### #### Study 1: Squamous cell versus adenocarcinoma in humans #### library(GEOquery) tmp1 = getGEO("GSE10245") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~as.factor(characteristics_ch1),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) # Calculate significance pValues = f.pvalue(exprs(tmp1),mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2]) #### #### Study 2: Obesity resistence in rats #### tmp1 = getGEO("GSE11492") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~grepl("OR",pData(tmp1)$title),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) keep = rowSums(is.na(exprs(tmp1)))==0 # Calculate significance pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2]) #### #### Study 3: Cigaratte smoking on oral mucosa in humans #### tmp1 = getGEO("GSE17913") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~grepl("Never",pData(tmp1)$title),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) keep = rowSums(is.na(exprs(tmp1)))==0 # Calculate significance pValues = f.pvalue(log2(exprs(tmp1))[keep,],mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2]) #### #### Study 4: Asthma exacerbation in homo sapiens #### tmp1 = getGEO("GSE16032") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~grepl("Conv",pData(tmp1)$title),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) keep = rowSums(is.na(exprs(tmp1)))==0 # Calculate significance pValues = f.pvalue(log2(exprs(tmp1))[keep,],mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2]) #### #### Study 5: Study of pulmonary sarcoidosis in homo sapiens #### tmp1 = getGEO("GSE16538") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~grepl("normal",pData(tmp1)$title),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) keep = rowSums(is.na(exprs(tmp1)))==0 # Calculate significance pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2]) #### #### Study 6: Sickle cell versus control in homo sapiens #### tmp1 = getGEO("GSE11524") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~grepl("sickle",pData(tmp1)$title),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) keep = rowSums(is.na(exprs(tmp1)))==0 # Calculate significance pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2]) #### #### Study 7: Down syndrome in humans #### tmp1 = getGEO("GSE5390") tmp1 = tmp1[[1]] # Get the model matrices mod = model.matrix(~grepl("Down",pData(tmp1)$title),data=pData(tmp1)) mod0 = model.matrix(~1,data=pData(tmp1)) keep = rowSums(is.na(exprs(tmp1)))==0 # Calculate significance pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0) qValues = qvalue(pValues) sum(qValues$qvalues < 0.05) # Find minimum sample size needed for validation probability >= 0.5 minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)) # Calculate costs for full and statistical validation costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2]) costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])