######################################################################################## # This R Script reproduces the simulation described in Leek (2011) "A method for statistically # validating high-throughput -omic results" Please see that paper and # http://www.biostat.jhsph.edu/~jleek/validate/ for more details. # Copyright (C) 2011 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, see . # # Depends (all files can be found at http://www.biostat.jhsph.edu/~jleek/validate/code.html): # validate.R - The R script containing the validation functions # qvalue package - can be obtained from Bioconductor ######################################################################################## ### Perform the Simulation Study ### ### Variables: ### numStudies - the number of simulated studies ### Data - matrix of simulated data from "original experiment" ### Group - the group variable for the original experiment ### significantGenes - the index of the set of significant genes ### vData - the validation data generated for the significant Genes ### vGroup - the group variable for the validation data ### vProbability - the validaiton probabilities for each of the simulated studies ### vCredible - the 95% credible intervals for each of the studies ### Coverage - coverage probabilities at each FDR threshold ### vQuantiles - quantiles of the validation probability for each FDR threshold library(genefilter) library(qvalue) ### Case 1 ### 100 simulated studies where the validation ### data comes from a "gold-standard" that always gives the truth set.seed(7743215) ii = 1 Coverage = rep(NA,10) vQuantiles = matrix(NA,nrow=10,ncol=3) for(Fdr in seq(0.05,0.5,by=0.05)){ numStudies=100 vProbability = rep(NA,numStudies) vCredible = matrix(NA,nrow=numStudies,ncol=2) for(s in 1:numStudies){ ### Define the group Group = rep(c(0,1),each=10) ### Generate random data Data = matrix(rnorm(1000*20),ncol=20) ### Add the Group signal to the first 300 genes Beta = c(rnorm(300,sd=2),rep(0,700)) Data = Data + Beta %*% t(Group) ### Calculate P-values from the t-tests and Q-values using qvalue Pvalues = rowttests(Data,as.factor(Group))$p.value Qvalues = qvalue(Pvalues,lambda=0)$qvalue ### Find the significant genes significantGenes = which(Qvalues < Fdr) ### Store the validation probabilities and 95% credible ### intervals over all 100 simulated studies. out = validate(sum(significantGenes <= 300),length(significantGenes),Fdr,plot=F) vProbability[s] = out$validationProb vCredible[s,] = out$credibleInterval } Coverage[ii] = mean(vCredible[,1] < Fdr & Fdr < vCredible[,2]) vQuantiles[ii,] = quantile(vProbability,c(0.25,0.5,0.75)) ii = ii + 1 cat(paste("FDR",Fdr,"complete\n")) } ### Make the plot for Figure 3 minimumValidationSize(Pvalues,targetProbability=0.5,plot=T) ### Case 2 ### 100 simulated studies where the validation ### data is generated from a larger sample from the ### same distribution as the original data set.seed(555155) ii = 1 Coverage = rep(NA,10) vQuantiles = matrix(NA,nrow=10,ncol=3) for(Fdr in seq(0.05,0.5,by=0.05)){ numStudies=100 vProbability = rep(NA,numStudies) vCredible = matrix(NA,nrow=numStudies,ncol=2) for(s in 1:numStudies){ ### Define the group Group = rep(c(0,1),each=10) ### Generate random data Data = matrix(rnorm(1000*20),ncol=20) ### Add the Group signal to the first 300 genes Beta = c(rnorm(300,sd=2),rep(0,700)) Data = Data + Beta %*% t(Group) ### Calculate P-values from the t-tests and Q-values using qvalue Pvalues = rowttests(Data,as.factor(Group))$p.value Qvalues = qvalue(Pvalues,lambda=0)$qvalue ### Find the significant genes significantGenes = which(Qvalues < Fdr) ### Generate validation data for the significant genes ### from the same distribution as before only for 100 "validation" samples vGroup = rep(c(0,1),each=50) vData = matrix(rnorm(length(significantGenes)*100),ncol=100) vData = vData + Beta[significantGenes] %*% t(vGroup) ### Calculate independent validation P-values, call all P-values ### significant less than 0.01. vPvalues = rowttests(vData,as.factor(vGroup))$p.value vQvalues = qvalue(vPvalues,lambda=0)$qvalue ### Store the validation probabilities and 95% credible ### intervals over all 100 simulated studies. out = validate(sum(vQvalues < Fdr),length(significantGenes),Fdr,plot=F) vProbability[s] = out$validationProb vCredible[s,] = out$credibleInterval } Coverage[ii] = mean(vCredible[,1] < Fdr & Fdr < vCredible[,2]) vQuantiles[ii,] = quantile(vProbability,c(0.25,0.5,0.75)) ii = ii + 1 cat(paste("FDR",Fdr,"complete\n")) } ### Case 3 ### 100 simulated studies where the validation ### data does not match the real data set.seed(123433) ii = 1 Coverage = rep(NA,10) vQuantiles = matrix(NA,nrow=10,ncol=3) for(Fdr in seq(0.05,0.5,by=0.05)){ numStudies=100 vProbability = rep(NA,numStudies) vCredible = matrix(NA,nrow=numStudies,ncol=2) for(s in 1:numStudies){ ### Define the group Group = rep(c(0,1),each=10) ### Generate random data Data = matrix(rnorm(1000*20),ncol=20) ### Add the Group signal to the first 300 genes Beta = c(rnorm(300,sd=2),rep(0,700)) Data = Data + Beta %*% t(Group) ### Calculate P-values from the t-tests and Q-values using qvalue Pvalues = rowttests(Data,as.factor(Group))$p.value Qvalues = qvalue(Pvalues,lambda=0)$qvalue ### Find the significant genes significantGenes = which(Qvalues < Fdr) ### The real true positive indicator vector ### which differs from the true positive vector ### for the original experimental data truePositives = c(rep(1,200),rep(0,300),rep(1,100),rep(0,400)) ### Store the validation probabilities and 95% credible ### intervals over all 100 simulated studies. out = validate(sum(truePositives[significantGenes]),length(significantGenes),Fdr,plot=F) vProbability[s] = out$validationProb vCredible[s,] = out$credibleInterval } Coverage[ii] = mean(vCredible[,1] < Fdr & Fdr < vCredible[,2]) vQuantiles[ii,] = quantile(vProbability,c(0.25,0.5,0.75)) ii = ii + 1 cat(paste("FDR",Fdr,"complete\n")) }