######################################################################################## # This R Script includes functions for the analyses described in Leek (2011) "A method for statistically # validating high-throughput -omic results" Please see that paper and # http://www.biostat.jhsph.edu/~jleek/validate/ for more details. # Copyright (C) 2011 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, see . # # Depends (all files can be found at http://www.biostat.jhsph.edu/~jleek/validate/code.html): # qvalue package - can be obtained from Bioconductor ######################################################################################## validate <- function(numValidated,numTested,fdr,plot=F){ ## Input parameters ## numValidated - the number of results that validated ## numTested - the number of results tested for validation ## fdr - the original ## plot - if true, a plot of the posterior distribution is made ## with the credible interval in grey and the original fdr ## shown with a dotted red line. ## Output parameters ## credibleInterval - the 95% credible interval for the false discovery rate in the validation set ## validationProb - the validation probability, e.g. the probability that the real FDR is less ## than the claimed FDR ## Parameters for the Beta prior distribution ## increasing the value of b and decreasing ## a leads to less conservative estimates of the ## validation probability. a = 1 b = 1 ## Calculate the parameters credint = qbeta(c(0.025,0.975),(a + numTested-numValidated), (b + numValidated)) probless = pbeta(fdr, (a+numTested-numValidated), (b + numValidated)) ## Make the plot if(plot){ aa <- list(x=seq(0,1,length=1e3),y=dbeta(seq(0,1,length=1e3), (a+numTested-numValidated), (b + numValidated))) x0 <- which.max(aa$x > credint[1]) x1 <- which.max(aa$x > credint[2]) plot(aa$x,aa$y,lwd=2,col="black",xlab="Proportion of False Positives",ylab="Posterior Density",type="n",xlim=c(0,1)) polygon(x=c(0,aa$x,1),y=c(0,aa$y,0),col=rgb(0,0,0,0),border=rgb(0,0,0,1),lwd=4) polygon(x=c(aa$x[x0],aa$x[x0:x1],aa$x[x1]),y=c(0,aa$y[x0:x1],0),col=rgb(0,0,0,0.3),border=rgb(0,0,0,0),lwd=4) lines(rep(fdr,100),seq(0,max(aa$y),length=100),lwd=2,col="red",lty=2) legend(0,y=max(aa$y),legend=c("Original FDR","95% Credible Interval"),col=c("red",rgb(0,0,0,0.3)),lwd=c(2,10),lty=c(2,1)) } out = list(credibleInterval=credint,validationProb = probless) return(out) } minimumValidationSize <- function(Pvalues,targetProbability=0.5,fdr = seq(0,1,by=0.05),plot=F){ ## Input parameters ## Pvalues - the p-values from the original experiment ## targetProb - the target validation probability desired ## fdr - the set of FDR cutoffs to evaluate the minimum sample size for (first value must be 0) ## plot - if true, a plot of the minimum sample size for each fdr cutoff is displayed ## Output parameters ## minimumSize - for each FDR threshold the minimum sample size needed to achieve a validation probability of 0.5 ## a value of NA means that it is not possible to achieve a validation proability above the target probability ## for that FDR threshold. ## Calculate the Q-values Qvalues = qvalue(Pvalues)$qvalue ## Calculate the number significant at each fdr threshold numberSignificant = cumsum(table(cut(Qvalues,fdr))) minimumSize = rep(NA,length(numberSignificant)) nCut = length(numberSignificant) for(i in 1:nCut){ vProbs = validate(1:numberSignificant[i]*(1-0.95*fdr[(i+1)]),1:numberSignificant[i],fdr[(i+1)])$validationProb if(any(vProbs > targetProbability)){ minimumSize[i] = which.max(vProbs > targetProbability) } } ## Calculate the parameters if(plot){ plot(fdr[2:length(fdr)],1:nCut,ylim=c(0,max(minimumSize,na.rm=T)),xlim=c(0,1),type="n",xlab="FDR Cutoff",ylab="Minimum Validation Sample Size") points(fdr[2:length(fdr)],minimumSize,pch=19,cex=2) points(fdr[2:length(fdr)][is.na(minimumSize)],rep(0,sum(is.na(minimumSize))),pch=4,cex=2) legend(0.7,y=max(minimumSize,na.rm=T),legend=c("Validation Sample Size","Not Possible at this FDR"),col=c(1,1),pch=c(19,4),cex=0.8) } names(minimumSize) = fdr[2:length(fdr)] return(minimumSize) }