########################################################################################
#  This R Script includes functions for the analyses described in Leek (2011) "A method for statistically
#  validating high-throughput -omic results" Please see that paper and 
#  http://www.biostat.jhsph.edu/~jleek/validate/ for more details.  
#  Copyright (C) 2011 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html)
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
#    
#    Depends (all files can be found at http://www.biostat.jhsph.edu/~jleek/validate/code.html): 
#    qvalue package - can be obtained from Bioconductor
########################################################################################


validate <- function(numValidated,numTested,fdr,plot=F){

  ## Input parameters
  ## numValidated - the number of results that validated 
  ## numTested - the number of results tested for validation
  ## fdr - the original
  ## plot - if true, a plot of the posterior distribution is made
  ##        with the credible interval in grey and the original fdr
  ##        shown with a dotted red line. 

  ## Output parameters
  ## credibleInterval - the 95% credible interval for the false discovery rate in the validation set
  ## validationProb - the validation probability, e.g. the probability that the real FDR is less
  ##                  than the claimed FDR

  ## Parameters for the Beta prior distribution
  ## increasing the value of b and decreasing
  ## a leads to less conservative estimates of the
  ## validation probability.
  
  a = 1
  b = 1

  ## Calculate the parameters
  credint = qbeta(c(0.025,0.975),(a + numTested-numValidated), (b + numValidated))
  probless = pbeta(fdr, (a+numTested-numValidated), (b + numValidated))

  ## Make the plot
  if(plot){
    aa <- list(x=seq(0,1,length=1e3),y=dbeta(seq(0,1,length=1e3), (a+numTested-numValidated), (b + numValidated)))
    x0 <- which.max(aa$x > credint[1])
    x1 <- which.max(aa$x > credint[2])
    plot(aa$x,aa$y,lwd=2,col="black",xlab="Proportion of False Positives",ylab="Posterior Density",type="n",xlim=c(0,1))
    polygon(x=c(0,aa$x,1),y=c(0,aa$y,0),col=rgb(0,0,0,0),border=rgb(0,0,0,1),lwd=4)
    polygon(x=c(aa$x[x0],aa$x[x0:x1],aa$x[x1]),y=c(0,aa$y[x0:x1],0),col=rgb(0,0,0,0.3),border=rgb(0,0,0,0),lwd=4)
    lines(rep(fdr,100),seq(0,max(aa$y),length=100),lwd=2,col="red",lty=2)
    legend(0,y=max(aa$y),legend=c("Original FDR","95% Credible Interval"),col=c("red",rgb(0,0,0,0.3)),lwd=c(2,10),lty=c(2,1))
  }
  out = list(credibleInterval=credint,validationProb = probless)
  return(out)
}


minimumValidationSize <- function(Pvalues,targetProbability=0.5,fdr = seq(0,1,by=0.05),plot=F){

  ## Input parameters
  ## Pvalues - the p-values from the original experiment
  ## targetProb - the target validation probability desired
  ## fdr - the set of FDR cutoffs to evaluate the minimum sample size for (first value must be 0)
  ## plot - if true, a plot of the minimum sample size for each fdr cutoff is displayed
  ## Output parameters
  ## minimumSize - for each FDR threshold the minimum sample size needed to achieve a validation probability of 0.5
  ##               a value of NA means that it is not possible to achieve a validation proability above the target probability
  ##               for that FDR threshold. 

  
  ## Calculate the Q-values
  Qvalues = qvalue(Pvalues)$qvalue

  ## Calculate the number significant at each fdr threshold
  numberSignificant = cumsum(table(cut(Qvalues,fdr)))

  
  minimumSize = rep(NA,length(numberSignificant))

  nCut = length(numberSignificant)
  for(i in 1:nCut){
    vProbs = validate(1:numberSignificant[i]*(1-0.95*fdr[(i+1)]),1:numberSignificant[i],fdr[(i+1)])$validationProb
    if(any(vProbs > targetProbability)){
      minimumSize[i] = which.max(vProbs > targetProbability)
    }
  }
  ## Calculate the parameters
  if(plot){
    plot(fdr[2:length(fdr)],1:nCut,ylim=c(0,max(minimumSize,na.rm=T)),xlim=c(0,1),type="n",xlab="FDR Cutoff",ylab="Minimum Validation Sample Size")
    points(fdr[2:length(fdr)],minimumSize,pch=19,cex=2)
    points(fdr[2:length(fdr)][is.na(minimumSize)],rep(0,sum(is.na(minimumSize))),pch=4,cex=2)
    legend(0.7,y=max(minimumSize,na.rm=T),legend=c("Validation Sample Size","Not Possible at this FDR"),col=c(1,1),pch=c(19,4),cex=0.8)
  }
  names(minimumSize) = fdr[2:length(fdr)]
  return(minimumSize)
}