########################################################################################
#  This R Script reproduces the analyses described in Leek (2011) "A method for statistically
#  validating high-throughput -omic results" Please see that paper and 
#  http://www.biostat.jhsph.edu/~jleek/validate/ for more details.  
#  Copyright (C) 2011 Jeffrey T. Leek (http://www.biostat.jhsph.edu/~jleek/contact.html)
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
#    
#    Depends (all files can be found at http://www.biostat.jhsph.edu/~jleek/validate/code.html): 
#    validate.R - The R script containing the validation functions
#    validate-data.rda - An rda containing the data sets used in the analysis.
#    RColorBrewer,GEOquery,qvalue - freely available R packages
#    
########################################################################################


###
### Read in the validation functions
### and load the q-value library
###

source('validate.R')
library(qvalue)

###
### Calculate the validation probabilities for the
### Science and Nature examples in Leek (2011) 
###

### Zhang et al. Science Example

original.fdr = 0.01
number.tested = 31
number.validated = 28

validate(number.validated,number.tested,original.fdr,plot=F)

### Carro et al. Nature Example

original.fdr = 0.05
number.tested = 2
number.validated = 2

validate(number.validated,number.tested,original.fdr,plot=F)
         

###
### Analyze the RNA-sequencing and qPCR data from  Leek (2011) 
###

### Load the data
load("validate-data.rda")

### Get the upper quartile for each lane of the sequencing data for normalization

uq <- apply(rnaseq.dat,2,function(x){quantile(x,prob=0.75)})

### Get the p-values, fitting the intercept term from Langmead et al. (2010) Genome Biology 11:R83

pval <- apply(rnaseq.dat,1,function(x){summary(glm(x ~ rep(c(0,1),each=7) + uq))$coeff[2,4]})

### Get the absolute fold-changes for the qPCR data
fc <- abs(qpcr.foldchange)
tmp <- !is.na(fc)


### Only consider the genes where there weren't problems
### with the qPCR and that don't have problems with the
### p-values

fc <- fc[tmp]
pval <- pval[tmp]

### Get the adjusted p-values and those less than 10%
qval <- qvalue(pval,lambda=0)
Index <- which(qval$qvalue < 0.10)

### Find the percentages of true positives for the
### 20 most significant genes and the genes significant at an FDR of 10%
n.fdr10 <- mean(fc[Index] > 0.25)*20
n.top20 <- sum(fc[order(pval)][1:20] > 0.25)


### Get the FDR level for the top 20 genes

qval <- qvalue(pval,lambda=0)
top20fdr <- qval$qvalue[order(qval$pvalue)[20]]


### Calculate the validation probabilities for the two cases

out.fdr10 <- validate(n.fdr10,20,0.10,plot=T)
out.top20 <- validate(n.top20,20,top20fdr,plot=T)


### Create Figure 2 using the validate function

pp = matrix(NA,nrow=7,ncol=100)
aa = c(0.01,0.05,0.10,0.2,0.3,0.4,0.5)
for(i in 1:100){
  for(j in 1:length(aa)){
    out = validate(i-max(i*aa[j]*0.7,0),i,aa[j],plot=F)
    pp[j,i] = out$validationProb
  }
  if(i %% 10 == 0){cat(i)}
}

library(RColorBrewer)
mypar <- function(a=1,b=1,brewer.n=8,brewer.name="Dark2",...){
 par(mar=c(2.5,2.5,1.6,1.1),mgp=c(1.5,.5,0))
 par(mfrow=c(a,b),...)
 palette(brewer.pal(brewer.n,brewer.name))
}


mypar()
par(mar=c(4,4,4,4))
matplot(t(pp),type="l",lwd=4,col=1:7,lty=1,cex.axis=2,xlab="",ylab="")


## Make the cost plot for figure 3

par(mfrow=c(2,1))
ngenes = 1:100
nsamples = 20

timeTaqMan = 1/(4*22*12) * (floor((ngenes*nsamples)/96) + 1)
costTaqMan = 250*ngenes + (4 + 150)*(floor((ngenes*nsamples)/96) + 1) + 40000*timeTaqMan

plot(costTaqMan,xlab="Number of Genes",ylab="Cost in Dollars",col=1,pch=19)

timeSYBrGreen = 1/(4*22*12) * (floor((ngenes*nsamples*2*3)/96) + 1)
costSYBrGreen = (4+150) * (floor((ngenes*nsamples*2*3)/96) + 1) + 40000*timeSYBrGreen

points(costSYBrGreen,col=2,pch=19)

legend(10,26000,c("TaqMan","SYBrGreen"),col=c(1,2),pch=19)


plot(timeTaqMan,xlab="Number of Genes",ylab="Time in Research Associate Years",col=1,pch=19,ylim=c(0,0.12))
points(timeSYBrGreen,xlab="Number of Genes",ylab="Time in Research Associate Years",col=2,pch=19)

legend(10,0.11,c("TaqMan","SYBrGreen"),col=c(1,2),pch=19)


## Load a p-value helper function

f.pvalue <- function(dat,mod,mod0){
  # This is a function for performing
  # parametric f-tests on the data matrix
  # dat comparing the null model mod0
  # to the alternative model mod. 
  n <- dim(dat)[2]
  m <- dim(dat)[1]
  df1 <- dim(mod)[2]
  df0 <- dim(mod0)[2]
  p <- rep(0,m)
  Id <- diag(n)
  
  resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*% t(mod))
  rss1 <- rowSums(resid*resid)
  rm(resid)
  
  resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*% t(mod0))
  rss0 <- rowSums(resid0*resid0)
  rm(resid0)
  
  fstats <- ((rss0 - rss1)/(df1-df0))/(rss1/(n-df1))
  p <-  1-pf(fstats,df1=(df1-df0),df2=(n-df1))
  return(p)
}

costFunction <- function(ngenes,nsamples){

timeTaqMan = 1/(4*22*12) * (floor((ngenes*nsamples)/96) + 1)
costTaqMan = 250*ngenes + (4 + 150)*(floor((ngenes*nsamples)/96) + 1) + 40000*timeTaqMan


timeSYBrGreen = 1/(4*22*12) * (floor((ngenes*nsamples*2*3)/96) + 1)
costSYBrGreen = (4+150) * (floor((ngenes*nsamples*2*3)/96) + 1) + 40000*timeSYBrGreen

return(list(timeTaqMan,costTaqMan,timeSYBrGreen,costSYBrGreen))

}


######## Calculate the validation samples for several GEO studies

source("validate.R")


####
#### Study 1: Squamous cell versus adenocarcinoma in humans
####

library(GEOquery)
tmp1 = getGEO("GSE10245")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~as.factor(characteristics_ch1),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))

# Calculate significance
pValues = f.pvalue(exprs(tmp1),mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])


####
#### Study 2: Obesity resistence in rats
####

tmp1 = getGEO("GSE11492")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~grepl("OR",pData(tmp1)$title),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))
keep = rowSums(is.na(exprs(tmp1)))==0

# Calculate significance
pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])


####
#### Study 3: Cigaratte smoking on oral mucosa in humans
####

tmp1 = getGEO("GSE17913")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~grepl("Never",pData(tmp1)$title),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))
keep = rowSums(is.na(exprs(tmp1)))==0

# Calculate significance
pValues = f.pvalue(log2(exprs(tmp1))[keep,],mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])


####
#### Study 4: Asthma exacerbation in homo sapiens
####

tmp1 = getGEO("GSE16032")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~grepl("Conv",pData(tmp1)$title),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))
keep = rowSums(is.na(exprs(tmp1)))==0

# Calculate significance
pValues = f.pvalue(log2(exprs(tmp1))[keep,],mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])


####
#### Study 5: Study of pulmonary sarcoidosis in homo sapiens
####

tmp1 = getGEO("GSE16538")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~grepl("normal",pData(tmp1)$title),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))
keep = rowSums(is.na(exprs(tmp1)))==0

# Calculate significance
pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])

####
#### Study 6: Sickle cell versus control in homo sapiens
####


tmp1 = getGEO("GSE11524")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~grepl("sickle",pData(tmp1)$title),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))
keep = rowSums(is.na(exprs(tmp1)))==0

# Calculate significance
pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])


####
#### Study 7: Down syndrome in humans
####


tmp1 = getGEO("GSE5390")
tmp1 = tmp1[[1]]

# Get the model matrices
mod = model.matrix(~grepl("Down",pData(tmp1)$title),data=pData(tmp1))
mod0 = model.matrix(~1,data=pData(tmp1))
keep = rowSums(is.na(exprs(tmp1)))==0

# Calculate significance
pValues = f.pvalue(exprs(tmp1)[keep,],mod,mod0)
qValues = qvalue(pValues)
sum(qValues$qvalues < 0.05)

# Find minimum sample size needed for validation probability >= 0.5
minimumValidationSize(pValues,plot=T,fdr=c(0,0.05))

# Calculate costs for full and statistical validation
costFunction(sum(qValues$qvalues < 0.05),dim(exprs(tmp1))[2])
costFunction(minimumValidationSize(pValues,plot=T,fdr=c(0,0.05)),dim(exprs(tmp1))[2])