########

# load in the data and a few functions
load("CHARM_data_for_genomics_class.rda")

# lets see what go loaded in
ls()
# a few data objects and a few functions

table(chr) # just chr 1

summary(p)

p[1:5,1:5]
pd[1:5,]

# rows of pd match with columms of p
sum(colnames(p) != rownames(pd))
match(colnames(p), rownames(pd))
diff(match(colnames(p), rownames(pd)))

# histogram of 1 sample
hist(p[,1])

# and of control probes
hist(p[controlIndex,1])

# get most variable probes
rowsd = apply(p,1, sd)
hist(rowsd)
top = order(rowsd, decreasing = T)[1:5000]
rowsd[head(top)]

pp = p[top,] # take the most variable probes
d = dist(t(pp)) # get the distance between SAMPLES - hence the t()
hc = hclust(d) # cluster
ttypes = paste(pd$TissueType,pd$DiseaseState,sep=":") # make unique tissue names
unique(ttypes)

# cluster dendogram
myplclust(hc, lab = ttypes,lab.col =as.numeric(factor(ttypes)),
	hang = 0.01)	

# heatmaps	
heatmap(pp[1:100,], labCol = pd$TissueType)
heatmap(pp[1:500,], labCol = pd$TissueType)
heatmap(pp[1:500,], labCol = pd$DiseaseState)

# check out pns groups
head(table(pns))
quantile(table(pns))
quantile(table(pns), seq(0,1,0.1))
hist(log10(table(pns)))

# lets plot the methylation in the fourth pns group
library(RColorBrewer)
Index = which(pns == 4)
palette(brewer.pal(n=8,"Set1"))
matplot(y=p[Index,], x= pos[Index], type = "l",
	col = as.numeric(factor(ttypes)), lty =1,
	ylim = c(0,1), ylab = "methylation",xlab = "position")

colon = which(pd$TissueType == "colon")	
matplot(y=p[Index,colon], x= pos[Index], type = "l",
	col = as.numeric(factor(ttypes[colon])), lty =1,
	ylim = c(0,1), ylab = "methylation",xlab = "position")

#####
logit = function(x) log(x)-log(1-x)
# install.packages("corpcor")
library(corpcor)

y = logit(p)
mm = apply(y,1,median)
s = fast.svd(y - mm, tol = 0)

# pc1 vs pc2
plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2",
	pch = 19, cex=1.5)
	
plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2",
	pch = 19, cex=1.5, col = as.numeric(factor(ttypes)))
legend("bottomright", levels(factor(ttypes)), col = 1:5, pch = 19,
	pt.cex = 2)

####################
# analysis	
# source("http://bioconductor.org/biocLite.R")
# biocLite("limma")
library(limma)

# cancer vs normal

p1 = p[,colon]
pd1 = pd[colon,]
mod = model.matrix(~pd1$DiseaseState)

fit = lmFit(p1, mod)
beta=fit$coef[,2]
fit=limma::ebayes(fit)
wald=fit$t[,2]

# unsmoothed
Index = which(pns == pns[which.max(wald)])
matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2,
	col = as.numeric(factor(pd1$DiseaseState)), lty =1,
	ylim = c(0,1), ylab = "methylation",xlab = "position")
legend("bottomright", levels(factor(pd1$DiseaseState)), 
	col = 1:2,	lwd = 5)
	
plot(wald[Index]~pos[Index], type = "l")

# smooth using running medians
swald = runMedPns(wald, pns)
sbeta = runMedPns(beta, pns)

plot(wald[Index]~pos[Index], pch = 19, col = "black")
lines(swald[Index] ~ pos[Index], col = 1, lwd = 3)


### smoothed

# unsmoothed
Index = which(pns == pns[which.max(swald)])
matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2,
	col = as.numeric(factor(pd1$DiseaseState)), lty =1,
	ylim = c(0,1), ylab = "methylation",xlab = "position")
legend("bottomright", levels(factor(pd1$DiseaseState)), 
	col = 1:2,	lwd = 5)
	
Index = (which.max(swald)-10):(which.max(swald)+10)
plot(wald[Index]~pos[Index], pch = 19, col = "black")
lines(swald[Index] ~ pos[Index], col = 1, lwd = 3)


matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2,
	col = as.numeric(factor(pd1$DiseaseState)), lty =1,
	ylim = c(0,1), ylab = "methylation",xlab = "position")
legend("bottomright", levels(factor(pd1$DiseaseState)), 
	col = 1:2,	lwd = 5)

par(mfrow = c(2,1))
hist(wald,breaks = 30, xlim = c(-10,10))
hist(swald,breaks = 30, xlim = c(-10,10))

# check control Index
par(mfrow = c(1,1))
hist(swald[controlIndex], breaks = 30)

# using regionFinder
dmrs = regionFinder(swald, pns, chr, pos, y=sbeta, cutoff = 2.54)
# interpretation

par(mfrow = c(2,2))
PAD=10
for(i in 1:4)  {
	Index=(dmrs[i,7]-PAD):(dmrs[i,8]+PAD)
	Index=Index[pns[Index]==dmrs$pns[i]]
		
	matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2,
		col = as.numeric(factor(pd1$DiseaseState)), lty =1,
		ylim = c(0,1), ylab = "methylation",xlab = "position")
	if(i == 3) {legend("bottomright", levels(factor(pd1$DiseaseState)), 
		col = 1:2,	lwd = 5)}
}

for(i in 1:4)  {

	Index=(dmrs[i,7]-PAD):(dmrs[i,8]+PAD)
	Index=Index[pns[Index]==dmrs$pns[i]]
	
	plot(wald[Index]~ pos[Index], pch = 19, ylim = c(-10,10))
	lines(swald[Index] ~ pos[Index], col = 1, lwd = 3)
}


# using regionFinder
dmrs2 = regionFinder(swald, pns, chr, pos, y=sbeta, cutoff = 4)
# interpretation

par(mfrow = c(2,2))
PAD=10
for(i in 1:4)  {
	Index=(dmrs2[i,7]-PAD):(dmrs2[i,8]+PAD)
	Index=Index[pns[Index]==dmrs2$pns[i]]
		
	matplot(y=p1[Index,], x= pos[Index], type = "l",lwd = 2,
		col = as.numeric(factor(pd1$DiseaseState)), lty =1,
		ylim = c(0,1), ylab = "methylation",xlab = "position")
	if(i == 3) {legend("bottomright", levels(factor(pd1$DiseaseState)), 
		col = 1:2,	lwd = 5)}
}


#############################
# illumina data

load("forJeffDataGenomics.rda")

# gives a summary
dat
class(dat)

# extract some info
pheno <- pData(dat) # phenotypes
map <- featureData(dat)@data # probe info


# get methylation data
meth <- exprs(dat)
range(meth,na.rm=T) # looks good

# reduce pheno dimensions
pheno1 <- pheno[,10:20]
names(pheno1) <- c("case","ageDraw","ageRecruit","ageDiagnosis",
	"histology", "stage", "grade","preTreat","postTreat","ca125","batch")

# format phenotype
outcome <- as.character(pheno1[,1])
case <- grep("Case",outcome)
outcome_num <- rep(0,length(outcome))
outcome_num[case] <- 1
pheno1[,1] <- outcome_num

for(j in 2:11) {
	pheno1[,j] <- as.character(pheno1[,j])
	for(i in 1:nrow(pheno1)) {
		pheno1[i,j] <- strsplit(pheno1[i,j],":")[[1]][2]
	}
}

for(i in 1:nrow(pheno1)) {
	pheno1[i,7] <- strsplit(pheno1[i,j]," ")[[1]][2]
}

# make integers from factors
for(i in c(3,4,7,10,11)) pheno1[,i] <- as.integer(pheno1[,i])

# make range look better
pheno1[,2] <- sub(" to ", "-",pheno1[,2])
pheno1[,2] <- sub("Over ", "+",pheno1[,2])

for(i in 8:9) {
	pheno1[,i] <- as.character(pheno1[,i])
	pheno1[,i] <- ifelse(pheno1[,i] == " Yes",1,0)
}

##########
## Map reduce
map1 <- map[,c(9,10,23,30,31)]

#######
par(mfrow = c(1,1))
boxplot(meth) # looks normalized


# start some analyses
checkCol <- colMeans(is.na(meth))
which.max(check)

checkRow <- rowMeans(is.na(meth))
which.max(check)

# drop low qc
meth = meth[checkRow < 0.2, checkCol < 0.05]
map1 = map1[checkRow < 0.2,]
pheno1 = pheno1[checkCol < 0.05,]

# impute data
# install.packages("impute")
library(impute)
meth <- impute.knn(meth)$data

# pca
y = logit(meth)
y[y==-Inf] = rep(-999)
mm = apply(y,1,median)
s = fast.svd(y - mm, tol = 0)

# pc1 vs pc2
plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2",
	pch = 19, cex=1.5)
	# pc1 vs pc2
plot(s$v[,2], s$v[,3], xlab = "pc2", ylab= "pc3",
	pch = 19, cex=1.5)
	
plot(s$v[,1], s$v[,2], xlab = "pc1", ylab= "pc2",
	pch = 19, cex=1.5, col = pheno1$case +1)
legend("bottomright", levels(factor(ttypes)), col = 1:5, pch = 19,
	pt.cex = 2)

boxplot(s$v[,1] ~ pheno1$batch)