### R code from vignette source 'Elbow_tutorial_vignette.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: analyze_excel
###################################################
    # load the ELBOW library
    library("ELBOW")
	
    # Set the filename (change everything after the <- to ``csv_file'',
    # where csv_file is the filename of your CSV data file)
    filename <- system.file("extdata", "EcoliMutMA.csv", package = "ELBOW")
	
    # Read in your CSV file
    csv_data <- read.csv(filename)

    # set the number of initial and final condition replicates both to three
    init_count  <- 3
    final_count <- 3

    # Parse the probes, intial conditions and final conditions
    # out of the CSV file.  Please see: extract_working_sets
    # for more information.
    #
    # init_count should be the number of columns associated with
    #       the initial conditions of the experiment.
    # final_count should be the number of columns associated with
    #       the final conditions of the experiment.
    working_sets <- extract_working_sets(csv_data, init_count, final_count)

    probes <- working_sets[[1]]
    initial_conditions <- working_sets[[2]]
    final_conditions <- working_sets[[3]]

    # Uncomment to output the plot to a PNG file (optional)
    # png(file="output_plot.png")

    # Analyze the elbow curve.
    sig <- analyze_elbow(probes, initial_conditions, final_conditions)
    
    # uncomment to write the significant probes to 'signprobes.csv'
    #write.table(sig,file="signprobes.csv",sep=",",row.names=FALSE)


###################################################
### code chunk number 2: load_limmma
###################################################
	####
	# Uncomment the lines below to install the necessary libraries:
	####
	#if (!requireNamespace("BiocManager", quietly=TRUE))
    	#install.packages("BiocManager")
	#BiocManager::install("GEOquery")
	#BiocManager::install("simpleaffy")
	#BiocManager::install("limma")
	#BiocManager::install("affyPLM")
	#BiocManager::install("RColorBrewer")
	####
	
	# retrieve the data from GEO
	library("GEOquery")
	getGEOSuppFiles("GSE20986")
	untar("GSE20986/GSE20986_RAW.tar", exdir="data")
	cels <- list.files("data/", pattern = "[gz]")
	sapply(paste("data", cels, sep="/"), gunzip)
	
	# reading in the CEL data into R (w/simpleaffy)
	library("simpleaffy")
	pheno_data <- data.frame(
		c(    # Name
			"GSM524662.CEL", "GSM524663.CEL", "GSM524664.CEL",
			"GSM524665.CEL", "GSM524666.CEL", "GSM524667.CEL",
			"GSM524668.CEL", "GSM524669.CEL", "GSM524670.CEL",
			"GSM524671.CEL", "GSM524672.CEL", "GSM524673.CEL"),
		c(    # FileName
			"GSM524662.CEL", "GSM524663.CEL", "GSM524664.CEL",
			"GSM524665.CEL", "GSM524666.CEL", "GSM524667.CEL",
			"GSM524668.CEL", "GSM524669.CEL", "GSM524670.CEL",
			"GSM524671.CEL", "GSM524672.CEL", "GSM524673.CEL"),
		c(    # Target
			"iris", "retina", "retina",
			"iris", "retina", "iris",
			"choroid", "choroid", "choroid",
			"huvec", "huvec", "huvec")
		)
	colnames(pheno_data) <- c("Name", "FileName", "Target")
	write.table(pheno_data, "data/phenodata.txt", row.names=FALSE, quote=FALSE)
	celfiles <- read.affy(covdesc="phenodata.txt", path="data")
	
	# normalizing the data
	celfiles.gcrma <- gcrma(celfiles)


###################################################
### code chunk number 3: limma_chip_qc
###################################################
	########
	# quality control checks (chip-level)
	########
	# load colour libraries
	library("RColorBrewer")
	
	# set colour palette
	cols <- brewer.pal(8, "Set1")
	
	# plot a boxplot of unnormalised intensity values
	boxplot(celfiles, col=cols)
	
	# plot a boxplot of normalised intensity values, affyPLM is required to interrogate celfiles.gcrma
	library("affyPLM")
	boxplot(celfiles.gcrma, col=cols)
	
	# the boxplots are somewhat skewed by the normalisation algorithm
	# and it is often more informative to look at density plots
	# Plot a density vs log intensity histogram for the unnormalised data
	hist(celfiles, col=cols)
	
	# Plot a density vs log intensity histogram for the normalised data
	hist(celfiles.gcrma, col=cols)
	########


###################################################
### code chunk number 4: limma_probe_qc
###################################################
		########
		# quality control checks (probe-level)
		########
		# Perform probe-level metric calculations on the CEL files:
		celfiles.qc <- fitPLM(celfiles)
		
		# Create an image of GSM24662.CEL:
		image(celfiles.qc, which=1, add.legend=TRUE)
		
		# Create an image of GSM524665.CEL
		# There is a spatial artifact present
		image(celfiles.qc, which=4, add.legend=TRUE)
		
		# affyPLM also provides more informative boxplots
		# RLE (Relative Log Expression) plots should have
		# values close to zero. GSM524665.CEL is an outlier
		RLE(celfiles.qc, main="RLE")
		
		# We can also use NUSE (Normalised Unscaled Standard Errors).
		# The median standard error should be 1 for most genes.
		# GSM524665.CEL appears to be an outlier on this plot too
		NUSE(celfiles.qc, main="NUSE")#'
		########


###################################################
### code chunk number 5: limma_sample_qc
###################################################
	########
	# quality control checks (sample vs. sample)
	########
	eset <- exprs(celfiles.gcrma)
	distance <- dist(t(eset),method="maximum")
	clusters <- hclust(distance)
	plot(clusters)
	########


###################################################
### code chunk number 6: limma_data_filter
###################################################
	########
	# data filtering
	########
	celfiles.filtered <- nsFilter(celfiles.gcrma, require.entrez=FALSE, remove.dupEntrez=FALSE)
	# What got removed and why?
	celfiles.filtered$filter.log


###################################################
### code chunk number 7: limma_extract_diffexp
###################################################
	########
	# Finding differentially expressed probesets
	########
	samples <- celfiles.gcrma$Target

	# check the results of this
	#samples

	# convert into factors
	samples <- as.factor(samples)

	# check factors have been assigned
	#samples

	# set up the experimental design
	design <- model.matrix(~0 + samples)
	colnames(design) <- c("choroid", "huvec", "iris", "retina")

	# inspect the experiment design
	#design


###################################################
### code chunk number 8: feed_limma
###################################################
	########
	# feed the data into limma for analysis
	########
	library("limma")

	# fit the linear model to the filtered expression set
	fit <- lmFit(exprs(celfiles.filtered$eset), design)

	# set up a contrast matrix to compare tissues v cell line
	contrast.matrix <- makeContrasts(huvec_choroid = huvec - choroid, huvec_retina = huvec - retina, huvec_iris <- huvec - iris, levels=design)

	# Now the contrast matrix is combined with the per-probeset linear model fit.
	huvec_fits <- contrasts.fit(fit, contrast.matrix)
	huvec_ebFit <- eBayes(huvec_fits)


###################################################
### code chunk number 9: elbow_limma
###################################################
	# load the ELBOW library
	library("ELBOW")

	# find the ELBOW limits	
	get_elbow_limma(huvec_ebFit)


###################################################
### code chunk number 10: load_deseq
###################################################
	# load the ELBOW library
	library("ELBOW")
	
	# install the DESeq libraries
	#if (!requireNamespace("BiocManager", quietly=TRUE))
    	#install.packages("BiocManager")
	#BiocManager::install("DESeq")

	## download the table
	library("DESeq")
	
	# the following bam file dataset was obtained from:
	# http://cgrlucb.wikispaces.com/file/view/yeast_sample_data.txt
	# it has been downloaded into this package for speed convenience.
	filename <- system.file("extdata", "yeast_sample_data.txt", package = "ELBOW")
	
	count_table <- read.table(filename, header=TRUE, sep="\t", row.names=1)
	expt_design <- data.frame(row.names = colnames(count_table), condition = c("WE","WE","M","M","M"))
	conditions = expt_design$condition
	rnadata <- newCountDataSet(count_table, conditions)
	rnadata <- estimateSizeFactors(rnadata)
	rnadata <- as(rnadata, "CountDataSet")
	## rnadata <- estimateVarianceFunctions(rnadata)
	rnadata <- estimateDispersions(rnadata)
	
	# this next step is essential, but it takes a long time...
	results <- nbinomTest(rnadata, "M", "WE")


###################################################
### code chunk number 11: get_elbow_deseq
###################################################
	limits <- do_elbow_rnaseq(results)


###################################################
### code chunk number 12: plot_elbow_deseq
###################################################
	# plot the results w/elbow
	plot_dataset(results, "log2FoldChange", limits$up_limit, limits$low_limit)