## ----setup, include=FALSE--------------------------------------------------
#set knitr chunk options
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

#load packages to avoid startup messages later in the code
suppressPackageStartupMessages({library(SingscoreAMLMutations)})
library(ggplot2)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(edgeR)
library(rtracklayer)
library(plyr)
library(org.Hs.eg.db)
library(GSEABase)
library(singscore)
library(reshape2)
library(gridExtra)
library(dcanr)
library(mclust)

#ggplot theme
rl = 1.2
current_theme = theme_minimal() +
  theme(
    panel.border = element_rect(colour = 'black', fill = NA),
    panel.grid.minor = element_blank(),
    axis.title = element_text(size = rel(rl) * 1.1),
    axis.text = element_text(size = rel(rl)),
    plot.title = element_text(size = rel(rl)),
    strip.background = element_rect(fill = NA, colour = 'black'),
    strip.text = element_text(size = rel(rl)),
    legend.text = element_text(size = rel(rl)),
    legend.title = element_text(size = rel(rl), face = 'italic'),
    legend.position = 'bottom',
    legend.direction = 'horizontal'
  )

#automatically create a bib database for R packages
knitr::write_bib(c(
  .packages(), 'knitr', 'rmarkdown', 'BiocStyle'
), 'packages.bib')

## ----gdc_query-------------------------------------------------------------
library(SingscoreAMLMutations)
library(TCGAbiolinks)

#get GDC version information
gdc_info = getGDCInfo()
gdc_info

## ----gdc_results-----------------------------------------------------------
#form a query for the RNAseq data
query_rna = GDCquery(
  #getGDCprojects()
  project = 'TCGA-LAML',
  #TCGAbiolinks:::getProjectSummary('TCGA-LAML')
  data.category = 'Transcriptome Profiling',
  data.type = 'Gene Expression Quantification',
  workflow.type = 'HTSeq - Counts'
)

#extract results of the query
rnaseq_res = getResults(query_rna)
dim(rnaseq_res)
colnames(rnaseq_res)

## ----gdc_download, results='hide'------------------------------------------
datapath = file.path(tempdir(), 'GDCdata')
GDCdownload(query_rna, directory = datapath) #(size: 39MB)

## ----gdc_prepare, results='hide'-------------------------------------------
aml_se = GDCprepare(query_rna, directory = datapath)

## ----show_se---------------------------------------------------------------
aml_se

## ----remove_dups-----------------------------------------------------------
library(SummarizedExperiment)
library(edgeR)

aml_dge = DGEList(counts = assay(aml_se), genes = rowData(aml_se))

## ----plot-hist-filtering, fig.wide=TRUE, fig.cap="Histogram of logCPM values for the AML data before and after filtering. Filtering results in fewer zeros in the data. Most genes with CPM less than 1, logCPM < 0; (red line) across the majority of samples get discarded, resulting in an approximately log-normal distribution."----
prop_expressed = rowMeans(cpm(aml_dge) > 1)
keep = prop_expressed > 0.5

op = par(no.readonly = TRUE)
par(mfrow = c(1, 2))
hist(cpm(aml_dge, log = TRUE), main = 'Unfiltered', xlab = 'logCPM')
abline(v = log(1), lty = 2, col = 2)
hist(cpm(aml_dge[keep, ], log = TRUE), main = 'Filtered', xlab = 'logCPM')
abline(v = log(1), lty = 2, col = 2)
par(op)

## ----remove_low_counts-----------------------------------------------------
#subset the data
aml_dge = aml_dge[keep, , keep.lib.sizes = FALSE]
aml_se = aml_se[keep, ]

## ----download_gencode, results='hide'--------------------------------------
#download v22 of the GENCODE annotation
gencode_file = 'gencode.v22.annotation.gtf.gz'
gencode_link = paste(
  'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22',
  gencode_file,
  sep = '/'
  )
download.file(gencode_link, gencode_file, method = 'libcurl') #(size: 39MB)

## ----compute_gene_lengths--------------------------------------------------
library(rtracklayer)
library(plyr)

gtf = import.gff(gencode_file, format = 'gtf', genome = 'GRCm38.71', feature.type = 'exon')
#split records by gene to group exons of the same gene
grl = reduce(split(gtf, elementMetadata(gtf)$gene_id))
gene_lengths = ldply(grl, function(x) {
  #sum up the length of individual exons
	return(c('gene_length' = sum(width(x))))
}, .id = 'ensembl_gene_id')

## ----add_biotype-----------------------------------------------------------
#extract information on gene biotype
genetype = unique(elementMetadata(gtf)[, c('gene_id', 'gene_type')])
colnames(genetype)[1] = 'ensembl_gene_id'
gene_lengths = merge(genetype, gene_lengths)

#remove ENSEMBL ID version numbers
gene_lengths$ensembl_gene_id = gsub('\\.[0-9]*', '', gene_lengths$ensembl_gene_id)
saveRDS(gene_lengths, file = 'gene_lengths_HTSeq_gencodev22.rds')
gene_lengths

## ----add_length_annotation-------------------------------------------------
#allocate rownames for ease of indexing
rownames(gene_lengths) = gene_lengths$ensembl_gene_id
rowData(aml_se)$gene_length = gene_lengths[rownames(aml_se), 'gene_length']
rowData(aml_se)$gene_biotype = gene_lengths[rownames(aml_se), 'gene_type']

#annotate gene lengths for the DGE object
aml_dge$genes$length = gene_lengths[rownames(aml_dge), 'gene_length']

## ----compute_fpkm----------------------------------------------------------
aml_dge_tmm = calcNormFactors(aml_dge, method = 'TMM')

#compute FPKM values and append to assays
assay(aml_se, 'logFPKM_TMM') = rpkm(aml_dge_tmm, log = TRUE)
aml_se

## ----preproc_mutations, eval=FALSE, include=FALSE--------------------------
#  #preprocessing - read tsv (Joe's version) and convert save as RDS
#  mut_info = read.csv('PatientMutations.tsv', sep = '\t', colClasses = c('character', rep('logical', 4)))
#  rownames(mut_info) = mut_info$Barcode
#  mut_info = mut_info[, -1]
#  patients = mut_info[substring(colnames(aml_se), 1, 12), ]
#  rownames(patients) = colnames(aml_se)
#  saveRDS(patients, file = 'AMLPatientMutationsTCGA.rds')

## ----annotate_mutations----------------------------------------------------
data(AMLPatientMutationsTCGA)
patient_mutations = AMLPatientMutationsTCGA
patient_mutations = patient_mutations[colnames(aml_se), ] # order samples
aml_mutations = colnames(patient_mutations) # get mutation labels
colData(aml_se) = cbind(colData(aml_se), patient_mutations)
colData(aml_se)[, aml_mutations]

## ----map_ensembl_entrez----------------------------------------------------
library(org.Hs.eg.db)

rowData(aml_se)$entrezgene = mapIds(
  org.Hs.eg.db,
  keys = rownames(aml_se),
  keytype = 'ENSEMBL',
  column = 'ENTREZID',
  multiVals = 'asNA'
  )
gene_annot = rowData(aml_se)

## ----discard_multimapped_genes---------------------------------------------
#select genes with mapped Entrez IDs
keep = !is.na(gene_annot$entrezgene)

#select genes with unique Entrez IDs
dup_entrez = gene_annot$entrezgene[duplicated(gene_annot$entrezgene)]
keep = keep & !gene_annot$entrezgene %in% dup_entrez

#Biotype of discarded genes (due to non-unique mapping)
head(sort(table(gene_annot[!keep, 'gene_biotype']), decreasing = TRUE), n = 10)

#subset the data
aml_se = aml_se[keep, ]

## ----signature_names-------------------------------------------------------
#create signature names
verhaak_names = paste('VERHAAK_AML_WITH_NPM1_MUTATED', c('UP', 'DN'), sep = '_')
verhaak_names

## ----download_signatures, results='hide'-----------------------------------
#generate URLs
verhaak_links = paste0(
  'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=',
  verhaak_names,
  '&fileType=xml'
  )

#download files
verhaak_files = paste0(verhaak_names, '.xml')
mapply(download.file, verhaak_links, verhaak_files, method = 'libcurl')

## ----read_signatures-------------------------------------------------------
library(GSEABase)

verhaak_sigs = getBroadSets(verhaak_files, membersId = 'MEMBERS_EZID')
verhaak_sigs

## ----rows_to_entrez--------------------------------------------------------
rownames(aml_se) = rowData(aml_se)$entrezgene

## ----compute_ranks_aml-----------------------------------------------------
library(singscore)

#apply the rankGenes method to each version of the dataset, excluding counts
aml_ranked = rankGenes(assay(aml_se, 'logFPKM_TMM'))

## ----compute_scores_verhaak, warning=TRUE----------------------------------
#apply the scoring function
verhaak_scores = simpleScore(aml_ranked,
                             upSet = verhaak_sigs[[1]],
                             downSet = verhaak_sigs[[2]])

## --------------------------------------------------------------------------
head(verhaak_scores)

## ----plot-dispersion, fig.wide=TRUE, fig.height=4, fig.cap="NPM1c signature scores from the up- and down-regulated gene sets, both combined (Total) and independently, can separate samples with NPM1c mutations or MLL fusions/PTDs from samples not bearing these genomic lesions. Scores are plotted against the median absolute deviation (MAD) of the ranks of genes in each gene set forming the NPM1c signature. The plots also reveal that scores close to 0 result in higher MADs."----

#relative size of text in the figure
relSize = 1.2

#create annotation
mutated_gene = rep('Other', ncol(aml_se))
mutated_gene[aml_se$NPM1c.Mut] = 'NPM1c Mut'
mutated_gene[aml_se$KMT2A.Fusion | aml_se$KMT2A.PTD] = 'MLL Fusion/PTD'
p1 = plotDispersion(verhaak_scores, annot = mutated_gene, textSize = relSize)
p1

## ----plot-rank-density, fig.wide=TRUE, fig.height=4, fig.cap="Rank distribution of genes reveals that expected up-regulated genes are up-regulated in high scoring samples and down-regulated in low-scoring samples with a similar, but inverse, observation for the expected down-regulated genes. Samples with the highest score, lowest score and highest dispersion (MAD) are show from left to right. The barcode plot for ranks of genes in each signature along with the density of the ranks is shown in each plot. Up-regulated genes are represented in green and down-regulated genes in purple. Gaussian distributions are observed at the extremes for the lowest and highest scores. High dispersion results in scores close to 0 because all genes are either evenly distributed (down-regulated gene set) or bi-modally distributed at the extrema (up-regulated gene set)"----
library(gridExtra)

#select samples with the properties required
select_samples = c(
  'Max Total Score' = which.max(verhaak_scores$TotalScore),
  'Min Total Score' = which.min(verhaak_scores$TotalScore),
  'Max Dispersion' = which.max(verhaak_scores$TotalDispersion)
  )

#plotRankDensity applied to each sample
rank_plots = lapply(names(select_samples), function(x) {
  #get the sample index
  aml_sample = select_samples[x]
  #drop = FALSE is required to ensure the data.frame is intact
  p1 = plotRankDensity(rankData = aml_ranked[, aml_sample, drop = FALSE],
                       upSet = verhaak_sigs[[1]],
                       downSet = verhaak_sigs[[2]],
                       textSize = relSize)
  
  #overwrite title of the plot with the description of the sample
  #this is possible because singscore uses ggplot2
  p1 = p1 + ggtitle(paste(x,  mutated_gene[aml_sample], sep = '\n')) +
    guides(colour = guide_legend(ncol = 1))
  
  return(p1)
})

#create a multipanel plot
grid.arrange(grobs = rank_plots, nrow = 1)

## ----create_plotdf_boxplot-------------------------------------------------
#create a dataframe with the data required: scores and mutations
scoredf = as.data.frame(colData(aml_se)[, aml_mutations])
scoredf$Score = verhaak_scores$TotalScore
scoredf$Dispersion = verhaak_scores$TotalDispersion

## ----plot-boxplot-mutscores, fig.height=5, fig.cap="NPM1c signature scores are able to separate mutants from wild-types. Boxplot of scores from the NPM1c signature split by different types of mutations."----
#restructure the data for ploting
plotdf = melt(
  scoredf,
  id.var = c('Score', 'Dispersion'),
  variable.name = 'Mutation',
  value.name = 'Status'
  )
#convert TRUE-FALSE values to Mut-WT
plotdf$Status = factor(plotdf$Status, labels = c('WT', 'Mut'))
p1 = ggplot(plotdf, aes(Mutation, Score, fill = Status)) +
  geom_boxplot(position = 'dodge', alpha = 0.6) +
  scale_fill_brewer(palette = 'Set2') +
  current_theme +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
p1

## ----predict_mutation_glm--------------------------------------------------
#fit GLMs to each mutation
glms = lapply(aml_mutations, function(x) {
  #generate a formula for the fit
  form = as.formula(paste0(x, ' ~ Score'))
  glm1 = glm(form, data = scoredf, family = binomial(link = 'logit'))
  return(glm1)
})
names(glms) = aml_mutations

#extract coefficients
coefs = lapply(glms, function(x) coef(summary(x)))
ldply(coefs, function(x) x[2, ], .id = 'Mutation')

## ----eval_prediction_performance-------------------------------------------
library(dcanr)

#assess sensitivity and specificity
prec_rec = ldply(glms, function(glm1) {
  #predict mutations for the data used in training and convert to binary format
  prediction = as.numeric(predict(glm1) > 0)
  observed = glm1$y
  prec = performanceMeasure(prediction, observed, 'precision')
  recall = performanceMeasure(prediction, observed, 'recall')
  f1 = performanceMeasure(prediction, observed)
  return(c('Precision' = prec, 'Recall' = recall, 'F1' = f1))
}, .id = 'Mutation')
prec_rec

## ----predict_with_mads-----------------------------------------------------
#include dispersion in the model
glm_npm1c = glm('NPM1c.Mut ~ Score + Dispersion',
               data = scoredf,
               family = binomial(link = 'logit'))

#evaluate performance of the new model
prediction = as.numeric(predict(glm_npm1c) > 0)
observed = glm_npm1c$y
c(
  'Precision' = performanceMeasure(prediction, observed, 'precision'),
  'Recall' = performanceMeasure(prediction, observed, 'recall'),
  'F1' = performanceMeasure(prediction, observed)
  )

## ----classify_mclust, message=FALSE, warning=FALSE-------------------------
library(mclust)

#Gaussian mixture model
m1 = Mclust(scoredf$Score, G = 2, verbose = FALSE)
#k-means clustering
m2 = kmeans(scoredf[, 5:6], centers = 2, nstart = 100)
#hierarchical clustering
m3 = hclust(dist(scoredf[, 5:6]))

mutation_inference = cbind(
  'GLM' = prediction,
  'mclust' = m1$classification,
  'k-means' = m2$cluster,
  'hclust' = cutree(m3, k = 2)
)
apply(mutation_inference, 2, adjustedRandIndex, scoredf$NPM1c.Mut)

## ----process_rossmll_sigs--------------------------------------------------
#create signature names
rossmll_name = 'ROSS_AML_WITH_MLL_FUSIONS'
#generate URLs
rossmll_link = paste0(
  'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=',
  rossmll_name,
  '&fileType=xml'
  )

#download files
rossmll_file = paste0(rossmll_name, '.xml')
download.file(rossmll_link, rossmll_file, method = 'libcurl')
rossmll_sig = getBroadSets(rossmll_file, membersId = 'MEMBERS_EZID')
rossmll_sig

## ----score_rossmll---------------------------------------------------------
rossmll_scores = simpleScore(aml_ranked, rossmll_sig[[1]], knownDirection = FALSE)

## ----plot-signature-landscape, fig.height=7, fig.width=7, fig.cap="Signature landscape between the MLL fusion and NPM1c signatures. A positive association is revealed between the two signatures in AML."----
p_mll_npm1c = plotScoreLandscape(
  verhaak_scores,
  rossmll_scores,
  scorenames = c('VERHAAK_AML_WITH_NPM1_MUTATED', 'ROSS_AML_WITH_MLL_FUSIONS'),
  textSize = relSize
  )
p_mll_npm1c

## ----new_annotations-------------------------------------------------------
#new annotation - modify previously used annotations
mutated_gene[aml_se$KMT2A.Fusion] = 'MLL Fusion'
mutated_gene[aml_se$KMT2A.PTD] = 'MLL PTD'
mutated_gene[aml_se$PML.RARA] = 'PML-RARA'

## ----plot-project-mll, fig.height=7.5, fig.width=7, fig.cap="The landscape reveals a distinction between the MLL fusions and PTDs as previously reported. Different mutations occupy different regions of the landscape, possessing different types of molecular traits."----
select_aml = !mutated_gene %in% 'Other'

#label samples with an mclust NPM1c classification uncertainty of > 0.3
label_samples = substr(rownames(verhaak_scores), 6, 12) #sample ID from barcodes
label_samples[m1$uncertainty < 0.3] = NA

#project mutations onto the landscape
p1 = projectScoreLandscape(
  p_mll_npm1c,
  verhaak_scores,
  rossmll_scores,
  subSamples = select_aml,
  annot = mutated_gene[select_aml],
  sampleLabels = label_samples[select_aml]
)
p1 + theme(legend.box = 'vertical')

## ----plot-project-pmlrara, fig.height=7.5, fig.width=7, echo=FALSE, fig.cap="The PML-RARA signature is independent from the NPM1c signature in AML. The L-shaped landscape suggests the molecular mechanisms underlying these mutations are mutually exclusive."----
newsig_name = 'ROSS_AML_WITH_PML_RARA_FUSION'
#generate URL
newsig_link = paste0(
  'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=',
  newsig_name,
  '&fileType=xml'
)

#download files
newsig_file = paste0(newsig_name, '.xml')
download.file(newsig_link, newsig_file, method = 'libcurl')
newsig_sig = getBroadSets(newsig_file, membersId = 'MEMBERS_EZID')

#score
newsig_scores = simpleScore(aml_ranked, newsig_sig[[1]], knownDirection = FALSE)

#plot and project
p1 = plotScoreLandscape(
  verhaak_scores,
  newsig_scores,
  scorenames = c('VERHAAK_AML_WITH_NPM1_MUTATED', newsig_name),
  textSize = relSize
)

#project NPM1 mutations onto the landscape
mutated_gene = rep('Other', ncol(aml_se))
mutated_gene[aml_se$NPM1c.Mut] = 'NPM1c Mut'
mutated_gene[aml_se$KMT2A.Fusion] = 'MLL Fusion'
mutated_gene[aml_se$KMT2A.PTD] = 'MLL PTD'
mutated_gene[aml_se$PML.RARA] = 'PML-RARA'
select_aml = !mutated_gene %in% 'Other'

p2 = projectScoreLandscape(p1,
                           verhaak_scores[, 1:2],
                           newsig_scores,
                           subSamples = select_aml,
                           annot = as.factor(mutated_gene[select_aml]))
p2 + theme(legend.box = 'vertical')

## ----session_info----------------------------------------------------------
sessionInfo()