## ----setup, include=FALSE-------------------------------------------------- #set knitr chunk options knitr::opts_chunk$set(warning = FALSE, message = FALSE) #load packages to avoid startup messages later in the code suppressPackageStartupMessages({library(SingscoreAMLMutations)}) library(ggplot2) library(TCGAbiolinks) library(SummarizedExperiment) library(edgeR) library(rtracklayer) library(plyr) library(org.Hs.eg.db) library(GSEABase) library(singscore) library(reshape2) library(gridExtra) library(dcanr) library(mclust) #ggplot theme rl = 1.2 current_theme = theme_minimal() + theme( panel.border = element_rect(colour = 'black', fill = NA), panel.grid.minor = element_blank(), axis.title = element_text(size = rel(rl) * 1.1), axis.text = element_text(size = rel(rl)), plot.title = element_text(size = rel(rl)), strip.background = element_rect(fill = NA, colour = 'black'), strip.text = element_text(size = rel(rl)), legend.text = element_text(size = rel(rl)), legend.title = element_text(size = rel(rl), face = 'italic'), legend.position = 'bottom', legend.direction = 'horizontal' ) #automatically create a bib database for R packages knitr::write_bib(c( .packages(), 'knitr', 'rmarkdown', 'BiocStyle' ), 'packages.bib') ## ----gdc_query------------------------------------------------------------- library(SingscoreAMLMutations) library(TCGAbiolinks) #get GDC version information gdc_info = getGDCInfo() gdc_info ## ----gdc_results----------------------------------------------------------- #form a query for the RNAseq data query_rna = GDCquery( #getGDCprojects() project = 'TCGA-LAML', #TCGAbiolinks:::getProjectSummary('TCGA-LAML') data.category = 'Transcriptome Profiling', data.type = 'Gene Expression Quantification', workflow.type = 'HTSeq - Counts' ) #extract results of the query rnaseq_res = getResults(query_rna) dim(rnaseq_res) colnames(rnaseq_res) ## ----gdc_download, results='hide'------------------------------------------ datapath = file.path(tempdir(), 'GDCdata') GDCdownload(query_rna, directory = datapath) #(size: 39MB) ## ----gdc_prepare, results='hide'------------------------------------------- aml_se = GDCprepare(query_rna, directory = datapath) ## ----show_se--------------------------------------------------------------- aml_se ## ----remove_dups----------------------------------------------------------- library(SummarizedExperiment) library(edgeR) aml_dge = DGEList(counts = assay(aml_se), genes = rowData(aml_se)) ## ----plot-hist-filtering, fig.wide=TRUE, fig.cap="Histogram of logCPM values for the AML data before and after filtering. Filtering results in fewer zeros in the data. Most genes with CPM less than 1, logCPM < 0; (red line) across the majority of samples get discarded, resulting in an approximately log-normal distribution."---- prop_expressed = rowMeans(cpm(aml_dge) > 1) keep = prop_expressed > 0.5 op = par(no.readonly = TRUE) par(mfrow = c(1, 2)) hist(cpm(aml_dge, log = TRUE), main = 'Unfiltered', xlab = 'logCPM') abline(v = log(1), lty = 2, col = 2) hist(cpm(aml_dge[keep, ], log = TRUE), main = 'Filtered', xlab = 'logCPM') abline(v = log(1), lty = 2, col = 2) par(op) ## ----remove_low_counts----------------------------------------------------- #subset the data aml_dge = aml_dge[keep, , keep.lib.sizes = FALSE] aml_se = aml_se[keep, ] ## ----download_gencode, results='hide'-------------------------------------- #download v22 of the GENCODE annotation gencode_file = 'gencode.v22.annotation.gtf.gz' gencode_link = paste( 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22', gencode_file, sep = '/' ) download.file(gencode_link, gencode_file, method = 'libcurl') #(size: 39MB) ## ----compute_gene_lengths-------------------------------------------------- library(rtracklayer) library(plyr) gtf = import.gff(gencode_file, format = 'gtf', genome = 'GRCm38.71', feature.type = 'exon') #split records by gene to group exons of the same gene grl = reduce(split(gtf, elementMetadata(gtf)$gene_id)) gene_lengths = ldply(grl, function(x) { #sum up the length of individual exons return(c('gene_length' = sum(width(x)))) }, .id = 'ensembl_gene_id') ## ----add_biotype----------------------------------------------------------- #extract information on gene biotype genetype = unique(elementMetadata(gtf)[, c('gene_id', 'gene_type')]) colnames(genetype)[1] = 'ensembl_gene_id' gene_lengths = merge(genetype, gene_lengths) #remove ENSEMBL ID version numbers gene_lengths$ensembl_gene_id = gsub('\\.[0-9]*', '', gene_lengths$ensembl_gene_id) saveRDS(gene_lengths, file = 'gene_lengths_HTSeq_gencodev22.rds') gene_lengths ## ----add_length_annotation------------------------------------------------- #allocate rownames for ease of indexing rownames(gene_lengths) = gene_lengths$ensembl_gene_id rowData(aml_se)$gene_length = gene_lengths[rownames(aml_se), 'gene_length'] rowData(aml_se)$gene_biotype = gene_lengths[rownames(aml_se), 'gene_type'] #annotate gene lengths for the DGE object aml_dge$genes$length = gene_lengths[rownames(aml_dge), 'gene_length'] ## ----compute_fpkm---------------------------------------------------------- aml_dge_tmm = calcNormFactors(aml_dge, method = 'TMM') #compute FPKM values and append to assays assay(aml_se, 'logFPKM_TMM') = rpkm(aml_dge_tmm, log = TRUE) aml_se ## ----preproc_mutations, eval=FALSE, include=FALSE-------------------------- # #preprocessing - read tsv (Joe's version) and convert save as RDS # mut_info = read.csv('PatientMutations.tsv', sep = '\t', colClasses = c('character', rep('logical', 4))) # rownames(mut_info) = mut_info$Barcode # mut_info = mut_info[, -1] # patients = mut_info[substring(colnames(aml_se), 1, 12), ] # rownames(patients) = colnames(aml_se) # saveRDS(patients, file = 'AMLPatientMutationsTCGA.rds') ## ----annotate_mutations---------------------------------------------------- data(AMLPatientMutationsTCGA) patient_mutations = AMLPatientMutationsTCGA patient_mutations = patient_mutations[colnames(aml_se), ] # order samples aml_mutations = colnames(patient_mutations) # get mutation labels colData(aml_se) = cbind(colData(aml_se), patient_mutations) colData(aml_se)[, aml_mutations] ## ----map_ensembl_entrez---------------------------------------------------- library(org.Hs.eg.db) rowData(aml_se)$entrezgene = mapIds( org.Hs.eg.db, keys = rownames(aml_se), keytype = 'ENSEMBL', column = 'ENTREZID', multiVals = 'asNA' ) gene_annot = rowData(aml_se) ## ----discard_multimapped_genes--------------------------------------------- #select genes with mapped Entrez IDs keep = !is.na(gene_annot$entrezgene) #select genes with unique Entrez IDs dup_entrez = gene_annot$entrezgene[duplicated(gene_annot$entrezgene)] keep = keep & !gene_annot$entrezgene %in% dup_entrez #Biotype of discarded genes (due to non-unique mapping) head(sort(table(gene_annot[!keep, 'gene_biotype']), decreasing = TRUE), n = 10) #subset the data aml_se = aml_se[keep, ] ## ----signature_names------------------------------------------------------- #create signature names verhaak_names = paste('VERHAAK_AML_WITH_NPM1_MUTATED', c('UP', 'DN'), sep = '_') verhaak_names ## ----download_signatures, results='hide'----------------------------------- #generate URLs verhaak_links = paste0( 'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=', verhaak_names, '&fileType=xml' ) #download files verhaak_files = paste0(verhaak_names, '.xml') mapply(download.file, verhaak_links, verhaak_files, method = 'libcurl') ## ----read_signatures------------------------------------------------------- library(GSEABase) verhaak_sigs = getBroadSets(verhaak_files, membersId = 'MEMBERS_EZID') verhaak_sigs ## ----rows_to_entrez-------------------------------------------------------- rownames(aml_se) = rowData(aml_se)$entrezgene ## ----compute_ranks_aml----------------------------------------------------- library(singscore) #apply the rankGenes method to each version of the dataset, excluding counts aml_ranked = rankGenes(assay(aml_se, 'logFPKM_TMM')) ## ----compute_scores_verhaak, warning=TRUE---------------------------------- #apply the scoring function verhaak_scores = simpleScore(aml_ranked, upSet = verhaak_sigs[[1]], downSet = verhaak_sigs[[2]]) ## -------------------------------------------------------------------------- head(verhaak_scores) ## ----plot-dispersion, fig.wide=TRUE, fig.height=4, fig.cap="NPM1c signature scores from the up- and down-regulated gene sets, both combined (Total) and independently, can separate samples with NPM1c mutations or MLL fusions/PTDs from samples not bearing these genomic lesions. Scores are plotted against the median absolute deviation (MAD) of the ranks of genes in each gene set forming the NPM1c signature. The plots also reveal that scores close to 0 result in higher MADs."---- #relative size of text in the figure relSize = 1.2 #create annotation mutated_gene = rep('Other', ncol(aml_se)) mutated_gene[aml_se$NPM1c.Mut] = 'NPM1c Mut' mutated_gene[aml_se$KMT2A.Fusion | aml_se$KMT2A.PTD] = 'MLL Fusion/PTD' p1 = plotDispersion(verhaak_scores, annot = mutated_gene, textSize = relSize) p1 ## ----plot-rank-density, fig.wide=TRUE, fig.height=4, fig.cap="Rank distribution of genes reveals that expected up-regulated genes are up-regulated in high scoring samples and down-regulated in low-scoring samples with a similar, but inverse, observation for the expected down-regulated genes. Samples with the highest score, lowest score and highest dispersion (MAD) are show from left to right. The barcode plot for ranks of genes in each signature along with the density of the ranks is shown in each plot. Up-regulated genes are represented in green and down-regulated genes in purple. Gaussian distributions are observed at the extremes for the lowest and highest scores. High dispersion results in scores close to 0 because all genes are either evenly distributed (down-regulated gene set) or bi-modally distributed at the extrema (up-regulated gene set)"---- library(gridExtra) #select samples with the properties required select_samples = c( 'Max Total Score' = which.max(verhaak_scores$TotalScore), 'Min Total Score' = which.min(verhaak_scores$TotalScore), 'Max Dispersion' = which.max(verhaak_scores$TotalDispersion) ) #plotRankDensity applied to each sample rank_plots = lapply(names(select_samples), function(x) { #get the sample index aml_sample = select_samples[x] #drop = FALSE is required to ensure the data.frame is intact p1 = plotRankDensity(rankData = aml_ranked[, aml_sample, drop = FALSE], upSet = verhaak_sigs[[1]], downSet = verhaak_sigs[[2]], textSize = relSize) #overwrite title of the plot with the description of the sample #this is possible because singscore uses ggplot2 p1 = p1 + ggtitle(paste(x, mutated_gene[aml_sample], sep = '\n')) + guides(colour = guide_legend(ncol = 1)) return(p1) }) #create a multipanel plot grid.arrange(grobs = rank_plots, nrow = 1) ## ----create_plotdf_boxplot------------------------------------------------- #create a dataframe with the data required: scores and mutations scoredf = as.data.frame(colData(aml_se)[, aml_mutations]) scoredf$Score = verhaak_scores$TotalScore scoredf$Dispersion = verhaak_scores$TotalDispersion ## ----plot-boxplot-mutscores, fig.height=5, fig.cap="NPM1c signature scores are able to separate mutants from wild-types. Boxplot of scores from the NPM1c signature split by different types of mutations."---- #restructure the data for ploting plotdf = melt( scoredf, id.var = c('Score', 'Dispersion'), variable.name = 'Mutation', value.name = 'Status' ) #convert TRUE-FALSE values to Mut-WT plotdf$Status = factor(plotdf$Status, labels = c('WT', 'Mut')) p1 = ggplot(plotdf, aes(Mutation, Score, fill = Status)) + geom_boxplot(position = 'dodge', alpha = 0.6) + scale_fill_brewer(palette = 'Set2') + current_theme + theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) p1 ## ----predict_mutation_glm-------------------------------------------------- #fit GLMs to each mutation glms = lapply(aml_mutations, function(x) { #generate a formula for the fit form = as.formula(paste0(x, ' ~ Score')) glm1 = glm(form, data = scoredf, family = binomial(link = 'logit')) return(glm1) }) names(glms) = aml_mutations #extract coefficients coefs = lapply(glms, function(x) coef(summary(x))) ldply(coefs, function(x) x[2, ], .id = 'Mutation') ## ----eval_prediction_performance------------------------------------------- library(dcanr) #assess sensitivity and specificity prec_rec = ldply(glms, function(glm1) { #predict mutations for the data used in training and convert to binary format prediction = as.numeric(predict(glm1) > 0) observed = glm1$y prec = performanceMeasure(prediction, observed, 'precision') recall = performanceMeasure(prediction, observed, 'recall') f1 = performanceMeasure(prediction, observed) return(c('Precision' = prec, 'Recall' = recall, 'F1' = f1)) }, .id = 'Mutation') prec_rec ## ----predict_with_mads----------------------------------------------------- #include dispersion in the model glm_npm1c = glm('NPM1c.Mut ~ Score + Dispersion', data = scoredf, family = binomial(link = 'logit')) #evaluate performance of the new model prediction = as.numeric(predict(glm_npm1c) > 0) observed = glm_npm1c$y c( 'Precision' = performanceMeasure(prediction, observed, 'precision'), 'Recall' = performanceMeasure(prediction, observed, 'recall'), 'F1' = performanceMeasure(prediction, observed) ) ## ----classify_mclust, message=FALSE, warning=FALSE------------------------- library(mclust) #Gaussian mixture model m1 = Mclust(scoredf$Score, G = 2, verbose = FALSE) #k-means clustering m2 = kmeans(scoredf[, 5:6], centers = 2, nstart = 100) #hierarchical clustering m3 = hclust(dist(scoredf[, 5:6])) mutation_inference = cbind( 'GLM' = prediction, 'mclust' = m1$classification, 'k-means' = m2$cluster, 'hclust' = cutree(m3, k = 2) ) apply(mutation_inference, 2, adjustedRandIndex, scoredf$NPM1c.Mut) ## ----process_rossmll_sigs-------------------------------------------------- #create signature names rossmll_name = 'ROSS_AML_WITH_MLL_FUSIONS' #generate URLs rossmll_link = paste0( 'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=', rossmll_name, '&fileType=xml' ) #download files rossmll_file = paste0(rossmll_name, '.xml') download.file(rossmll_link, rossmll_file, method = 'libcurl') rossmll_sig = getBroadSets(rossmll_file, membersId = 'MEMBERS_EZID') rossmll_sig ## ----score_rossmll--------------------------------------------------------- rossmll_scores = simpleScore(aml_ranked, rossmll_sig[[1]], knownDirection = FALSE) ## ----plot-signature-landscape, fig.height=7, fig.width=7, fig.cap="Signature landscape between the MLL fusion and NPM1c signatures. A positive association is revealed between the two signatures in AML."---- p_mll_npm1c = plotScoreLandscape( verhaak_scores, rossmll_scores, scorenames = c('VERHAAK_AML_WITH_NPM1_MUTATED', 'ROSS_AML_WITH_MLL_FUSIONS'), textSize = relSize ) p_mll_npm1c ## ----new_annotations------------------------------------------------------- #new annotation - modify previously used annotations mutated_gene[aml_se$KMT2A.Fusion] = 'MLL Fusion' mutated_gene[aml_se$KMT2A.PTD] = 'MLL PTD' mutated_gene[aml_se$PML.RARA] = 'PML-RARA' ## ----plot-project-mll, fig.height=7.5, fig.width=7, fig.cap="The landscape reveals a distinction between the MLL fusions and PTDs as previously reported. Different mutations occupy different regions of the landscape, possessing different types of molecular traits."---- select_aml = !mutated_gene %in% 'Other' #label samples with an mclust NPM1c classification uncertainty of > 0.3 label_samples = substr(rownames(verhaak_scores), 6, 12) #sample ID from barcodes label_samples[m1$uncertainty < 0.3] = NA #project mutations onto the landscape p1 = projectScoreLandscape( p_mll_npm1c, verhaak_scores, rossmll_scores, subSamples = select_aml, annot = mutated_gene[select_aml], sampleLabels = label_samples[select_aml] ) p1 + theme(legend.box = 'vertical') ## ----plot-project-pmlrara, fig.height=7.5, fig.width=7, echo=FALSE, fig.cap="The PML-RARA signature is independent from the NPM1c signature in AML. The L-shaped landscape suggests the molecular mechanisms underlying these mutations are mutually exclusive."---- newsig_name = 'ROSS_AML_WITH_PML_RARA_FUSION' #generate URL newsig_link = paste0( 'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=', newsig_name, '&fileType=xml' ) #download files newsig_file = paste0(newsig_name, '.xml') download.file(newsig_link, newsig_file, method = 'libcurl') newsig_sig = getBroadSets(newsig_file, membersId = 'MEMBERS_EZID') #score newsig_scores = simpleScore(aml_ranked, newsig_sig[[1]], knownDirection = FALSE) #plot and project p1 = plotScoreLandscape( verhaak_scores, newsig_scores, scorenames = c('VERHAAK_AML_WITH_NPM1_MUTATED', newsig_name), textSize = relSize ) #project NPM1 mutations onto the landscape mutated_gene = rep('Other', ncol(aml_se)) mutated_gene[aml_se$NPM1c.Mut] = 'NPM1c Mut' mutated_gene[aml_se$KMT2A.Fusion] = 'MLL Fusion' mutated_gene[aml_se$KMT2A.PTD] = 'MLL PTD' mutated_gene[aml_se$PML.RARA] = 'PML-RARA' select_aml = !mutated_gene %in% 'Other' p2 = projectScoreLandscape(p1, verhaak_scores[, 1:2], newsig_scores, subSamples = select_aml, annot = as.factor(mutated_gene[select_aml])) p2 + theme(legend.box = 'vertical') ## ----session_info---------------------------------------------------------- sessionInfo()