## ----setup, include=FALSE-------------------------------------------------- #set knitr chunk options knitr::opts_chunk$set(warning = FALSE, message = FALSE) #load packages to avoid startup messages later in the code suppressPackageStartupMessages({library(SingscoreAMLMutations)}) library(ggplot2) library(TCGAbiolinks) library(SummarizedExperiment) library(edgeR) library(rtracklayer) library(plyr) library(org.Hs.eg.db) library(GSEABase) library(singscore) library(reshape2) library(gridExtra) library(dcanr) #ggplot theme rl = 1.2 current_theme = theme_minimal() + theme( panel.border = element_rect(colour = 'black', fill = NA), panel.grid.minor = element_blank(), axis.title = element_text(size = rel(rl) * 1.1), axis.text = element_text(size = rel(rl)), plot.title = element_text(size = rel(rl)), strip.background = element_rect(fill = NA, colour = 'black'), strip.text = element_text(size = rel(rl)), legend.text = element_text(size = rel(rl)), legend.title = element_text(size = rel(rl), face = 'italic'), legend.position = 'bottom', legend.direction = 'horizontal' ) #automatically create a bib database for R packages knitr::write_bib(c( .packages(), 'knitr', 'rmarkdown', 'BiocStyle' ), 'packages.bib') ## ----gdc_query------------------------------------------------------------- library(SingscoreAMLMutations) library(TCGAbiolinks) #get GDC version information gdc_info = getGDCInfo() gdc_info ## ----gdc_results----------------------------------------------------------- #form a query for the RNAseq data query_rna = GDCquery( #getGDCprojects() project = 'TCGA-LAML', #TCGAbiolinks:::getProjectSummary('TCGA-LAML') data.category = 'Transcriptome Profiling', data.type = 'Gene Expression Quantification', workflow.type = 'HTSeq - Counts' ) #extract results of the query rnaseq_res = getResults(query_rna) dim(rnaseq_res) colnames(rnaseq_res) ## ----gdc_download, results='hide'------------------------------------------ datapath = file.path(tempdir(), 'GDCdata') GDCdownload(query_rna, directory = datapath) #(size: 39MB) ## ----gdc_prepare, results='hide'------------------------------------------- aml_se = GDCprepare(query_rna, directory = datapath) ## ----show_se--------------------------------------------------------------- aml_se ## ----remove_dups----------------------------------------------------------- library(SummarizedExperiment) library(edgeR) aml_dge = DGEList(counts = assay(aml_se), genes = rowData(aml_se)) ## ----plot-hist-filtering, fig.wide=TRUE, fig.cap="过滤前后AML数据logCPM值的直方图. 过滤之后数据中的零值减少了。在大部分样本里CPM值小于1(logCPM < 0)的基因被过滤掉,使得最后得到了近似log-normal分布的logCPM值。"---- prop_expressed = rowMeans(edgeR::cpm(aml_dge) > 1) keep = prop_expressed > 0.5 op = par(no.readonly = TRUE) par(mfrow = c(1, 2)) hist(edgeR::cpm(aml_dge, log = TRUE), main = 'Unfiltered', xlab = 'logCPM') abline(v = log(1), lty = 2, col = 2) hist(edgeR::cpm(aml_dge[keep, ], log = TRUE), main = 'Filtered', xlab = 'logCPM') abline(v = log(1), lty = 2, col = 2) par(op) ## ----remove_low_counts----------------------------------------------------- #subset the data aml_dge = aml_dge[keep, , keep.lib.sizes = FALSE] aml_se = aml_se[keep, ] ## ----download_gencode, results='hide'-------------------------------------- #download v22 of the GENCODE annotation gencode_file = 'gencode.v22.annotation.gtf.gz' gencode_link = paste( 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22', gencode_file, sep = '/' ) download.file(gencode_link, gencode_file, method = 'libcurl') #(size: 39MB) ## ----compute_gene_lengths-------------------------------------------------- library(rtracklayer) library(plyr) gtf = import.gff(gencode_file, format = 'gtf', genome = 'GRCm38.71', feature.type = 'exon') #split records by gene to group exons of the same gene grl = reduce(split(gtf, elementMetadata(gtf)$gene_id)) gene_lengths = ldply(grl, function(x) { #sum up the length of individual exons return(c('gene_length' = sum(width(x)))) }, .id = 'ensembl_gene_id') ## ----add_biotype----------------------------------------------------------- #extract information on gene biotype genetype = unique(elementMetadata(gtf)[, c('gene_id', 'gene_type')]) colnames(genetype)[1] = 'ensembl_gene_id' gene_lengths = merge(genetype, gene_lengths) #remove ENSEMBL ID version numbers gene_lengths$ensembl_gene_id = gsub('\\.[0-9]*', '', gene_lengths$ensembl_gene_id) saveRDS(gene_lengths, file = 'gene_lengths_HTSeq_gencodev22.rds') gene_lengths ## ----add_length_annotation------------------------------------------------- #allocate rownames for ease of indexing rownames(gene_lengths) = gene_lengths$ensembl_gene_id rowData(aml_se)$gene_length = gene_lengths[rownames(aml_se), 'gene_length'] rowData(aml_se)$gene_biotype = gene_lengths[rownames(aml_se), 'gene_type'] #annotate gene lengths for the DGE object aml_dge$genes$length = gene_lengths[rownames(aml_dge), 'gene_length'] ## ----compute_fpkm---------------------------------------------------------- aml_dge_tmm = calcNormFactors(aml_dge, method = 'TMM') #compute FPKM values and append to assays assay(aml_se, 'logFPKM_TMM') = rpkm(aml_dge_tmm, log = TRUE) aml_se ## ----preproc_mutations, eval=FALSE, include=FALSE-------------------------- # #preprocessing - read tsv (Joe's version) and convert save as RDS # mut_info = read.csv('PatientMutations.tsv', sep = '\t', colClasses = c('character', rep('logical', 4))) # rownames(mut_info) = mut_info$Barcode # mut_info = mut_info[, -1] # patients = mut_info[substring(colnames(aml_se), 1, 12), ] # rownames(patients) = colnames(aml_se) # saveRDS(patients, file = 'AMLPatientMutationsTCGA.rds') ## ----annotate_mutations---------------------------------------------------- data(AMLPatientMutationsTCGA) patient_mutations = AMLPatientMutationsTCGA patient_mutations = patient_mutations[colnames(aml_se), ] # order samples aml_mutations = colnames(patient_mutations) # get mutation labels colData(aml_se) = cbind(colData(aml_se), patient_mutations) colData(aml_se)[, aml_mutations] ## ----map_ensembl_entrez---------------------------------------------------- library(org.Hs.eg.db) rowData(aml_se)$entrezgene = mapIds( org.Hs.eg.db, keys = rownames(aml_se), keytype = 'ENSEMBL', column = 'ENTREZID', multiVals = 'asNA' ) gene_annot = rowData(aml_se) ## ----discard_multimapped_genes--------------------------------------------- #select genes with mapped Entrez IDs keep = !is.na(gene_annot$entrezgene) #select genes with unique Entrez IDs dup_entrez = gene_annot$entrezgene[duplicated(gene_annot$entrezgene)] keep = keep & !gene_annot$entrezgene %in% dup_entrez #Biotype of discarded genes (due to non-unique mapping) head(sort(table(gene_annot[!keep, 'gene_biotype']), decreasing = TRUE), n = 10) #subset the data aml_se = aml_se[keep, ] ## ----signature_names------------------------------------------------------- #create signature names verhaak_names = paste('VERHAAK_AML_WITH_NPM1_MUTATED', c('UP', 'DN'), sep = '_') verhaak_names ## ----download_signatures, results='hide'----------------------------------- #generate URLs verhaak_links = paste0( 'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=', verhaak_names, '&fileType=xml' ) #download files verhaak_files = paste0(verhaak_names, '.xml') mapply(download.file, verhaak_links, verhaak_files, method = 'libcurl') ## ----read_signatures------------------------------------------------------- library(GSEABase) verhaak_sigs = getBroadSets(verhaak_files, membersId = 'MEMBERS_EZID') verhaak_sigs ## ----rows_to_entrez-------------------------------------------------------- rownames(aml_se) = rowData(aml_se)$entrezgene ## ----compute_ranks_aml----------------------------------------------------- library(singscore) #apply the rankGenes method to each version of the dataset, excluding counts aml_ranked = rankGenes(assay(aml_se, 'logFPKM_TMM')) ## ----compute_scores_verhaak, warning=TRUE---------------------------------- #apply the scoring function verhaak_scores = simpleScore(aml_ranked, upSet = verhaak_sigs[[1]], downSet = verhaak_sigs[[2]]) ## -------------------------------------------------------------------------- head(verhaak_scores) ## ----plot-dispersion, fig.wide=TRUE, fig.height=4, fig.cap="使用NPM1c signature上调和/或下调基因列表得到的分数可以将携带有NPM1c mutations或*MLL* fusions/PTDs的样本区分出来. 以Scores和基因集内基因排名的median absolute deviation (MAD)为轴作图,图上可以看出scores值接近0的样本点拥有更高的MADs值。"---- #relative size of text in the figure relSize = 1.2 #create annotation mutated_gene = rep('Other', ncol(aml_se)) mutated_gene[aml_se$NPM1c.Mut] = 'NPM1c Mut' mutated_gene[aml_se$KMT2A.Fusion | aml_se$KMT2A.PTD] = 'MLL Fusion/PTD' p1 = plotDispersion(verhaak_scores, annot = mutated_gene, textSize = relSize) p1 ## ----plot-rank-density, fig.wide=TRUE, fig.height=4, fig.cap="基因排名的分布显示了应该上调的基因在高分样本中上调,在低分样本中下调;应该下调的基因在高分样本中下调,在低分样本中上调. 从左到右分别是总体得分最高的样本、总体得分最低的样本和总体离差(dispersion)值最高的样本。barcode图显示了标签内每个基因的排名及其密度函数。上调基因的颜色为绿色而下调基因为紫色。在最高分和最低分样本中,排名两端呈现正态分布(Gaussian distributions)。总体离差(dispersion)最高的样本得分接近0,因为下调基因的排名呈现均匀分布而上调基因的排名呈现双峰分布。"---- library(gridExtra) #select samples with the properties required select_samples = c( 'Max Total Score' = which.max(verhaak_scores$TotalScore), 'Min Total Score' = which.min(verhaak_scores$TotalScore), 'Max Dispersion' = which.max(verhaak_scores$TotalDispersion) ) #plotRankDensity applied to each sample rank_plots = lapply(names(select_samples), function(x) { #get the sample index aml_sample = select_samples[x] #drop = FALSE is required to ensure the data.frame is intact p1 = plotRankDensity(rankData = aml_ranked[, aml_sample, drop = FALSE], upSet = verhaak_sigs[[1]], downSet = verhaak_sigs[[2]], textSize = relSize) #overwrite title of the plot with the description of the sample #this is possible because singscore uses ggplot2 p1 = p1 + ggtitle(paste(x, mutated_gene[aml_sample], sep = '\n')) + guides(colour = guide_legend(ncol = 1)) return(p1) }) #create a multipanel plot grid.arrange(grobs = rank_plots, nrow = 1) ## ----create_plotdf_boxplot------------------------------------------------- #create a dataframe with the data required: scores and mutations scoredf = as.data.frame(colData(aml_se)[, aml_mutations]) scoredf$Score = verhaak_scores$TotalScore scoredf$Dispersion = verhaak_scores$TotalDispersion ## ----plot-boxplot-mutscores, fig.height=5, fig.cap="NPM1c标签的得分可以区分突变样本与野生型样本. NPM1c标签得分的Boxplot(按照不同突变类型分类)。"---- #restructure the data for ploting plotdf = melt( scoredf, id.var = c('Score', 'Dispersion'), variable.name = 'Mutation', value.name = 'Status' ) #convert TRUE-FALSE values to Mut-WT plotdf$Status = factor(plotdf$Status, labels = c('WT', 'Mut')) p1 = ggplot(plotdf, aes(Mutation, Score, fill = Status)) + geom_boxplot(position = 'dodge', alpha = 0.6) + scale_fill_brewer(palette = 'Set2') + current_theme + theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) p1 ## ----predict_mutation_glm-------------------------------------------------- #fit GLMs to each mutation glms = lapply(aml_mutations, function(x) { #generate a formula for the fit form = as.formula(paste0(x, ' ~ Score')) glm1 = glm(form, data = scoredf, family = binomial(link = 'logit')) return(glm1) }) names(glms) = aml_mutations #extract coefficients coefs = lapply(glms, function(x) coef(summary(x))) ldply(coefs, function(x) x[2, ], .id = 'Mutation') ## ----eval_prediction_performance------------------------------------------- library(dcanr) #assess sensitivity and specificity prec_rec = ldply(glms, function(glm1) { #predict mutations for the data used in training and convert to binary format prediction = as.numeric(predict(glm1) > 0) observed = glm1$y prec = performanceMeasure(prediction, observed, 'precision') recall = performanceMeasure(prediction, observed, 'recall') f1 = performanceMeasure(prediction, observed) return(c('Precision' = prec, 'Recall' = recall, 'F1' = f1)) }, .id = 'Mutation') prec_rec ## ----predict_with_mads----------------------------------------------------- #include dispersion in the model glm_npm1c = glm('NPM1c.Mut ~ Score + Dispersion', data = scoredf, family = binomial(link = 'logit')) #evaluate performance of the new model prediction = as.numeric(predict(glm_npm1c) > 0) observed = glm_npm1c$y c( 'Precision' = performanceMeasure(prediction, observed, 'precision'), 'Recall' = performanceMeasure(prediction, observed, 'recall'), 'F1' = performanceMeasure(prediction, observed) ) ## ----process_rossmll_sigs-------------------------------------------------- #create signature names rossmll_name = 'ROSS_AML_WITH_MLL_FUSIONS' #generate URLs rossmll_link = paste0( 'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=', rossmll_name, '&fileType=xml' ) #download files rossmll_file = paste0(rossmll_name, '.xml') download.file(rossmll_link, rossmll_file, method = 'libcurl') rossmll_sig = getBroadSets(rossmll_file, membersId = 'MEMBERS_EZID') rossmll_sig ## ----score_rossmll--------------------------------------------------------- rossmll_scores = simpleScore(aml_ranked, rossmll_sig[[1]], knownDirection = FALSE) ## ----plot-signature-landscape, fig.height=7, fig.width=7, fig.cap="MLL fusion和NPM1c标签的landscape. AML中两个标签存在正相关。"---- p_mll_npm1c = plotScoreLandscape( verhaak_scores, rossmll_scores, scorenames = c('VERHAAK_AML_WITH_NPM1_MUTATED', 'ROSS_AML_WITH_MLL_FUSIONS'), textSize = relSize ) p_mll_npm1c ## ----new_annotations------------------------------------------------------- #new annotation - modify previously used annotations mutated_gene[aml_se$KMT2A.Fusion] = 'MLL Fusion' mutated_gene[aml_se$KMT2A.PTD] = 'MLL PTD' mutated_gene[aml_se$PML.RARA] = 'PML-RARA' ## ----plot-project-mll, fig.height=7.5, fig.width=7, fig.cap="Landscape展示了MLL fusions和PTDs之间有很大不同. 不同的突变在landscape上占据了不同的位置,提示了它们具有不同的分子特征。"---- select_aml = !mutated_gene %in% 'Other' #project above mutations onto the landscape p1 = projectScoreLandscape(p_mll_npm1c, verhaak_scores, rossmll_scores, subSamples = select_aml, annot = mutated_gene[select_aml]) p1 + theme(legend.box = 'vertical') ## ----plot-project-pmlrara, fig.height=7.5, fig.width=7, echo=FALSE, fig.cap="PML-RARA标签和NPM1c标签之间没有关联. L型的landscape提示了这两个突变背后的分子机制是互斥的。"---- newsig_name = 'ROSS_AML_WITH_PML_RARA_FUSION' #generate URL newsig_link = paste0( 'http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=', newsig_name, '&fileType=xml' ) #download files newsig_file = paste0(newsig_name, '.xml') download.file(newsig_link, newsig_file, method = 'libcurl') newsig_sig = getBroadSets(newsig_file, membersId = 'MEMBERS_EZID') #score newsig_scores = simpleScore(aml_ranked, newsig_sig[[1]], knownDirection = FALSE) #plot and project p1 = plotScoreLandscape( verhaak_scores, newsig_scores, scorenames = c('VERHAAK_AML_WITH_NPM1_MUTATED', newsig_name), textSize = relSize ) #project NPM1 mutations onto the landscape mutated_gene = rep('Other', ncol(aml_se)) mutated_gene[aml_se$NPM1c.Mut] = 'NPM1c Mut' mutated_gene[aml_se$KMT2A.Fusion] = 'MLL Fusion' mutated_gene[aml_se$KMT2A.PTD] = 'MLL PTD' mutated_gene[aml_se$PML.RARA] = 'PML-RARA' select_aml = !mutated_gene %in% 'Other' p2 = projectScoreLandscape(p1, verhaak_scores[, 1:2], newsig_scores, subSamples = select_aml, annot = as.factor(mutated_gene[select_aml])) p2 + theme(legend.box = 'vertical') ## ----session_info---------------------------------------------------------- sessionInfo()