## ----nanotubes---------------------------------------------------------------- library(nanotubes) data(nanotubes) ## ----packages, results='hide', message=FALSE, warning=FALSE------------------- # CRAN packages for data manipulation and plotting library(knitr) library(kableExtra) library(pheatmap) library(ggseqlogo) library(viridis) library(magrittr) library(ggforce) library(ggthemes) library(tidyverse) # CAGEfightR and related packages library(CAGEfightR) library(GenomicRanges) library(SummarizedExperiment) library(GenomicFeatures) library(BiocParallel) library(InteractionSet) library(Gviz) # Bioconductor packages for differential expression library(DESeq2) library(limma) library(edgeR) library(statmod) library(BiasedUrn) library(sva) # Bioconductor packages for enrichment analyses library(TFBSTools) library(motifmatchr) library(pathview) # Bioconductor data packages library(BSgenome.Mmusculus.UCSC.mm9) library(TxDb.Mmusculus.UCSC.mm9.knownGene) library(org.Mm.eg.db) library(JASPAR2016) ## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set(fig.pos = 'H', out.extra = '', fig.retina = 1, collapse = TRUE, comment = "#>" ) ## ----scriptwise, results='hide', message=FALSE, warning=FALSE----------------- # Rename these for easier access bsg <- BSgenome.Mmusculus.UCSC.mm9 txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene odb <- org.Mm.eg.db # Script-wide settings register(MulticoreParam(3)) # Parallel execution when possible theme_set(theme_light()) # White theme for ggplot2 figures ## ----studyDesign, eval=TRUE--------------------------------------------------- kable(nanotubes, caption = "The initial design matrix for the nanotubes experiment") %>% kable_styling(latex_options = "hold_position") ## ----fnames------------------------------------------------------------------- # Setup paths to file on hard drive bw_plus <- system.file("extdata", nanotubes$BigWigPlus, package = "nanotubes", mustWork = TRUE) bw_minus <- system.file("extdata", nanotubes$BigWigMinus, package = "nanotubes", mustWork = TRUE) # Save as named BigWigFileList bw_plus <- BigWigFileList(bw_plus) bw_minus <- BigWigFileList(bw_minus) names(bw_plus) <- names(bw_minus) <- nanotubes$Name ## ----quantifyCTSSs------------------------------------------------------------ CTSSs <- quantifyCTSSs(plusStrand = bw_plus, minusStrand = bw_minus, genome = seqinfo(bsg), design = nanotubes) ## ----inspectCTSSs------------------------------------------------------------- # Get a summary CTSSs # Extract CTSS positions rowRanges(CTSSs) # Extract CTSS counts assay(CTSSs, "counts") %>% head ## ----pooled------------------------------------------------------------------- CTSSs <- CTSSs %>% calcTPM() %>% calcPooled() ## ----tagClustering------------------------------------------------------------ TCs <- quickTSSs(CTSSs) ## ----TCfiltering-------------------------------------------------------------- TSSs <- TCs %>% calcTPM() %>% subsetBySupport(inputAssay = "TPM", unexpressed = 1, minSamples = 4) ## ----bidirClustering---------------------------------------------------------- BCs <- quickEnhancers(CTSSs) ## ----BCfiltering-------------------------------------------------------------- BCs <- subsetBySupport(BCs, inputAssay = "counts", unexpressed = 0, minSamples = 4) ## ----TSSannotation------------------------------------------------------------ # Annotate with transcript IDs TSSs <- assignTxID(TSSs, txModels = txdb, swap = "thick") # Annotate with transcript context TSSs <- assignTxType(TSSs, txModels = txdb, swap = "thick") ## ----BCannotation------------------------------------------------------------- # Annotate with transcript context BCs <- assignTxType(BCs, txModels = txdb, swap = "thick") # Keep only non-exonic BCs as enhancer candidates Enhancers <- subset(BCs, txType %in% c("intergenic", "intron")) ## ----cleanClusters------------------------------------------------------------ # Clean colData TSSs$totalTags <- NULL Enhancers$totalTags <- NULL # Clean rowData rowData(TSSs)$balance <- NA rowData(TSSs)$bidirectionality <- NA rowData(Enhancers)$txID <- NA # Add labels for making later retrieval easy rowData(TSSs)$clusterType <- "TSS" rowData(Enhancers)$clusterType <- "Enhancer" ## ----combineClusters---------------------------------------------------------- RSE <- combineClusters(object1 = TSSs, object2 = Enhancers, removeIfOverlapping = "object1") ## ----finalTPM----------------------------------------------------------------- RSE <- calcTPM(RSE) ## ----genomeTracks------------------------------------------------------------- # Genome track axis_track <- GenomeAxisTrack() # Annotation track tx_track <- GeneRegionTrack(txdb, name = "Gene Models", col = NA, fill = "bisque4", shape = "arrow", showId = TRUE) ## ----simpleTSS, fig.width=5, fig.height=5, fig.cap='Genome browser example of TSS candidate'---- # Extract 100 bp around the first TSS plot_region <- RSE %>% rowRanges() %>% subset(clusterType == "TSS") %>% .[1] %>% add(100) %>% unstrand() # CTSS track ctss_track <- CTSSs %>% rowRanges() %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "CTSSs") # Cluster track cluster_track <- RSE %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId = TRUE) # Plot tracks together plotTracks(list(axis_track, ctss_track, cluster_track, tx_track), from = start(plot_region), to = end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----simpleEnhancer, fig.width=5, fig.height=5, fig.cap='Genome browser example of enhancer candidate'---- # Extract 100 bp around the first enhancer plot_region <- RSE %>% rowRanges() %>% subset(clusterType == "Enhancer") %>% .[1] %>% add(100) %>% unstrand() # CTSS track ctss_track <- CTSSs %>% rowRanges() %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "CTSSs") # Cluster track cluster_track <- RSE %>% rowRanges %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId = TRUE) # Plot tracks together plotTracks(list(axis_track, ctss_track, cluster_track, tx_track), from = start(plot_region), to = end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----simplifyTxTypes---------------------------------------------------------- cluster_info <- RSE %>% rowData() %>% as.data.frame() ## ----plotTxTypes1, fig.width=5, fig.height=3, fig.cap="Number of clusters within each annotation category"---- # Number of clusters ggplot(cluster_info, aes(x = txType, fill = clusterType)) + geom_bar(alpha = 0.75, position = "dodge", color = "black") + scale_fill_colorblind("Cluster type") + labs(x = "Cluster annotation", y = "Frequency") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## ----plotTxTypes2, fig.width=5, fig.height=3, fig.cap="Expression of clusters within each annotation category"---- # Expression of clusters ggplot(cluster_info, aes(x = txType, y = log2(score / ncol(RSE)), fill = clusterType)) + geom_violin(alpha = 0.75, draw_quantiles = c(0.25, 0.50, 0.75)) + scale_fill_colorblind("Cluster type") + labs(x = "Cluster annotation", y = "log2(TPM)") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## ----highTSSs----------------------------------------------------------------- # Select highly expressed TSSs highTSSs <- subset(RSE, clusterType == 'TSS' & score / ncol(RSE) >= 10) # Calculate IQR as 10%-90% interval highTSSs <- calcShape(highTSSs, pooled = CTSSs, shapeFunction = shapeIQR, lower = 0.10, upper = 0.90) ## ----TSSShapes, fig.width=5, fig.height=3, fig.cap="Bimodal distribution of Interquartile Ranges (IQRs) of highly expressed TSSs"---- highTSSs %>% rowData() %>% as.data.frame() %>% ggplot(aes(x = IQR)) + geom_histogram(binwidth = 1, fill = "hotpink", alpha = 0.75) + geom_vline(xintercept = 10, linetype = "dashed", alpha = 0.75, color = "black") + facet_zoom(xlim = c(0,100)) + labs(x = "10-90% IQR", y = "Frequency") ## ----classifyShapes----------------------------------------------------------- # Divide into groups rowData(highTSSs)$shape <- ifelse(rowData(highTSSs)$IQR < 10, "Sharp", "Broad") # Count group sizes table(rowData(highTSSs)$shape) ## ----BSGenome----------------------------------------------------------------- promoter_seqs <- highTSSs %>% rowRanges() %>% swapRanges() %>% promoters(upstream = 40, downstream = 10) %>% getSeq(bsg, .) ## ----ggseqlogo, fig.width=5, fig.height=2.5, fig.cap='Sequence logos of core promoter regions of Sharp and Broad classes of TSSs'---- promoter_seqs %>% as.character %>% split(rowData(highTSSs)$shape) %>% ggseqlogo(data = ., ncol = 2, nrow = 1) + theme_logo() + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) ## ----orientation-------------------------------------------------------------- rowData(RSE)$clusterType <- RSE %>% rowData() %>% use_series("clusterType") %>% as_factor() %>% fct_relevel("TSS") ## ----findLinks---------------------------------------------------------------- # Find links and calculate correlations all_links <- RSE %>% swapRanges() %>% findLinks(maxDist = 5e4L, directional = "clusterType", inputAssay = "TPM", method = "kendall") # Show links all_links ## ----linkCorrelations--------------------------------------------------------- # Subset to only positive correlation cor_links <- subset(all_links, estimate > 0) # Sort based on correlation cor_links <- cor_links[order(cor_links$estimate, decreasing = TRUE)] ## ----browseLinks, fig.width=5, fig.height=5, fig.cap="Genome browser example of TSS-enhancer link"---- # Extract region around the link of interest plot_region <- cor_links[1] %>% boundingBox() %>% linearize(1:2) %>% add(1000L) # Cluster track cluster_track <- RSE %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId = TRUE) # Link track link_track <- cor_links %>% subsetByOverlaps(plot_region) %>% trackLinks(name = "Links", interaction.measure = "p.value", interaction.dimension.transform = "log", col.outside = "grey", plot.anchors = FALSE, col.interactions = "black") # Plot tracks together plotTracks(list(axis_track, link_track, cluster_track, tx_track), from = start(plot_region), to = end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----findStretches------------------------------------------------------------ # Subset to only enhancers Enhancers <- subset(RSE, clusterType == "Enhancer") # Find stretches within 12.5 kbp stretches <- findStretches(Enhancers, inputAssay = "TPM", mergeDist = 12500L, minSize = 5L, method = "kendall") ## ----stretchTypes------------------------------------------------------------- # Annotate transcript models stretches <- assignTxType(stretches, txModels = txdb) # Sort by correlation stretches <- stretches[order(stretches$aveCor, decreasing = TRUE)] # Show the results stretches ## ----browseStretches, fig.width=5, fig.height=5, fig.cap="Genome browser example of enhancer stretch"---- # Extract region around a stretch of enhancers plot_region <- stretches["chr17:26666593-26675486"] + 1000 # Cluster track cluster_track <- RSE %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId = TRUE) # CTSS track ctss_track <- CTSSs %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "CTSSs") # Stretch enhancer track stretch_track <- stretches %>% subsetByOverlaps(plot_region) %>% AnnotationTrack(name = "Stretches", fill = "hotpink", col = NULL) # Plot tracks together plotTracks(list(axis_track, stretch_track, cluster_track, ctss_track), from = start(plot_region), to = end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----vstBlind----------------------------------------------------------------- # Create DESeq2 object with blank design dds_blind <- DESeqDataSet(RSE, design = ~ 1) # Normalize and log transform vst_blind <- vst(dds_blind, blind = TRUE) ## ----PCA, fig.width=4, fig.height=4, fig.cap="PCA-plot of variance stabilized expression."---- plotPCA(vst_blind, "Class") ## ----batchVar----------------------------------------------------------------- # Extract PCA results pca_res <- plotPCA(vst_blind, "Class", returnData = TRUE) # Define a new variable using PC1 batch_var <- ifelse(pca_res$PC1 > 0, "A", "B") # Attach the batch variable as a factor to the experiment RSE$Batch <- factor(batch_var) # Show the new design RSE %>% colData() %>% subset(select = c(Class, Batch)) %>% kable(caption = "Design matrix after adding new batch covariate.") %>% kable_styling(latex_options = "hold_position") ## ----finalDesign-------------------------------------------------------------- # Specify design dds <- DESeqDataSet(RSE, design = ~ Batch + Class) # Fit DESeq2 model dds <- DESeq(dds) ## ----results------------------------------------------------------------------ # Extract results res <- results(dds, contrast = c("Class", "Nano", "Ctrl"), alpha = 0.05, independentFiltering = TRUE, tidy = TRUE) %>% bind_cols(as.data.frame(rowData(RSE))) %>% as_tibble() # Show the top hits res %>% top_n(n = -10, wt = padj) %>% dplyr::select(cluster = row, clusterType, txType, baseMean, log2FoldChange, padj) %>% kable(caption = "Top differentially expressed TSS and enhancer candidates") %>% kable_styling(latex_options = "hold_position") ## ----diagnosticPlot, fig.width=5, fig.height=5, fig.cap="Diagnostic MA plot of the differential expression analysis"---- ggplot(res, aes(x = log2(baseMean), y = log2FoldChange, color = padj < 0.05)) + geom_point(alpha = 0.25) + geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.75) + facet_grid(clusterType ~ .) ## ----DEtable------------------------------------------------------------------ table(clusterType = rowRanges(RSE)$clusterType, DE = res$padj < 0.05) ## ----vstGuided---------------------------------------------------------------- # Guided / non-blind variance stabilizing transformation vst_guided <- varianceStabilizingTransformation(dds, blind = FALSE) ## ----ComBat------------------------------------------------------------------- # Design matrix of wanted effects bio_effects <- model.matrix(~ Class, data = colData(RSE)) # Run ComBat assay(RSE, "ComBat") <- ComBat(dat = assay(vst_guided), batch = RSE$Batch, mod = bio_effects) ## ----correctedPCA, fig.width=4, fig.height=4, fig.cap='PCA-plot of batch corrected expression.'---- # Overwrite assay assay(vst_guided) <- assay(RSE, "ComBat") # Plot as before plotPCA(vst_guided, "Class") ## ----findTop10---------------------------------------------------------------- # Find top 10 DE enhancers top10 <- res %>% filter(clusterType == "Enhancer", padj < 0.05) %>% group_by(log2FoldChange >= 0) %>% top_n(n = 5, wt = abs(log2FoldChange)) %>% pull(row) # Extract expression values in tidy format tidyEnhancers <- assay(RSE, "ComBat")[top10, ] %>% t() %>% as_tibble(rownames = "Sample") %>% mutate(Class = RSE$Class) %>% gather(key = "Enhancer", value = "Expression", -Sample, -Class, factor_key = TRUE) ## ----ploTop10, fig.width=6, fig.height=5, fig.cap="Expression profile of top 10 differentially expressed enhancer candidates."---- ggplot(tidyEnhancers, aes(x = Class, y = Expression, fill = Class)) + geom_dotplot(stackdir = "center", binaxis = "y", dotsize = 3) + facet_wrap(~ Enhancer, ncol = 2, scales = "free_y") ## ----promoter_seqs------------------------------------------------------------ cluster_seqs <- RSE %>% rowRanges() %>% swapRanges() %>% unstrand() %>% add(500) %>% getSeq(bsg, names = .) ## ----TFBStools---------------------------------------------------------------- # Extract motifs as PFMs motif_pfms <- getMatrixSet(JASPAR2016, opts = list(species = "10090")) # Look at the IDs and names of the first few motifs: head(name(motif_pfms)) ## ----motifmatch--------------------------------------------------------------- # Find matches motif_hits <- matchMotifs(motif_pfms, subject = cluster_seqs) # Matches are returned as a sparse matrix: motifMatches(motif_hits)[1:5, 1:5] ## ----fishers------------------------------------------------------------------ # 2x2 table for fishers table(FOSJUN = motifMatches(motif_hits)[,"MA0099.2"], DE = res$padj < 0.05) %>% print() %>% fisher.test() ## ----assignGeneIDs------------------------------------------------------------ RSE <- assignGeneID(RSE, geneModels = txdb) ## ----quantifyGenes------------------------------------------------------------ GSE <- RSE %>% subset(clusterType == "TSS") %>% quantifyGenes(genes = "geneID", inputAssay = "counts") ## ----geneLevelExamples-------------------------------------------------------- rowRanges(GSE["100038347",]) ## ----symbols------------------------------------------------------------------ # Translate symbols rowData(GSE)$symbol <- mapIds(odb, keys = rownames(GSE), column = "SYMBOL", keytype = "ENTREZID") ## ----limmaNorm---------------------------------------------------------------- # Create DGElist object dge <- DGEList(counts = assay(GSE, "counts"), genes = as.data.frame(rowData(GSE))) # Calculate normalization factors dge <- calcNormFactors(dge) ## ----limmaVoom---------------------------------------------------------------- # Design matrix mod <- model.matrix(~ Batch + Class, data = colData(GSE)) # Model mean-variance using voom v <- voom(dge, design = mod) # Fit and shrink DE model fit <- lmFit(v, design = mod) eb <- eBayes(fit, robust = TRUE) # Summarize the results dt <- decideTests(eb) ## ----limmaSummary------------------------------------------------------------- # Global summary dt %>% summary() %>% kable(caption = "Global summary of differentially expressed genes.") %>% kable_styling(latex_options = "hold_position") ## ----limmaTop----------------------------------------------------------------- # Inspect top htis topTable(eb, coef = "ClassNano") %>% dplyr::select(symbol, nClusters, AveExpr, logFC, adj.P.Val) %>% kable(caption = "Top differentially expressed genes.") %>% kable_styling(latex_options = "hold_position") ## ----goanna------------------------------------------------------------------- # Find enriched GO-terms GO <- goana(eb, coef = "ClassNano", species = "Mm", trend = TRUE) # Show top hits topGO(GO, ontology = "BP", number = 10) %>% kable(caption = "Top enriched or depleted GO-terms.") %>% kable_styling(latex_options = "hold_position") ## ----kegga-------------------------------------------------------------------- # Find enriched KEGG-terms KEGG <- kegga(eb, coef = "ClassNano", species = "Mm", trend = TRUE) # Show top hits topKEGG(KEGG, number = 10) %>% knitr::kable(caption = "Top enriched or depleted KEGG-terms.") %>% kable_styling(latex_options = "hold_position") ## ----pathview, message = FALSE, fig.width=6, fig.height=6, fig.cap="Detailed view of differentially expressed genes in the KEGG TNF signalling pathway."---- # Format DE genes for pathview DE_genes <- dt[, "ClassNano"] %>% as.integer() %>% set_names(rownames(dt)) %>% Filter(f=function(x) x != 0, x=.) # Run pathview; this will save a png file to a temporary directory pathview(DE_genes, species = "mmu", pathway.id = "mmu04668", kegg.dir = tempdir()) # Show the png file grid.newpage() grid.raster(png::readPNG("mmu04668.pathview.png")) ## ----subsetByComposition------------------------------------------------------ # Filter away lowly expressed RSE_filtered <- RSE %>% subset(clusterType == "TSS" & !is.na(geneID)) %>% subsetByComposition(inputAssay = "counts", genes = "geneID", unexpressed = 0.1, minSamples = 5L) ## ----TSSstructure, fig.width=5, fig.height=2.5, fig.cap="Overview of alternative TSS usage within genes."---- RSE_filtered %>% rowData() %>% as.data.frame() %>% as_tibble() %>% dplyr::count(geneID) %>% ggplot(aes(x = n, fill = n >= 2)) + geom_bar(alpha = 0.75) + scale_fill_colorblind("Multi-TSS") + labs(x = "Number of TSSs per gene", y = "Frequency") ## ----edgeRNorm---------------------------------------------------------------- # Annotate with symbols like before: rowData(RSE_filtered)$symbol <- mapIds(odb, keys = rowData(RSE_filtered)$geneID, column = "SYMBOL", keytype = "ENTREZID") # Extract gene info TSS_info <- RSE_filtered %>% rowData() %>% subset(select = c(score, txType, geneID, symbol)) %>% as.data.frame() # Build DGEList dge <- DGEList(counts = assay(RSE_filtered, "counts"), genes = TSS_info) ## ----diffSpliceDGE------------------------------------------------------------ # Estimate normalization factors dge <- calcNormFactors(dge) # Estimate dispersion and fit GLMs disp <- estimateDisp(dge, design = mod, tagwise = FALSE) QLfit <- glmQLFit(disp, design = mod, robust = TRUE) # Apply diffSpliceDGE ds <- diffSpliceDGE(QLfit, coef = "ClassNano", geneid = "geneID") ## ----dtuTSS------------------------------------------------------------------- dtu_TSSs <- topSpliceDGE(ds, test = "exon") dplyr::select(dtu_TSSs, txType, geneID, symbol, logFC, FDR) %>% kable(caption = "Top differentially used TSSs") %>% kable_styling(latex_options = "hold_position") ## ----dtuGene------------------------------------------------------------------ dtu_genes <- topSpliceDGE(ds, test = "Simes") dplyr::select(dtu_genes, geneID, symbol, NExons, FDR) %>% kable(row.names = FALSE, caption = "Top genes showing any differential TSS usage.") %>% kable_styling(latex_options = "hold_position") ## ----heatmap, fig.width=3, fig.height=4, fig.cap="Heatmap showing expression of TSSs within Fblim1"---- RSE_filtered %>% subset(geneID == "74202") %>% assay("ComBat") %>% t() %>% pheatmap(color = magma(100), cluster_cols = FALSE, main="Fblim1") ## ----dtubrowser, fig.width=5, fig.height=5, fig.cap="Genome-browser example of differential TSS usage within Fblim1"---- # Extract region at Fblim1 plot_region <- RSE_filtered %>% rowRanges() %>% subset(geneID == "74202") %>% GenomicRanges::reduce(min.gapwidth = 1e6L) %>% unstrand() %>% add(5e3L) # Cluster track cluster_track <- RSE_filtered %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId = TRUE) # CTSS tracks for each group ctrl_track <- CTSSs %>% subset(select = Class == "Ctrl") %>% calcPooled() %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "Ctrl") nano_track <- CTSSs %>% subset(select = Class == "Nano") %>% calcPooled() %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "Nano") # Plot tracks together plotTracks(list(axis_track, tx_track, cluster_track, Ctrl=ctrl_track, nano_track), from = start(plot_region), to = end(plot_region), chromosome = as.character(seqnames(plot_region)))