## ----pre,echo=FALSE,results='hide'-------------------------------------------- library(knitr) opts_chunk$set(warning=FALSE,message=FALSE,cache=TRUE) ## ----style,echo=FALSE,results='asis'------------------------------------------ BiocStyle::markdown() ## ----loadLibrary-------------------------------------------------------------- library(dupRadar) ## ----eval=FALSE--------------------------------------------------------------- # # call the duplicate marker and analyze the reads # bamDuprm <- markDuplicates(dupremover="bamutil", # bam="test.bam", # path="/opt/bamUtil-master/bin", # rminput=FALSE) ## ----------------------------------------------------------------------------- if(suppressWarnings(require(AnnotationHub))) { ah = AnnotationHub() query(ah, c("ensembl", "80", "Takifugu", "gtf")) # discovery cache(ah["AH47101"]) # retrieve / file path } ## ----------------------------------------------------------------------------- attach(dupRadar_examples) ## ----eval=FALSE--------------------------------------------------------------- # # The call parameters: # bamDuprm <- "test_duprm.bam" # the duplicate marked bam file # gtf <- "genes.gtf" # the gene model # stranded <- 2 # '0' (unstranded), '1' (stranded) and '2' (reversely stranded) # paired <- FALSE # is the library paired end? # threads <- 4 # number of threads to be used # # # Duplication rate analysis # dm <- analyzeDuprates(bamDuprm,gtf,stranded,paired,threads) ## ----fig.width=14,fig.height=7------------------------------------------------ ## make a duprate plot (blue cloud) par(mfrow=c(1,2)) duprateExpDensPlot(DupMat=dm) # a good looking plot title("good experiment") duprateExpDensPlot(DupMat=dm.bad) # a dataset with duplication problems title("duplication problems") ## duprate boxplot duprateExpBoxplot(DupMat=dm) # a good looking plot duprateExpBoxplot(DupMat=dm.bad) # a dataset with duplication problems ## ----fig.width=7,fig.height=7------------------------------------------------- duprateExpDensPlot(DupMat=dm) # or, just to get the fitted model without plot fit <- duprateExpFit(DupMat=dm) cat("duprate at low read counts: ",fit$intercept,"\n", "progression of the duplication rate: ",fit$slope,fill=TRUE) ## ----eval=FALSE--------------------------------------------------------------- # ## INTERACTIVE: identify single points on screen (name="ID" column of dm) # duprateExpPlot(DupMat=dm) # a good looking plot # duprateExpIdentify(DupMat=dm) ## ----fig.width=14,fig.height=7------------------------------------------------ par(mfrow=c(1,2)) cols <- colorRampPalette(c("black","blue","green","yellow","red")) duprateExpPlot(DupMat=dm,addLegend=FALSE) duprateExpPlot(DupMat=dm.bad,addLegend=FALSE,nrpoints=10,nbin=500,colramp=cols) ## ----fig.width=7,fig.height=7------------------------------------------------- readcountExpBoxplot(DupMat=dm) ## ----fig.width=7,fig.height=7------------------------------------------------- expressionHist(DupMat=dm) ## ----------------------------------------------------------------------------- # calculate the fraction of multimappers per gene dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti # how many genes are exclusively covered by multimappers sum(dm$mhRate == 1, na.rm=TRUE) # and how many have an RPKM (including multimappers) > 5 sum(dm$mhRate==1 & dm$RPKMMulti > 5, na.rm=TRUE) # and which are they? dm[dm$mhRate==1 & dm$RPKMMulti > 5, "ID"] ## ----fig.width=7,fig.height=7------------------------------------------------- hist(dm$mhRate, breaks=50, col="red", main="Frequency of multimapping rates", xlab="Multimapping rate per gene", ylab="Frequency") ## ----fig.width=7,fig.height=7------------------------------------------------- # comparison of multi-mapping RPK and uniquely-mapping RPK plot(log2(dm$RPK), log2(dm$RPKMulti), xlab="Reads per kb (uniquely mapping reads only)", ylab="Reads per kb (all including multimappers, non-weighted)" ) ## ---- eval=F------------------------------------------------------------------ # identify(log2(dm$RPK), # log2(dm$RPKMulti), # labels=dm$ID) ## ----eval=F------------------------------------------------------------------- # library(dupRadar) # library(biomaRt) # # ## for detailed explanations on biomaRt, please see the respective # ## vignette # # ## set up biomart connection for mouse (needs internet connection) # ensm <- useMart("ensembl") # ensm <- useDataset("mmusculus_gene_ensembl", mart=ensm) # # ## get a table which has the gene GC content for the IDs that have been # ## used to generate the table (depends on the GTF annotation that you # ## use) # tr <- getBM(attributes=c("mgi_symbol", "percentage_gc_content"), # values=TRUE, mart=ensm) # # ## create a GC vector with IDs as element names # mgi.gc <- tr$percentage_gc_content # names(mgi.gc) <- tr$mgi_symbol # # ## using dm demo duplication matrix that comes with the package # ## add GC content to our demo data and keep only subset for which we can # ## retrieve data # keep <- dm$ID %in% tr$mgi_symbol # dm.gc <- dm[keep,] # dm.gc$gc <- mgi.gc[dm.gc$ID] # # ## check distribution of annotated gene GC content (in %) # boxplot(dm.gc$gc, main="Gene GC content", ylab="% GC") ## ----eval=F------------------------------------------------------------------- # par(mfrow=c(1,2)) # # ## below median GC genes # duprateExpDensPlot(dm.gc[dm.gc$gc<=45,], main="below median GC genes") # # ## above median GC genes # duprateExpDensPlot(dm.gc[dm.gc$gc>=45,], main="above median GC genes") ## ----eval=F------------------------------------------------------------------- # #!/usr/bin/env Rscript # # ######################################## # ## # ## dupRadar shell script # ## call dupRadar R package from the shell for # ## easy integration into pipelines # ## # ## Holger Klein & Sergi Sayols # ## # ## https://github.com/ssayols/dupRadar # ## # ## input: # ## - _duplicate marked_ bam file # ## - gtf file # ## - parameters for duplication counting routine: # ## stranded, paired, outdir, threads. # ## # ######################################## # # library(dupRadar) # # #################### # ## # ## get name patterns from command line # ## # args <- commandArgs(TRUE) # # ## the bam file to analyse # bam <- args[1] # ## usually, same GTF file as used in htseq-count # gtf <- gsub("gtf=","",args[2]) # ## no|yes|reverse # stranded <- gsub("stranded=","",args[3]) # ## is a paired end experiment # paired <- gsub("paired=","",args[4]) # ## output directory # outdir <- gsub("outdir=","",args[5]) # ## number of threads to be used # threads <- as.integer(gsub("threads=","",args[6])) # # if(length(args) != 6) { # stop (paste0("Usage: ./dupRadar.sh ", # " paired=[yes|no] ", # "outdir=./ threads=1")) # } # # if(!file.exists(bam)) { # stop(paste("File",bam,"does NOT exist")) # } # # if(!file.exists(gtf)) { # stop(paste("File",gtf,"does NOT exist")) # } # # if(!file.exists(outdir)) { # stop(paste("Dir",outdir,"does NOT exist")) # } # # if(is.na(stranded) | !(grepl("no|yes|reverse",stranded))) { # stop("Stranded has to be no|yes|reverse") # } # # if(is.na(paired) | !(grepl("no|yes",paired))) { # stop("Paired has to be no|yes") # } # # if(is.na(threads)) { # stop("Threads has to be an integer number") # } # # stranded <- if(stranded == "no") 0 else if(stranded == "yes") 1 else 2 # # ## end command line parsing # ## # ######################################## # # ######################################## # ## # ## analyze duprates and create plots # ## # cat("Processing file ", bam, " with GTF ", gtf, "\n") # # ## calculate duplication rate matrix # dm <- analyzeDuprates(bam, # gtf, # stranded, # (paired == "yes"), # threads) # # ## produce plots # # ## duprate vs. expression smooth scatter # png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drescatter.png"), # width=1000, height=1000) # duprateExpDensPlot(dm, main=basename(bam)) # dev.off() # # ## expression histogram # png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_ehist.png"), # width=1000, height=1000) # expressionHist(dm) # dev.off() # # ## duprate vs. expression boxplot # png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drebp.png"), # width=1000, height=1000) # par(mar=c(10,4,4,2)+.1) # duprateExpBoxplot(dm, main=basename(bam)) # dev.off() ## ----citation----------------------------------------------------------------- citation("dupRadar") ## ----echo=FALSE--------------------------------------------------------------- sessionInfo()