1 Installation

Install the package from Bioconductor.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("RcwlPipelines")

The development version is also available to download from Github.

BiocManager::install("hubentu/RcwlPipelines")

Load the package into the R session.

library(RcwlPipelines)

2 Tools and pipelines scripts

The R scripts to build the CWL tools and pipelines based on the Rcwl package are stored in the “src” folder with “tl_” and “pl_” prefix respectively. The function cwlTools can be used to collect the available scripts. The cachePath can be your existing cache directory or a new folder.

tools <- cwlTools(cachePath = tempdir())
tools
#> class: BiocFileCache
#> bfccache: /tmp/RtmpKausCT
#> bfccount: 77
#> For more information see: bfcinfo() or bfcquery()

The full paths can be pulled from the “fpath” column. The scripts can be viewed to demonstrate how the tools and pipelines were built.

library(dplyr)
bfcinfo(tools) %>% select(rname, fpath)
#> # A tibble: 77 x 2
#>    rname              fpath                                                  
#>    <chr>              <chr>                                                  
#>  1 ApplyBQSR          /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  2 BaseRecalibrator   /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  3 CalculateContamin… /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  4 ColSeqArtifact     /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  5 FilterMutectCalls  /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  6 FilterOBias        /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  7 Funcotator         /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  8 GenomicsDB         /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#>  9 GetPileupSummaries /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#> 10 LoFreq             /tmp/RtmpEs3U2w/Rinst2cdb115cb09/RcwlPipelines/src/tl_…
#> # … with 67 more rows

The commands and docker containers from the wrapped tools are included in the metadata.

tls <- bfcinfo(tools) %>% filter(Type == "tool") %>%
    select(rname, Command, Container)
knitr::kable(tls)

rname	Command	Container
ApplyBQSR	gatk ApplyBQSR	broadinstitute/gatk:4.1.2.0
BaseRecalibrator	gatk BaseRecalibrator	broadinstitute/gatk:4.1.2.0
CalculateContamination	gatk CalculateContamination	broadinstitute/gatk:4.1.2.0
ColSeqArtifact	gatk CollectSequencingArtifactMetrics	broadinstitute/gatk:4.1.2.0
FilterMutectCalls	gatk FilterMutectCalls	broadinstitute/gatk:4.1.2.0
FilterOBias	gatk FilterByOrientationBias	broadinstitute/gatk:4.1.2.0
Funcotator	gatk Funcotator	broadinstitute/gatk:4.1.2.0
GenomicsDB	gatk GenomicsDBImport	broadinstitute/gatk:4.1.2.0
GetPileupSummaries	gatk GetPileupSummaries	broadinstitute/gatk:4.1.2.0
LoFreq	lofreq somatic	andreaswilm/lofreq:v2.1.2
MuSE	MuSEv1.0rc_submission_c039ffa call	marghoob/muse:1.0rc_c
Mutect2	gatk Mutect2	broadinstitute/gatk:4.1.2.0
PoN	gatk CreateSomaticPanelOfNormals	broadinstitute/gatk:4.1.2.0
STAR	STAR	hubentu/rcwl-rnaseq
SomaticSniper	/opt/somatic-sniper/build/bin/bam-somaticsniper	lethalfang/somaticsniper:1.0.5.0-2
VarDict	/opt/VarDict-1.5.1/bin/VarDict	lethalfang/vardictjava:1.5.1
VarScan2		serge2016/varscan:v0.1.1
VarScan2_processSomatic	java -jar /opt/varscan/VarScan.jar processSomatic	mgibio/varscan-cwl:v2.4.2-samtools1.3.1
VarScan2_somatic	java -jar /opt/varscan/VarScan.jar somatic	mgibio/varscan-cwl:v2.4.2-samtools1.3.1
VarScan2_somaticFilter	java -jar /opt/varscan/VarScan.jar somaticFilter	mgibio/varscan-cwl:v2.4.2-samtools1.3.1
bcfview	bcftools view	biocontainers/bcftools:v1.5_cv3
bgzip	bgzip -c	biocontainers/tabix:v1.3.2-2-deb_cv1
blastn	blastn	biocontainers/blast:v2.2.31_cv2
bowtie2	bowtie2	biocontainers/bowtie2:v2.2.9_cv2
bowtie2_build	bowtie2-build	biocontainers/bowtie2:v2.2.9_cv2
bwa	bwa mem	biocontainers/bwa:0.7.15_cv4
bwa_index	bwa index	biocontainers/bwa:v0.7.15_cv4
cutadapt	cutadapt	kfdrc/cutadapt
fastqc	fastqc	hubentu/rcwl-rnaseq
featureCounts	featureCounts	hubentu/rcwl-rnaseq
geneBody_coverage	geneBody_coverage.py	hubentu/rcwl-rnaseq
genePredToBed	genePredToBed	hubentu/rcwl-rnaseq
gtfToGenePred	gtfToGenePred	hubentu/rcwl-rnaseq
hisat2_align	hisat2	biocontainers/hisat2:v2.0.5-1-deb_cv1
hisat2_build	hisat2-build	biocontainers/hisat2:v2.0.5-1-deb_cv1
htseq	htseq-count	genomicpariscentre/htseq
lancet	/lancet-1.0.7/lancet	kfdrc/lancet:1.0.7
makeblastdb	makeblastdb	biocontainers/blast:v2.2.31_cv2
manta	configManta.py	cmopipeline/strelka2_manta
markdup	picard MarkDuplicates	biocontainers/picard:2.3.0
mergeBam	picard MergeSamFiles	biocontainers/picard:2.3.0
multiqc	multiqc	hubentu/rcwl-rnaseq
mvOut	R function	NA
neusomatic_call	python /opt/neusomatic/neusomatic/python/call.py	msahraeian/neusomatic
neusomatic_postprocess	python /opt/neusomatic/neusomatic/python/postprocess.py	msahraeian/neusomatic
neusomatic_preprocess	python /opt/neusomatic/neusomatic/python/preprocess.py	msahraeian/neusomatic
polysolver	bash /home/polysolver/scripts/shell_call_hla_type	sachet/polysolver:v4
read_distribution	read_distribution.py	hubentu/rcwl-rnaseq
runWDL	java	NA
salmon_index	salmon index	combinelab/salmon
salmon_quant	salmon quant	combinelab/salmon
sam2bam	samtools view	biocontainers/samtools:v1.7.0_cv3
samtools_flagstat	samtools flagstat	biocontainers/samtools:v1.7.0_cv3
samtools_index	samtools index	biocontainers/samtools:v1.7.0_cv3
samtools_mpileup	samtools mpileup	biocontainers/samtools:v1.7.0_cv3
samtools_stats	samtools stats	biocontainers/samtools:v1.7.0_cv3
sortBam	samtools sort	biocontainers/samtools:v1.7.0_cv3
starFusion	/usr/local/src/STAR-Fusion/STAR-Fusion	trinityctat/ctatfusion
strelka	configureStrelkaSomaticWorkflow.py	cmopipeline/strelka2_manta
tabix_index	tabix	biocontainers/tabix:v1.3.2-2-deb_cv1

Numbers of pipelines have been built on the wrapped tools. Here we show the built-in pipelines and their corresponding steps.

pls <- bfcinfo(tools) %>% filter(Type == "pipeline") %>% pull(rname)
pls <- setdiff(pls, "mc3")
Steps <- sapply(pls, function(x) {
    cwl <- get(x, envir = environment())
    ss <- names(steps(cwl))
    paste(ss, collapse = "+")
})

knitr::kable(data.frame(Pipelines = pls, Steps), row.names = FALSE)

Pipelines	Steps
BaseRecal	BaseRecalibrator+ApplyBQSR+samtools_index+samtools_flagstat+samtools_stats
GAlign	fqJson+fq2ubam+ubam2bamJson+align+mvOut
GPoN	GenomicsDB+PoN
Mutect2PL	Mutect2+GetPileupSummariesT+GetPileupSummariesN+CalculateContamination+FilterMutectCalls+ColSeqArtifact+FilterOBias+bcfview
RSeQC	gtfToGenePred+genePredToBed+r_distribution+gCoverage
VarScan2Somatic	mpileupT+mpileupN+somatic+processSomatic+somaticFilter
alignMerge	bwaAlign+mergeBamDup
bwaAlign	bwa+sam2bam+sortBam+idxBam
bwaMMRecal	bwaAlign+mergeBamDup+BaseRecal
bwaMRecal	bwaAlign+markdup+BaseRecal
hapCall	hapJson+HC+mvOut
jdCall	jdJson+JD+mvOut
mantaStrelka	manta+strelka
mergeBamDup	mergeBam+markdup+samtools_index+samtools_flagstat
neusomatic	preprocess+call+postprocess
rnaseq_Sf	fastqc+STAR+sortBam+samtools_index+samtools_flagstat+featureCounts+gtfToGenePred+genePredToBed+r_distribution+gCoverage

3 Build a pipeline

We can develop a pipline by utilizing the available tools. For example, a simple alignment pipelines with mapping and marking duplicates can be built from the tools.

First, we check whether the required tools (bwa, samtools and picard markduplicates) are available.

bfcquery(tools, "bwa|sam2bam|sortBam|samtools_index|markdup") %>%
    filter(Type == "tool") %>%
    select(rname, Command, Container)
#> # A tibble: 6 x 3
#>   rname          Command               Container                        
#>   <chr>          <chr>                 <chr>                            
#> 1 bwa            bwa mem               biocontainers/bwa:0.7.15_cv4     
#> 2 bwa_index      bwa index             biocontainers/bwa:v0.7.15_cv4    
#> 3 markdup        picard MarkDuplicates biocontainers/picard:2.3.0       
#> 4 sam2bam        samtools view         biocontainers/samtools:v1.7.0_cv3
#> 5 samtools_index samtools index        biocontainers/samtools:v1.7.0_cv3
#> 6 sortBam        samtools sort         biocontainers/samtools:v1.7.0_cv3

Next, we define the input parameters.

p1 <- InputParam(id = "threads", type = "int")
p2 <- InputParam(id = "RG", type = "string")
p3 <- InputParam(id = "Ref", type = "string")
p4 <- InputParam(id = "FQ1", type = "File")
p5 <- InputParam(id = "FQ2", type = "File?")

Then we define the pipeline steps, from raw fastqs to duplicates marked alignments.

## bwa
s1 <- Step(id = "bwa", run = bwa,
           In = list(threads = "threads",
                     RG = "RG",
                     Ref = "Ref",
                     FQ1 = "FQ1",
                     FQ2 = "FQ2"))
## sam to bam
s2 <- Step(id = "sam2bam", run = sam2bam,
           In = list(sam = "bwa/sam"))
## sort bam
s3 <- Step(id = "sortBam", run = sortBam,
           In = list(bam = "sam2bam/bam"))
## mark duplicates
s4 <- Step(id = "markdup", run = markdup,
           In = list(ibam = "sortBam/sbam",
                     obam = list(
                         valueFrom="$(inputs.ibam.nameroot).mdup.bam"),
                     matrix = list(
                         valueFrom="$(inputs.ibam.nameroot).markdup.txt")))
## index bam
s5 <- Step(id = "idxBam", run = samtools_index,
           In = list(bam = "markdup/mBam"))

Last, we define the outputs and connect the steps to a new pipeline.

req1 <- list(class = "StepInputExpressionRequirement")
req2 <- list(class = "InlineJavascriptRequirement")
## outputs
o1 <- OutputParam(id = "Bam", type = "File", outputSource = "markdup/mBam")
o2 <- OutputParam(id = "Idx", type = "File", outputSource = "idxBam/idx")
## stepParam
Align <- cwlStepParam(requirements = list(req1, req2),
                      inputs = InputParamList(p1, p2, p3, p4, p5),
                      outputs = OutputParamList(o1, o2))
## build pipeline
Align <- Align + s1 + s2 + s3 + s4 + s5

The pipeline is ready for use. We can plot the pipeline with plotCWL from the Rcwl package.

plotCWL(Align)

4 Pipelines summary

There are mainly 4 pipelines are collected in this package. Here is a brief introduction to these pipelines. More pipelines and tools are expected to be included in the future.

4.1 DNASeq alignment pipeline

The pipeline can be used to preprocess DNA sequences in fastq format. It can take paired fastqs, read groups from multiple batches as input.

inputs(alignMerge)
#> inputs:
#>   idBam (string):  
#>   RG (string[]):  
#>   threads (int):  
#>   Ref (File):  
#>   FQ1s (File[]):  
#>   FQ2s (File[]):

The pipeline includes two steps and several jobs will be run in each step.

bwaAlign: bwa alignment by read groups.

runs(runs(alignMerge)[[1]])
#> List of length 4
#> names(4): bwa sam2bam sortBam idxBam

bwa: To align fastqs and read groups to reference genome with bwa.
sam2bam: To convert the alignments in “sam” format to “bam” format with samtools.
sortBam: To sort the “bam” file by coordinates with samtools.
idxBam: To index “bam” file with samtools.

mergeBamDup: Merge by samples and markduplicates.

runs(runs(alignMerge)[[2]])
#> List of length 4
#> names(4): mergeBam markdup samtools_index samtools_flagstat

mergeBam: To merge bam files from multiple batches with picard.
markdup: To mark duplicates with picard.
samtools_index: To index bam file with samtools.
samtools_flagstat: To summarize flags in bam with samtools.

The final bam files after duplicates marked, bam index, duplicates matrix, and flag statistics summary will be in the output folder.

outputs(alignMerge)
#> outputs:
#> oBam:
#>   type: File
#>   outputSource: mergeBamDup/oBam
#> matrix:
#>   type: File
#>   outputSource: mergeBamDup/matrix
#> Idx:
#>   type: File
#>   outputSource: mergeBamDup/Idx
#> stat:
#>   type: File
#>   outputSource: mergeBamDup/stat

Here you can find an example to run the pipeline.

https://hubentu.github.io/others/Rcwl/application.html#dnaseq-alignment-pipeline

4.2 RNASeq pipeline

The pipeline was built with reads quality summary, STAR alignment, quantification by featureCounts and RSeQC quality control. Here are the inputs.

inputs(rnaseq_Sf)
#> inputs:
#>   in_seqfiles (File[]):  
#>   in_prefix (string):  
#>   in_genomeDir (Directory):  
#>   in_GTFfile (File):  
#>   in_runThreadN (int):  1

The pipeline includes 6 steps.

fastqc: To run quality summary for raw fastqs with fastqc.
STAR: To align fastqs with STAR.
samtools_index: To index aligned bam file.
samtools_flagstat: To summary alignment flags.
featureCounts: To quantify gene abundances.
RSeQC: Several steps included.
- gtfToGenePred: To convert GTF annotation to “genePred” format.
- genePredToBed: To convert “genePred” annotation to “bed” format.
- r_distribution: To run reads distribution over genome features.
- gCoverage: To summarize read coverage over gene body.

The outputs and logs from alignment, quantification and QC steps are collected together to the output folder. A final QC report could be generated by multiqc, which is also available in the data package.

An example to run the pipeline.

https://hubentu.github.io/others/Rcwl/application.html#rnaseq-pipeline

4.3 MC3 somatic variant calling pipeline

The Multi-Center Mutation Calling in Multiple Cancers project (MC3) pipeline was developed by the TCGA group, which was implemented with CWL and available at https://github.com/OpenGenomics/mc3. Two major steps, variant calling and merging VCFs to MAF, was imported to the mc3 pipeline in this package.

The steps of the pipeline was built on the CWL files from its github repository, which were also contained in the package. Thereforce, we need the load the pipleine by sourcing it from the script.

The pipeline requires a paired of tumor/normal BAM files and genomic annotation files as inputs.

bfcinfo(tools) %>% filter(rname == "mc3") %>% pull(rpath) %>% source
inputs(mc3)
#> inputs:
#>   tumorID (string):  
#>   normalID (string):  
#>   tumor (File):  
#>   normal (File):  
#>   bed_file (File?):  
#>   centromere (File):  
#>   cosmic (File):  
#>   dbsnp (File):  
#>   refFasta (File):  
#>   vepData (Directory):

Two steps are included.

call_variants: To call variants by 7 pipelines.
covert: To merge VCFs and convert to MAF

The merged VCF and converted MAF files will be collected to the output folder.

outputs(mc3)
#> outputs:
#> outmaf:
#>   type: File
#>   outputSource: convert/outmaf
#> outvcf:
#>   type: File
#>   outputSource: convert/vepvcf

Here is an examples to run the pipeline.
https://hubentu.github.io/others/Rcwl/application.html#mc3-somatic-variant-calling-pipeline

4.4 GATK4 germline variant calling pipeline

The GATK4 best practice pipeline for germline variant calling was implemented with Workflow Description Language (WDL). We wrapped the WDL pipeline into 3 steps with Rcwl. The details of the pipeline can be find here: https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145

GAlign GATK alignment.

The fastqs, sample information and customized json files for WDL are required as inputs. Multiple steps will run in this step, including bwa alignment, mark duplicates and base quality recalibration. GATK ready BAM files will be collected to the output directory.

hapCall HaplotypeCaller.

The GATK ready BAM and customized json files are inputs in this step. The local paths of GATK bundle files are required to be modified in your json file. A “gVCF” files will be generated.

jdCall Joint variant discovery

This step will combine the “gVCF” files and then call germline variants in all samples. The paths of the local bundle files are also required to be changed in the json template file. The final VCF file of germline variants will be collected.

An example to run the pipeline.
https://hubentu.github.io/others/Rcwl/application.html#gatk4-germline-variant-calling-pipeline

4.5 GATK4 Somatic short variant pipeline

The GATK4 Mutect2 pipeline for germline variant calling was also available in WDL. The pipeline was reimplemented with Rcwl based on the best practice documents. https://software.broadinstitute.org/gatk/best-practices/workflow?id=11146

Variant calling on normal samples

First, we need to run Mutect2 in tumor-only mode for each normal sample by the tool Mutect2. The argument “–max-mnp-distance 0” is required to be added because the next step, “GenpmicsDBImport”, can’t handle MNPs.

arguments(Mutect2) <- list("--max-mnp-distance", "0")
Mutect2
#> class: cwlParam 
#>  cwlClass: CommandLineTool 
#>  cwlVersion: v1.0 
#>  baseCommand: gatk Mutect2 
#> requirements:
#> - class: DockerRequirement
#>   dockerPull: broadinstitute/gatk:4.1.2.0
#> arguments: --max-mnp-distance 0 
#> inputs:
#>   tbam (File): -I 
#>   nbam (File?): -I 
#>   Ref (File): -R 
#>   normal (string?): -normal 
#>   germline (File?): --germline-resource 
#>   pon (File?): --panel-of-normals 
#>   interval (File?): -L 
#>   out (string): -O 
#> outputs:
#> vout:
#>   type: File
#>   secondaryFiles:
#>   - .idx
#>   - .stats
#>   outputBinding:
#>     glob: $(inputs.out)

Panel of normals

This step is to create a GenomicsDB and then combine to a VCF output for the panel of normals from all the normal Mutect2 calls. A cwl pipeline GPoN was built to create the panel VCF.

runs(GPoN)
#> List of length 2
#> names(2): GenomicsDB PoN

Mutect2 and variant filtering

This pipeline includes two main steps. First we call a large set of candidate somatic variants, then filter them by estimated contamination and orientation bias artifacts. We can plot the Mutect2PL pipeline to show the details.

plotCWL(Mutect2PL)

5 SessionInfo

sessionInfo()
#> R version 3.6.1 (2019-07-05)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.9-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.9-bioc/R/lib/libRlapack.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#> [1] dplyr_0.8.3          RcwlPipelines_1.0.11 BiocFileCache_1.8.0 
#> [4] dbplyr_1.4.2         Rcwl_1.0.11          S4Vectors_0.22.0    
#> [7] BiocGenerics_0.30.0  yaml_2.2.0           BiocStyle_2.12.0    
#> 
#> loaded via a namespace (and not attached):
#>  [1] bit64_0.9-7         RColorBrewer_1.1-2  progress_1.2.2     
#>  [4] httr_1.4.1          tools_3.6.1         backports_1.1.4    
#>  [7] utf8_1.1.4          R6_2.4.0            DBI_1.0.0          
#> [10] lazyeval_0.2.2      colorspace_1.4-1    withr_2.1.2        
#> [13] tidyselect_0.2.5    gridExtra_2.3       prettyunits_1.0.2  
#> [16] bit_1.1-14          curl_4.0            compiler_3.6.1     
#> [19] cli_1.1.0           influenceR_0.1.0    bookdown_0.13      
#> [22] scales_1.0.0        checkmate_1.9.4     readr_1.3.1        
#> [25] rappdirs_0.3.1      stringr_1.4.0       digest_0.6.20      
#> [28] rmarkdown_1.15      R.utils_2.9.0       pkgconfig_2.0.2    
#> [31] htmltools_0.3.6     highr_0.8           htmlwidgets_1.3    
#> [34] rlang_0.4.0         rstudioapi_0.10     RSQLite_2.1.2      
#> [37] shiny_1.3.2         visNetwork_2.0.8    jsonlite_1.6       
#> [40] BiocParallel_1.18.1 rgexf_0.15.3        R.oo_1.22.0        
#> [43] magrittr_1.5        Rcpp_1.0.2          munsell_0.5.0      
#> [46] fansi_0.4.0         viridis_0.5.1       R.methodsS3_1.7.1  
#> [49] stringi_1.4.3       grid_3.6.1          blob_1.2.0         
#> [52] promises_1.0.1      crayon_1.3.4        hms_0.5.1          
#> [55] batchtools_0.9.11   zeallot_0.1.0       knitr_1.24         
#> [58] pillar_1.4.2        igraph_1.2.4.1      base64url_1.4      
#> [61] codetools_0.2-16    XML_3.98-1.20       glue_1.3.1         
#> [64] evaluate_0.14       downloader_0.4      data.table_1.12.2  
#> [67] BiocManager_1.30.4  vctrs_0.2.0         httpuv_1.5.1       
#> [70] gtable_0.3.0        purrr_0.3.2         tidyr_0.8.3        
#> [73] assertthat_0.2.1    ggplot2_3.2.1       xfun_0.9           
#> [76] mime_0.7            xtable_1.8-4        later_0.8.0        
#> [79] viridisLite_0.3.0   tibble_2.1.3        memoise_1.1.0      
#> [82] Rook_1.1-1          DiagrammeR_1.0.1    brew_1.0-6

Rcwl Pipelines

2019-09-04