Advanced Analyses

Prepare the environment

Firstly we need to make sure the tallyFile has been downloaded correctly and we load the sample data. See the vignette h5vc for more in-depth explanations of this step.

library(h5vc)
library(rhdf5)
library(GenomicRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:bit64':
## 
##     match, order, rank
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, as.vector, cbind, colnames, duplicated, eval,
##     evalq, get, intersect, is.unsorted, lapply, mapply, match,
##     mget, order, paste, pmax, pmax.int, pmin, pmin.int, rank,
##     rbind, rep.int, rownames, sapply, setdiff, sort, table,
##     tapply, union, unique, unlist
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following objects are masked from 'package:reshape':
## 
##     expand, rename
## 
## The following objects are masked from 'package:plyr':
## 
##     desc, rename
## 
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## 
## The following object is masked from 'package:plyr':
## 
##     compact
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")

Calling Variants

We use the callVariantsPaired function to perform variant calling in a cohort paired tumour/normal samples. This function has the following signature:

callVariantsPaired <- function (data, sampledata, cl = vcConfParams())

Where vcConfParams() is a function used to configure the behaviour of the calling function, it returns a list of paramters.

vcConfParams()
## $minStrandCov
## [1] 5
## 
## $maxStrandCov
## [1] 200
## 
## $minStrandAltSupport
## [1] 2
## 
## $maxStrandAltSupportControl
## [1] 0
## 
## $minStrandDelSupport
## [1] 2
## 
## $maxStrandDelSupportControl
## [1] 0
## 
## $minStrandCovControl
## [1] 5
## 
## $maxStrandCovControl
## [1] 200
## 
## $bases
## [1] 5 6 7 8
## 
## $returnDataPoints
## [1] FALSE
## 
## $annotateWithBackground
## [1] FALSE
## 
## $mergeCalls
## [1] FALSE
## 
## $mergeAggregator
## function (x, ...) 
## UseMethod("mean")
## <bytecode: 0x1d47620>
## <environment: namespace:base>
## 
## $pValueAggregator
## function (..., na.rm = FALSE)  .Primitive("max")
calls16 <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
  names = c( "Coverages", "Counts", "Reference" ),
  range = c(29950000,30000000)
  )

calls22 <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/22",
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
  names = c( "Coverages", "Counts", "Reference" ),
  range = c(39350000,39400000)
  )

calls <- rbind(do.call(rbind,calls16), do.call(rbind,calls22))

Plotting Variants

We can use the mismatchPlot function to create plots of the suspected variant regions. First we extract a set of interesting positions:

calls$Patient = sampleData$Patient[match(calls$Sample, sampleData$Sample)]
calls <- subset(calls, pValueFwd < 0.05 & pValueRev < 0.05)
calls$varID = paste(calls$seqnames, calls$start, calls$altAllele, calls$Patient)
calls <- calls[!duplicated(calls$varID), ]
calls
##                     Chrom    Start      End        Sample altAllele
## 29970000:29979999.1    16 29979628 29979628 PT8PrimaryDNA         A
## 39350000:39359999.2    22 39357400 39357400 PT5PrimaryDNA         A
##                     refAllele caseCountFwd caseCountRev caseCoverageFwd
## 29970000:29979999.1         G            3            9              19
## 39350000:39359999.2         T            9            5              18
##                     caseCoverageRev controlCountFwd controlCountRev
## 29970000:29979999.1              23               0               0
## 39350000:39359999.2              16               0               0
##                     controlCoverageFwd controlCoverageRev
## 29970000:29979999.1                 34                 25
## 39350000:39359999.2                 17                 14
##                     backgroundFrequencyFwd backgroundFrequencyRev
## 29970000:29979999.1                      0                      0
## 39350000:39359999.2                      0                      0
##                     pValueFwd pValueRev  Patient        varID
## 29970000:29979999.1         0         0 Patient8   A Patient8
## 39350000:39359999.2         0         0 Patient5   A Patient5

The remaining two calls have good significance (binom.test against the estimated background error rate, i.e. the mean of the mismatch rates of all control samples in the cohort) and we filter out double entries that come from the same patient. You will see in the plots below, that patient 5 has the variant in both the primary and the relapse tumour. Now we can generate the mismatch plots.

windowsize <- 35
for (i in seq(nrow(calls))) {
    position <- calls$Start[i]
    data <- h5dapply(filename = tallyFile, group = paste("/ExampleStudy", calls$Chrom[i], 
        sep = "/"), blocksize = windowsize * 3, names = c("Coverages", "Counts", 
        "Deletions"), range = c(position - windowsize, position + windowsize))[[1]]
    p <- mismatchPlot(data = data, sampledata = sampleData, samples = sampleData$Sample[sampleData$Patient == 
        calls$Patient[i]], windowsize = windowsize, position = position)
    print(p)
}

plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6

Plotting Arbitrary Datasets

Sometimes you want more flexibility that what mismatchPlot can provide, for this case the geom_h5vc function can be used to create an independed ggplot2 plot layer from a dataset that can be added to any ggplot object, e.g. an existing mismatchPlot or an empty plot. The example below demonstrates this.

library(h5vc)
library(ggplot2)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
position <- 29979629
windowsize <- 30
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
data <- h5dapply(filename = tallyFile, group = "/ExampleStudy/16", blocksize = windowsize * 
    3, names = c("Coverages", "Counts", "Deletions"), range = c(position - windowsize, 
    position + windowsize))[[1]]  #choose blocksize larger than range so that all needed data is collected as one block
# Summing up all mismatches irrespective of the alternative allele
data$CountsAggregate = colSums(data$Counts)
# Simple overview plot showing number of mismatches per position
p <- ggplot() + geom_h5vc(data = data, sampledata = sampleData, windowsize = 35, 
    position = 500, dataset = "Coverages", fill = "gray") + geom_h5vc(data = data, 
    sampledata = sampleData, windowsize = 35, position = 500, dataset = "CountsAggregate", 
    fill = "#D50000") + facet_wrap(~Sample, ncol = 2)
print(p)

plot of chunk unnamed-chunk-7

Mutation Spectrum Analysis

We can also easily perform mutation spectrum analysis by using the function mutationSpectrum which works on a set of variant calls in a data.frame form as it would be produced by a call to e.g. callVariantsPaired and a tallyFile parameter specifying hte location of a tally file as well as a context parameter. The context parameter specifies how much sequence context should be taken into account for the mutation spectrum. An example with context 1 (i.e. one base up- and one base downstream of the variant) is shown below.

We skip the calling of variants (since ittakes al ong time on larger regions) and load the corresponding example data instead, we would generate the calls like this:

# loading library and example data
library(h5vc)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
windowsize <- 1e+05
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
# Calling Variants
vars <- h5dapply(filename = tallyFile, group = "/ExampleStudy/16", blocksize = 10000, 
    FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE), 
    names = c("Coverages", "Counts", "Reference"), range = c(2.9e+07, 3e+07), 
    verbose = TRUE)

variantCalls <- do.call(rbind, vars)

vars <- h5dapply(filename = tallyFile, group = "/ExampleStudy/22", blocksize = 10000, 
    FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE), 
    names = c("Coverages", "Counts", "Reference"), range = c(3.8e+07, 4e+07), 
    verbose = TRUE)

variantCalls <- rbind(variantCalls, do.call(rbind, vars))
rownames(variantCalls) <- NULL

save(variantCalls, file = "example.variants.RDa")

The mutation spectrum can now be extracted from the tallyFile and variant calls like so:

data("example.variants", package = "h5vcData")
ms <- mutationSpectrum(variantCalls, tallyFile, "/ExampleStudy")
head(ms)
##   refAllele altAllele        Sample Prefix Suffix Transition Context
## 1         G         T PT5PrimaryDNA      T      T        G>T     TGT
## 2         C         A PT5PrimaryDNA      A      A        G>T     ACA
## 3         C         G PT8PrimaryDNA      C      C        C>G     CCC
## 4         A         C PT8PrimaryDNA      G      C        A>C     GAC
## 5         A         G PT5PrimaryDNA      G      G        A>G     GAG
## 6         C         T PT5PrimaryDNA      G      C        C>T     GCC

An overview is given in the following plot:

suppressPackageStartupMessages(library(ggplot2))
p <- ggplot(ms, aes(x = paste(Prefix, Suffix, sep = "."), fill = Transition)) + 
    geom_bar(width = 0.5) + facet_grid(Sample ~ Transition) + scale_y_discrete(name = "Number of Events") + 
    scale_x_discrete(name = "Mutation Context") + theme(axis.text = element_text(size = 14), 
    axis.title = element_text(size = 16), axis.text.x = element_text(angle = 90, 
        hjust = 0), strip.text = element_text(size = 14), legend.position = "none")
print(p)

plot of chunk mutationSpectrumPlot

The y axis shows the number of this kind of mutation and the x-axis shows the sequence context (of size 1 in this case). At this point one could go on to perform a NMF-based analysis of the mutation spectra as is described in Alexandrov et. al. - Signatures of mutational processes in human cancer