Firstly we need to make sure the tallyFile has been downloaded correctly and we load the sample data. See the vignette h5vc for more in-depth explanations of this step.
library(h5vc)
library(rhdf5)
library(GenomicRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:bit64':
##
## match, order, rank
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, as.vector, cbind, colnames, duplicated, eval,
## evalq, get, intersect, is.unsorted, lapply, mapply, match,
## mget, order, paste, pmax, pmax.int, pmin, pmin.int, rank,
## rbind, rep.int, rownames, sapply, setdiff, sort, table,
## tapply, union, unique, unlist
##
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
##
## The following objects are masked from 'package:reshape':
##
## expand, rename
##
## The following objects are masked from 'package:plyr':
##
## desc, rename
##
## Loading required package: XVector
##
## Attaching package: 'XVector'
##
## The following object is masked from 'package:plyr':
##
## compact
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
We use the callVariantsPaired
function to perform variant calling in a cohort paired tumour/normal samples.
This function has the following signature:
callVariantsPaired <- function (data, sampledata, cl = vcConfParams())
Where vcConfParams()
is a function used to configure the behaviour of the calling function, it returns a list of paramters.
vcConfParams()
## $minStrandCov
## [1] 5
##
## $maxStrandCov
## [1] 200
##
## $minStrandAltSupport
## [1] 2
##
## $maxStrandAltSupportControl
## [1] 0
##
## $minStrandDelSupport
## [1] 2
##
## $maxStrandDelSupportControl
## [1] 0
##
## $minStrandCovControl
## [1] 5
##
## $maxStrandCovControl
## [1] 200
##
## $bases
## [1] 5 6 7 8
##
## $returnDataPoints
## [1] FALSE
##
## $annotateWithBackground
## [1] FALSE
##
## $mergeCalls
## [1] FALSE
##
## $mergeAggregator
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x1d47620>
## <environment: namespace:base>
##
## $pValueAggregator
## function (..., na.rm = FALSE) .Primitive("max")
calls16 <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy/16",
blocksize = 10000,
FUN = callVariantsPaired,
sampledata = sampleData,
cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
names = c( "Coverages", "Counts", "Reference" ),
range = c(29950000,30000000)
)
calls22 <- h5dapply(
filename = tallyFile,
group = "/ExampleStudy/22",
blocksize = 10000,
FUN = callVariantsPaired,
sampledata = sampleData,
cl = vcConfParams( returnDataPoints = TRUE, annotateWithBackground = TRUE ),
names = c( "Coverages", "Counts", "Reference" ),
range = c(39350000,39400000)
)
calls <- rbind(do.call(rbind,calls16), do.call(rbind,calls22))
We can use the mismatchPlot
function to create plots of the suspected variant regions.
First we extract a set of interesting positions:
calls$Patient = sampleData$Patient[match(calls$Sample, sampleData$Sample)]
calls <- subset(calls, pValueFwd < 0.05 & pValueRev < 0.05)
calls$varID = paste(calls$seqnames, calls$start, calls$altAllele, calls$Patient)
calls <- calls[!duplicated(calls$varID), ]
calls
## Chrom Start End Sample altAllele
## 29970000:29979999.1 16 29979628 29979628 PT8PrimaryDNA A
## 39350000:39359999.2 22 39357400 39357400 PT5PrimaryDNA A
## refAllele caseCountFwd caseCountRev caseCoverageFwd
## 29970000:29979999.1 G 3 9 19
## 39350000:39359999.2 T 9 5 18
## caseCoverageRev controlCountFwd controlCountRev
## 29970000:29979999.1 23 0 0
## 39350000:39359999.2 16 0 0
## controlCoverageFwd controlCoverageRev
## 29970000:29979999.1 34 25
## 39350000:39359999.2 17 14
## backgroundFrequencyFwd backgroundFrequencyRev
## 29970000:29979999.1 0 0
## 39350000:39359999.2 0 0
## pValueFwd pValueRev Patient varID
## 29970000:29979999.1 0 0 Patient8 A Patient8
## 39350000:39359999.2 0 0 Patient5 A Patient5
The remaining two calls have good significance (binom.test
against the estimated background error rate, i.e. the mean of the mismatch rates of all control samples in the cohort) and we filter out double entries that come from the same patient. You will see in the plots below, that patient 5 has the variant in both the primary and the relapse tumour.
Now we can generate the mismatch plots.
windowsize <- 35
for (i in seq(nrow(calls))) {
position <- calls$Start[i]
data <- h5dapply(filename = tallyFile, group = paste("/ExampleStudy", calls$Chrom[i],
sep = "/"), blocksize = windowsize * 3, names = c("Coverages", "Counts",
"Deletions"), range = c(position - windowsize, position + windowsize))[[1]]
p <- mismatchPlot(data = data, sampledata = sampleData, samples = sampleData$Sample[sampleData$Patient ==
calls$Patient[i]], windowsize = windowsize, position = position)
print(p)
}
Sometimes you want more flexibility that what mismatchPlot
can provide, for this case the geom_h5vc
function can be used to create an independed ggplot2
plot layer from a dataset that can be added to any ggplot
object, e.g. an existing mismatchPlot
or an empty plot.
The example below demonstrates this.
library(h5vc)
library(ggplot2)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
position <- 29979629
windowsize <- 30
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
data <- h5dapply(filename = tallyFile, group = "/ExampleStudy/16", blocksize = windowsize *
3, names = c("Coverages", "Counts", "Deletions"), range = c(position - windowsize,
position + windowsize))[[1]] #choose blocksize larger than range so that all needed data is collected as one block
# Summing up all mismatches irrespective of the alternative allele
data$CountsAggregate = colSums(data$Counts)
# Simple overview plot showing number of mismatches per position
p <- ggplot() + geom_h5vc(data = data, sampledata = sampleData, windowsize = 35,
position = 500, dataset = "Coverages", fill = "gray") + geom_h5vc(data = data,
sampledata = sampleData, windowsize = 35, position = 500, dataset = "CountsAggregate",
fill = "#D50000") + facet_wrap(~Sample, ncol = 2)
print(p)
We can also easily perform mutation spectrum analysis by using the function mutationSpectrum
which works on a set of variant calls in a data.frame
form as it would be produced by a call to e.g. callVariantsPaired
and a tallyFile parameter specifying hte location of a tally file as well as a context parameter.
The context parameter specifies how much sequence context should be taken into account for the mutation spectrum. An example with context 1 (i.e. one base up- and one base downstream of the variant) is shown below.
We skip the calling of variants (since ittakes al ong time on larger regions) and load the corresponding example data instead, we would generate the calls like this:
# loading library and example data
library(h5vc)
tallyFile <- system.file("extdata", "example.tally.hfs5", package = "h5vcData")
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
windowsize <- 1e+05
samples <- sampleData$Sample[sampleData$Patient == "Patient8"]
# Calling Variants
vars <- h5dapply(filename = tallyFile, group = "/ExampleStudy/16", blocksize = 10000,
FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE),
names = c("Coverages", "Counts", "Reference"), range = c(2.9e+07, 3e+07),
verbose = TRUE)
variantCalls <- do.call(rbind, vars)
vars <- h5dapply(filename = tallyFile, group = "/ExampleStudy/22", blocksize = 10000,
FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE),
names = c("Coverages", "Counts", "Reference"), range = c(3.8e+07, 4e+07),
verbose = TRUE)
variantCalls <- rbind(variantCalls, do.call(rbind, vars))
rownames(variantCalls) <- NULL
save(variantCalls, file = "example.variants.RDa")
The mutation spectrum can now be extracted from the tallyFile and variant calls like so:
data("example.variants", package = "h5vcData")
ms <- mutationSpectrum(variantCalls, tallyFile, "/ExampleStudy")
head(ms)
## refAllele altAllele Sample Prefix Suffix Transition Context
## 1 G T PT5PrimaryDNA T T G>T TGT
## 2 C A PT5PrimaryDNA A A G>T ACA
## 3 C G PT8PrimaryDNA C C C>G CCC
## 4 A C PT8PrimaryDNA G C A>C GAC
## 5 A G PT5PrimaryDNA G G A>G GAG
## 6 C T PT5PrimaryDNA G C C>T GCC
An overview is given in the following plot:
suppressPackageStartupMessages(library(ggplot2))
p <- ggplot(ms, aes(x = paste(Prefix, Suffix, sep = "."), fill = Transition)) +
geom_bar(width = 0.5) + facet_grid(Sample ~ Transition) + scale_y_discrete(name = "Number of Events") +
scale_x_discrete(name = "Mutation Context") + theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 16), axis.text.x = element_text(angle = 90,
hjust = 0), strip.text = element_text(size = 14), legend.position = "none")
print(p)
The y axis shows the number of this kind of mutation and the x-axis shows the sequence context (of size 1
in this case). At this point one could go on to perform a NMF-based analysis of the mutation spectra as is described in Alexandrov et. al. - Signatures of mutational processes in human cancer