fastreeR 1.6.0
The goal of fastreeR is to provide functions for calculating distance matrix, building phylogenetic tree or performing hierarchical clustering between samples, directly from a VCF or FASTA file.
To install fastreeR
package:
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("fastreeR")
You should allocate minimum 10kb per sample per variant of RAM for the JVM.
The more RAM you allocate, the faster the execution will be (less pauses
for garbage collection).
In order to allocate RAM, a special parameter needs to be passed while JVM
initializes. JVM parameters can be passed by setting java.parameters
option.
The -Xmx
parameter, followed (without space) by an integer value and a
letter, is used to tell JVM what is the maximum amount of heap RAM that it can
use. The letter in the parameter (uppercase or lowercase), indicates RAM units.
For example, parameters -Xmx1024m
or -Xmx1024M
or -Xmx1g
or -Xmx1G
, all
allocate 1 Gigabyte or 1024 Megabytes of maximum RAM for JVM.
options(java.parameters="-Xmx1G")
unloadNamespace("fastreeR")
library(fastreeR)
library(utils)
library(ape)
library(stats)
library(grid)
library(BiocFileCache)
We download, in a temporary location, a small vcf file
from 1K project, with around 150 samples and 100k variants (SNPs and INDELs).
We use BiocFileCache
for this retrieval process
so that it is not repeated needlessly.
If for any reason we cannot download, we use the small sample vcf from
fastreeR
package.
bfc <- BiocFileCache::BiocFileCache(ask = FALSE)
tempVcfUrl <-
paste0("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/",
"1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/",
"supporting/related_samples/",
"ALL.chrX.shapeit2_integrated_snvindels_v2a_related_samples_27022019.",
"GRCh38.phased.vcf.gz")
tempVcf <- BiocFileCache::bfcquery(bfc,field = "rname", "tempVcf")$rpath[1]
if(is.na(tempVcf)) {
tryCatch(
{ tempVcf <- BiocFileCache::bfcadd(bfc,"tempVcf",fpath=tempVcfUrl)[[1]]
},
error=function(cond) {
tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
},
warning=function(cond) {
tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
}
)
}
if(file.size(tempVcf) == 0L) {
tempVcf <- system.file("extdata", "samples.vcf.gz", package="fastreeR")
}
We download, in temporary location, some small bacterial genomes.
We use BiocFileCache
for this retrieval process
so that it is not repeated needlessly.
If for any reason we cannot download, we use the small sample fasta from
fastreeR
package.
tempFastasUrls <- c(
#Mycobacterium liflandii
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Mycobacterium_liflandii/latest_assembly_versions/",
"GCF_000026445.2_ASM2644v2/GCF_000026445.2_ASM2644v2_genomic.fna.gz"),
#Pelobacter propionicus
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Pelobacter_propionicus/latest_assembly_versions/",
"GCF_000015045.1_ASM1504v1/GCF_000015045.1_ASM1504v1_genomic.fna.gz"),
#Rickettsia prowazekii
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Rickettsia_prowazekii/latest_assembly_versions/",
"GCF_000022785.1_ASM2278v1/GCF_000022785.1_ASM2278v1_genomic.fna.gz"),
#Salmonella enterica
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Salmonella_enterica/reference/",
"GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz"),
#Staphylococcus aureus
paste0("https://ftp.ncbi.nih.gov/genomes/refseq/bacteria/",
"Staphylococcus_aureus/reference/",
"GCF_000013425.1_ASM1342v1/GCF_000013425.1_ASM1342v1_genomic.fna.gz")
)
tempFastas <- list()
for (i in seq(1,5)) {
tempFastas[[i]] <- BiocFileCache::bfcquery(bfc,field = "rname",
paste0("temp_fasta",i))$rpath[1]
if(is.na(tempFastas[[i]])) {
tryCatch(
{ tempFastas[[i]] <-
BiocFileCache::bfcadd(bfc, paste0("temp_fasta",i),
fpath=tempFastasUrls[i])[[1]]
},
error=function(cond) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
},
warning=function(cond) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
}
)
}
if(file.size(tempFastas[[i]]) == 0L) {
tempFastas <- system.file("extdata", "samples.fasta.gz",
package="fastreeR")
break
}
}
myVcfIstats <- fastreeR::vcf2istats(inputFile = tempVcf)
plot(myVcfIstats[,7:9])
Figure 1: Sample statistics from vcf file
The most time consuming process is calculating distances between samples. Assign more processors in order to speed up this operation.
myVcfDist <- fastreeR::vcf2dist(inputFile = tempVcf, threads = 2)
graphics::hist(myVcfDist, breaks = 100, main=NULL,
xlab = "Distance", xlim = c(0,max(myVcfDist)))
Figure 2: Histogram of distances from vcf file
We note two distinct groups of distances. One around of distance value 0.05 and the second around distance value 0.065.
fastreeR::dist2tree
Notice that the generated tree is ultrametric.
myVcfTree <- fastreeR::dist2tree(inputDist = myVcfDist)
plot(ape::read.tree(text = myVcfTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Figure 3: Tree from vcf with fastreeR
Of course the same can be achieved directly from the vcf file, without calculating distances.
myVcfTree <- fastreeR::vcf2tree(inputFile = tempVcf, threads = 2)
plot(ape::read.tree(text = myVcfTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Figure 4: Tree from vcf with fastreeR
As expected from the histogram of distances, two groups of samples also emerge in the tree. The two branches, one at height around 0.055 and the second around height 0.065, are clearly visible.
stats::hclust
For comparison, we generate a tree by using stats
package and distances
calculated by fastreeR
.
myVcfTreeStats <- stats::hclust(myVcfDist)
plot(myVcfTreeStats, ann = FALSE, cex = 0.3)
Figure 5: Tree from vcf with stats::hclust
Although it does not initially look very similar, because it is not ultrametric, it is indeed quite the same tree. We note again the two groups (two branches) of samples and the 4 samples, possibly clones, that they show very close distances between them.
We can identify the two groups of samples, apparent from the hierarchical tree,
by using dist2clusters
or vcf2clusters
or tree2clusters
.
By playing a little with the cutHeight
parameter, we find that a
value of cutHeight=0.067
cuts the tree into two branches.
The first group contains 106 samples and the second 44.
myVcfClust <- fastreeR::dist2clusters(inputDist = myVcfDist, cutHeight = 0.067)
#> ..done.
if (length(myVcfClust) > 1) {
tree <- myVcfClust[[1]]
clusters <- myVcfClust[[2]]
tree
clusters
}
#> [1] "1 100 HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00105 HG00106 HG00107 HG00108 HG00109 HG00110 HG00111 HG00112 HG00113 HG00114 HG00115 HG00116 HG00117 HG00118 HG00119 HG00120 HG00121 HG00122 HG00123 HG00125 HG00126 HG00127 HG00128 HG00129 HG00130 HG00131 HG00132 HG00133 HG00136 HG00137 HG00138 HG00139 HG00140 HG00141 HG00142 HG00143 HG00145 HG00146 HG00148 HG00149 HG00150 HG00151 HG00154 HG00155 HG00157 HG00158 HG00159 HG00160 HG00171 HG00173 HG00174 HG00176 HG00177 HG00178 HG00179 HG00180 HG00181 HG00182 HG00183 HG00185 HG00186 HG00187 HG00188 HG00189 HG00190 HG00231 HG00232 HG00233 HG00234 HG00235 HG00236 HG00237 HG00238 HG00239 HG00240 HG00242 HG00243 HG00244 HG00245 HG00246 HG00250 HG00251 HG00252 HG00253 HG00254 HG00255 HG00256 HG00257 HG00258 HG00259 HG00260 HG00261 HG00262"
Similar analysis we can perform when we have samples represented as sequences in a fasta file.
Use of the downloaded sample fasta file :
myFastaDist <- fastreeR::fasta2dist(tempFastas, kmer = 6)
Or use the provided by fastreeR
fasta file of 48 bacterial RefSeq :
myFastaDist <- fastreeR::fasta2dist(
system.file("extdata", "samples.fasta.gz", package="fastreeR"), kmer = 6)
graphics::hist(myFastaDist, breaks = 100, main=NULL,
xlab="Distance", xlim = c(0,max(myFastaDist)))
Figure 6: Histogram of distances from fasta file
fastreeR::dist2tree
myFastaTree <- fastreeR::dist2tree(inputDist = myFastaDist)
plot(ape::read.tree(text = myFastaTree), direction = "down", cex = 0.3)
ape::add.scale.bar()
ape::axisPhylo(side = 2)
Figure 7: Tree from fasta with fastreeR
stats::hclust
myFastaTreeStats <- stats::hclust(myFastaDist)
plot(myFastaTreeStats, ann = FALSE, cex = 0.3)
Figure 8: Tree from fasta with stats::hclust
utils::sessionInfo()
#> R version 4.3.1 Patched (2023-06-17 r84564)
#> Platform: x86_64-apple-darwin20 (64-bit)
#> Running under: macOS Monterey 12.6.5
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
#>
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> time zone: America/New_York
#> tzcode source: internal
#>
#> attached base packages:
#> [1] grid stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] BiocFileCache_2.10.0 dbplyr_2.3.4 ape_5.7-1
#> [4] fastreeR_1.6.0 BiocStyle_2.30.0
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.7 utf8_1.2.4 generics_0.1.3
#> [4] stringi_1.7.12 RSQLite_2.3.1 lattice_0.22-5
#> [7] digest_0.6.33 magrittr_2.0.3 evaluate_0.22
#> [10] dynamicTreeCut_1.63-1 bookdown_0.36 fastmap_1.1.1
#> [13] blob_1.2.4 R.oo_1.25.0 jsonlite_1.8.7
#> [16] R.utils_2.12.2 DBI_1.1.3 BiocManager_1.30.22
#> [19] httr_1.4.7 purrr_1.0.2 fansi_1.0.5
#> [22] jquerylib_0.1.4 cli_3.6.1 rlang_1.1.1
#> [25] R.methodsS3_1.8.2 bit64_4.0.5 withr_2.5.1
#> [28] cachem_1.0.8 yaml_2.3.7 tools_4.3.1
#> [31] parallel_4.3.1 memoise_2.0.1 dplyr_1.1.3
#> [34] filelock_1.0.2 curl_5.1.0 vctrs_0.6.4
#> [37] R6_2.5.1 magick_2.8.1 lifecycle_1.0.3
#> [40] stringr_1.5.0 bit_4.0.5 pkgconfig_2.0.3
#> [43] rJava_1.0-6 pillar_1.9.0 bslib_0.5.1
#> [46] glue_1.6.2 Rcpp_1.0.11 xfun_0.40
#> [49] tibble_3.2.1 tidyselect_1.2.0 knitr_1.44
#> [52] htmltools_0.5.6.1 nlme_3.1-163 rmarkdown_2.25
#> [55] compiler_4.3.1