TileDBArray 1.8.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.10552873 0.96583437 1.87996438 . -1.4068881 1.5607851
## [2,] 0.95000428 0.72437722 -0.61028413 . -0.3067657 1.5557007
## [3,] -0.44662261 0.50404614 0.16215128 . -0.1623389 -0.2892420
## [4,] 0.09519224 0.68715952 -0.47131696 . -0.8289541 0.5522636
## [5,] -0.77444588 0.60409727 1.18988793 . -1.8451904 -0.6919793
## ... . . . . . .
## [96,] 1.5380312 0.7637399 -1.9864690 . 0.253785423 0.899167037
## [97,] 2.0037261 -0.9109348 0.3961184 . 1.701874979 0.660873165
## [98,] -0.4550960 0.5983830 1.5605912 . 0.558231633 0.003951903
## [99,] 2.0935994 -1.7197198 -0.6270500 . -0.322098907 -0.726083103
## [100,] -1.1326740 0.4682062 0.6466833 . 0.063511600 -0.038342332
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.10552873 0.96583437 1.87996438 . -1.4068881 1.5607851
## [2,] 0.95000428 0.72437722 -0.61028413 . -0.3067657 1.5557007
## [3,] -0.44662261 0.50404614 0.16215128 . -0.1623389 -0.2892420
## [4,] 0.09519224 0.68715952 -0.47131696 . -0.8289541 0.5522636
## [5,] -0.77444588 0.60409727 1.18988793 . -1.8451904 -0.6919793
## ... . . . . . .
## [96,] 1.5380312 0.7637399 -1.9864690 . 0.253785423 0.899167037
## [97,] 2.0037261 -0.9109348 0.3961184 . 1.701874979 0.660873165
## [98,] -0.4550960 0.5983830 1.5605912 . 0.558231633 0.003951903
## [99,] 2.0935994 -1.7197198 -0.6270500 . -0.322098907 -0.726083103
## [100,] -1.1326740 0.4682062 0.6466833 . 0.063511600 -0.038342332
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 2 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE TRUE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.10552873 0.96583437 1.87996438 . -1.4068881 1.5607851
## GENE_2 0.95000428 0.72437722 -0.61028413 . -0.3067657 1.5557007
## GENE_3 -0.44662261 0.50404614 0.16215128 . -0.1623389 -0.2892420
## GENE_4 0.09519224 0.68715952 -0.47131696 . -0.8289541 0.5522636
## GENE_5 -0.77444588 0.60409727 1.18988793 . -1.8451904 -0.6919793
## ... . . . . . .
## GENE_96 1.5380312 0.7637399 -1.9864690 . 0.253785423 0.899167037
## GENE_97 2.0037261 -0.9109348 0.3961184 . 1.701874979 0.660873165
## GENE_98 -0.4550960 0.5983830 1.5605912 . 0.558231633 0.003951903
## GENE_99 2.0935994 -1.7197198 -0.6270500 . -0.322098907 -0.726083103
## GENE_100 -1.1326740 0.4682062 0.6466833 . 0.063511600 -0.038342332
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -0.10552873 0.95000428 -0.44662261 0.09519224 -0.77444588 -0.29845731
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -0.10552873 0.96583437 1.87996438 -1.64598987 1.86597605
## GENE_2 0.95000428 0.72437722 -0.61028413 0.14438003 0.58244036
## GENE_3 -0.44662261 0.50404614 0.16215128 -0.55444854 1.49721232
## GENE_4 0.09519224 0.68715952 -0.47131696 0.46214732 -0.81223839
## GENE_5 -0.77444588 0.60409727 1.18988793 1.21436610 -0.37071975
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.2110575 1.9316687 3.7599288 . -2.8137763 3.1215703
## GENE_2 1.9000086 1.4487544 -1.2205683 . -0.6135315 3.1114014
## GENE_3 -0.8932452 1.0080923 0.3243026 . -0.3246778 -0.5784840
## GENE_4 0.1903845 1.3743190 -0.9426339 . -1.6579082 1.1045273
## GENE_5 -1.5488918 1.2081945 2.3797759 . -3.6903808 -1.3839586
## ... . . . . . .
## GENE_96 3.0760625 1.5274797 -3.9729379 . 0.507570847 1.798334074
## GENE_97 4.0074521 -1.8218696 0.7922367 . 3.403749957 1.321746331
## GENE_98 -0.9101919 1.1967660 3.1211824 . 1.116463266 0.007903807
## GENE_99 4.1871987 -3.4394395 -1.2540999 . -0.644197814 -1.452166206
## GENE_100 -2.2653481 0.9364125 1.2933666 . 0.127023200 -0.076684664
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## 14.44495740 -0.03267866 -3.31984759 8.76408647 -6.29207368 16.15850557
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -5.26425348 -2.03289993 -11.00916234 9.65904450
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 -2.2191149
## GENE_2 1.2367870
## GENE_3 1.3715926
## GENE_4 -0.8635955
## GENE_5 1.5090158
## ... .
## GENE_96 0.9121365
## GENE_97 3.9132844
## GENE_98 1.2179417
## GENE_99 2.3471072
## GENE_100 -2.2246337
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.1380843 1.0478837 1.6607065 . 0.3980768 1.1442563
## [2,] -0.1613176 -1.0691259 1.6968770 . -1.3413093 -1.0980893
## [3,] 0.9975616 -0.1167066 1.2816768 . 0.3003691 -0.3605586
## [4,] 1.6008150 -0.6482420 1.4929224 . -0.2953196 0.1088085
## [5,] 0.7474978 0.5862172 -0.1307349 . 0.7284235 0.5371106
## ... . . . . . .
## [96,] -0.39671123 -1.07345171 1.98833965 . -0.1391141 0.8928555
## [97,] -1.62099526 -0.50842914 1.46464205 . 0.2297579 0.8015005
## [98,] 1.66148158 0.82122391 1.43796032 . -1.2725264 -1.1525469
## [99,] 1.34103736 -1.05583313 0.13668815 . -0.8777210 1.5385936
## [100,] 0.04616995 0.44401936 -0.34876995 . -0.6057029 -0.5481662
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.1380843 1.0478837 1.6607065 . 0.3980768 1.1442563
## [2,] -0.1613176 -1.0691259 1.6968770 . -1.3413093 -1.0980893
## [3,] 0.9975616 -0.1167066 1.2816768 . 0.3003691 -0.3605586
## [4,] 1.6008150 -0.6482420 1.4929224 . -0.2953196 0.1088085
## [5,] 0.7474978 0.5862172 -0.1307349 . 0.7284235 0.5371106
## ... . . . . . .
## [96,] -0.39671123 -1.07345171 1.98833965 . -0.1391141 0.8928555
## [97,] -1.62099526 -0.50842914 1.46464205 . 0.2297579 0.8015005
## [98,] 1.66148158 0.82122391 1.43796032 . -1.2725264 -1.1525469
## [99,] 1.34103736 -1.05583313 0.13668815 . -0.8777210 1.5385936
## [100,] 0.04616995 0.44401936 -0.34876995 . -0.6057029 -0.5481662
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.16-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.16-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] TileDBArray_1.8.0 DelayedArray_0.24.0 IRanges_2.32.0
## [4] S4Vectors_0.36.0 MatrixGenerics_1.10.0 matrixStats_0.62.0
## [7] BiocGenerics_0.44.0 Matrix_1.5-1 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.9 bslib_0.4.0 compiler_4.2.1
## [4] BiocManager_1.30.19 jquerylib_0.1.4 tools_4.2.1
## [7] digest_0.6.30 bit_4.0.4 jsonlite_1.8.3
## [10] evaluate_0.17 lattice_0.20-45 nanotime_0.3.7
## [13] rlang_1.0.6 cli_3.4.1 RcppCCTZ_0.2.11
## [16] yaml_2.3.6 xfun_0.34 fastmap_1.1.0
## [19] stringr_1.4.1 knitr_1.40 sass_0.4.2
## [22] bit64_4.0.5 grid_4.2.1 data.table_1.14.4
## [25] R6_2.5.1 rmarkdown_2.17 bookdown_0.29
## [28] tiledb_0.16.0 magrittr_2.0.3 htmltools_0.5.3
## [31] stringi_1.7.8 cachem_1.0.6 zoo_1.8-11