TileDBArray 1.12.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 1.262403499 -0.396124761 0.987464844 . -0.4074979 -0.4770928
## [2,] -1.320321224 0.521717902 -0.241036080 . -0.5222599 0.6148844
## [3,] 0.250724980 0.055090704 0.066488852 . 0.7844167 0.3353531
## [4,] 1.548522094 -0.155436954 -0.005868964 . -1.6645198 -0.1071965
## [5,] 1.003796655 0.774995830 -0.790374436 . -0.5615602 -0.9337578
## ... . . . . . .
## [96,] 1.1024757 -1.0218676 -0.8637707 . -0.36370301 -1.10942658
## [97,] -0.2660647 1.5491471 -0.8221721 . -0.40592501 -0.94282985
## [98,] -0.2351233 -0.5593851 0.7258330 . 1.88453085 -0.42038591
## [99,] -0.3278127 -1.5637214 -1.2834359 . -0.10421344 0.03009067
## [100,] -1.1787443 1.7870710 -1.3740928 . -0.13521266 0.64817168
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 1.262403499 -0.396124761 0.987464844 . -0.4074979 -0.4770928
## [2,] -1.320321224 0.521717902 -0.241036080 . -0.5222599 0.6148844
## [3,] 0.250724980 0.055090704 0.066488852 . 0.7844167 0.3353531
## [4,] 1.548522094 -0.155436954 -0.005868964 . -1.6645198 -0.1071965
## [5,] 1.003796655 0.774995830 -0.790374436 . -0.5615602 -0.9337578
## ... . . . . . .
## [96,] 1.1024757 -1.0218676 -0.8637707 . -0.36370301 -1.10942658
## [97,] -0.2660647 1.5491471 -0.8221721 . -0.40592501 -0.94282985
## [98,] -0.2351233 -0.5593851 0.7258330 . 1.88453085 -0.42038591
## [99,] -0.3278127 -1.5637214 -1.2834359 . -0.10421344 0.03009067
## [100,] -1.1787443 1.7870710 -1.3740928 . -0.13521266 0.64817168
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 1.262403499 -0.396124761 0.987464844 . -0.4074979 -0.4770928
## GENE_2 -1.320321224 0.521717902 -0.241036080 . -0.5222599 0.6148844
## GENE_3 0.250724980 0.055090704 0.066488852 . 0.7844167 0.3353531
## GENE_4 1.548522094 -0.155436954 -0.005868964 . -1.6645198 -0.1071965
## GENE_5 1.003796655 0.774995830 -0.790374436 . -0.5615602 -0.9337578
## ... . . . . . .
## GENE_96 1.1024757 -1.0218676 -0.8637707 . -0.36370301 -1.10942658
## GENE_97 -0.2660647 1.5491471 -0.8221721 . -0.40592501 -0.94282985
## GENE_98 -0.2351233 -0.5593851 0.7258330 . 1.88453085 -0.42038591
## GENE_99 -0.3278127 -1.5637214 -1.2834359 . -0.10421344 0.03009067
## GENE_100 -1.1787443 1.7870710 -1.3740928 . -0.13521266 0.64817168
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 1.2624035 -1.3203212 0.2507250 1.5485221 1.0037967 -0.8044394
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 1.262403499 -0.396124761 0.987464844 -1.268607760 -0.571359207
## GENE_2 -1.320321224 0.521717902 -0.241036080 0.075956454 -1.506010425
## GENE_3 0.250724980 0.055090704 0.066488852 -0.296052254 -0.377302526
## GENE_4 1.548522094 -0.155436954 -0.005868964 1.168682017 -1.615868211
## GENE_5 1.003796655 0.774995830 -0.790374436 -0.401337549 -0.133100859
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 2.52480700 -0.79224952 1.97492969 . -0.8149958 -0.9541857
## GENE_2 -2.64064245 1.04343580 -0.48207216 . -1.0445198 1.2297688
## GENE_3 0.50144996 0.11018141 0.13297770 . 1.5688334 0.6707061
## GENE_4 3.09704419 -0.31087391 -0.01173793 . -3.3290396 -0.2143931
## GENE_5 2.00759331 1.54999166 -1.58074887 . -1.1231203 -1.8675157
## ... . . . . . .
## GENE_96 2.2049514 -2.0437351 -1.7275413 . -0.72740603 -2.21885316
## GENE_97 -0.5321293 3.0982941 -1.6443443 . -0.81185003 -1.88565969
## GENE_98 -0.4702466 -1.1187702 1.4516659 . 3.76906170 -0.84077183
## GENE_99 -0.6556253 -3.1274428 -2.5668717 . -0.20842689 0.06018133
## GENE_100 -2.3574886 3.5741420 -2.7481856 . -0.27042531 1.29634336
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## -2.8569015 -6.1197812 -0.6072298 -1.8717755 -7.6163311 -11.0881865
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## 7.2207930 -21.9970639 14.4598584 -13.5778965
out %*% runif(ncol(out))
## <100 x 1> DelayedMatrix object of type "double":
## y
## GENE_1 -0.5088240
## GENE_2 -3.5323774
## GENE_3 -0.3683187
## GENE_4 1.7427330
## GENE_5 2.9038835
## ... .
## GENE_96 -4.2789663
## GENE_97 -0.5465519
## GENE_98 1.1588719
## GENE_99 -2.6328770
## GENE_100 -2.9410181
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.08820191 -0.29663880 -0.17315756 . 1.2801035 -1.4613983
## [2,] -0.67024000 -1.39825191 -0.63773837 . -0.7929436 -0.1106620
## [3,] 1.18145131 -0.61667068 0.37877830 . -0.3043487 1.0485920
## [4,] 1.28831465 -0.58957164 -0.22725172 . 1.2239048 1.6915330
## [5,] -0.88802648 0.18482872 1.99692496 . 0.5815341 -2.2580003
## ... . . . . . .
## [96,] -1.5332230 -0.4782448 0.6090779 . 0.7147580 -0.6227382
## [97,] 0.3352413 1.6373325 -1.3262426 . -1.1181334 -1.0265291
## [98,] -0.5100926 -1.7461465 -2.0536734 . -0.3858809 2.2538729
## [99,] 0.8932646 -0.1227073 0.3533865 . 0.4711125 -0.3369576
## [100,] -0.7035329 1.9900629 0.2637234 . 1.3885236 -1.0335787
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.08820191 -0.29663880 -0.17315756 . 1.2801035 -1.4613983
## [2,] -0.67024000 -1.39825191 -0.63773837 . -0.7929436 -0.1106620
## [3,] 1.18145131 -0.61667068 0.37877830 . -0.3043487 1.0485920
## [4,] 1.28831465 -0.58957164 -0.22725172 . 1.2239048 1.6915330
## [5,] -0.88802648 0.18482872 1.99692496 . 0.5815341 -2.2580003
## ... . . . . . .
## [96,] -1.5332230 -0.4782448 0.6090779 . 0.7147580 -0.6227382
## [97,] 0.3352413 1.6373325 -1.3262426 . -1.1181334 -1.0265291
## [98,] -0.5100926 -1.7461465 -2.0536734 . -0.3858809 2.2538729
## [99,] 0.8932646 -0.1227073 0.3533865 . 0.4711125 -0.3369576
## [100,] -0.7035329 1.9900629 0.2637234 . 1.3885236 -1.0335787
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.14 TileDBArray_1.12.0 DelayedArray_0.28.0
## [4] SparseArray_1.2.1 S4Arrays_1.2.0 abind_1.4-5
## [7] IRanges_2.36.0 S4Vectors_0.40.1 MatrixGenerics_1.14.0
## [10] matrixStats_1.0.0 BiocGenerics_0.48.1 Matrix_1.6-1.1
## [13] BiocStyle_2.30.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.0.5 jsonlite_1.8.7 compiler_4.3.2
## [4] BiocManager_1.30.22 crayon_1.5.2 Rcpp_1.0.11
## [7] jquerylib_0.1.4 yaml_2.3.7 fastmap_1.1.1
## [10] lattice_0.22-5 R6_2.5.1 RcppCCTZ_0.2.12
## [13] XVector_0.42.0 tiledb_0.21.1 knitr_1.45
## [16] bookdown_0.36 bslib_0.5.1 rlang_1.1.1
## [19] cachem_1.0.8 xfun_0.41 sass_0.4.7
## [22] bit64_4.0.5 cli_3.6.1 zlibbioc_1.48.0
## [25] spdl_0.0.5 digest_0.6.33 grid_4.3.2
## [28] data.table_1.14.8 evaluate_0.23 nanotime_0.3.7
## [31] zoo_1.8-12 rmarkdown_2.25 tools_4.3.2
## [34] htmltools_0.5.7