TileDBArray 1.10.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.728736701 -0.293327749 -0.712775430 . 0.04240392 -0.54434779
## [2,] -0.636496402 -0.433369504 -1.237965461 . -1.11537543 0.01511303
## [3,] 2.060318728 -0.977855095 -0.729195484 . 1.37936891 0.27792337
## [4,] -0.422188276 0.242964758 -0.009816666 . -0.94146193 1.25431462
## [5,] 0.109277671 -1.222443678 0.165926095 . -1.01266087 2.05922611
## ... . . . . . .
## [96,] -0.69200889 -0.83121327 0.90686055 . 0.0207247 -0.0744170
## [97,] 0.38491563 -0.41987022 -0.35356830 . -0.6500773 1.5340520
## [98,] 1.23286241 -1.19327848 -0.82662468 . 1.0588625 -0.5167429
## [99,] 1.01347222 1.63372263 1.67644270 . -1.1019158 -0.3503914
## [100,] 0.01755838 -0.32177194 1.62845087 . -0.4959258 -1.9836283
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.728736701 -0.293327749 -0.712775430 . 0.04240392 -0.54434779
## [2,] -0.636496402 -0.433369504 -1.237965461 . -1.11537543 0.01511303
## [3,] 2.060318728 -0.977855095 -0.729195484 . 1.37936891 0.27792337
## [4,] -0.422188276 0.242964758 -0.009816666 . -0.94146193 1.25431462
## [5,] 0.109277671 -1.222443678 0.165926095 . -1.01266087 2.05922611
## ... . . . . . .
## [96,] -0.69200889 -0.83121327 0.90686055 . 0.0207247 -0.0744170
## [97,] 0.38491563 -0.41987022 -0.35356830 . -0.6500773 1.5340520
## [98,] 1.23286241 -1.19327848 -0.82662468 . 1.0588625 -0.5167429
## [99,] 1.01347222 1.63372263 1.67644270 . -1.1019158 -0.3503914
## [100,] 0.01755838 -0.32177194 1.62845087 . -0.4959258 -1.9836283
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0.0 0.0 0.0 . 0 0
## [997,] 0.0 0.0 -1.8 . 0 0
## [998,] 0.0 0.0 0.0 . 0 0
## [999,] 0.0 0.0 0.0 . 0 0
## [1000,] 0.0 0.0 0.0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.728736701 -0.293327749 -0.712775430 . 0.04240392 -0.54434779
## GENE_2 -0.636496402 -0.433369504 -1.237965461 . -1.11537543 0.01511303
## GENE_3 2.060318728 -0.977855095 -0.729195484 . 1.37936891 0.27792337
## GENE_4 -0.422188276 0.242964758 -0.009816666 . -0.94146193 1.25431462
## GENE_5 0.109277671 -1.222443678 0.165926095 . -1.01266087 2.05922611
## ... . . . . . .
## GENE_96 -0.69200889 -0.83121327 0.90686055 . 0.0207247 -0.0744170
## GENE_97 0.38491563 -0.41987022 -0.35356830 . -0.6500773 1.5340520
## GENE_98 1.23286241 -1.19327848 -0.82662468 . 1.0588625 -0.5167429
## GENE_99 1.01347222 1.63372263 1.67644270 . -1.1019158 -0.3503914
## GENE_100 0.01755838 -0.32177194 1.62845087 . -0.4959258 -1.9836283
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 0.7287367 -0.6364964 2.0603187 -0.4221883 0.1092777 -0.2228720
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 0.728736701 -0.293327749 -0.712775430 -0.990870559 0.304297827
## GENE_2 -0.636496402 -0.433369504 -1.237965461 -1.048718213 0.299819447
## GENE_3 2.060318728 -0.977855095 -0.729195484 -1.066118631 -1.552085738
## GENE_4 -0.422188276 0.242964758 -0.009816666 0.949422248 -1.050033230
## GENE_5 0.109277671 -1.222443678 0.165926095 -0.328825731 0.677068540
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 1.45747340 -0.58665550 -1.42555086 . 0.08480785 -1.08869557
## GENE_2 -1.27299280 -0.86673901 -2.47593092 . -2.23075086 0.03022607
## GENE_3 4.12063746 -1.95571019 -1.45839097 . 2.75873783 0.55584673
## GENE_4 -0.84437655 0.48592952 -0.01963333 . -1.88292386 2.50862925
## GENE_5 0.21855534 -2.44488736 0.33185219 . -2.02532175 4.11845222
## ... . . . . . .
## GENE_96 -1.38401779 -1.66242654 1.81372110 . 0.04144941 -0.14883399
## GENE_97 0.76983125 -0.83974044 -0.70713660 . -1.30015463 3.06810393
## GENE_98 2.46572482 -2.38655696 -1.65324937 . 2.11772496 -1.03348571
## GENE_99 2.02694444 3.26744526 3.35288540 . -2.20383169 -0.70078286
## GENE_100 0.03511675 -0.64354389 3.25690174 . -0.99185156 -3.96725658
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## 12.7888619 -17.8435068 -4.4139384 -6.4953747 -0.4349758 6.0072143
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -2.3241599 4.1193169 0.2879729 -4.5436386
out %*% runif(ncol(out))
## <100 x 1> DelayedMatrix object of type "double":
## y
## GENE_1 -2.5881180
## GENE_2 -1.3992015
## GENE_3 -2.1444479
## GENE_4 1.2004298
## GENE_5 0.9119064
## ... .
## GENE_96 0.3945491
## GENE_97 1.3580597
## GENE_98 1.6764055
## GENE_99 -1.3805076
## GENE_100 -3.2639756
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.9696274 2.1010170 -2.1014252 . -1.7347835 -0.6488154
## [2,] -2.6217426 -0.1932832 -0.6313562 . 0.2527910 -0.1241249
## [3,] 0.6090490 -0.6476765 -0.8201650 . 1.1376374 1.1090150
## [4,] -0.6992607 -0.7141666 0.5663899 . 0.8899380 3.3792124
## [5,] 0.3268363 -1.7860914 1.3102527 . -0.7786451 -0.4938195
## ... . . . . . .
## [96,] -0.77060523 0.04101226 -1.29514000 . -0.95086683 0.37891427
## [97,] -0.46300746 -0.11114356 1.49848964 . -0.37707878 0.09880271
## [98,] -1.65274807 1.15994836 -0.28636100 . 0.26783606 2.52204105
## [99,] 2.93153512 -0.11830202 -1.87364369 . -0.25250750 0.09863699
## [100,] 0.06608643 -0.23167067 -0.01024835 . 0.84854641 1.08222777
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.9696274 2.1010170 -2.1014252 . -1.7347835 -0.6488154
## [2,] -2.6217426 -0.1932832 -0.6313562 . 0.2527910 -0.1241249
## [3,] 0.6090490 -0.6476765 -0.8201650 . 1.1376374 1.1090150
## [4,] -0.6992607 -0.7141666 0.5663899 . 0.8899380 3.3792124
## [5,] 0.3268363 -1.7860914 1.3102527 . -0.7786451 -0.4938195
## ... . . . . . .
## [96,] -0.77060523 0.04101226 -1.29514000 . -0.95086683 0.37891427
## [97,] -0.46300746 -0.11114356 1.49848964 . -0.37707878 0.09880271
## [98,] -1.65274807 1.15994836 -0.28636100 . 0.26783606 2.52204105
## [99,] 2.93153512 -0.11830202 -1.87364369 . -0.25250750 0.09863699
## [100,] 0.06608643 -0.23167067 -0.01024835 . 0.84854641 1.08222777
sessionInfo()
## R version 4.3.0 RC (2023-04-13 r84266)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.12 TileDBArray_1.10.0 DelayedArray_0.26.2
## [4] S4Arrays_1.0.1 IRanges_2.34.0 S4Vectors_0.38.1
## [7] MatrixGenerics_1.12.0 matrixStats_0.63.0 BiocGenerics_0.46.0
## [10] Matrix_1.5-4 BiocStyle_2.28.0
##
## loaded via a namespace (and not attached):
## [1] crayon_1.5.2 cli_3.6.1 knitr_1.42
## [4] rlang_1.1.0 xfun_0.38 data.table_1.14.8
## [7] jsonlite_1.8.4 zoo_1.8-12 bit_4.0.5
## [10] htmltools_0.5.5 nanotime_0.3.7 sass_0.4.5
## [13] rmarkdown_2.21 grid_4.3.0 evaluate_0.20
## [16] jquerylib_0.1.4 fastmap_1.1.1 yaml_2.3.7
## [19] bookdown_0.33 BiocManager_1.30.20 compiler_4.3.0
## [22] Rcpp_1.0.10 RcppCCTZ_0.2.12 lattice_0.21-8
## [25] digest_0.6.31 R6_2.5.1 tiledb_0.19.0
## [28] bslib_0.4.2 bit64_4.0.5 tools_4.3.0
## [31] spdl_0.0.4 cachem_1.0.7