TileDBArray 1.12.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.94178844 -0.62700543 0.60183264 . -0.2243928 -1.1455271
## [2,] 0.04502763 -0.04181300 0.33643838 . -0.8044238 -0.1929210
## [3,] -0.52948843 -1.62683533 -0.17858431 . -1.7369557 -1.5487005
## [4,] 0.66265029 0.63687630 -1.66534052 . -2.3051074 -0.3645513
## [5,] 0.19621862 0.64994273 -0.62014965 . -0.8477423 -0.3019052
## ... . . . . . .
## [96,] -0.46412951 0.72564578 -0.21278365 . 0.3202285 -1.7422782
## [97,] 1.21965386 0.97042483 -1.20139680 . -0.3431161 1.0402541
## [98,] 0.40968960 0.66494135 -0.32323964 . -1.0033156 -0.1634495
## [99,] -1.11460384 -1.29254694 1.17182981 . -0.3346231 1.0601997
## [100,] 0.05107307 -0.68777550 -0.76848196 . -0.3479158 0.2684019
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.94178844 -0.62700543 0.60183264 . -0.2243928 -1.1455271
## [2,] 0.04502763 -0.04181300 0.33643838 . -0.8044238 -0.1929210
## [3,] -0.52948843 -1.62683533 -0.17858431 . -1.7369557 -1.5487005
## [4,] 0.66265029 0.63687630 -1.66534052 . -2.3051074 -0.3645513
## [5,] 0.19621862 0.64994273 -0.62014965 . -0.8477423 -0.3019052
## ... . . . . . .
## [96,] -0.46412951 0.72564578 -0.21278365 . 0.3202285 -1.7422782
## [97,] 1.21965386 0.97042483 -1.20139680 . -0.3431161 1.0402541
## [98,] 0.40968960 0.66494135 -0.32323964 . -1.0033156 -0.1634495
## [99,] -1.11460384 -1.29254694 1.17182981 . -0.3346231 1.0601997
## [100,] 0.05107307 -0.68777550 -0.76848196 . -0.3479158 0.2684019
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.94178844 -0.62700543 0.60183264 . -0.2243928 -1.1455271
## GENE_2 0.04502763 -0.04181300 0.33643838 . -0.8044238 -0.1929210
## GENE_3 -0.52948843 -1.62683533 -0.17858431 . -1.7369557 -1.5487005
## GENE_4 0.66265029 0.63687630 -1.66534052 . -2.3051074 -0.3645513
## GENE_5 0.19621862 0.64994273 -0.62014965 . -0.8477423 -0.3019052
## ... . . . . . .
## GENE_96 -0.46412951 0.72564578 -0.21278365 . 0.3202285 -1.7422782
## GENE_97 1.21965386 0.97042483 -1.20139680 . -0.3431161 1.0402541
## GENE_98 0.40968960 0.66494135 -0.32323964 . -1.0033156 -0.1634495
## GENE_99 -1.11460384 -1.29254694 1.17182981 . -0.3346231 1.0601997
## GENE_100 0.05107307 -0.68777550 -0.76848196 . -0.3479158 0.2684019
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -0.94178844 0.04502763 -0.52948843 0.66265029 0.19621862 1.72419100
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -0.94178844 -0.62700543 0.60183264 1.05735792 -0.09521169
## GENE_2 0.04502763 -0.04181300 0.33643838 0.35611017 1.43408806
## GENE_3 -0.52948843 -1.62683533 -0.17858431 -0.06553833 0.09369211
## GENE_4 0.66265029 0.63687630 -1.66534052 0.30640467 -0.79457599
## GENE_5 0.19621862 0.64994273 -0.62014965 0.49895050 -1.54576504
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -1.88357689 -1.25401086 1.20366527 . -0.4487857 -2.2910542
## GENE_2 0.09005527 -0.08362600 0.67287676 . -1.6088476 -0.3858419
## GENE_3 -1.05897686 -3.25367067 -0.35716862 . -3.4739114 -3.0974009
## GENE_4 1.32530058 1.27375259 -3.33068103 . -4.6102149 -0.7291025
## GENE_5 0.39243725 1.29988546 -1.24029931 . -1.6954847 -0.6038103
## ... . . . . . .
## GENE_96 -0.9282590 1.4512916 -0.4255673 . 0.6404570 -3.4845563
## GENE_97 2.4393077 1.9408497 -2.4027936 . -0.6862323 2.0805082
## GENE_98 0.8193792 1.3298827 -0.6464793 . -2.0066312 -0.3268989
## GENE_99 -2.2292077 -2.5850939 2.3436596 . -0.6692461 2.1203994
## GENE_100 0.1021461 -1.3755510 -1.5369639 . -0.6958315 0.5368037
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## -8.6361618 -8.0739004 -14.2228185 7.9804543 -0.7927904 11.3500122
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -0.1774751 2.5370590 -6.0595616 -17.3880190
out %*% runif(ncol(out))
## <100 x 1> DelayedMatrix object of type "double":
## y
## GENE_1 -0.1402634
## GENE_2 1.3847311
## GENE_3 -3.6544835
## GENE_4 -2.4101736
## GENE_5 -1.1035764
## ... .
## GENE_96 -1.1255900
## GENE_97 0.7953684
## GENE_98 0.7171758
## GENE_99 0.4068600
## GENE_100 -0.2920467
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.1505235 1.2880402 1.2271147 . 0.1074502 0.9429227
## [2,] 1.2162633 1.3142932 0.9214262 . -0.4507009 -0.7955018
## [3,] -0.1409912 -0.1750126 -1.1247218 . -0.5695450 -0.1550749
## [4,] 0.5988802 0.2553403 1.6273801 . 0.2158597 0.6110915
## [5,] 0.6048747 0.1962898 -1.2006590 . 0.4174817 0.1504474
## ... . . . . . .
## [96,] 1.25991611 0.58201369 0.78091457 . -0.667439220 0.593560320
## [97,] 0.87917964 0.24314292 -0.70962381 . 1.101701899 -1.342769078
## [98,] -2.03343094 -1.01277676 -0.92313926 . 1.887865199 0.801879878
## [99,] 0.20814737 -0.93471545 -0.90667702 . -1.305145317 -0.001397843
## [100,] 0.78945758 -0.78723388 -0.05659207 . 1.534474432 -1.653283410
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.1505235 1.2880402 1.2271147 . 0.1074502 0.9429227
## [2,] 1.2162633 1.3142932 0.9214262 . -0.4507009 -0.7955018
## [3,] -0.1409912 -0.1750126 -1.1247218 . -0.5695450 -0.1550749
## [4,] 0.5988802 0.2553403 1.6273801 . 0.2158597 0.6110915
## [5,] 0.6048747 0.1962898 -1.2006590 . 0.4174817 0.1504474
## ... . . . . . .
## [96,] 1.25991611 0.58201369 0.78091457 . -0.667439220 0.593560320
## [97,] 0.87917964 0.24314292 -0.70962381 . 1.101701899 -1.342769078
## [98,] -2.03343094 -1.01277676 -0.92313926 . 1.887865199 0.801879878
## [99,] 0.20814737 -0.93471545 -0.90667702 . -1.305145317 -0.001397843
## [100,] 0.78945758 -0.78723388 -0.05659207 . 1.534474432 -1.653283410
sessionInfo()
## R version 4.3.1 Patched (2023-06-17 r84564)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.6.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.14 TileDBArray_1.12.0 DelayedArray_0.28.0
## [4] SparseArray_1.2.0 S4Arrays_1.2.0 abind_1.4-5
## [7] IRanges_2.36.0 S4Vectors_0.40.0 MatrixGenerics_1.14.0
## [10] matrixStats_1.0.0 BiocGenerics_0.48.0 Matrix_1.6-1.1
## [13] BiocStyle_2.30.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.0.5 jsonlite_1.8.7 compiler_4.3.1
## [4] BiocManager_1.30.22 crayon_1.5.2 Rcpp_1.0.11
## [7] jquerylib_0.1.4 yaml_2.3.7 fastmap_1.1.1
## [10] lattice_0.22-5 R6_2.5.1 RcppCCTZ_0.2.12
## [13] XVector_0.42.0 tiledb_0.21.1 knitr_1.44
## [16] bookdown_0.36 bslib_0.5.1 rlang_1.1.1
## [19] cachem_1.0.8 xfun_0.40 sass_0.4.7
## [22] bit64_4.0.5 cli_3.6.1 zlibbioc_1.48.0
## [25] spdl_0.0.5 digest_0.6.33 grid_4.3.1
## [28] data.table_1.14.8 evaluate_0.22 nanotime_0.3.7
## [31] zoo_1.8-12 rmarkdown_2.25 tools_4.3.1
## [34] htmltools_0.5.6.1