TileDBArray 1.0.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.00917916 2.48136600 2.09949659 . 0.68091727 -0.79907317
## [2,] 0.29387642 0.23884843 -0.01743697 . 1.14146602 2.41519397
## [3,] 1.94308771 2.41017901 -2.20270019 . -0.09902853 -0.33164562
## [4,] -0.01058239 0.34992698 -0.24849602 . -0.51789395 -0.08996845
## [5,] -0.33514884 0.65252740 0.87909366 . 0.23525956 0.70809260
## ... . . . . . .
## [96,] 0.1497390 1.5086475 1.3103905 . 0.07038173 -0.63971422
## [97,] 2.3905530 -0.4222903 1.4595225 . -0.01702675 0.40246448
## [98,] -1.5994051 0.4115585 0.4044613 . 0.29091276 0.69000383
## [99,] 0.3193522 1.2549680 0.1385671 . 2.52015980 1.62887499
## [100,] 0.4665032 -2.0473783 -1.6307158 . 1.36331099 -0.77007736
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.00917916 2.48136600 2.09949659 . 0.68091727 -0.79907317
## [2,] 0.29387642 0.23884843 -0.01743697 . 1.14146602 2.41519397
## [3,] 1.94308771 2.41017901 -2.20270019 . -0.09902853 -0.33164562
## [4,] -0.01058239 0.34992698 -0.24849602 . -0.51789395 -0.08996845
## [5,] -0.33514884 0.65252740 0.87909366 . 0.23525956 0.70809260
## ... . . . . . .
## [96,] 0.1497390 1.5086475 1.3103905 . 0.07038173 -0.63971422
## [97,] 2.3905530 -0.4222903 1.4595225 . -0.01702675 0.40246448
## [98,] -1.5994051 0.4115585 0.4044613 . 0.29091276 0.69000383
## [99,] 0.3193522 1.2549680 0.1385671 . 2.52015980 1.62887499
## [100,] 0.4665032 -2.0473783 -1.6307158 . 1.36331099 -0.77007736
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.00917916 2.48136600 2.09949659 . 0.68091727 -0.79907317
## GENE_2 0.29387642 0.23884843 -0.01743697 . 1.14146602 2.41519397
## GENE_3 1.94308771 2.41017901 -2.20270019 . -0.09902853 -0.33164562
## GENE_4 -0.01058239 0.34992698 -0.24849602 . -0.51789395 -0.08996845
## GENE_5 -0.33514884 0.65252740 0.87909366 . 0.23525956 0.70809260
## ... . . . . . .
## GENE_96 0.1497390 1.5086475 1.3103905 . 0.07038173 -0.63971422
## GENE_97 2.3905530 -0.4222903 1.4595225 . -0.01702675 0.40246448
## GENE_98 -1.5994051 0.4115585 0.4044613 . 0.29091276 0.69000383
## GENE_99 0.3193522 1.2549680 0.1385671 . 2.52015980 1.62887499
## GENE_100 0.4665032 -2.0473783 -1.6307158 . 1.36331099 -0.77007736
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## [1] -0.00917916 0.29387642 1.94308771 -0.01058239 -0.33514884 -0.38495963
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -0.00917916 2.48136600 2.09949659 0.30713033 0.17374706
## GENE_2 0.29387642 0.23884843 -0.01743697 1.29136260 0.21239100
## GENE_3 1.94308771 2.41017901 -2.20270019 0.66055155 0.05411511
## GENE_4 -0.01058239 0.34992698 -0.24849602 -0.18213800 0.44133474
## GENE_5 -0.33514884 0.65252740 0.87909366 -0.13086620 -0.27345850
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.01835832 4.96273200 4.19899318 . 1.3618345 -1.5981463
## GENE_2 0.58775283 0.47769686 -0.03487394 . 2.2829320 4.8303879
## GENE_3 3.88617541 4.82035801 -4.40540038 . -0.1980571 -0.6632912
## GENE_4 -0.02116478 0.69985395 -0.49699204 . -1.0357879 -0.1799369
## GENE_5 -0.67029767 1.30505481 1.75818732 . 0.4705191 1.4161852
## ... . . . . . .
## GENE_96 0.2994779 3.0172949 2.6207809 . 0.14076347 -1.27942844
## GENE_97 4.7811060 -0.8445806 2.9190449 . -0.03405349 0.80492897
## GENE_98 -3.1988102 0.8231170 0.8089226 . 0.58182552 1.38000767
## GENE_99 0.6387045 2.5099361 0.2771341 . 5.04031961 3.25774998
## GENE_100 0.9330063 -4.0947566 -3.2614317 . 2.72662198 -1.54015473
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## -4.7734066 -6.8729376 -9.5849650 22.1961669 -6.0738247 7.9665851 -7.6887666
## SAMP_8 SAMP_9 SAMP_10
## 0.8475866 0.7202236 5.2410834
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 1.65489759
## GENE_2 2.94739512
## GENE_3 -1.01761478
## GENE_4 0.07803604
## GENE_5 1.85596877
## ... .
## GENE_96 -0.520778
## GENE_97 1.785528
## GENE_98 1.453101
## GENE_99 3.211069
## GENE_100 -2.712747
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -1.2990667 0.2937422 1.0581249 . -1.3038320 -1.3458431
## [2,] 0.5144211 -1.6260591 0.1906459 . 1.0638363 -0.6248759
## [3,] -1.2350954 1.2790956 -0.1601771 . -0.3305960 0.1744152
## [4,] 0.4001719 0.6619406 0.7911893 . -0.5830216 0.4723746
## [5,] 1.3838254 1.1398083 1.1654215 . -0.1557135 1.2571049
## ... . . . . . .
## [96,] 0.4518644 0.9719467 -0.3286256 . -0.1391583 -0.9771942
## [97,] -0.1321476 -0.4048106 0.2169297 . 0.9000114 -1.6105170
## [98,] 1.1895491 0.1596533 -1.5493773 . -0.5700465 0.9508540
## [99,] 2.0598406 0.1085602 -1.0298455 . 2.1379406 -1.1323443
## [100,] 0.5097971 -2.3404609 0.8140521 . -0.9921417 -0.5601653
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -1.2990667 0.2937422 1.0581249 . -1.3038320 -1.3458431
## [2,] 0.5144211 -1.6260591 0.1906459 . 1.0638363 -0.6248759
## [3,] -1.2350954 1.2790956 -0.1601771 . -0.3305960 0.1744152
## [4,] 0.4001719 0.6619406 0.7911893 . -0.5830216 0.4723746
## [5,] 1.3838254 1.1398083 1.1654215 . -0.1557135 1.2571049
## ... . . . . . .
## [96,] 0.4518644 0.9719467 -0.3286256 . -0.1391583 -0.9771942
## [97,] -0.1321476 -0.4048106 0.2169297 . 0.9000114 -1.6105170
## [98,] 1.1895491 0.1596533 -1.5493773 . -0.5700465 0.9508540
## [99,] 2.0598406 0.1085602 -1.0298455 . 2.1379406 -1.1323443
## [100,] 0.5097971 -2.3404609 0.8140521 . -0.9921417 -0.5601653
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] TileDBArray_1.0.0 DelayedArray_0.16.0 IRanges_2.24.0
## [4] S4Vectors_0.28.0 MatrixGenerics_1.2.0 matrixStats_0.57.0
## [7] BiocGenerics_0.36.0 Matrix_1.2-18 BiocStyle_2.18.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 RcppCCTZ_0.2.9 knitr_1.30
## [4] magrittr_1.5 bit_4.0.4 nanotime_0.3.2
## [7] lattice_0.20-41 rlang_0.4.8 stringr_1.4.0
## [10] tools_4.0.3 grid_4.0.3 xfun_0.18
## [13] tiledb_0.8.2 htmltools_0.5.0 bit64_4.0.5
## [16] yaml_2.2.1 digest_0.6.27 bookdown_0.21
## [19] BiocManager_1.30.10 evaluate_0.14 rmarkdown_2.5
## [22] stringi_1.5.3 compiler_4.0.3 zoo_1.8-8