TileDBArray 1.3.1
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -1.4876942 0.9946409 0.9018698 . -0.900152175 -0.944196206
## [2,] -0.2066295 1.8965505 0.4174348 . 1.934064863 -0.734844800
## [3,] -0.3880476 -0.9267627 -1.0123803 . -1.101404107 -1.005070098
## [4,] -0.2857930 -0.7621731 -0.4664523 . -0.002923844 0.287002918
## [5,] -1.1382317 -0.4697956 1.5724830 . 1.298439953 1.001333989
## ... . . . . . .
## [96,] -0.8353525 0.1662235 -0.6860404 . 0.3474314 -2.1772387
## [97,] 0.6769341 1.7436521 0.2038298 . 1.4687745 0.5680365
## [98,] -0.7138535 0.9034176 -0.3244572 . 0.4175140 0.1923676
## [99,] -0.3030104 1.2077602 -0.4674825 . 0.8014256 -0.1469238
## [100,] -0.9994296 1.4035177 -0.6473532 . -0.6930187 0.8245946
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -1.4876942 0.9946409 0.9018698 . -0.900152175 -0.944196206
## [2,] -0.2066295 1.8965505 0.4174348 . 1.934064863 -0.734844800
## [3,] -0.3880476 -0.9267627 -1.0123803 . -1.101404107 -1.005070098
## [4,] -0.2857930 -0.7621731 -0.4664523 . -0.002923844 0.287002918
## [5,] -1.1382317 -0.4697956 1.5724830 . 1.298439953 1.001333989
## ... . . . . . .
## [96,] -0.8353525 0.1662235 -0.6860404 . 0.3474314 -2.1772387
## [97,] 0.6769341 1.7436521 0.2038298 . 1.4687745 0.5680365
## [98,] -0.7138535 0.9034176 -0.3244572 . 0.4175140 0.1923676
## [99,] -0.3030104 1.2077602 -0.4674825 . 0.8014256 -0.1469238
## [100,] -0.9994296 1.4035177 -0.6473532 . -0.6930187 0.8245946
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -1.4876942 0.9946409 0.9018698 . -0.900152175 -0.944196206
## GENE_2 -0.2066295 1.8965505 0.4174348 . 1.934064863 -0.734844800
## GENE_3 -0.3880476 -0.9267627 -1.0123803 . -1.101404107 -1.005070098
## GENE_4 -0.2857930 -0.7621731 -0.4664523 . -0.002923844 0.287002918
## GENE_5 -1.1382317 -0.4697956 1.5724830 . 1.298439953 1.001333989
## ... . . . . . .
## GENE_96 -0.8353525 0.1662235 -0.6860404 . 0.3474314 -2.1772387
## GENE_97 0.6769341 1.7436521 0.2038298 . 1.4687745 0.5680365
## GENE_98 -0.7138535 0.9034176 -0.3244572 . 0.4175140 0.1923676
## GENE_99 -0.3030104 1.2077602 -0.4674825 . 0.8014256 -0.1469238
## GENE_100 -0.9994296 1.4035177 -0.6473532 . -0.6930187 0.8245946
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -1.4876942 -0.2066295 -0.3880476 -0.2857930 -1.1382317 0.3623738
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -1.4876942 0.9946409 0.9018698 0.2765628 -0.8900061
## GENE_2 -0.2066295 1.8965505 0.4174348 1.3127971 0.8602111
## GENE_3 -0.3880476 -0.9267627 -1.0123803 1.1554082 -0.3014832
## GENE_4 -0.2857930 -0.7621731 -0.4664523 -1.2519341 0.3634616
## GENE_5 -1.1382317 -0.4697956 1.5724830 1.5117490 0.1700830
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -2.9753884 1.9892818 1.8037397 . -1.800304350 -1.888392411
## GENE_2 -0.4132590 3.7931009 0.8348695 . 3.868129726 -1.469689599
## GENE_3 -0.7760952 -1.8535253 -2.0247606 . -2.202808214 -2.010140196
## GENE_4 -0.5715861 -1.5243462 -0.9329047 . -0.005847688 0.574005836
## GENE_5 -2.2764635 -0.9395912 3.1449660 . 2.596879906 2.002667979
## ... . . . . . .
## GENE_96 -1.6707051 0.3324470 -1.3720809 . 0.6948629 -4.3544774
## GENE_97 1.3538683 3.4873042 0.4076595 . 2.9375490 1.1360729
## GENE_98 -1.4277069 1.8068352 -0.6489145 . 0.8350279 0.3847353
## GENE_99 -0.6060209 2.4155204 -0.9349650 . 1.6028513 -0.2938476
## GENE_100 -1.9988593 2.8070355 -1.2947065 . -1.3860375 1.6491892
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## -8.200481 9.750826 -16.745692 1.832169 -5.987724 -3.920892 -2.363883
## SAMP_8 SAMP_9 SAMP_10
## -11.414388 -15.822494 9.691824
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 -0.4143653
## GENE_2 2.5382724
## GENE_3 -1.5118408
## GENE_4 -2.8317523
## GENE_5 2.0210758
## ... .
## GENE_96 -2.6404997
## GENE_97 4.4481894
## GENE_98 0.1846369
## GENE_99 1.7132315
## GENE_100 1.0276862
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.35343331 0.29641438 -0.96936867 . 0.9892044 0.6748587
## [2,] -0.85571114 2.29723682 -1.01650496 . 0.4915177 -0.9085020
## [3,] -0.12393038 1.09959940 -0.08045247 . 0.6301282 -0.1001589
## [4,] -1.03221849 -0.39013470 -0.56478213 . -0.1994910 -2.2906674
## [5,] -0.73694619 -0.25845930 -0.29360742 . 0.3880314 -0.1548332
## ... . . . . . .
## [96,] -0.11348848 0.60051133 -2.34905726 . 0.60939019 -1.76346369
## [97,] 0.50912445 -1.82966485 -0.65246504 . -1.07424355 -1.31265180
## [98,] -0.87829866 0.09913116 -1.46613623 . 0.03499978 0.60636647
## [99,] 0.55111251 0.41976440 0.24197894 . -1.27535453 -1.13206082
## [100,] -1.94753682 -0.32877248 2.35404588 . 1.58599482 0.25613118
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.35343331 0.29641438 -0.96936867 . 0.9892044 0.6748587
## [2,] -0.85571114 2.29723682 -1.01650496 . 0.4915177 -0.9085020
## [3,] -0.12393038 1.09959940 -0.08045247 . 0.6301282 -0.1001589
## [4,] -1.03221849 -0.39013470 -0.56478213 . -0.1994910 -2.2906674
## [5,] -0.73694619 -0.25845930 -0.29360742 . 0.3880314 -0.1548332
## ... . . . . . .
## [96,] -0.11348848 0.60051133 -2.34905726 . 0.60939019 -1.76346369
## [97,] 0.50912445 -1.82966485 -0.65246504 . -1.07424355 -1.31265180
## [98,] -0.87829866 0.09913116 -1.46613623 . 0.03499978 0.60636647
## [99,] 0.55111251 0.41976440 0.24197894 . -1.27535453 -1.13206082
## [100,] -1.94753682 -0.32877248 2.35404588 . 1.58599482 0.25613118
sessionInfo()
## R version 4.1.0 RC (2021-05-16 r80304)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] TileDBArray_1.3.1 DelayedArray_0.19.0 IRanges_2.27.0
## [4] S4Vectors_0.31.0 MatrixGenerics_1.5.0 matrixStats_0.58.0
## [7] BiocGenerics_0.39.0 Matrix_1.3-3 BiocStyle_2.21.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 bslib_0.2.5.1 compiler_4.1.0
## [4] BiocManager_1.30.15 jquerylib_0.1.4 tools_4.1.0
## [7] digest_0.6.27 bit_4.0.4 jsonlite_1.7.2
## [10] evaluate_0.14 lattice_0.20-44 nanotime_0.3.2
## [13] rlang_0.4.11 RcppCCTZ_0.2.9 yaml_2.2.1
## [16] xfun_0.23 stringr_1.4.0 knitr_1.33
## [19] sass_0.4.0 bit64_4.0.5 grid_4.1.0
## [22] R6_2.5.0 rmarkdown_2.8 bookdown_0.22
## [25] tiledb_0.9.2 magrittr_2.0.1 htmltools_0.5.1.1
## [28] stringi_1.6.2 zoo_1.8-9