TileDBArray 1.8.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.2640003 0.5416198 -1.0856839 . -1.1016997 -0.5836639
## [2,] -1.2259816 -1.1376672 -0.3787514 . -0.6869444 -0.8165528
## [3,] 0.9445778 -0.5509675 -0.0259178 . -1.7109114 0.4861461
## [4,] 2.5489565 0.7891163 -0.5366672 . 0.6896411 -0.3069134
## [5,] 1.2701316 -0.7724357 1.4558719 . 0.8993002 -1.2343125
## ... . . . . . .
## [96,] 0.8538330 -0.7634359 -1.6978769 . -0.8730761 -0.7638128
## [97,] 1.1616570 -1.0349033 1.1520933 . -0.1717866 0.3467213
## [98,] 2.1416492 1.7457579 1.7450897 . 0.6089990 0.2954712
## [99,] -0.8861340 -0.3956813 1.3786109 . 0.5129737 0.8523028
## [100,] 0.1312684 0.6457383 0.6129041 . -1.9830656 1.8730542
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.2640003 0.5416198 -1.0856839 . -1.1016997 -0.5836639
## [2,] -1.2259816 -1.1376672 -0.3787514 . -0.6869444 -0.8165528
## [3,] 0.9445778 -0.5509675 -0.0259178 . -1.7109114 0.4861461
## [4,] 2.5489565 0.7891163 -0.5366672 . 0.6896411 -0.3069134
## [5,] 1.2701316 -0.7724357 1.4558719 . 0.8993002 -1.2343125
## ... . . . . . .
## [96,] 0.8538330 -0.7634359 -1.6978769 . -0.8730761 -0.7638128
## [97,] 1.1616570 -1.0349033 1.1520933 . -0.1717866 0.3467213
## [98,] 2.1416492 1.7457579 1.7450897 . 0.6089990 0.2954712
## [99,] -0.8861340 -0.3956813 1.3786109 . 0.5129737 0.8523028
## [100,] 0.1312684 0.6457383 0.6129041 . -1.9830656 1.8730542
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.2640003 0.5416198 -1.0856839 . -1.1016997 -0.5836639
## GENE_2 -1.2259816 -1.1376672 -0.3787514 . -0.6869444 -0.8165528
## GENE_3 0.9445778 -0.5509675 -0.0259178 . -1.7109114 0.4861461
## GENE_4 2.5489565 0.7891163 -0.5366672 . 0.6896411 -0.3069134
## GENE_5 1.2701316 -0.7724357 1.4558719 . 0.8993002 -1.2343125
## ... . . . . . .
## GENE_96 0.8538330 -0.7634359 -1.6978769 . -0.8730761 -0.7638128
## GENE_97 1.1616570 -1.0349033 1.1520933 . -0.1717866 0.3467213
## GENE_98 2.1416492 1.7457579 1.7450897 . 0.6089990 0.2954712
## GENE_99 -0.8861340 -0.3956813 1.3786109 . 0.5129737 0.8523028
## GENE_100 0.1312684 0.6457383 0.6129041 . -1.9830656 1.8730542
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## -0.2640003 -1.2259816 0.9445778 2.5489565 1.2701316 -0.3676004
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 -0.2640003 0.5416198 -1.0856839 1.1325230 -1.5142681
## GENE_2 -1.2259816 -1.1376672 -0.3787514 -1.6663522 -1.5917658
## GENE_3 0.9445778 -0.5509675 -0.0259178 0.8097697 0.3207587
## GENE_4 2.5489565 0.7891163 -0.5366672 -0.2471427 -0.2028863
## GENE_5 1.2701316 -0.7724357 1.4558719 -1.2351992 0.4061722
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 -0.52800054 1.08323954 -2.17136776 . -2.2033993 -1.1673278
## GENE_2 -2.45196316 -2.27533445 -0.75750289 . -1.3738888 -1.6331056
## GENE_3 1.88915561 -1.10193497 -0.05183559 . -3.4218228 0.9722921
## GENE_4 5.09791298 1.57823266 -1.07333448 . 1.3792822 -0.6138268
## GENE_5 2.54026329 -1.54487138 2.91174373 . 1.7986005 -2.4686250
## ... . . . . . .
## GENE_96 1.7076659 -1.5268718 -3.3957538 . -1.7461523 -1.5276256
## GENE_97 2.3233140 -2.0698065 2.3041867 . -0.3435732 0.6934425
## GENE_98 4.2832985 3.4915158 3.4901794 . 1.2179981 0.5909424
## GENE_99 -1.7722681 -0.7913626 2.7572217 . 1.0259475 1.7046055
## GENE_100 0.2625367 1.2914766 1.2258081 . -3.9661313 3.7461084
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## -0.8599570 -10.8258316 0.8952665 -18.7948858 32.0918730 7.4379815
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -16.6303835 0.4850394 -0.4939165 -17.5287546
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 -2.3585860
## GENE_2 -5.5310757
## GENE_3 -0.3046179
## GENE_4 3.7071711
## GENE_5 2.1400362
## ... .
## GENE_96 -2.854953
## GENE_97 2.879193
## GENE_98 2.948292
## GENE_99 1.492790
## GENE_100 3.219258
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.7553646 0.9674689 -0.7292148 . -1.8709681 0.9818543
## [2,] 1.2946611 0.4551860 0.4256007 . 0.8538542 -1.4380213
## [3,] 0.6373950 -0.4168922 -0.4043765 . 1.7767707 -0.5014699
## [4,] -0.5471411 2.3406304 0.5794445 . 0.2525754 3.2271998
## [5,] 1.1648167 0.6857656 1.2730973 . -0.8931647 -0.3110404
## ... . . . . . .
## [96,] 0.64174716 -1.03557301 -1.26936249 . -0.72817026 0.12432277
## [97,] 0.09014076 -0.87465835 -0.20097186 . 0.61983804 -0.26919858
## [98,] 0.41190611 0.56931257 1.61183750 . -1.42275209 -0.27383617
## [99,] -0.09850353 -0.76290121 0.86093005 . 0.57483361 -0.01492378
## [100,] 1.37658136 -0.45976734 0.12846070 . 1.12947600 -1.71125527
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.7553646 0.9674689 -0.7292148 . -1.8709681 0.9818543
## [2,] 1.2946611 0.4551860 0.4256007 . 0.8538542 -1.4380213
## [3,] 0.6373950 -0.4168922 -0.4043765 . 1.7767707 -0.5014699
## [4,] -0.5471411 2.3406304 0.5794445 . 0.2525754 3.2271998
## [5,] 1.1648167 0.6857656 1.2730973 . -0.8931647 -0.3110404
## ... . . . . . .
## [96,] 0.64174716 -1.03557301 -1.26936249 . -0.72817026 0.12432277
## [97,] 0.09014076 -0.87465835 -0.20097186 . 0.61983804 -0.26919858
## [98,] 0.41190611 0.56931257 1.61183750 . -1.42275209 -0.27383617
## [99,] -0.09850353 -0.76290121 0.86093005 . 0.57483361 -0.01492378
## [100,] 1.37658136 -0.45976734 0.12846070 . 1.12947600 -1.71125527
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] TileDBArray_1.8.0 DelayedArray_0.24.0 IRanges_2.32.0
## [4] S4Vectors_0.36.0 MatrixGenerics_1.10.0 matrixStats_0.62.0
## [7] BiocGenerics_0.44.0 Matrix_1.4-1 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.9 bslib_0.3.1 compiler_4.2.1
## [4] BiocManager_1.30.18 jquerylib_0.1.4 tools_4.2.1
## [7] digest_0.6.29 bit_4.0.4 jsonlite_1.8.0
## [10] evaluate_0.15 lattice_0.20-45 nanotime_0.3.6
## [13] rlang_1.0.4 cli_3.3.0 RcppCCTZ_0.2.10
## [16] yaml_2.3.5 xfun_0.31 fastmap_1.1.0
## [19] stringr_1.4.0 knitr_1.39 sass_0.4.1
## [22] bit64_4.0.5 grid_4.2.1 data.table_1.14.2
## [25] R6_2.5.1 rmarkdown_2.14 bookdown_0.27
## [28] tiledb_0.14.1 magrittr_2.0.3 htmltools_0.5.2
## [31] stringi_1.7.8 zoo_1.8-10