TileDBArray 1.17.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 2.62959200 0.19391388 -1.08501711 . -0.02037295 0.04514482
## [2,] 0.08444157 0.35194720 1.22873045 . 2.11682745 -0.07218104
## [3,] 0.46628631 0.10434060 1.98500649 . -0.14839481 -0.96951259
## [4,] -0.58894633 1.90243972 0.68935012 . -0.30584191 -0.73782407
## [5,] -0.59900999 0.07978701 -0.69612264 . 0.20408254 0.99961509
## ... . . . . . .
## [96,] 1.93892002 1.31017812 -0.24742506 . -1.9782845 0.7270660
## [97,] 1.09796604 0.71583421 0.01756792 . 0.5871173 -0.4000176
## [98,] -0.57382668 0.31028886 -0.90668143 . 0.3385697 0.1378214
## [99,] 1.02831146 -0.88638213 1.70113455 . -0.9413263 -0.3415815
## [100,] -0.58146991 -0.28591557 -0.95861649 . -0.4080083 -0.9447406
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 2.62959200 0.19391388 -1.08501711 . -0.02037295 0.04514482
## [2,] 0.08444157 0.35194720 1.22873045 . 2.11682745 -0.07218104
## [3,] 0.46628631 0.10434060 1.98500649 . -0.14839481 -0.96951259
## [4,] -0.58894633 1.90243972 0.68935012 . -0.30584191 -0.73782407
## [5,] -0.59900999 0.07978701 -0.69612264 . 0.20408254 0.99961509
## ... . . . . . .
## [96,] 1.93892002 1.31017812 -0.24742506 . -1.9782845 0.7270660
## [97,] 1.09796604 0.71583421 0.01756792 . 0.5871173 -0.4000176
## [98,] -0.57382668 0.31028886 -0.90668143 . 0.3385697 0.1378214
## [99,] 1.02831146 -0.88638213 1.70113455 . -0.9413263 -0.3415815
## [100,] -0.58146991 -0.28591557 -0.95861649 . -0.4080083 -0.9447406
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 2.62959200 0.19391388 -1.08501711 . -0.02037295 0.04514482
## GENE_2 0.08444157 0.35194720 1.22873045 . 2.11682745 -0.07218104
## GENE_3 0.46628631 0.10434060 1.98500649 . -0.14839481 -0.96951259
## GENE_4 -0.58894633 1.90243972 0.68935012 . -0.30584191 -0.73782407
## GENE_5 -0.59900999 0.07978701 -0.69612264 . 0.20408254 0.99961509
## ... . . . . . .
## GENE_96 1.93892002 1.31017812 -0.24742506 . -1.9782845 0.7270660
## GENE_97 1.09796604 0.71583421 0.01756792 . 0.5871173 -0.4000176
## GENE_98 -0.57382668 0.31028886 -0.90668143 . 0.3385697 0.1378214
## GENE_99 1.02831146 -0.88638213 1.70113455 . -0.9413263 -0.3415815
## GENE_100 -0.58146991 -0.28591557 -0.95861649 . -0.4080083 -0.9447406
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 2.62959200 0.08444157 0.46628631 -0.58894633 -0.59900999 0.37790015
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 2.62959200 0.19391388 -1.08501711 0.15848963 -1.84268862
## GENE_2 0.08444157 0.35194720 1.22873045 -0.64779747 -0.02626796
## GENE_3 0.46628631 0.10434060 1.98500649 -0.01390352 1.03805390
## GENE_4 -0.58894633 1.90243972 0.68935012 -0.73790139 0.11463852
## GENE_5 -0.59900999 0.07978701 -0.69612264 -0.70542268 -0.59529513
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 5.2591840 0.3878278 -2.1700342 . -0.04074591 0.09028963
## GENE_2 0.1688831 0.7038944 2.4574609 . 4.23365491 -0.14436208
## GENE_3 0.9325726 0.2086812 3.9700130 . -0.29678962 -1.93902518
## GENE_4 -1.1778927 3.8048794 1.3787002 . -0.61168382 -1.47564815
## GENE_5 -1.1980200 0.1595740 -1.3922453 . 0.40816508 1.99923017
## ... . . . . . .
## GENE_96 3.87784004 2.62035623 -0.49485013 . -3.9565691 1.4541320
## GENE_97 2.19593208 1.43166841 0.03513585 . 1.1742346 -0.8000352
## GENE_98 -1.14765336 0.62057773 -1.81336286 . 0.6771394 0.2756427
## GENE_99 2.05662292 -1.77276426 3.40226910 . -1.8826527 -0.6831629
## GENE_100 -1.16293983 -0.57183114 -1.91723299 . -0.8160166 -1.8894811
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## 17.3790908 17.4109161 -13.2008932 -11.3875180 -3.3928529 -2.0244144
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -0.6438404 5.3244535 3.8719887 7.1839794
out %*% runif(ncol(out))
## [,1]
## GENE_1 0.208751243
## GENE_2 1.450930185
## GENE_3 -1.047259031
## GENE_4 -0.870368557
## GENE_5 0.904414240
## GENE_6 1.423835123
## GENE_7 -0.133049331
## GENE_8 -0.177677336
## GENE_9 -2.946745874
## GENE_10 -1.721469201
## GENE_11 1.478377976
## GENE_12 -0.988847202
## GENE_13 -3.571189445
## GENE_14 2.934872053
## GENE_15 0.883184134
## GENE_16 2.187632673
## GENE_17 2.039340885
## GENE_18 3.821197634
## GENE_19 -0.711113437
## GENE_20 1.411689314
## GENE_21 -0.610187951
## GENE_22 1.179790946
## GENE_23 -1.691275873
## GENE_24 1.310809976
## GENE_25 2.208029364
## GENE_26 -0.664266631
## GENE_27 -1.480142677
## GENE_28 1.441128963
## GENE_29 1.042294254
## GENE_30 1.951661629
## GENE_31 0.677215703
## GENE_32 -2.164802770
## GENE_33 -2.652049490
## GENE_34 1.430236640
## GENE_35 -0.645033355
## GENE_36 -0.710834191
## GENE_37 3.182226365
## GENE_38 -0.042564635
## GENE_39 1.320771798
## GENE_40 0.389427539
## GENE_41 -2.958582183
## GENE_42 0.741804978
## GENE_43 0.838162072
## GENE_44 2.056547325
## GENE_45 -0.973506586
## GENE_46 -0.605155913
## GENE_47 2.535685384
## GENE_48 -1.565261352
## GENE_49 1.463299088
## GENE_50 -0.096122569
## GENE_51 -2.118348961
## GENE_52 0.950281231
## GENE_53 -2.216535232
## GENE_54 4.527445104
## GENE_55 -0.230142013
## GENE_56 -3.001164990
## GENE_57 -1.894182756
## GENE_58 -0.999051184
## GENE_59 1.351586719
## GENE_60 -0.291182666
## GENE_61 -2.875271828
## GENE_62 -2.205514626
## GENE_63 -1.202502048
## GENE_64 -2.050242266
## GENE_65 -0.325488219
## GENE_66 -1.459680789
## GENE_67 -0.319592808
## GENE_68 1.698083996
## GENE_69 -0.358540783
## GENE_70 0.879953943
## GENE_71 -1.904931253
## GENE_72 1.707612377
## GENE_73 1.178914436
## GENE_74 0.198659188
## GENE_75 4.029540242
## GENE_76 1.836252305
## GENE_77 -1.348265888
## GENE_78 1.044540383
## GENE_79 2.703884464
## GENE_80 0.001503996
## GENE_81 -1.264966425
## GENE_82 -0.505925919
## GENE_83 0.408388098
## GENE_84 0.533522967
## GENE_85 -2.466729970
## GENE_86 1.595184516
## GENE_87 1.788923896
## GENE_88 -3.313879811
## GENE_89 -1.036889365
## GENE_90 -0.341767190
## GENE_91 1.670327629
## GENE_92 -0.885695266
## GENE_93 -0.344771981
## GENE_94 -2.984218887
## GENE_95 -1.577913383
## GENE_96 -0.198930097
## GENE_97 -0.120375480
## GENE_98 -2.041991092
## GENE_99 1.153590678
## GENE_100 -0.276827159
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.04598121 1.70113041 -0.22200391 . 0.646294870 -1.241378593
## [2,] 0.38883620 -1.15903317 -0.86169059 . -0.061866602 -0.265896121
## [3,] 0.49886556 1.22398692 -1.00561657 . 0.129649868 0.001448387
## [4,] 0.58535140 0.06496264 -1.06415428 . 0.325018573 0.284306193
## [5,] -0.68061355 0.01235286 -1.69959549 . -0.172873365 1.906059005
## ... . . . . . .
## [96,] 2.0132831 0.8495502 0.4666486 . 1.1238013 -1.0606060
## [97,] -0.6794967 0.2729679 -0.9874437 . -1.0005060 0.4084280
## [98,] -1.2298746 2.4890566 -1.1938685 . 1.4507118 -1.1122924
## [99,] 0.3055599 1.6021382 0.3829184 . 1.4870642 -0.4867454
## [100,] -0.5605120 -0.4450448 -0.4840729 . -0.3400393 -0.7553986
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.04598121 1.70113041 -0.22200391 . 0.646294870 -1.241378593
## [2,] 0.38883620 -1.15903317 -0.86169059 . -0.061866602 -0.265896121
## [3,] 0.49886556 1.22398692 -1.00561657 . 0.129649868 0.001448387
## [4,] 0.58535140 0.06496264 -1.06415428 . 0.325018573 0.284306193
## [5,] -0.68061355 0.01235286 -1.69959549 . -0.172873365 1.906059005
## ... . . . . . .
## [96,] 2.0132831 0.8495502 0.4666486 . 1.1238013 -1.0606060
## [97,] -0.6794967 0.2729679 -0.9874437 . -1.0005060 0.4084280
## [98,] -1.2298746 2.4890566 -1.1938685 . 1.4507118 -1.1122924
## [99,] 0.3055599 1.6021382 0.3829184 . 1.4870642 -0.4867454
## [100,] -0.5605120 -0.4450448 -0.4840729 . -0.3400393 -0.7553986
sessionInfo()
## R Under development (unstable) (2024-11-20 r87352)
## Platform: x86_64-apple-darwin20
## Running under: macOS Monterey 12.7.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.19 TileDBArray_1.17.0 DelayedArray_0.33.2
## [4] SparseArray_1.7.2 S4Arrays_1.7.1 IRanges_2.41.1
## [7] abind_1.4-8 S4Vectors_0.45.2 MatrixGenerics_1.19.0
## [10] matrixStats_1.4.1 BiocGenerics_0.53.3 generics_0.1.3
## [13] Matrix_1.7-1 BiocStyle_2.35.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.5.0 jsonlite_1.8.9 compiler_4.5.0
## [4] BiocManager_1.30.25 crayon_1.5.3 Rcpp_1.0.13-1
## [7] nanoarrow_0.6.0 jquerylib_0.1.4 yaml_2.3.10
## [10] fastmap_1.2.0 lattice_0.22-6 R6_2.5.1
## [13] RcppCCTZ_0.2.12 XVector_0.47.0 tiledb_0.30.2
## [16] knitr_1.49 bookdown_0.41 bslib_0.8.0
## [19] rlang_1.1.4 cachem_1.1.0 xfun_0.49
## [22] sass_0.4.9 bit64_4.5.2 cli_3.6.3
## [25] zlibbioc_1.53.0 spdl_0.0.5 digest_0.6.37
## [28] grid_4.5.0 lifecycle_1.0.4 data.table_1.16.2
## [31] evaluate_1.0.1 nanotime_0.3.10 zoo_1.8-12
## [34] rmarkdown_2.29 tools_4.5.0 htmltools_0.5.8.1