TileDBArray 1.16.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.56850898 -1.36375814 -0.25715020 . 0.7601518 -0.3793802
## [2,] 1.04344255 -0.99069495 -1.33787147 . 0.2530575 1.5741640
## [3,] 0.59359346 -0.57249807 0.47100810 . 0.5245230 0.3919767
## [4,] 0.08711209 0.92657275 -0.92017220 . 1.1735586 -1.0954105
## [5,] -2.00863374 -0.55438818 -0.69237694 . 0.1940079 -1.4901530
## ... . . . . . .
## [96,] -0.3418704 -0.2809559 1.3395935 . -1.01105021 0.42355672
## [97,] -0.4221230 0.5700102 0.5566377 . -0.28866204 -1.21959121
## [98,] -1.2597082 -2.2813921 0.3555055 . 0.41708523 -0.92633485
## [99,] -0.5246781 0.9631862 1.3832180 . -0.02107582 0.22091190
## [100,] 0.5425727 -1.5705088 -1.7606574 . -0.98260776 0.14642005
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.56850898 -1.36375814 -0.25715020 . 0.7601518 -0.3793802
## [2,] 1.04344255 -0.99069495 -1.33787147 . 0.2530575 1.5741640
## [3,] 0.59359346 -0.57249807 0.47100810 . 0.5245230 0.3919767
## [4,] 0.08711209 0.92657275 -0.92017220 . 1.1735586 -1.0954105
## [5,] -2.00863374 -0.55438818 -0.69237694 . 0.1940079 -1.4901530
## ... . . . . . .
## [96,] -0.3418704 -0.2809559 1.3395935 . -1.01105021 0.42355672
## [97,] -0.4221230 0.5700102 0.5566377 . -0.28866204 -1.21959121
## [98,] -1.2597082 -2.2813921 0.3555055 . 0.41708523 -0.92633485
## [99,] -0.5246781 0.9631862 1.3832180 . -0.02107582 0.22091190
## [100,] 0.5425727 -1.5705088 -1.7606574 . -0.98260776 0.14642005
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.56850898 -1.36375814 -0.25715020 . 0.7601518 -0.3793802
## GENE_2 1.04344255 -0.99069495 -1.33787147 . 0.2530575 1.5741640
## GENE_3 0.59359346 -0.57249807 0.47100810 . 0.5245230 0.3919767
## GENE_4 0.08711209 0.92657275 -0.92017220 . 1.1735586 -1.0954105
## GENE_5 -2.00863374 -0.55438818 -0.69237694 . 0.1940079 -1.4901530
## ... . . . . . .
## GENE_96 -0.3418704 -0.2809559 1.3395935 . -1.01105021 0.42355672
## GENE_97 -0.4221230 0.5700102 0.5566377 . -0.28866204 -1.21959121
## GENE_98 -1.2597082 -2.2813921 0.3555055 . 0.41708523 -0.92633485
## GENE_99 -0.5246781 0.9631862 1.3832180 . -0.02107582 0.22091190
## GENE_100 0.5425727 -1.5705088 -1.7606574 . -0.98260776 0.14642005
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 0.56850898 1.04344255 0.59359346 0.08711209 -2.00863374 2.29370808
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 0.56850898 -1.36375814 -0.25715020 -0.99955710 -1.66815803
## GENE_2 1.04344255 -0.99069495 -1.33787147 1.23793205 -0.06103405
## GENE_3 0.59359346 -0.57249807 0.47100810 -0.61025185 -0.32165283
## GENE_4 0.08711209 0.92657275 -0.92017220 -0.73048316 1.64186874
## GENE_5 -2.00863374 -0.55438818 -0.69237694 0.03514041 -0.00171157
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 1.1370180 -2.7275163 -0.5143004 . 1.5203035 -0.7587604
## GENE_2 2.0868851 -1.9813899 -2.6757429 . 0.5061151 3.1483280
## GENE_3 1.1871869 -1.1449961 0.9420162 . 1.0490459 0.7839534
## GENE_4 0.1742242 1.8531455 -1.8403444 . 2.3471173 -2.1908210
## GENE_5 -4.0172675 -1.1087764 -1.3847539 . 0.3880158 -2.9803061
## ... . . . . . .
## GENE_96 -0.6837408 -0.5619117 2.6791870 . -2.02210042 0.84711343
## GENE_97 -0.8442460 1.1400204 1.1132753 . -0.57732409 -2.43918243
## GENE_98 -2.5194164 -4.5627842 0.7110110 . 0.83417046 -1.85266970
## GENE_99 -1.0493563 1.9263724 2.7664360 . -0.04215163 0.44182380
## GENE_100 1.0851453 -3.1410175 -3.5213148 . -1.96521553 0.29284009
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## -7.7446107 0.5568634 10.3145187 4.1217396 -27.4024139 -2.6052130
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -3.4174072 -2.6926929 2.8728173 9.4606662
out %*% runif(ncol(out))
## [,1]
## GENE_1 -1.553990043
## GENE_2 1.012335629
## GENE_3 -0.684500084
## GENE_4 2.088900630
## GENE_5 -3.504973651
## GENE_6 2.411668979
## GENE_7 0.237375516
## GENE_8 -4.065014362
## GENE_9 0.279096547
## GENE_10 1.397958479
## GENE_11 0.461795663
## GENE_12 2.008091130
## GENE_13 0.636828519
## GENE_14 -0.268182253
## GENE_15 -2.344710189
## GENE_16 3.494933371
## GENE_17 -1.050426267
## GENE_18 0.032311266
## GENE_19 0.936707507
## GENE_20 -1.643506716
## GENE_21 -0.807067649
## GENE_22 -1.739425684
## GENE_23 -1.007675717
## GENE_24 2.601765658
## GENE_25 -2.523249115
## GENE_26 -1.658982070
## GENE_27 -1.594576447
## GENE_28 0.668495232
## GENE_29 3.419205248
## GENE_30 -0.174167125
## GENE_31 -2.253925971
## GENE_32 1.286948708
## GENE_33 1.859712348
## GENE_34 -0.727357259
## GENE_35 2.101412337
## GENE_36 -2.057241220
## GENE_37 2.395097846
## GENE_38 -1.076688545
## GENE_39 0.467139735
## GENE_40 4.792492867
## GENE_41 -1.423005651
## GENE_42 0.288841094
## GENE_43 -0.682523214
## GENE_44 -0.376898019
## GENE_45 -2.616864492
## GENE_46 0.570182116
## GENE_47 -2.015054813
## GENE_48 0.062258836
## GENE_49 2.298415943
## GENE_50 2.040252422
## GENE_51 -4.940049644
## GENE_52 -1.493473810
## GENE_53 0.418837876
## GENE_54 -0.622576644
## GENE_55 -0.849166066
## GENE_56 0.145856434
## GENE_57 -0.176952944
## GENE_58 -3.509119915
## GENE_59 -0.706742848
## GENE_60 -0.084524454
## GENE_61 -2.601622235
## GENE_62 2.275805480
## GENE_63 0.746696745
## GENE_64 0.411261782
## GENE_65 3.671059824
## GENE_66 -1.768878742
## GENE_67 0.851997490
## GENE_68 0.206648148
## GENE_69 1.982931788
## GENE_70 -3.966721789
## GENE_71 1.917842107
## GENE_72 -1.819314675
## GENE_73 -1.721455856
## GENE_74 -0.782419755
## GENE_75 -1.966598098
## GENE_76 -3.913627302
## GENE_77 -2.962173163
## GENE_78 3.869578273
## GENE_79 0.012075252
## GENE_80 -0.452928218
## GENE_81 -0.025227888
## GENE_82 -1.884963459
## GENE_83 4.664388668
## GENE_84 -3.338508866
## GENE_85 0.969159328
## GENE_86 -2.260986937
## GENE_87 0.524372001
## GENE_88 -0.114263741
## GENE_89 4.646698252
## GENE_90 3.683147456
## GENE_91 0.006177958
## GENE_92 3.104297686
## GENE_93 -1.537205043
## GENE_94 -0.303645933
## GENE_95 -1.454162470
## GENE_96 0.587649678
## GENE_97 -0.798446574
## GENE_98 -4.981551947
## GENE_99 0.027584532
## GENE_100 -3.843331125
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.046325091 0.003463769 -2.424105948 . 0.54277653 -0.90928189
## [2,] -0.782092923 -0.035790854 1.055893489 . -3.63359540 -1.66973220
## [3,] 2.401938649 -0.786367810 0.515589800 . 0.71396649 0.52476644
## [4,] 0.554562017 -0.667399897 1.700398828 . -0.86435435 -0.09375093
## [5,] 0.718196354 0.403625957 0.713750526 . 0.77644644 -0.32857469
## ... . . . . . .
## [96,] 0.260293279 -0.778593014 -0.526799780 . 0.74105079 -2.24677933
## [97,] 1.520210929 0.191664927 -1.863342081 . -0.92476323 0.60382540
## [98,] -0.236721876 0.008566115 1.550973083 . -0.18074428 -1.36585494
## [99,] -1.070804386 2.326006222 0.240010195 . 1.06744136 0.23552237
## [100,] -0.920046328 0.103362132 0.607274730 . 0.05513427 -2.17684009
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.046325091 0.003463769 -2.424105948 . 0.54277653 -0.90928189
## [2,] -0.782092923 -0.035790854 1.055893489 . -3.63359540 -1.66973220
## [3,] 2.401938649 -0.786367810 0.515589800 . 0.71396649 0.52476644
## [4,] 0.554562017 -0.667399897 1.700398828 . -0.86435435 -0.09375093
## [5,] 0.718196354 0.403625957 0.713750526 . 0.77644644 -0.32857469
## ... . . . . . .
## [96,] 0.260293279 -0.778593014 -0.526799780 . 0.74105079 -2.24677933
## [97,] 1.520210929 0.191664927 -1.863342081 . -0.92476323 0.60382540
## [98,] -0.236721876 0.008566115 1.550973083 . -0.18074428 -1.36585494
## [99,] -1.070804386 2.326006222 0.240010195 . 1.06744136 0.23552237
## [100,] -0.920046328 0.103362132 0.607274730 . 0.05513427 -2.17684009
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS Monterey 12.7.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.18 TileDBArray_1.16.0 DelayedArray_0.32.0
## [4] SparseArray_1.6.0 S4Arrays_1.6.0 IRanges_2.40.0
## [7] abind_1.4-8 S4Vectors_0.44.0 MatrixGenerics_1.18.0
## [10] matrixStats_1.4.1 BiocGenerics_0.52.0 Matrix_1.7-1
## [13] BiocStyle_2.34.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.5.0 jsonlite_1.8.9 compiler_4.4.1
## [4] BiocManager_1.30.25 crayon_1.5.3 Rcpp_1.0.13
## [7] nanoarrow_0.6.0 jquerylib_0.1.4 yaml_2.3.10
## [10] fastmap_1.2.0 lattice_0.22-6 R6_2.5.1
## [13] RcppCCTZ_0.2.12 XVector_0.46.0 tiledb_0.30.2
## [16] knitr_1.48 bookdown_0.41 bslib_0.8.0
## [19] rlang_1.1.4 cachem_1.1.0 xfun_0.48
## [22] sass_0.4.9 bit64_4.5.2 cli_3.6.3
## [25] zlibbioc_1.52.0 spdl_0.0.5 digest_0.6.37
## [28] grid_4.4.1 lifecycle_1.0.4 data.table_1.16.2
## [31] evaluate_1.0.1 nanotime_0.3.10 zoo_1.8-12
## [34] rmarkdown_2.28 tools_4.4.1 htmltools_0.5.8.1