chihaya 1.4.0
The chihaya package saves DelayedArray
objects for efficient, portable and stable reproduction of delayed operations in a new R session or other programming frameworks.
Check out the specification for more details.
Make a DelayedArray
object with some operations:
library(DelayedArray)
x <- DelayedArray(matrix(runif(1000), ncol=10))
x <- x[11:15,] / runif(5)
x <- log2(x + 1)
x
## <5 x 10> DelayedMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 3.2307029 1.5069998 1.0892496 . 2.9317923 3.0445530
## [2,] 0.3318089 0.3964993 1.0129336 . 0.4897470 0.6263867
## [3,] 1.7775790 3.1322702 2.7085484 . 2.9777790 2.4281258
## [4,] 0.4956991 0.9881600 0.4206113 . 0.2392414 0.8861764
## [5,] 0.7538181 0.5090957 1.3103558 . 1.3456434 1.4159059
showtree(x)
## 5x10 double: DelayedMatrix object
## └─ 5x10 double: Stack of 2 unary iso op(s)
## └─ 5x10 double: Unary iso op with args
## └─ 5x10 double: Subset
## └─ 100x10 double: [seed] matrix object
Save it into a HDF5 file with saveDelayed()
:
library(chihaya)
tmp <- tempfile(fileext=".h5")
saveDelayed(x, tmp)
rhdf5::h5ls(tmp)
## group name otype dclass dim
## 0 / delayed H5I_GROUP
## 1 /delayed base H5I_DATASET FLOAT ( 0 )
## 2 /delayed method H5I_DATASET STRING ( 0 )
## 3 /delayed seed H5I_GROUP
## 4 /delayed/seed method H5I_DATASET STRING ( 0 )
## 5 /delayed/seed seed H5I_GROUP
## 6 /delayed/seed/seed along H5I_DATASET INTEGER ( 0 )
## 7 /delayed/seed/seed method H5I_DATASET STRING ( 0 )
## 8 /delayed/seed/seed seed H5I_GROUP
## 9 /delayed/seed/seed/seed index H5I_GROUP
## 10 /delayed/seed/seed/seed/index 0 H5I_DATASET INTEGER 5
## 11 /delayed/seed/seed/seed seed H5I_GROUP
## 12 /delayed/seed/seed/seed/seed data H5I_DATASET FLOAT 100 x 10
## 13 /delayed/seed/seed/seed/seed native H5I_DATASET INTEGER ( 0 )
## 14 /delayed/seed/seed side H5I_DATASET STRING ( 0 )
## 15 /delayed/seed/seed value H5I_DATASET FLOAT 5
## 16 /delayed/seed side H5I_DATASET STRING ( 0 )
## 17 /delayed/seed value H5I_DATASET FLOAT ( 0 )
And then load it back in later:
y <- loadDelayed(tmp)
y
## <5 x 10> DelayedMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 3.2307029 1.5069998 1.0892496 . 2.9317923 3.0445530
## [2,] 0.3318089 0.3964993 1.0129336 . 0.4897470 0.6263867
## [3,] 1.7775790 3.1322702 2.7085484 . 2.9777790 2.4281258
## [4,] 0.4956991 0.9881600 0.4206113 . 0.2392414 0.8861764
## [5,] 0.7538181 0.5090957 1.3103558 . 1.3456434 1.4159059
Of course, this is not a particularly interesting case as we end up saving the original array inside our HDF5 file anyway. The real fun begins when you have some more interesting seeds.
We can use the delayed nature of the operations to avoid breaking sparsity. For example:
library(Matrix)
x <- rsparsematrix(1000, 1000, density=0.01)
x <- DelayedArray(x) + runif(1000)
tmp <- tempfile(fileext=".h5")
saveDelayed(x, tmp)
rhdf5::h5ls(tmp)
## group name otype dclass dim
## 0 / delayed H5I_GROUP
## 1 /delayed along H5I_DATASET INTEGER ( 0 )
## 2 /delayed method H5I_DATASET STRING ( 0 )
## 3 /delayed seed H5I_GROUP
## 4 /delayed/seed data H5I_DATASET FLOAT 10000
## 5 /delayed/seed dimnames H5I_GROUP
## 6 /delayed/seed indices H5I_DATASET INTEGER 10000
## 7 /delayed/seed indptr H5I_DATASET INTEGER 1001
## 8 /delayed/seed shape H5I_DATASET INTEGER 2
## 9 /delayed side H5I_DATASET STRING ( 0 )
## 10 /delayed value H5I_DATASET FLOAT 1000
file.info(tmp)[["size"]]
## [1] 101676
# Compared to a dense array.
tmp2 <- tempfile(fileext=".h5")
out <- HDF5Array::writeHDF5Array(x, tmp2, "data")
file.info(tmp2)[["size"]]
## [1] 279722
# Loading it back in.
y <- loadDelayed(tmp)
showtree(y)
## 1000x1000 double: DelayedMatrix object
## └─ 1000x1000 double: Unary iso op with args
## └─ 1000x1000 double, sparse: [seed] dgCMatrix object
We can also store references to external files, thus avoiding data duplication:
library(HDF5Array)
test <- HDF5Array(tmp2, "data")
stuff <- log2(test + 1)
stuff
## <1000 x 1000> DelayedMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0.10119835 0.10119835 0.10119835 . 0.10119835 0.10119835
## [2,] 0.09285575 1.04717480 0.09285575 . 0.09285575 0.09285575
## [3,] 0.33333475 0.33333475 0.33333475 . 0.33333475 0.33333475
## [4,] 0.03761130 0.03761130 0.03761130 . 0.03761130 0.03761130
## [5,] 0.07943211 0.07943211 0.07943211 . 0.07943211 0.07943211
## ... . . . . . .
## [996,] 0.4840033 0.4840033 0.4840033 . 0.4840033 0.4840033
## [997,] 0.7988256 0.7988256 0.7988256 . 0.7988256 0.7988256
## [998,] 0.9872841 0.9872841 0.9872841 . 0.9872841 0.9872841
## [999,] 0.9816573 0.9816573 0.9816573 . 0.9816573 0.9816573
## [1000,] 0.1731113 0.1731113 0.1731113 . 0.1731113 0.1731113
tmp <- tempfile(fileext=".h5")
saveDelayed(stuff, tmp)
rhdf5::h5ls(tmp)
## group name otype dclass dim
## 0 / delayed H5I_GROUP
## 1 /delayed base H5I_DATASET FLOAT ( 0 )
## 2 /delayed method H5I_DATASET STRING ( 0 )
## 3 /delayed seed H5I_GROUP
## 4 /delayed/seed method H5I_DATASET STRING ( 0 )
## 5 /delayed/seed seed H5I_GROUP
## 6 /delayed/seed/seed dimensions H5I_DATASET INTEGER 2
## 7 /delayed/seed/seed file H5I_DATASET STRING ( 0 )
## 8 /delayed/seed/seed name H5I_DATASET STRING ( 0 )
## 9 /delayed/seed/seed sparse H5I_DATASET INTEGER ( 0 )
## 10 /delayed/seed/seed type H5I_DATASET STRING ( 0 )
## 11 /delayed/seed side H5I_DATASET STRING ( 0 )
## 12 /delayed/seed value H5I_DATASET FLOAT ( 0 )
file.info(tmp)[["size"]] # size of the delayed operations + pointer to the actual file
## [1] 49642
y <- loadDelayed(tmp)
y
## <1000 x 1000> DelayedMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0.10119835 0.10119835 0.10119835 . 0.10119835 0.10119835
## [2,] 0.09285575 1.04717480 0.09285575 . 0.09285575 0.09285575
## [3,] 0.33333475 0.33333475 0.33333475 . 0.33333475 0.33333475
## [4,] 0.03761130 0.03761130 0.03761130 . 0.03761130 0.03761130
## [5,] 0.07943211 0.07943211 0.07943211 . 0.07943211 0.07943211
## ... . . . . . .
## [996,] 0.4840033 0.4840033 0.4840033 . 0.4840033 0.4840033
## [997,] 0.7988256 0.7988256 0.7988256 . 0.7988256 0.7988256
## [998,] 0.9872841 0.9872841 0.9872841 . 0.9872841 0.9872841
## [999,] 0.9816573 0.9816573 0.9816573 . 0.9816573 0.9816573
## [1000,] 0.1731113 0.1731113 0.1731113 . 0.1731113 0.1731113
sessionInfo()
## R version 4.4.0 beta (2024-04-15 r86425)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] HDF5Array_1.32.0 rhdf5_2.48.0 chihaya_1.4.0
## [4] DelayedArray_0.30.0 SparseArray_1.4.0 S4Arrays_1.4.0
## [7] abind_1.4-5 IRanges_2.38.0 S4Vectors_0.42.0
## [10] MatrixGenerics_1.16.0 matrixStats_1.3.0 BiocGenerics_0.50.0
## [13] Matrix_1.7-0 BiocStyle_2.32.0
##
## loaded via a namespace (and not attached):
## [1] crayon_1.5.2 cli_3.6.2 knitr_1.46
## [4] rlang_1.1.3 xfun_0.43 jsonlite_1.8.8
## [7] htmltools_0.5.8.1 sass_0.4.9 rmarkdown_2.26
## [10] grid_4.4.0 evaluate_0.23 jquerylib_0.1.4
## [13] fastmap_1.1.1 Rhdf5lib_1.26.0 yaml_2.3.8
## [16] lifecycle_1.0.4 bookdown_0.39 BiocManager_1.30.22
## [19] compiler_4.4.0 Rcpp_1.0.12 rhdf5filters_1.16.0
## [22] XVector_0.44.0 lattice_0.22-6 digest_0.6.35
## [25] R6_2.5.1 bslib_0.7.0 tools_4.4.0
## [28] zlibbioc_1.50.0 cachem_1.0.8