You can install the release version of sparseMatrixStats from BioConductor:
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("sparseMatrixStats")
The sparseMatrixStats package implements a number of summary functions for sparse matrices from the Matrix package.
Let us load the package and create a synthetic sparse matrix.
library(sparseMatrixStats)
# Matrix defines the sparse Matrix class
# dgCMatrix that we will use
library(Matrix)
# For reproducibility
set.seed(1)
Create a synthetic table with customers, items, and how often they bought that item.
customer_ids <- seq_len(100)
item_ids <- seq_len(30)
n_transactions <- 1000
customer <- sample(customer_ids, size = n_transactions, replace = TRUE,
prob = runif(100))
item <- sample(item_ids, size = n_transactions, replace = TRUE,
prob = runif(30))
tmp <- table(paste0(customer, "-", item))
tmp2 <- strsplit(names(tmp), "-")
purchase_table <- data.frame(
customer = as.numeric(sapply(tmp2, function(x) x[1])),
item = as.numeric(sapply(tmp2, function(x) x[2])),
n = as.numeric(tmp)
)
head(purchase_table, n = 10)
#> customer item n
#> 1 1 15 2
#> 2 1 20 1
#> 3 1 22 1
#> 4 1 23 1
#> 5 1 30 1
#> 6 100 11 1
#> 7 100 15 3
#> 8 100 17 1
#> 9 100 19 1
#> 10 100 20 2
Let us turn the table into a matrix to simplify the analysis:
purchase_matrix <- sparseMatrix(purchase_table$customer, purchase_table$item,
x = purchase_table$n,
dims = c(100, 30),
dimnames = list(customer = paste0("Customer_", customer_ids),
item = paste0("Item_", item_ids)))
purchase_matrix[1:10, 1:15]
#> 10 x 15 sparse Matrix of class "dgCMatrix"
#> [[ suppressing 15 column names 'Item_1', 'Item_2', 'Item_3' ... ]]
#>
#> Customer_1 . . . . . . . . . . . . . . 2
#> Customer_2 . . 2 . . . . . . . . . . . 1
#> Customer_3 . . 1 . . . . . 1 . . 1 . . 1
#> Customer_4 . . . . . 1 . . 1 1 . 1 . 1 3
#> Customer_5 . . . . . . . . . . . 1 . . .
#> Customer_6 . . . 1 . 1 1 . . 1 1 . . . 1
#> Customer_7 2 . 1 . . 1 . 1 1 . 1 2 . . 1
#> Customer_8 . . . . . . . . 1 . . . . . .
#> Customer_9 . . . . . 2 . . 1 . . . . 1 .
#> Customer_10 . . . . . . . . . . . . . . .
We can see that some customers did not buy anything, where as some bought a lot.
sparseMatrixStats
can help us to identify interesting patterns in this data:
# How often was each item bough in total?
colSums2(purchase_matrix)
#> [1] 34 13 30 10 12 34 29 31 71 28 20 40 0 48 52 27 30 55 14 46 8 41 40 73 56
#> [26] 83 20 18 13 24
# What is the range of number of items each
# customer bought?
head(rowRanges(purchase_matrix))
#> [,1] [,2]
#> [1,] 0 2
#> [2,] 0 2
#> [3,] 0 3
#> [4,] 0 3
#> [5,] 0 1
#> [6,] 0 4
# What is the variance in the number of items
# each customer bought?
head(rowVars(purchase_matrix))
#> [1] 0.23448276 0.40919540 0.39195402 0.74022989 0.06436782 0.81034483
# How many items did a customer not buy at all, one time, 2 times,
# or exactly 4 times?
head(rowTabulates(purchase_matrix, values = c(0, 1, 2, 4)))
#> 0 1 2 4
#> Customer_1 25 4 1 0
#> Customer_2 25 2 3 0
#> Customer_3 25 4 0 0
#> Customer_4 19 8 1 0
#> Customer_5 28 2 0 0
#> Customer_6 20 7 2 1
In the previous section, I demonstrated how to create a sparse matrix from scratch using the sparseMatrix()
function.
However, often you already have an existing matrix and want to convert it to a sparse representation.
mat <- matrix(0, nrow=10, ncol=6)
mat[sample(seq_len(60), 4)] <- 1:4
# Convert dense matrix to sparse matrix
sparse_mat <- as(mat, "dgCMatrix")
sparse_mat
#> 10 x 6 sparse Matrix of class "dgCMatrix"
#>
#> [1,] . . . . . .
#> [2,] . . . . 4 .
#> [3,] . . . . . .
#> [4,] . . . . . .
#> [5,] . . . . . .
#> [6,] 1 . . 2 . .
#> [7,] . . 3 . . .
#> [8,] . . . . . .
#> [9,] . . . . . .
#> [10,] . . . . . .
The sparseMatrixStats package is a derivative of the matrixStats package and implements it’s API for
sparse matrices. For example, to calculate the variance for each column of mat
you can do
apply(mat, 2, var)
#> [1] 0.1 0.0 0.9 0.4 1.6 0.0
However, this is quite inefficient and matrixStats provides the direct function
matrixStats::colVars(mat)
#> [1] 0.1 0.0 0.9 0.4 1.6 0.0
Now for sparse matrices, you can also just call
sparseMatrixStats::colVars(sparse_mat)
#> [1] 0.1 0.0 0.9 0.4 1.6 0.0
If you have a large matrix with many exact zeros, working on the sparse representation can considerably speed up the computations.
I generate a dataset with 10,000 rows and 50 columns that is 99% empty
big_mat <- matrix(0, nrow=1e4, ncol=50)
big_mat[sample(seq_len(1e4 * 50), 5000)] <- rnorm(5000)
# Convert dense matrix to sparse matrix
big_sparse_mat <- as(big_mat, "dgCMatrix")
I use the bench package to benchmark the performance difference:
bench::mark(
sparseMatrixStats=sparseMatrixStats::colMedians(big_sparse_mat),
matrixStats=matrixStats::colMedians(big_mat),
apply=apply(big_mat, 2, median)
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sparseMatrixStats 52.16µs 57.68µs 15640. NA 5.76
#> 2 matrixStats 5.17ms 5.41ms 170. NA 0
#> 3 apply 12.34ms 12.8ms 74.8 NA 74.8
As you can see sparseMatrixStats is more than 60 times fast than matrixStats, which in turn is 7 times faster than the apply()
version.
sessionInfo()
#> R version 4.0.0 (2020-04-24)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 18.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.11-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.11-bioc/R/lib/libRlapack.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] Matrix_1.2-18 sparseMatrixStats_1.0.5 BiocStyle_2.16.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.4.6 knitr_1.28 magrittr_1.5
#> [4] lattice_0.20-41 rlang_0.4.6 fansi_0.4.1
#> [7] stringr_1.4.0 tools_4.0.0 grid_4.0.0
#> [10] xfun_0.14 utf8_1.1.4 cli_2.0.2
#> [13] htmltools_0.4.0 ellipsis_0.3.1 matrixStats_0.56.0
#> [16] assertthat_0.2.1 yaml_2.2.1 digest_0.6.25
#> [19] tibble_3.0.1 lifecycle_0.2.0 crayon_1.3.4
#> [22] bookdown_0.19 BiocManager_1.30.10 vctrs_0.3.0
#> [25] glue_1.4.1 evaluate_0.14 rmarkdown_2.1
#> [28] bench_1.1.1 stringi_1.4.6 compiler_4.0.0
#> [31] pillar_1.4.4 MatrixGenerics_1.0.2 pkgconfig_2.0.3