## ----eval=FALSE------------------------------------------------------------ # BiocManager::install('ccfindR') ## -------------------------------------------------------------------------- library(ccfindR) ## ---- echo=T--------------------------------------------------------------- # A toy matrix for count data set.seed(1) mat <- matrix(rpois(n = 80, lambda = 2), nrow = 4, ncol = 20) ABC <- LETTERS[1:4] abc <- letters[1:20] rownames(mat) <- ABC colnames(mat) <- abc ## ---- echo=T--------------------------------------------------------------- # create scNMFSet object sc <- scNMFSet(count = mat) ## ---- echo=T--------------------------------------------------------------- # create scNMFSet object sc <- scNMFSet(assays = list(counts = mat)) ## -------------------------------------------------------------------------- # set row and column names suppressMessages(library(S4Vectors)) genes <- DataFrame(ABC) rownames(genes) <- ABC cells <- DataFrame(abc) rownames(cells) <- abc sc <- scNMFSet(count = mat, rowData = genes, colData = cells) sc ## -------------------------------------------------------------------------- # read sparse matrix dir <- system.file('extdata', package = 'ccfindR') mat <- Matrix::readMM(paste0(dir,'/matrix.mtx')) rownames(mat) <- 1:nrow(mat) colnames(mat) <- 1:ncol(mat) sc <- scNMFSet(count = mat, rowData = DataFrame(1:nrow(mat)), colData = DataFrame(1:ncol(mat))) sc ## -------------------------------------------------------------------------- # read 10x files sc <- read_10x(dir = dir, count = 'matrix.mtx', genes = 'genes.tsv', barcodes = 'barcodes.tsv') sc ## -------------------------------------------------------------------------- # slots and subsetting counts(sc)[1:7,1:3] head(rowData(sc)) head(colData(sc)) sc2 <- sc[1:20,1:70] # subsetting of object sc2 <- remove_zeros(sc2) # remove empty rows/columns sc2 ## ----cells, cache=T, fig.small=TRUE,fig.cap="Quality control filtering of cells. Histogram of UMI counts is shown. Cells can be selected (red) by setting lower and upper thresholds of the UMI count."---- sc <- filter_cells(sc, umi.min = 10^2.6, umi.max = 10^3.4) ## ----genes, cache=T, fig.small=TRUE, fig.cap="Selection of genes for clustering. The scatter plot shows distributions of expression variance to mean ratio (VMR) and the number of cells expressed. Minimum VMR and a range of cell number can be set to select genes (red). Symbols in orange are marker genes provided as input, selected irrespective of expression variance."---- markers <- c('CD4','CD8A','CD8B','CD19','CD3G','CD3D', 'CD3Z','CD14') sc0 <- filter_genes(sc, markers = markers, vmr.min = 1.5, min.cells.expressed = 50, rescue.genes = FALSE) ## ----rescue, cache=T, echo=T, fig.small=TRUE, fig.cap='Additional selection of genes with modes at nonzero counts. Symbols in blue represent genes rescued.'---- sc_rescue <- filter_genes(sc, markers = markers, vmr.min = 1.5, min.cells.expressed = 50, rescue.genes = TRUE, progress.bar = FALSE) ## ---- cache=T-------------------------------------------------------------- rownames(sc_rescue) <- rowData(sc_rescue)[,2] sc <- sc_rescue ## ---- cache=T-------------------------------------------------------------- set.seed(1) sc <- factorize(sc, ranks = 3, nrun = 1, ncnn.step = 1, criterion='connectivity', verbose = 3) ## ---- cache=T-------------------------------------------------------------- sc <- factorize(sc, ranks = 3, nrun = 5, verbose = 2) ## ---- cache=T-------------------------------------------------------------- sc <- factorize(sc, ranks = seq(3,7), nrun = 5, verbose = 1, progress.bar = FALSE) ## -------------------------------------------------------------------------- measure(sc) ## ----measure, fig.large=TRUE, fig.cap='Factorization quality measures as functions of the rank. Dispersion measures the degree of bimodality in consistency matrix. Cophenetic correlation measures the degree of agreement between consistency matrix and hierarchical clustering.'---- plot(sc) ## ---- cache=T-------------------------------------------------------------- sb <- sc_rescue set.seed(2) sb <- vb_factorize(sb, ranks =3, verbose = 3, Tol = 2e-4, hyper.update.n0 = 5) ## ---- cache=T-------------------------------------------------------------- sb <- vb_factorize(sb, ranks = seq(2,7), nrun = 5, verbose = 1, Tol = 1e-4, progress.bar = FALSE) ## -------------------------------------------------------------------------- measure(sb) ## ----lml, fig.small=TRUE, fig.cap='Dependence of log ML with rank.'-------- plot(sb) ## -------------------------------------------------------------------------- ranks(sb) head(basis(sb)[ranks(sb)==5][[1]]) # basis matrix W for rank 5 ## ----sb, cache=T, fig.small=TRUE, fig.cap='Heatmap of basis matrix elements. Marker genes selected in rows, other than those provided as input, are based on the degree to which each features strongly in a particular cluster only and not in the rest. Columns represent the clusters.'---- feature_map(sb, markers = markers, rank = 5, max.per.cluster = 4, gene.name = rowData(sb)[,2], cexRow = 0.7) ## -------------------------------------------------------------------------- cell_type <- c('B_cell','CD8+_T','CD4+_T','Monocytes','NK') colnames(basis(sb)[ranks(sb) == 5][[1]]) <- cell_type rownames(coeff(sb)[ranks(sb) == 5][[1]]) <- cell_type ## ----sc, fig.small=TRUE, fig.cap = 'Heatmap of cluster coefficient matrix elements. Rows indicate clusters and columns the cells.'---- cell_map(sb, rank = 5) ## ----tsne, fig.cap='tSNE-based visualization of clustering. Coefficient matrix elements of cells were used as input with colors indicating predicted cluster assignment. The bar plot shows the cell counts of each cluster.'---- visualize_clusters(sb, rank = 5, cex = 0.7) ## ----tree, fig.small=TRUE, fig.cap = "Hierarchical tree of clusters derived from varying ranks. The rank increases from 2 to 5 horizontally and nodes are labeled by cluster IDs which bifurcated in each rank."---- tree <- build_tree(sb, rmax = 5) tree <- rename_tips(tree, rank = 5, tip.labels = cell_type) plot_tree(tree, cex = 0.8, show.node.label = TRUE)