HDCytoData 1.20.0
This vignette demonstrates several examples and use cases for the datasets in the HDCytoData
package.
Using the clustering datasets, we can generate dimension reduction plots with colors indicating the ground truth cell population labels. This provides a visual representation of the cell population structure in these datasets, which is useful during exploratory data analysis and for representing the output of clustering or other downstream analysis algorithms.
Below, we compare three different dimension reduction algorithms (principal component analysis [PCA], t-distributed stochastic neighbor embedding [tSNE], and uniform manifold approximation and projection [UMAP]), for one of the datasets (Levine_32dim
). This dataset contains ground truth cell population labels for 14 immune cell populations.
suppressPackageStartupMessages(library(HDCytoData))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(Rtsne))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
# ---------
# Load data
# ---------
d_SE <- Levine_32dim_SE()
## see ?HDCytoData and browseVignettes('HDCytoData') for documentation
## loading from cache
# -------------
# Preprocessing
# -------------
# select 'cell type' marker columns for defining clusters
d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"])
# extract cell population labels
population <- rowData(d_SE)$population_id
dim(d_sub)
## [1] 265627 32
stopifnot(nrow(d_sub) == length(population))
# transform data using asinh with cofactor 5
cofactor <- 5
d_sub <- asinh(d_sub / cofactor)
summary(d_sub)
## CD45RA CD133 CD19 CD22
## Min. :-0.05731 Min. :-0.05808 Min. :-0.05809 Min. :-0.05734
## 1st Qu.: 0.20463 1st Qu.:-0.02294 1st Qu.:-0.01884 1st Qu.:-0.02069
## Median : 0.54939 Median : 0.02535 Median : 0.07521 Median : 0.05879
## Mean : 0.68813 Mean : 0.14596 Mean : 0.50930 Mean : 0.39732
## 3rd Qu.: 1.03120 3rd Qu.: 0.22430 3rd Qu.: 0.54839 3rd Qu.: 0.38648
## Max. : 6.69120 Max. : 5.52750 Max. : 4.99008 Max. : 5.16048
## CD11b CD4 CD8 CD34
## Min. :-0.058236 Min. :-0.05775 Min. :-0.05800 Min. :-0.05801
## 1st Qu.:-0.000294 1st Qu.:-0.01259 1st Qu.:-0.01732 1st Qu.:-0.01117
## Median : 0.257923 Median : 0.13122 Median : 0.07363 Median : 0.11071
## Mean : 0.710319 Mean : 0.36760 Mean : 0.56522 Mean : 0.33989
## 3rd Qu.: 0.923517 3rd Qu.: 0.57812 3rd Qu.: 0.48642 3rd Qu.: 0.39281
## Max. : 5.260789 Max. : 6.58176 Max. : 4.69369 Max. : 5.14800
## Flt3 CD20 CXCR4 CD235ab
## Min. :-0.057884 Min. :-0.05813 Min. :-0.05704 Min. :-0.05761
## 1st Qu.:-0.007793 1st Qu.:-0.02207 1st Qu.: 0.25290 1st Qu.: 0.23100
## Median : 0.110317 Median : 0.03382 Median : 0.66539 Median : 0.54043
## Mean : 0.229768 Mean : 0.38441 Mean : 0.79247 Mean : 0.63189
## 3rd Qu.: 0.336117 3rd Qu.: 0.32551 3rd Qu.: 1.20168 3rd Qu.: 0.92358
## Max. : 7.117323 Max. : 6.05141 Max. : 5.69667 Max. : 6.64670
## CD45 CD123 CD321 CD14
## Min. :2.040 Min. :-0.05800 Min. :-0.05355 Min. :-0.057954
## 1st Qu.:5.116 1st Qu.:-0.01162 1st Qu.: 1.32346 1st Qu.:-0.026326
## Median :5.645 Median : 0.09602 Median : 1.90479 Median :-0.005379
## Mean :5.408 Mean : 0.37241 Mean : 1.93542 Mean : 0.077030
## 3rd Qu.:5.939 3rd Qu.: 0.41310 3rd Qu.: 2.51781 3rd Qu.: 0.089789
## Max. :7.238 Max. : 6.64063 Max. : 6.86739 Max. : 5.006121
## CD33 CD47 CD11c CD7
## Min. :-0.05808 Min. :-0.05509 Min. :-0.058053 Min. :-0.05816
## 1st Qu.:-0.01813 1st Qu.: 2.08788 1st Qu.:-0.002711 1st Qu.:-0.01567
## Median : 0.06107 Median : 2.71442 Median : 0.212063 Median : 0.13002
## Mean : 0.30792 Mean : 2.65608 Mean : 0.703504 Mean : 0.81384
## 3rd Qu.: 0.34147 3rd Qu.: 3.27654 3rd Qu.: 0.861448 3rd Qu.: 1.37083
## Max. : 5.61247 Max. : 6.40249 Max. : 6.520939 Max. : 6.31922
## CD15 CD16 CD44 CD38
## Min. :-0.05808 Min. :-0.05778 Min. :0.02606 Min. :-0.05719
## 1st Qu.:-0.01502 1st Qu.:-0.02255 1st Qu.:3.12712 1st Qu.: 0.40198
## Median : 0.09355 Median : 0.01424 Median :3.87967 Median : 1.02032
## Mean : 0.23136 Mean : 0.16123 Mean :3.76018 Mean : 1.47781
## 3rd Qu.: 0.38331 3rd Qu.: 0.16077 3rd Qu.:4.47392 3rd Qu.: 2.19146
## Max. : 1.53415 Max. : 5.33831 Max. :7.40456 Max. : 7.29308
## CD13 CD3 CD61 CD117
## Min. :-0.05773 Min. :-0.05824 Min. :-0.05764 Min. :-0.05767
## 1st Qu.: 0.02110 1st Qu.: 0.08495 1st Qu.:-0.01285 1st Qu.:-0.02396
## Median : 0.18706 Median : 0.60376 Median : 0.09569 Median :-0.00041
## Mean : 0.36856 Mean : 2.16576 Mean : 0.34446 Mean : 0.13120
## 3rd Qu.: 0.53550 3rd Qu.: 4.66522 3rd Qu.: 0.41579 3rd Qu.: 0.15474
## Max. : 6.98119 Max. : 6.74836 Max. : 7.74850 Max. : 5.50213
## CD49d HLA-DR CD64 CD41
## Min. :-0.05806 Min. :-0.05797 Min. :-0.05820 Min. :-0.05824
## 1st Qu.: 0.28301 1st Qu.: 0.05771 1st Qu.:-0.01058 1st Qu.:-0.02017
## Median : 0.67721 Median : 0.61133 Median : 0.12249 Median : 0.05223
## Mean : 0.79494 Mean : 1.52181 Mean : 0.55151 Mean : 0.26175
## 3rd Qu.: 1.19079 3rd Qu.: 2.88824 3rd Qu.: 0.60413 3rd Qu.: 0.30559
## Max. : 5.15344 Max. : 7.05251 Max. : 4.51784 Max. : 7.71829
# subsample cells for faster runtimes in vignette
n <- 2000
set.seed(123)
ix <- sample(seq_len(nrow(d_sub)), n)
d_sub <- d_sub[ix, ]
population <- population[ix]
dim(d_sub)
## [1] 2000 32
stopifnot(nrow(d_sub) == length(population))
# remove any near-duplicate rows (required by Rtsne)
dups <- duplicated(d_sub)
d_sub <- d_sub[!dups, ]
population <- population[!dups]
dim(d_sub)
## [1] 1998 32
stopifnot(nrow(d_sub) == length(population))
# ------------------------
# Dimension reduction: PCA
# ------------------------
n_dims <- 2
# run PCA
# (note: no scaling, since asinh-transformed dimensions are already comparable)
out_PCA <- prcomp(d_sub, center = TRUE, scale. = FALSE)
dims_PCA <- out_PCA$x[, seq_len(n_dims)]
colnames(dims_PCA) <- c("PC_1", "PC_2")
head(dims_PCA)
## PC_1 PC_2
## [1,] 1.450702 3.1573053
## [2,] 2.453109 -0.9381139
## [3,] -2.705226 0.7090551
## [4,] 2.718284 -2.2801305
## [5,] -2.714230 -0.1954170
## [6,] -3.003650 -0.1938087
stopifnot(nrow(dims_PCA) == length(population))
colnames(dims_PCA) <- c("dimension_x", "dimension_y")
dims_PCA <- cbind(as.data.frame(dims_PCA), population, type = "PCA")
head(dims_PCA)
## dimension_x dimension_y population type
## 1 1.450702 3.1573053 unassigned PCA
## 2 2.453109 -0.9381139 unassigned PCA
## 3 -2.705226 0.7090551 unassigned PCA
## 4 2.718284 -2.2801305 unassigned PCA
## 5 -2.714230 -0.1954170 unassigned PCA
## 6 -3.003650 -0.1938087 unassigned PCA
str(dims_PCA)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 1.45 2.45 -2.71 2.72 -2.71 ...
## $ dimension_y: num 3.157 -0.938 0.709 -2.28 -0.195 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "PCA" "PCA" "PCA" "PCA" ...
# generate plot
d_plot <- dims_PCA
str(d_plot)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 1.45 2.45 -2.71 2.72 -2.71 ...
## $ dimension_y: num 3.157 -0.938 0.709 -2.28 -0.195 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "PCA" "PCA" "PCA" "PCA" ...
colors <- c(rainbow(14), "gray75")
ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) +
facet_wrap(~ type, scales = "free") +
geom_point(size = 0.7, alpha = 0.5) +
scale_color_manual(values = colors) +
labs(x = "dimension x", y = "dimension y") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))
# -------------------------
# Dimension reduction: tSNE
# -------------------------
# run Rtsne
set.seed(123)
out_Rtsne <- Rtsne(as.matrix(d_sub), dims = n_dims)
dims_Rtsne <- out_Rtsne$Y
colnames(dims_Rtsne) <- c("tSNE_1", "tSNE_2")
head(dims_Rtsne)
## tSNE_1 tSNE_2
## [1,] 21.160789 -9.879256
## [2,] -8.307273 28.789121
## [3,] -4.444776 -29.266588
## [4,] -5.729179 30.125150
## [5,] -19.601833 -16.635593
## [6,] -6.028896 -17.779414
stopifnot(nrow(dims_Rtsne) == length(population))
colnames(dims_Rtsne) <- c("dimension_x", "dimension_y")
dims_Rtsne <- cbind(as.data.frame(dims_Rtsne), population, type = "tSNE")
head(dims_Rtsne)
## dimension_x dimension_y population type
## 1 21.160789 -9.879256 unassigned tSNE
## 2 -8.307273 28.789121 unassigned tSNE
## 3 -4.444776 -29.266588 unassigned tSNE
## 4 -5.729179 30.125150 unassigned tSNE
## 5 -19.601833 -16.635593 unassigned tSNE
## 6 -6.028896 -17.779414 unassigned tSNE
str(dims_Rtsne)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 21.16 -8.31 -4.44 -5.73 -19.6 ...
## $ dimension_y: num -9.88 28.79 -29.27 30.13 -16.64 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "tSNE" "tSNE" "tSNE" "tSNE" ...
# generate plot
d_plot <- dims_Rtsne
ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) +
facet_wrap(~ type, scales = "free") +
geom_point(size = 0.7, alpha = 0.5) +
scale_color_manual(values = colors) +
labs(x = "dimension x", y = "dimension y") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))