Contents

Compiled date: 2022-04-26

Last edited: 2021-12-12

License: GPL-3

1 Installation

Run the following code to install the Bioconductor version of package.

# install.packages("BiocManager")
BiocManager::install("POMA")

2 Load Packages

library(POMA)
library(SummarizedExperiment)
library(ggplot2)
library(patchwork)

3 Load Data and Imputation

Let’s create a cleaned SummarizedExperiment object from the sample st000336 data to explore the normalization effects.

# load example data
data("st000336")

# imputation using the default method KNN
example_data <- st000336 %>% PomaImpute()
> method argument is empty! KNN will be used
example_data
> class: SummarizedExperiment 
> dim: 30 57 
> metadata(0):
> assays(1): ''
> rownames(30): x1_methylhistidine x3_methylhistidine ... pyruvate
>   succinate
> rowData names(0):
> colnames(57): DMD004.1.U02 DMD005.1.U02 ... DMD167.5.U02 DMD173.1.U02
> colData names(2): group steroids

4 Normalization

Here we will evaluate ALL normalization methods that POMA offers on the same SummarizedExperiment object to compare them (Berg et al. 2006).

none <- PomaNorm(example_data, method = "none")
auto_scaling <- PomaNorm(example_data, method = "auto_scaling")
level_scaling <- PomaNorm(example_data, method = "level_scaling")
log_scaling <- PomaNorm(example_data, method = "log_scaling")
log_transformation <- PomaNorm(example_data, method = "log_transformation")
vast_scaling <- PomaNorm(example_data, method = "vast_scaling")
log_pareto <- PomaNorm(example_data, method = "log_pareto")

4.1 Normalization effect on data dimensions

When we check for the dimension of the data after normalization we can see that ALL methods have the same effect on data dimension. PomaNorm only change the data dimension when the data have features that only have zeros and when the data have features with 0 variance. Only in these two cases PomaNorm will remove features of the data, changing the data dimensions.

dim(SummarizedExperiment::assay(none))
> [1] 30 57
dim(SummarizedExperiment::assay(auto_scaling))
> [1] 30 57
dim(SummarizedExperiment::assay(level_scaling))
> [1] 30 57
dim(SummarizedExperiment::assay(log_scaling))
> [1] 30 57
dim(SummarizedExperiment::assay(log_transformation))
> [1] 30 57
dim(SummarizedExperiment::assay(vast_scaling))
> [1] 30 57
dim(SummarizedExperiment::assay(log_pareto))
> [1] 30 57

4.2 Normalization effect on samples

Here we can evaluate the different normalization effects on samples (Berg et al. 2006).

a <- PomaBoxplots(none, group = "samples", jitter = FALSE) +
  ggtitle("Not Normalized")

b <- PomaBoxplots(auto_scaling, group = "samples", jitter = FALSE) +
  ggtitle("Auto Scaling") +
  theme(axis.text.x = element_blank(),
        legend.position = "none")

c <- PomaBoxplots(level_scaling, group = "samples", jitter = FALSE) +
  ggtitle("Level Scaling") +
  theme(axis.text.x = element_blank(),
        legend.position = "none")

d <- PomaBoxplots(log_scaling, group = "samples", jitter = FALSE) +
  ggtitle("Log Scaling") +
  theme(axis.text.x = element_blank(),
        legend.position = "none")

e <- PomaBoxplots(log_transformation, group = "samples", jitter = FALSE) +
  ggtitle("Log Transformation") +
  theme(axis.text.x = element_blank(),
        legend.position = "none")

f <- PomaBoxplots(vast_scaling, group = "samples", jitter = FALSE) +
  ggtitle("Vast Scaling") +
  theme(axis.text.x = element_blank(),
        legend.position = "none")

g <- PomaBoxplots(log_pareto, group = "samples", jitter = FALSE) +
  ggtitle("Log Pareto") +
  theme(axis.text.x = element_blank(),
        legend.position = "none")

a  

(b + c + d) / (e + f + g)

4.3 Normalization effect on features

Here we can evaluate the different normalization effects on features.

h <- PomaDensity(none, group = "features") +
  ggtitle("Not Normalized")

i <- PomaDensity(auto_scaling, group = "features") +
  ggtitle("Auto Scaling") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

j <- PomaDensity(level_scaling, group = "features") +
  ggtitle("Level Scaling") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

k <- PomaDensity(log_scaling, group = "features") +
  ggtitle("Log Scaling") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

l <- PomaDensity(log_transformation, group = "features") +
  ggtitle("Log Transformation") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

m <- PomaDensity(vast_scaling, group = "features") +
  ggtitle("Vast Scaling") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

n <- PomaDensity(log_pareto, group = "features") +
  ggtitle("Log Pareto") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

h  

(i + j + k) / (l + m + n)

5 Session Information

sessionInfo()
> R version 4.2.0 RC (2022-04-19 r82224)
> Platform: x86_64-pc-linux-gnu (64-bit)
> Running under: Ubuntu 20.04.4 LTS
> 
> Matrix products: default
> BLAS:   /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
> LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
> 
> locale:
>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
>  [3] LC_TIME=en_GB              LC_COLLATE=C              
>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
> 
> attached base packages:
> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
> [8] base     
> 
> other attached packages:
>  [1] patchwork_1.1.1             SummarizedExperiment_1.26.0
>  [3] Biobase_2.56.0              GenomicRanges_1.48.0       
>  [5] GenomeInfoDb_1.32.0         IRanges_2.30.0             
>  [7] S4Vectors_0.34.0            BiocGenerics_0.42.0        
>  [9] MatrixGenerics_1.8.0        matrixStats_0.62.0         
> [11] tibble_3.1.6                dplyr_1.0.8                
> [13] knitr_1.38                  plotly_4.10.0              
> [15] ggraph_2.0.5                ggplot2_3.3.5              
> [17] POMA_1.6.0                  BiocStyle_2.24.0           
> 
> loaded via a namespace (and not attached):
>   [1] backports_1.4.1        circlize_0.4.14        plyr_1.8.7            
>   [4] igraph_1.3.1           lazyeval_0.2.2         splines_4.2.0         
>   [7] gmp_0.6-5              BiocParallel_1.30.0    listenv_0.8.0         
>  [10] digest_0.6.29          foreach_1.5.2          htmltools_0.5.2       
>  [13] magick_2.7.3           viridis_0.6.2          fansi_1.0.3           
>  [16] magrittr_2.0.3         cluster_2.1.3          doParallel_1.0.17     
>  [19] limma_3.52.0           graphlayouts_0.8.0     recipes_0.2.0         
>  [22] ComplexHeatmap_2.12.0  globals_0.14.0         gower_1.0.0           
>  [25] rARPACK_0.11-0         hardhat_0.2.0          colorspace_2.0-3      
>  [28] ggrepel_0.9.1          xfun_0.30              crayon_1.5.1          
>  [31] RCurl_1.98-1.6         jsonlite_1.8.0         impute_1.70.0         
>  [34] survival_3.3-1         iterators_1.0.14       glue_1.6.2            
>  [37] polyclip_1.10-0        gtable_0.3.0           ipred_0.9-12          
>  [40] zlibbioc_1.42.0        XVector_0.36.0         GetoptLong_1.0.5      
>  [43] DelayedArray_0.22.0    RankProd_3.22.0        future.apply_1.9.0    
>  [46] shape_1.4.6            Rmpfr_0.8-7            scales_1.2.0          
>  [49] DBI_1.1.2              Rcpp_1.0.8.3           viridisLite_0.4.0     
>  [52] clue_0.3-60            proxy_0.4-26           lava_1.6.10           
>  [55] prodlim_2019.11.13     glmnet_4.1-4           httr_1.4.2            
>  [58] htmlwidgets_1.5.4      RColorBrewer_1.1-3     ellipsis_0.3.2        
>  [61] farver_2.1.0           pkgconfig_2.0.3        nnet_7.3-17           
>  [64] sass_0.4.1             utf8_1.2.2             caret_6.0-92          
>  [67] labeling_0.4.2         tidyselect_1.1.2       rlang_1.0.2           
>  [70] reshape2_1.4.4         munsell_0.5.0          tools_4.2.0           
>  [73] cli_3.3.0              generics_0.1.2         broom_0.8.0           
>  [76] evaluate_0.15          stringr_1.4.0          fastmap_1.1.0         
>  [79] yaml_2.3.5             ModelMetrics_1.2.2.2   tidygraph_1.2.1       
>  [82] purrr_0.3.4            randomForest_4.7-1     glasso_1.11           
>  [85] future_1.25.0          nlme_3.1-157           compiler_4.2.0        
>  [88] png_0.1-7              e1071_1.7-9            tweenr_1.0.2          
>  [91] bslib_0.3.1            stringi_1.7.6          highr_0.9             
>  [94] RSpectra_0.16-1        lattice_0.20-45        Matrix_1.4-1          
>  [97] vegan_2.6-2            permute_0.9-7          vctrs_0.4.1           
> [100] pillar_1.7.0           lifecycle_1.0.1        BiocManager_1.30.17   
> [103] jquerylib_0.1.4        GlobalOptions_0.1.2    data.table_1.14.2     
> [106] bitops_1.0-7           corpcor_1.6.10         R6_2.5.1              
> [109] bookdown_0.26          gridExtra_2.3          parallelly_1.31.1     
> [112] codetools_0.2-18       MASS_7.3-57            assertthat_0.2.1      
> [115] rjson_0.2.21           withr_2.5.0            GenomeInfoDbData_1.2.8
> [118] mgcv_1.8-40            parallel_4.2.0         mixOmics_6.20.0       
> [121] grid_4.2.0             rpart_4.1.16           timeDate_3043.102     
> [124] tidyr_1.2.0            class_7.3-20           rmarkdown_2.14        
> [127] Cairo_1.5-15           ggforce_0.3.3          pROC_1.18.0           
> [130] lubridate_1.8.0        ellipse_0.4.2

References

Berg, Robert A van den, Huub CJ Hoefsloot, Johan A Westerhuis, Age K Smilde, and Mariët J van der Werf. 2006. “Centering, Scaling, and Transformations: Improving the Biological Information Content of Metabolomics Data.” BMC Genomics 7 (1): 142.