###Differential Methylation Analysis with Whole Genome Bisulfite Sequencing (WGBS) Data in TCGA
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
library(BiocManager)
BiocManager::install("tcgaWGBSData.hg19", version = "devel")
Data can be extracted in the following way:
tcga_data <- eh[["EH1661"]]
TCGA_bs <- eh[["EH1662"]]
tcga_eh_directory <- dirname(tcga_data)
assay_file <- tcga_data
rds_file <- paste0(tcga_eh_directory, '/1662')
file.rename(from=assay_file, to=paste0(tcga_eh_directory, '/assays.h5'))
file.rename(from=rds_file, to=paste0(tcga_eh_directory, '/se.rds'))
TCGA_bs <- HDF5Array::loadHDF5SummarizedExperiment(tcga_eh_directory)
Phenotypic data can be extracted by
Methylation Comparison between normal and tumor sample
library(ggplot2)
cov_matrix <- bsseq::getCoverage(TCGA_bs)
meth_matrix <- bsseq::getCoverage(TCGA_bs, type='M')
meth_matrix <- meth_matrix/cov_matrix
# Get total CpG coverage
totCov <- colSums(cov_matrix>0)
# Restrict to CpGs with minimum read covergae of 10
meth_matrix[cov_matrix<10] <- NA
meanMethylation <- DelayedArray::colMeans(meth_matrix, na.rm=TRUE)
sampleType <- phenoData$sample.type
Df <- data.frame('mean-methylation' = meanMethylation, 'type' = sampleType)
g <- ggplot2::ggplot()
g <- g + ggplot2::geom_boxplot(data=Df,ggplot2::aes(x=type,y=mean.methylation))
g <- g + ggplot2::xlab('sample type') + ggplot2::ylab('Methylation')
g <- g + ggplot2::theme(axis.text.x = element_text(angle = 0, hjust = 1))
g
As expected overall methylation is lower in tumor samples compared to the normal samples.
In this analysis we are using functions from bsseq package to conduct the differntial methylation analysis.
# In this analysis we restrict the data to just chromosome 22
chrIndex <- seqnames(TCGA_bs) == 'chr22'
# Also we are only conducting the analysis for 3 normal and tumor pairs
group1 <- c(11, 6, 23) # normal samples
group2 <- c(20, 26, 25) # Tumor samples
subSample <- c(group1, group2)
TCGA_bs_sub <- updateObject(TCGA_bs[chrIndex, subSample])
TCGA_bs_sub.fit <- bsseq::BSmooth(TCGA_bs_sub, mc.cores = 2, verbose = TRUE)
TCGA_bs_sub.tstat <- bsseq::BSmooth.tstat(TCGA_bs_sub.fit,
group1 = c(1, 2, 3),
group2 = c(4, 5, 6),
estimate.var = "group2",
local.correct = TRUE,
verbose = TRUE)
plot(TCGA_bs_sub.tstat)
dmrs0 <- bsseq::dmrFinder(TCGA_bs_sub.tstat, cutoff = c(-4.6, 4.6))
dmrs <- subset(dmrs0, n >= 3 & abs(meanDiff) >= 0.1)
nrow(dmrs)
head(dmrs, n = 3)
pData <- pData(TCGA_bs_sub.fit)
pData$col <- rep(c("red", "blue"), each = 3)
pData(TCGA_bs_sub.fit) <- pData
bsseq::plotRegion(TCGA_bs_sub.fit, dmrs[1,], extend = 5000, addRegions = dmrs)
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.27 R6_2.5.0 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 rlang_0.4.11 stringi_1.6.2 jquerylib_0.1.4
## [9] bslib_0.2.5.1 rmarkdown_2.8 tools_4.1.0 stringr_1.4.0
## [13] xfun_0.23 yaml_2.2.1 compiler_4.1.0 htmltools_0.5.1.1
## [17] knitr_1.33 sass_0.4.0