tcgaWGBSData.hg19 Vignette

Divy S. Kangeyan

2020-05-07

###Differential Methylation Analysis with Whole Genome Bisulfite Sequencing (WGBS) Data in TCGA

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
library(BiocManager)
BiocManager::install("tcgaWGBSData.hg19", version = "devel")
library(ExperimentHub)
eh = ExperimentHub()
query(eh, "tcgaWGBSData.hg19")

Data can be extracted in the following way:

tcga_data <- eh[["EH1661"]]
TCGA_bs <- eh[["EH1662"]]

tcga_eh_directory <- dirname(tcga_data)
assay_file <- tcga_data
rds_file <- paste0(tcga_eh_directory, '/1662')

file.rename(from=assay_file, to=paste0(tcga_eh_directory, '/assays.h5'))
file.rename(from=rds_file, to=paste0(tcga_eh_directory, '/se.rds'))
TCGA_bs <- HDF5Array::loadHDF5SummarizedExperiment(tcga_eh_directory)

Phenotypic data can be extracted by

library(Biobase)
phenoData <- Biobase::pData(TCGA_bs)
head(phenoData)

Methylation Comparison between normal and tumor sample

library(ggplot2)

cov_matrix <- bsseq::getCoverage(TCGA_bs)
meth_matrix <- bsseq::getCoverage(TCGA_bs, type='M')
meth_matrix <- meth_matrix/cov_matrix

# Get total CpG coverage
totCov <- colSums(cov_matrix>0)

# Restrict to CpGs with minimum read covergae of 10
meth_matrix[cov_matrix<10] <- NA 

meanMethylation <- DelayedArray::colMeans(meth_matrix, na.rm=TRUE)
sampleType <- phenoData$sample.type
Df <- data.frame('mean-methylation' = meanMethylation, 'type' = sampleType)

g <- ggplot2::ggplot() 
g <- g + ggplot2::geom_boxplot(data=Df,ggplot2::aes(x=type,y=mean.methylation))
g <- g + ggplot2::xlab('sample type') + ggplot2::ylab('Methylation') 
g <- g + ggplot2::theme(axis.text.x = element_text(angle = 0, hjust = 1))
g

As expected overall methylation is lower in tumor samples compared to the normal samples.

Differential methylation analysis of tumor samples

In this analysis we are using functions from bsseq package to conduct the differntial methylation analysis.

# In this analysis we restrict the data to just chromosome 22
chrIndex <- seqnames(TCGA_bs) == 'chr22'

# Also we are only conducting the analysis for 3 normal and tumor pairs
group1 <- c(11, 6, 23) # normal samples
group2 <- c(20, 26, 25) # Tumor samples
subSample <- c(group1, group2)

TCGA_bs_sub <- updateObject(TCGA_bs[chrIndex, subSample])
TCGA_bs_sub.fit <- bsseq::BSmooth(TCGA_bs_sub, mc.cores = 2, verbose = TRUE)
TCGA_bs_sub.tstat <- bsseq::BSmooth.tstat(TCGA_bs_sub.fit, 
                                group1 = c(1, 2, 3),
                                group2 = c(4, 5, 6), 
                                estimate.var = "group2",
                                local.correct = TRUE,
                                verbose = TRUE)
plot(TCGA_bs_sub.tstat)
dmrs0 <- bsseq::dmrFinder(TCGA_bs_sub.tstat, cutoff = c(-4.6, 4.6))
dmrs <- subset(dmrs0, n >= 3 & abs(meanDiff) >= 0.1)
nrow(dmrs)
head(dmrs, n = 3)

pData <- pData(TCGA_bs_sub.fit)
pData$col <- rep(c("red", "blue"), each = 3)
pData(TCGA_bs_sub.fit) <- pData

bsseq::plotRegion(TCGA_bs_sub.fit, dmrs[1,], extend = 5000, addRegions = dmrs)
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.11-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.11-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.0.0  magrittr_1.5    htmltools_0.4.0 tools_4.0.0    
##  [5] yaml_2.2.1      Rcpp_1.0.4.6    stringi_1.4.6   rmarkdown_2.1  
##  [9] knitr_1.28      stringr_1.4.0   digest_0.6.25   xfun_0.13      
## [13] rlang_0.4.6     evaluate_0.14