## ----setup, include = FALSE---------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----install_biocmanager,eval=FALSE-------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("curatedAdipoChIP") ## ----loading_libraries, message=FALSE------------------------------------ # loading required libraries library(ExperimentHub) library(SummarizedExperiment) library(S4Vectors) library(fastqcr) library(DESeq2) library(dplyr) library(tidyr) library(ggplot2) ## ----loading_data-------------------------------------------------------- # query package resources on ExperimentHub eh <- ExperimentHub() query(eh, "curatedAdipoChIP") # load data from ExperimentHub peak_counts <- query(eh, "curatedAdipoChIP")[[1]] # print object peak_counts ## ----peak_counts--------------------------------------------------------- # print count matrix assay(peak_counts)[1:5, 1:5] ## ----colData------------------------------------------------------------- # names of the coldata object names(colData(peak_counts)) # table of times column table(colData(peak_counts)$time) # table of stage column table(colData(peak_counts)$stage) # table of factor column table(colData(peak_counts)$factor) ## ----rowRanges----------------------------------------------------------- # print GRanges object rowRanges(peak_counts) ## ----metadata, message=FALSE--------------------------------------------- # print data of first study metadata(peak_counts)$studies[1,] ## ----summary_table,echo=FALSE-------------------------------------------- # generate a study summary table colData(peak_counts)[, c(-2, -12)] %>% as_tibble() %>% filter(!is.na(control_id)) %>% group_by(study) %>% summarise(pmid = unique(pmid), nsamples = n(), time = paste(unique(time), collapse = '/ '), stages = paste(unique(stage), collapse = '/'), factor = paste(unique(factor), collapse = '/ ')) %>% knitr::kable(align = 'cccccc', col.names = c('GEO series ID', 'PubMed ID', 'Num. of Samples', 'Time (hr)', 'Differentiation Stage', 'Factor')) ## ----control_samples----------------------------------------------------- # show the number of control samples table(is.na(peak_counts$control_id)) ## ----subset object------------------------------------------------------- # subset peaks_count object # select samples for the factor CEBPB and stage 0 and 1 sample_ind <- (peak_counts$factor == 'CEBPB') & (peak_counts$stage %in% c(0, 1)) sample_ids <- colnames(peak_counts)[sample_ind] # select peaks from the selected samples peak_ind <- lapply(mcols(peak_counts)$peak, function(x) sum(sample_ids %in% x)) peak_ind <- unlist(peak_ind) > 2 # subset the object se <- peak_counts[peak_ind, sample_ind] # show the number of samples in each group table(se$stage) # show the number of peak in each sample table(unlist(mcols(se)$peak))[sample_ids] ## ----filtering_samples--------------------------------------------------- # check the number of files in qc qc <- se$qc table(lengths(qc)) # flattening qc list qc <- unlist(qc, recursive = FALSE) length(qc) ## ----per_base_scores----------------------------------------------------- # extracting per_base_sequence_quality per_base <- lapply(qc, function(x) { df <- x[['per_base_sequence_quality']] df %>% dplyr::select(Base, Mean) %>% transform(Base = strsplit(as.character(Base), '-')) %>% unnest(Base) %>% mutate(Base = as.numeric(Base)) }) %>% bind_rows(.id = 'run') ## ----score_summary------------------------------------------------------- # a quick look at quality scores summary(per_base) ## ----finding_low_scores,fig.align='centre',fig.height=4,fig.width=4------ # find low quality runs per_base <- per_base %>% group_by(run) %>% mutate(run_average = mean(Mean) > 34) # plot average per base quality per_base %>% ggplot(aes(x = Base, y = Mean, group = run, color = run_average)) + geom_line() + lims(y = c(0,40)) ## ----remove_lowscore----------------------------------------------------- # get run ids of low quality samples bad_runs <- unique(per_base$run[per_base$run_average == FALSE]) bad_samples <- lapply(se$runs, function(x) sum(x$run %in% bad_runs)) # subset the counts object se2 <- se[, unlist(bad_samples) == 0] # show remaining samples by stage table(se2$stage) ## ----remove_low_counts--------------------------------------------------- # filtering low count genes low_counts <- apply(assay(se2), 1, function(x) length(x[x>10])>=2) table(low_counts) # subsetting the count object se3 <- se2[low_counts,] ## ----sampel_correlations,fig.align='centre',fig.height=6,fig.width=7----- # plot scatters of samples from each group par(mfrow = c(2,3)) lapply(split(colnames(se3), se3$stage), function(x) { # get counts of three samples y1 <- assay(se3)[, x[1]] y2 <- assay(se3)[, x[2]] y3 <- assay(se3)[, x[3]] # plot scatters of pairs of samples plot(log10(y1 + 1), log10(y2 + 1), xlab = x[1], ylab = x[2]) plot(log10(y1 + 1), log10(y3 + 1), xlab = x[1], ylab = x[3]) plot(log10(y2 + 1), log10(y3 + 1), xlab = x[2], ylab = x[3]) }) ## ----differential_binding------------------------------------------------ # differential binding analysis colData(se3) <- colData(se3)[, -2] se3$stage <- factor(se3$stage) dds <- DESeqDataSet(se3, ~stage) dds <- DESeq(dds) res <- results(dds) table(res$padj < .1) ## ----pca,fig.align='centre',fig.height=4,fig.width=4--------------------- # explaining variabce plotPCA(rlog(dds), intgroup = 'stage') plotPCA(rlog(dds), intgroup = 'study') ## ----studies_keys-------------------------------------------------------- # keys of the studies in this subset of the data unique(se3$bibtexkey) ## ----citation, warning=FALSE--------------------------------------------- # citing the package citation("curatedAdipoChIP") ## ----session_info-------------------------------------------------------- devtools::session_info()