Perform the zFPKM transform on RNA-seq FPKM data. This algorithm is based on the publication by Hart et al., 2013 (Pubmed ID 24215113). The reference recommends using zFPKM > -3 to select expressed genes. Validated with ENCODE open/closed promoter chromatin structure epigenetic data on six of the ENCODE cell lines. It works well for gene level data using FPKM or TPM, but does not appear to calibrate well for transcript level data.
We calculate zFPKM for existing FPKM from gse94802.
library(dplyr)
library(GEOquery)
library(stringr)
library(SummarizedExperiment)
library(tidyr)
getSpecificGEOSupp <- function(url) {
temp <- tempfile()
download.file(url, temp)
out <- read.csv(gzfile(temp), row.names=1, check.names=FALSE)
out <- select(out, -MGI_Symbol)
return(out)
}
gse94802_fpkm <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE94nnn/GSE94802/suppl/GSE94802_Minkina_etal_normalized_FPKM.csv.gz"
gse94802_counts <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE94nnn/GSE94802/suppl/GSE94802_Minkina_etal_raw_counts.csv.gz"
if (file.exists("gse94802.rds")) {
esetlist <- readRDS("gse94802.rds")
} else {
esetlist <- getGEO("gse94802")
}
doe <- pData(esetlist[[1]])
colData <- DataFrame(
condition=ifelse(str_detect(doe$title, regex("control", ignore_case=TRUE)), "control", "mutant"),
sample_id=str_match(doe$title, "rep\\d_(.+)")[, 2],
row.names=str_match(doe$title, "rep\\d_(.+)")[, 2])
se <- SummarizedExperiment(assays=SimpleList(fpkm=getSpecificGEOSupp(gse94802_fpkm),
counts=getSpecificGEOSupp(gse94802_counts)),
colData=colData)
# clear namespace
rm(esetlist, gse94802_fpkm, gse94802_counts, doe, colData, getSpecificGEOSupp)
We compute zFPKM.
library(zFPKM)
assay(se, "zfpkm") <- zFPKM(se)
We can also plot the Guassian fit to the FPKM data for which the z-scores are based.
zFPKMPlot(se)
To determine which genes are active, we compute the median expression within each group.
activeGenes <- assay(se, "zfpkm") %>%
mutate(gene=rownames(assay(se, "zfpkm"))) %>%
gather(sample_id, zfpkm, -gene) %>%
left_join(select(as.data.frame(colData(se)), sample_id, condition), by="sample_id") %>%
group_by(gene, condition) %>%
summarize(median_zfpkm=median(zfpkm)) %>%
ungroup() %>%
mutate(active=(median_zfpkm > -3)) %>%
filter(active) %>%
select(gene) %>%
distinct()
seActive <- SummarizedExperiment(
assays=SimpleList(counts=as.matrix(assay(se, "counts")[activeGenes$gene, ])),
colData=colData(se))
In the following DE analysis, we only use genes that were active in either group.
library(limma)
library(edgeR)
# Generate normalized log2CPM from counts AFTER we filter for protein-coding
# genes that are detectably expressed.
dge <- DGEList(counts=assay(seActive, "counts"))
dge <- calcNormFactors(dge)
design <- model.matrix(~ 0 + condition, data=colData(seActive))
vq <- voomWithQualityWeights(dge, design, plot=TRUE)
fit <- lmFit(vq, design)
contrastMatrix <- makeContrasts(conditioncontrol - conditionmutant, levels=design)
fit <- contrasts.fit(fit, contrastMatrix)
fit <- eBayes(fit, robust=TRUE)
deGenes <- topTable(fit, number=Inf)
Hart T, Komori HK, LaMere S, Podshivalova K, Salomon DR. Finding the active genes in deep RNA-seq gene expression studies. BMC Genomics. 2013 Nov 11;14:778. doi: 10.1186/1471-2164-14-778.
sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.3 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.6-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.6-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] edgeR_3.20.0 limma_3.34.0
## [3] bindrcpp_0.2 zFPKM_1.0.0
## [5] tidyr_0.7.2 SummarizedExperiment_1.8.0
## [7] DelayedArray_0.4.0 matrixStats_0.52.2
## [9] GenomicRanges_1.30.0 GenomeInfoDb_1.14.0
## [11] IRanges_2.12.0 S4Vectors_0.16.0
## [13] stringr_1.2.0 GEOquery_2.46.0
## [15] Biobase_2.38.0 BiocGenerics_0.24.0
## [17] dplyr_0.7.4 printr_0.1
##
## loaded via a namespace (and not attached):
## [1] statmod_1.4.30 locfit_1.5-9.1
## [3] tidyselect_0.2.2 purrr_0.2.4
## [5] lattice_0.20-35 colorspace_1.3-2
## [7] htmltools_0.3.6 yaml_2.1.14
## [9] XML_3.98-1.9 rlang_0.1.2
## [11] glue_1.2.0 GenomeInfoDbData_0.99.1
## [13] bindr_0.1 plyr_1.8.4
## [15] zlibbioc_1.24.0 munsell_0.4.3
## [17] gtable_0.2.0 evaluate_0.10.1
## [19] labeling_0.3 knitr_1.17
## [21] Rcpp_0.12.13 readr_1.1.1
## [23] backports_1.1.1 scales_0.5.0
## [25] checkmate_1.8.5 XVector_0.18.0
## [27] ggplot2_2.2.1 hms_0.3
## [29] digest_0.6.12 stringi_1.1.5
## [31] rprojroot_1.2 grid_3.4.2
## [33] tools_3.4.2 bitops_1.0-6
## [35] magrittr_1.5 RCurl_1.95-4.8
## [37] lazyeval_0.2.1 tibble_1.3.4
## [39] pkgconfig_2.0.1 Matrix_1.2-11
## [41] xml2_1.1.1 assertthat_0.2.0
## [43] rmarkdown_1.6 httr_1.3.1
## [45] R6_2.2.2 compiler_3.4.2