Introduction to zFPKM Transformation

Identifying active genes for subsequent analysis

We calculate zFPKM for existing FPKM from gse94802.

library(dplyr)
library(GEOquery)
library(stringr)
library(SummarizedExperiment)
library(tidyr)

getSpecificGEOSupp <- function(url) {
  temp <- tempfile()
  download.file(url, temp)
  out <- read.csv(gzfile(temp), row.names=1, check.names=FALSE)
  out <- select(out, -MGI_Symbol)
  return(out)
}

gse94802_fpkm <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE94nnn/GSE94802/suppl/GSE94802_Minkina_etal_normalized_FPKM.csv.gz"
gse94802_counts <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE94nnn/GSE94802/suppl/GSE94802_Minkina_etal_raw_counts.csv.gz"

if (file.exists("gse94802.rds")) {
  esetlist <- readRDS("gse94802.rds")
} else {
  esetlist <- getGEO("gse94802")
}

doe <- pData(esetlist[[1]])

colData <- DataFrame(
  condition=ifelse(str_detect(doe$title, regex("control", ignore_case=TRUE)), "control", "mutant"),
  sample_id=str_match(doe$title, "rep\\d_(.+)")[, 2],
  row.names=str_match(doe$title, "rep\\d_(.+)")[, 2])

se <- SummarizedExperiment(assays=SimpleList(fpkm=getSpecificGEOSupp(gse94802_fpkm),
                                             counts=getSpecificGEOSupp(gse94802_counts)),
                           colData=colData)

# clear namespace
rm(esetlist, gse94802_fpkm, gse94802_counts, doe, colData, getSpecificGEOSupp)

We compute zFPKM.

library(zFPKM)
assay(se, "zfpkm") <- zFPKM(se)

We can also plot the Guassian fit to the FPKM data for which the z-scores are based.

zFPKMPlot(se)

To determine which genes are active, we compute the median expression within each group.

activeGenes <- assay(se, "zfpkm") %>%
  mutate(gene=rownames(assay(se, "zfpkm"))) %>%
  gather(sample_id, zfpkm, -gene) %>%
  left_join(select(as.data.frame(colData(se)), sample_id, condition), by="sample_id") %>%
  group_by(gene, condition) %>%
  summarize(median_zfpkm=median(zfpkm)) %>%
  ungroup() %>%
  mutate(active=(median_zfpkm > -3)) %>%
  filter(active) %>%
  select(gene) %>%
  distinct()

seActive <- SummarizedExperiment(
  assays=SimpleList(counts=as.matrix(assay(se, "counts")[activeGenes$gene, ])),
  colData=colData(se))

In the following DE analysis, we only use genes that were active in either group.

library(limma)
library(edgeR)

# Generate normalized log2CPM from counts AFTER we filter for protein-coding
# genes that are detectably expressed.
dge <- DGEList(counts=assay(seActive, "counts"))
dge <- calcNormFactors(dge)
design <- model.matrix(~ 0 + condition, data=colData(seActive))
vq <- voomWithQualityWeights(dge, design, plot=TRUE)

fit <- lmFit(vq, design)
contrastMatrix <- makeContrasts(conditioncontrol - conditionmutant, levels=design)
fit <- contrasts.fit(fit, contrastMatrix)
fit <- eBayes(fit, robust=TRUE)
deGenes <- topTable(fit, number=Inf)

References

Hart T, Komori HK, LaMere S, Podshivalova K, Salomon DR. Finding the active genes in deep RNA-seq gene expression studies. BMC Genomics. 2013 Nov 11;14:778. doi: 10.1186/1471-2164-14-778.

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.18-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] edgeR_4.0.0                 limma_3.58.0               
##  [3] zFPKM_1.24.0                tidyr_1.3.0                
##  [5] SummarizedExperiment_1.32.0 GenomicRanges_1.54.0       
##  [7] GenomeInfoDb_1.38.0         IRanges_2.36.0             
##  [9] S4Vectors_0.40.0            MatrixGenerics_1.14.0      
## [11] matrixStats_1.0.0           stringr_1.5.0              
## [13] GEOquery_2.70.0             Biobase_2.62.0             
## [15] BiocGenerics_0.48.0         dplyr_1.1.3                
## [17] printr_0.3                 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4            ggplot2_3.4.4           xfun_0.40              
##  [4] bslib_0.5.1             lattice_0.22-5          tzdb_0.4.0             
##  [7] vctrs_0.6.4             tools_4.3.1             bitops_1.0-7           
## [10] generics_0.1.3          tibble_3.2.1            fansi_1.0.5            
## [13] pkgconfig_2.0.3         Matrix_1.6-1.1          data.table_1.14.8      
## [16] checkmate_2.2.0         lifecycle_1.0.3         GenomeInfoDbData_1.2.11
## [19] farver_2.1.1            compiler_4.3.1          munsell_0.5.0          
## [22] statmod_1.5.0           htmltools_0.5.6.1       sass_0.4.7             
## [25] RCurl_1.98-1.12         yaml_2.3.7              pillar_1.9.0           
## [28] crayon_1.5.2            jquerylib_0.1.4         cachem_1.0.8           
## [31] DelayedArray_0.28.0     abind_1.4-5             locfit_1.5-9.8         
## [34] tidyselect_1.2.0        digest_0.6.33           stringi_1.7.12         
## [37] purrr_1.0.2             labeling_0.4.3          fastmap_1.1.1          
## [40] grid_4.3.1              colorspace_2.1-0        cli_3.6.1              
## [43] SparseArray_1.2.0       magrittr_2.0.3          S4Arrays_1.2.0         
## [46] utf8_1.2.4              readr_2.1.4             withr_2.5.1            
## [49] scales_1.2.1            backports_1.4.1         rmarkdown_2.25         
## [52] XVector_0.42.0          hms_1.1.3               evaluate_0.22          
## [55] knitr_1.44              rlang_1.1.1             Rcpp_1.0.11            
## [58] glue_1.6.2              xml2_1.3.5              jsonlite_1.8.7         
## [61] R6_2.5.1                zlibbioc_1.48.0

Introduction to zFPKM Transformation

Ron Ammar

2023-10-24

Summary

Identifying active genes for subsequent analysis

References