1 Installation

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install(c("EpiTxDb","EpiTxDb.Hs.hg38"))

2 Introduction

The epitranscriptome includes all post-transcriptional modifications of the RNA and describes and additional layer of information encoded on RNA. Like the term epigenome it is not about a change in nucleotide sequences, but the addition of functional elements through modifications.

With the development of high throughput detection strategies for specific RNA modifications, such as miCLIP and Pseudo-Seq amongst other, a large number of modified positions have been identified and were summarized via the RMBase project (Xuan et al. 2017; Sun et al. 2015) project.

To make these information avaialble within the Bioconductor universe EpiTxDb was developed, which facilitates the storage of epitranscriptomic information. More specifically, it can keep track of modification identity, position, the enzyme for introducing it on the RNA, a specifier which determines the position on the RNA to be modified and the literature references each modification is associated with.

3 Getting started

library(EpiTxDb)
library(EpiTxDb.Hs.hg38)

The class EpiTxDb is the class for storing the epitranscriptomic data. It inherits the inner workings of AnnotationDb class from the AnnotationDbi package.

As an example for the vignette the snoRNAdb data (Lestrade and Weber 2006) from the EpiTxDb.Hs.hg38 package will be used. The data is stored in the AnnotationHub and is downloaded and cached upon the first request.

etdb <- EpiTxDb.Hs.hg38.snoRNAdb()
## snapshotDate(): 2020-10-27
## loading from cache
etdb
## EpiTxDb object:
## # Db type: EpiTxDb
## # Supporting package: EpiTxDb
## # Data source: snoRNAdb
## # Organism: Homo sapiens
## # Genome: hg38
## # Coordinates: per Transcript
## # Nb of modifications: 235
## # Db created by: EpiTxDb package from Bioconductor
## # Creation time: 2020-02-26 10:34:30 +0100 (Wed, 26 Feb 2020)
## # EpiTxDb version at creation time: 0.99.0
## # RSQLite version at creation time: 2.2.0
## # DBSCHEMAVERSION: 1.0

As expected for an AnnotationDb class the general accessors are available.

keytypes(etdb)
##  [1] "MODID"            "MODNAME"          "MODSTRAND"        "MODTYPE"         
##  [5] "REF"              "REFTYPE"          "RXENSEMBL"        "RXENSEMBLTRANS"  
##  [9] "RXENTREZID"       "RXGENENAME"       "SNID"             "SNNAME"          
## [13] "SPECENSEMBL"      "SPECENSEMBLTRANS" "SPECENTREZID"     "SPECGENENAME"    
## [17] "SPECTYPE"
columns(etdb)
##  [1] "MODEND"           "MODID"            "MODNAME"          "MODSTART"        
##  [5] "MODSTRAND"        "MODTYPE"          "REF"              "REFID"           
##  [9] "REFTYPE"          "RXENSEMBL"        "RXENSEMBLTRANS"   "RXENTREZID"      
## [13] "RXGENENAME"       "RXID"             "RXRANK"           "SNID"            
## [17] "SNNAME"           "SPECENSEMBL"      "SPECENSEMBLTRANS" "SPECENTREZID"    
## [21] "SPECGENENAME"     "SPECID"           "SPECTYPE"
head(keys(etdb, "MODID"))
## [1] "1" "2" "3" "4" "5" "6"
select(etdb, keys = "1",
       columns = c("MODNAME","MODTYPE","MODSTART","MODSTRAND","SNNAME",
                   "RXGENENAME","SPECTYPE","SPECGENENAME"),
       keytype = "MODID")
## 'select()' returned 1:1 mapping between keys and columns

The columns with the prefix RX or SPEC reference the reaction enzyme and the location specifier. This can be the same information, but for ribosomal modifications from the snoRNAdb it is of course fibrillarin and a snoRNA.

In addition the following accessor for metadata are available as well.

species(etdb)
## [1] "Homo sapiens"
organism(etdb)
## [1] "Homo sapiens"
seqlevels(etdb)
## [1] "NR_003285" "NR_003286" "NR_003287" "NR_004430" "NR_002716" "NR_003925"
## [7] "NR_002756" "NR_004394" "NR_029422"

4 Accessing RNA modifications

The specialized accessors are modifications() and modificationsBy(). modifications() allows for filtering results, whereas modificationsBy() returns all the modifications in batches separated by certain information.

modifications(etdb, columns = c("mod_id","mod_type","mod_name",
                                "rx_genename","spec_genename",
                                "ref_type","ref"),
              filter = list(mod_id = 1:3))
## GRanges object with 3 ranges and 7 metadata columns:
##        seqnames    ranges strand |    mod_id    mod_type    mod_name
##           <Rle> <IRanges>  <Rle> | <integer> <character> <character>
##   [1] NR_003285        14      + |         1          Um       Um_14
##   [2] NR_003285        55      + |         2           Y        Y_55
##   [3] NR_003285        69      + |         3           Y        Y_69
##                  rx_genename   spec_genename        ref_type             ref
##              <CharacterList> <CharacterList> <CharacterList> <CharacterList>
##   [1]            fibrillarin         SNORD71            PMID        16381836
##   [2] dyskerin pseudouridi..         SNORA72            PMID        16381836
##   [3] dyskerin pseudouridi..         SNORA69            PMID        16381836
##   -------
##   seqinfo: 9 sequences from hg38 genome; no seqlengths
# split by sequence name, usually a transcipt identifier
modificationsBy(etdb, by = "seqnames")
## GRangesList object of length 9:
## $NR_003285
## GRanges object with 4 ranges and 3 metadata columns:
##        seqnames    ranges strand |    mod_id         mod    mod_name
##           <Rle> <IRanges>  <Rle> | <integer> <character> <character>
##   [1] NR_003285        14      + |         1          Um       Um_14
##   [2] NR_003285        55      + |         2           Y        Y_55
##   [3] NR_003285        69      + |         3           Y        Y_69
##   [4] NR_003285        75      + |         4          Gm       Gm_75
##   -------
##   seqinfo: 9 sequences from hg38 genome; no seqlengths
## 
## ...
## <8 more elements>
# split modification type
modificationsBy(etdb, by = "modtype")
## GRangesList object of length 5:
## $Am
## GRanges object with 39 ranges and 3 metadata columns:
##         seqnames    ranges strand |         mod    mod_id    mod_name
##            <Rle> <IRanges>  <Rle> | <character> <integer> <character>
##    [1] NR_003286        27      + |          Am         5       Am_27
##    [2] NR_003286        99      + |          Am         9       Am_99
##    [3] NR_003286       159      + |          Am        15      Am_159
##    [4] NR_003286       166      + |          Am        16      Am_166
##    [5] NR_003286       468      + |          Am        25      Am_468
##    ...       ...       ...    ... .         ...       ...         ...
##   [35] NR_002716        30      + |          Am       207       Am_30
##   [36] NR_003925        65      + |          Am       216       Am_65
##   [37] NR_004394        47      + |          Am       226       Am_47
##   [38] NR_004394        53      + |          Am       227       Am_53
##   [39] NR_004394        53      + |          Am       228       Am_53
##   -------
##   seqinfo: 9 sequences from hg38 genome; no seqlengths
## 
## ...
## <4 more elements>

5 Shifting coordinates from genomic to transcriptomic

Since epitranscriptomic modifications by their nature can have different meaning for each of the individual transcript variants, this can introduce conflicts from how epitranscriptomics coordinates are saved. In the example above the coordinates are given per transcript, because of the origin of the data.

However, not all sources report transcript coordinates. It might be of interest to shift the genomic coordinates to transcript coordinates and at the same time take care, that the transcript maturation processes can lead to more than one result: From one genomic coordinate, multiple transcriptomic coordinates can be derived.

Whether this is biologically relevant or whether biological evidence does exist for each modification on each transcript cannot be guaranteed or differentiated technically depending on the methods used. This might change with the arrival of new techniques allowing for detection of modified nucleotides per individual transcript variant.

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(BSgenome.Hsapiens.UCSC.hg38)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
seqlevels(txdb) <- "chr1"
bs <- BSgenome.Hsapiens.UCSC.hg38

etdb <- EpiTxDb.Hs.hg38.RMBase()
## snapshotDate(): 2020-10-27
## loading from cache
tx <- exonsBy(txdb)
mod <- modifications(etdb, filter = list(sn_name = "chr1"))
length(mod)
## [1] 47275

In the following example we will focus on shifting the coordinates to individual mature transcripts. However, keep in mind, that premature transcript might be of interest as well and this can be controlled via the tx arguments of shiftGenomicToTranscript()

mod_tx <- shiftGenomicToTranscript(mod, tx)
## Warning: Coordinates for 905 ranges of 'subject' not found:
## 'chr1:14662:-','chr1:14668:-','chr1:14766:-','chr1:139005:-','chr1:139020:-','chr1:139040:-','chr1:139127:-','chr1:629454:+','chr1:629456:+','chr1:629465:+'
## and more ...
length(mod_tx)
## [1] 157694

Due to multiple matches for each transcript variant the number of modifications has increased.

With the we can plot the relative positions of modifications by type on chr1 transcripts.

mod_tx <- split(mod_tx,seqnames(mod_tx))
names <- Reduce(intersect,list(names(mod_tx),names(tx)))
# Getting the corresponding 5'-UTR and 3'-UTR annotations
fp <- fiveUTRsByTranscript(txdb)
tp <- threeUTRsByTranscript(txdb)
tx <- tx[names]
mod_tx <- mod_tx[names]
fp_m <- match(names,names(fp))
fp_m <- fp_m[!is.na(fp_m)]
tp_m <- match(names,names(tp))
tp_m <- tp_m[!is.na(tp_m)]
fp <- fp[fp_m]
tp <- tp[tp_m]

# Getting lengths of transcripts, 5'-UTR and 3'-UTR
tx_lengths <- sum(width(tx))
fp_lengths <- rep(0L,length(tx))
names(fp_lengths) <- names
fp_lengths[names(fp)] <- sum(width(fp))
tp_lengths <- rep(0L,length(tx))
names(tp_lengths) <- names
tp_lengths[names(tp)] <- sum(width(tp))

# Rescale modifications
# CDS start is at position 1L and cds end at position 1000L
from <- IRanges(fp_lengths+1L, tx_lengths - tp_lengths)
to <- IRanges(1L,1000L)
mod_rescale <- rescale(mod_tx, to, from)

# Construct result data.frame
rel_pos <- data.frame(mod_type = unlist(mcols(mod_rescale,level="within")[,"mod_type"]),
                      rel_pos = unlist(start(mod_rescale)))
rel_pos <- rel_pos[rel_pos$rel_pos < 1500 & rel_pos$rel_pos > -500,]
library(ggplot2)
ggplot(rel_pos[rel_pos$mod_type %in% c("m6A","m1A","Y"),],
       aes(x = rel_pos, colour = mod_type)) + 
  geom_density()