## ----warning=FALSE, message=FALSE---------------------------------------- library(EnsDb.Hsapiens.v75) ## Making a "short cut" edb <- EnsDb.Hsapiens.v75 ## print some informations for this package edb ## for what organism was the database generated? organism(edb) ## ------------------------------------------------------------------------ Tx <- transcripts(edb, filter = list(GenenameFilter("BCL2L11"))) Tx ## as this is a GRanges object we can access e.g. the start coordinates with head(start(Tx)) ## or extract the biotype with head(Tx$tx_biotype) ## ------------------------------------------------------------------------ ## list all database tables along with their columns listTables(edb) ## list columns from a specific table listColumns(edb, "tx") ## ------------------------------------------------------------------------ Tx <- transcripts(edb, columns = c(listColumns(edb , "tx"), "gene_name"), filter = TxbiotypeFilter("nonsense_mediated_decay"), return.type = "DataFrame") nrow(Tx) Tx ## ------------------------------------------------------------------------ yCds <- cdsBy(edb, filter = SeqnameFilter("Y")) yCds ## ------------------------------------------------------------------------ ## Define the filter grf <- GRangesFilter(GRanges("11", ranges = IRanges(114000000, 114000050), strand="+"), condition = "overlapping") ## Query genes: gn <- genes(edb, filter = grf) gn ## Next we retrieve all transcripts for that gene so that we can plot them. txs <- transcripts(edb, filter = GenenameFilter(gn$gene_name)) ## ----tx-for-zbtb16, message=FALSE, fig.align='center', fig.width=7.5, fig.height=5---- plot(3, 3, pch = NA, xlim = c(start(gn), end(gn)), ylim = c(0, length(txs)), yaxt = "n", ylab = "") ## Highlight the GRangesFilter region rect(xleft = start(grf), xright = end(grf), ybottom = 0, ytop = length(txs), col = "red", border = "red") for(i in 1:length(txs)) { current <- txs[i] rect(xleft = start(current), xright = end(current), ybottom = i-0.975, ytop = i-0.125, border = "grey") text(start(current), y = i-0.5, pos = 4, cex = 0.75, labels = current$tx_id) } ## ------------------------------------------------------------------------ transcripts(edb, filter = grf) ## ------------------------------------------------------------------------ ## Get all gene biotypes from the database. The GenebiotypeFilter ## allows to filter on these values. listGenebiotypes(edb) ## Get all transcript biotypes from the database. listTxbiotypes(edb) ## ------------------------------------------------------------------------ ## We're going to fetch all genes which names start with BCL. To this end ## we define a GenenameFilter with partial matching, i.e. condition "like" ## and a % for any character/string. BCLs <- genes(edb, columns = c("gene_name", "entrezid", "gene_biotype"), filter = list(GenenameFilter("BCL%", condition = "like")), return.type = "DataFrame") nrow(BCLs) BCLs ## ------------------------------------------------------------------------ ## determine the average length of snRNA, snoRNA and rRNA genes encoded on ## chromosomes X and Y. mean(lengthOf(edb, of = "tx", filter = list(GenebiotypeFilter(c("snRNA", "snoRNA", "rRNA")), SeqnameFilter(c("X", "Y"))))) ## determine the average length of protein coding genes encoded on the same ## chromosomes. mean(lengthOf(edb, of = "tx", filter = list(GenebiotypeFilter("protein_coding"), SeqnameFilter(c("X", "Y"))))) ## ------------------------------------------------------------------------ ## Extract all exons 1 and (if present) 2 for all genes encoded on the ## Y chromosome exons(edb, columns = c("tx_id", "exon_idx"), filter = list(SeqnameFilter("Y"), ExonrankFilter(3, condition = "<"))) ## ------------------------------------------------------------------------ TxByGns <- transcriptsBy(edb, by = "gene", filter = list(SeqnameFilter(c("X", "Y"))) ) TxByGns ## ----eval=FALSE---------------------------------------------------------- ## ## will just get exons for all genes on chromosomes 1 to 22, X and Y. ## ## Note: want to get rid of the "LRG" genes!!! ## EnsGenes <- exonsBy(edb, by = "gene", ## filter = list(SeqnameFilter(c(1:22, "X", "Y")), ## GeneidFilter("ENSG%", "like"))) ## ----eval=FALSE---------------------------------------------------------- ## ## Transforming the GRangesList into a data.frame in SAF format ## EnsGenes.SAF <- toSAF(EnsGenes) ## ----eval=FALSE---------------------------------------------------------- ## ## Create a GRanges of non-overlapping exon parts. ## DJE <- disjointExons(edb, ## filter = list(SeqnameFilter(c(1:22, "X", "Y")), ## GeneidFilter("ENSG%", "like"))) ## ----eval=FALSE---------------------------------------------------------- ## library(EnsDb.Hsapiens.v75) ## library(Rsamtools) ## edb <- EnsDb.Hsapiens.v75 ## ## ## Get the FaFile with the genomic sequence matching the Ensembl version ## ## using the AnnotationHub package. ## Dna <- getGenomeFaFile(edb) ## ## ## Get start/end coordinates of all genes. ## genes <- genes(edb) ## ## Subset to all genes that are encoded on chromosomes for which ## ## we do have DNA sequence available. ## genes <- genes[seqnames(genes) %in% seqnames(seqinfo(Dna))] ## ## ## Get the gene sequences, i.e. the sequence including the sequence of ## ## all of the gene's exons and introns. ## geneSeqs <- getSeq(Dna, genes) ## ----eval=FALSE---------------------------------------------------------- ## ## get all exons of all transcripts encoded on chromosome Y ## yTx <- exonsBy(edb, filter = SeqnameFilter("Y")) ## ## ## Retrieve the sequences for these transcripts from the FaFile. ## library(GenomicFeatures) ## yTxSeqs <- extractTranscriptSeqs(Dna, yTx) ## yTxSeqs ## ## ## Extract the sequences of all transcripts encoded on chromosome Y. ## yTx <- extractTranscriptSeqs(Dna, edb, filter = SeqnameFilter("Y")) ## ## ## Along these lines, we could use the method also to retrieve the coding sequence ## ## of all transcripts on the Y chromosome. ## cdsY <- cdsBy(edb, filter = SeqnameFilter("Y")) ## extractTranscriptSeqs(Dna, cdsY) ## ----message=FALSE------------------------------------------------------- ## Change the seqlevels style form Ensembl (default) to UCSC: seqlevelsStyle(edb) <- "UCSC" ## Now we can use UCSC style seqnames in SeqnameFilters or GRangesFilter: genesY <- genes(edb, filter = SeqnameFilter("chrY")) ## The seqlevels of the returned GRanges are also in UCSC style seqlevels(genesY) ## ------------------------------------------------------------------------ seqlevelsStyle(edb) <- "UCSC" ## Getting the default option: getOption("ensembldb.seqnameNotFound") ## Listing all seqlevels in the database. seqlevels(edb)[1:30] ## Setting the option to NA, thus, for each seqname for which no mapping is available, ## NA is returned. options(ensembldb.seqnameNotFound=NA) seqlevels(edb)[1:30] ## Resetting the option. options(ensembldb.seqnameNotFound = "ORIGINAL") ## ----warning=FALSE, message=FALSE---------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) bsg <- BSgenome.Hsapiens.UCSC.hg19 ## Get the genome version unique(genome(bsg)) unique(genome(edb)) ## Although differently named, both represent genome build GRCh37. ## Extract the full transcript sequences. yTxSeqs <- extractTranscriptSeqs(bsg, exonsBy(edb, "tx", filter = SeqnameFilter("chrY"))) yTxSeqs ## Extract just the CDS Test <- cdsBy(edb, "tx", filter = SeqnameFilter("chrY")) yTxCds <- extractTranscriptSeqs(bsg, cdsBy(edb, "tx", filter = SeqnameFilter("chrY"))) yTxCds ## ------------------------------------------------------------------------ seqlevelsStyle(edb) <- "Ensembl" ## ----gviz-plot, message=FALSE, fig.align='center', fig.width=7.5, fig.height=2.25---- ## Loading the Gviz library library(Gviz) library(EnsDb.Hsapiens.v75) edb <- EnsDb.Hsapiens.v75 ## Retrieving a Gviz compatible GRanges object with all genes ## encoded on chromosome Y. gr <- getGeneRegionTrackForGviz(edb, chromosome = "Y", start = 20400000, end = 21400000) ## Define a genome axis track gat <- GenomeAxisTrack() ## We have to change the ucscChromosomeNames option to FALSE to enable Gviz usage ## with non-UCSC chromosome names. options(ucscChromosomeNames = FALSE) plotTracks(list(gat, GeneRegionTrack(gr))) options(ucscChromosomeNames = TRUE) ## ----message=FALSE------------------------------------------------------- seqlevelsStyle(edb) <- "UCSC" ## Retrieving the GRanges objects with seqnames corresponding to UCSC chromosome names. gr <- getGeneRegionTrackForGviz(edb, chromosome = "chrY", start = 20400000, end = 21400000) seqnames(gr) ## Define a genome axis track gat <- GenomeAxisTrack() plotTracks(list(gat, GeneRegionTrack(gr))) ## ----gviz-separate-tracks, message=FALSE, warning=FALSE, fig.align='center', fig.width=7.5, fig.height=2.25---- protCod <- getGeneRegionTrackForGviz(edb, chromosome = "chrY", start = 20400000, end = 21400000, filter = GenebiotypeFilter("protein_coding")) lincs <- getGeneRegionTrackForGviz(edb, chromosome = "chrY", start = 20400000, end = 21400000, filter = GenebiotypeFilter("lincRNA")) plotTracks(list(gat, GeneRegionTrack(protCod, name = "protein coding"), GeneRegionTrack(lincs, name = "lincRNAs")), transcriptAnnotation = "symbol") ## At last we change the seqlevels style again to Ensembl seqlevelsStyle <- "Ensembl" ## ------------------------------------------------------------------------ library(EnsDb.Hsapiens.v75) edb <- EnsDb.Hsapiens.v75 ## List all available columns in the database. columns(edb) ## Note that these do *not* correspond to the actual column names ## of the database that can be passed to methods like exons, genes, ## transcripts etc. These column names can be listed with the listColumns ## method. listColumns(edb) ## List all of the supported key types. keytypes(edb) ## Get all gene ids from the database. gids <- keys(edb, keytype = "GENEID") length(gids) ## Get all gene names for genes encoded on chromosome Y. gnames <- keys(edb, keytype = "GENENAME", filter = SeqnameFilter("Y")) head(gnames) ## ----warning=FALSE------------------------------------------------------- ## Use the /standard/ way to fetch data. select(edb, keys = c("ZBTB16", "BCL2", "BCL2L11"), keytype = "GENENAME", columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE")) ## Use the filtering system of ensembldb select(edb, keys = list(GenenameFilter(c("BCL2", "BCL2L11")), TxbiotypeFilter("protein_coding")), columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE")) ## ------------------------------------------------------------------------ ## Use the default method, which just returns the first value for multi mappings. mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME") ## Alternatively, specify multiVals="list" to return all mappings. mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME", multiVals = "list") ## And, just like before, we can use filters to map only to protein coding transcripts. mapIds(edb, keys = list(GenenameFilter(c("BCL2", "BCL2L11")), TxbiotypeFilter("protein_coding")), column = "TXID", multiVals = "list") ## ----eval=FALSE---------------------------------------------------------- ## library(ensembldb) ## ## ## get all human gene/transcript/exon annotations from Ensembl (75) ## ## the resulting tables will be stored by default to the current working ## ## directory ## fetchTablesFromEnsembl(75, species = "human") ## ## ## These tables can then be processed to generate a SQLite database ## ## containing the annotations (again, the function assumes the required ## ## txt files to be present in the current working directory) ## DBFile <- makeEnsemblSQLiteFromTables() ## ## ## and finally we can generate the package ## makeEnsembldbPackage(ensdb = DBFile, version = "0.99.12", ## maintainer = "Johannes Rainer ", ## author = "J Rainer") ## ----eval=FALSE---------------------------------------------------------- ## ## Load the AnnotationHub data. ## library(AnnotationHub) ## ah <- AnnotationHub() ## ## ## Query all available files for Ensembl release 77 for ## ## Mus musculus. ## query(ah, c("Mus musculus", "release-77")) ## ## ## Get the resource for the gtf file with the gene/transcript definitions. ## Gtf <- ah["AH28822"] ## ## Create a EnsDb database file from this. ## DbFile <- ensDbFromAH(Gtf) ## ## We can either generate a database package, or directly load the data ## edb <- EnsDb(DbFile) ## ## ## ## Identify and get the FaFile object with the genomic DNA sequence matching ## ## the EnsDb annotation. ## Dna <- getGenomeFaFile(edb) ## library(Rsamtools) ## ## We next retrieve the sequence of all exons on chromosome Y. ## exons <- exons(edb, filter = SeqnameFilter("Y")) ## exonSeq <- getSeq(Dna, exons) ## ## ## Alternatively, look up and retrieve the toplevel DNA sequence manually. ## Dna <- ah[["AH22042"]] ## ----message=FALSE------------------------------------------------------- ## Generate a sqlite database from a GRanges object specifying ## genes encoded on chromosome Y load(system.file("YGRanges.RData", package = "ensembldb")) Y DB <- ensDbFromGRanges(Y, path=tempdir(), version = 75, organism = "Homo_sapiens") edb <- EnsDb(DB) edb ## As shown in the example below, we could make an EnsDb package on ## this DB object using the makeEnsembldbPackage function. ## ----eval=FALSE---------------------------------------------------------- ## library(ensembldb) ## ## ## the GTF file can be downloaded from ## ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/ ## gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz" ## ## generate the SQLite database file ## DB <- ensDbFromGtf(gtf = gtffile) ## ## ## load the DB file directly ## EDB <- EnsDb(DB) ## ## ## alternatively, build the annotation package ## ## and finally we can generate the package ## makeEnsembldbPackage(ensdb = DB, version = "0.99.12", ## maintainer = "Johannes Rainer ", ## author = "J Rainer")