## ----warning=FALSE, message=FALSE----------------------------------------
library(EnsDb.Hsapiens.v75)

## Making a "short cut"
edb <- EnsDb.Hsapiens.v75
## print some informations for this package
edb

## for what organism was the database generated?
organism(edb)

## ------------------------------------------------------------------------
Tx <- transcripts(edb, filter = list(GenenameFilter("BCL2L11")))

Tx

## as this is a GRanges object we can access e.g. the start coordinates with
head(start(Tx))

## or extract the biotype with
head(Tx$tx_biotype)

## ------------------------------------------------------------------------
## list all database tables along with their columns
listTables(edb)

## list columns from a specific table
listColumns(edb, "tx")

## ------------------------------------------------------------------------
Tx <- transcripts(edb,
		  columns = c(listColumns(edb , "tx"), "gene_name"),
		  filter = TxbiotypeFilter("nonsense_mediated_decay"),
		  return.type = "DataFrame")
nrow(Tx)
Tx

## ------------------------------------------------------------------------
yCds <- cdsBy(edb, filter = SeqnameFilter("Y"))
yCds

## ------------------------------------------------------------------------
## Define the filter
grf <- GRangesFilter(GRanges("11", ranges = IRanges(114000000, 114000050),
			     strand="+"), condition = "overlapping")

## Query genes:
gn <- genes(edb, filter = grf)
gn

## Next we retrieve all transcripts for that gene so that we can plot them.
txs <- transcripts(edb, filter = GenenameFilter(gn$gene_name))

## ----tx-for-zbtb16, message=FALSE, fig.align='center', fig.width=7.5, fig.height=5----
plot(3, 3, pch = NA, xlim = c(start(gn), end(gn)), ylim = c(0, length(txs)),
     yaxt = "n", ylab = "")
## Highlight the GRangesFilter region
rect(xleft = start(grf), xright = end(grf), ybottom = 0, ytop = length(txs),
     col = "red", border = "red")
for(i in 1:length(txs)) {
    current <- txs[i]
    rect(xleft = start(current), xright = end(current), ybottom = i-0.975,
	 ytop = i-0.125, border = "grey")
    text(start(current), y = i-0.5, pos = 4, cex = 0.75, labels = current$tx_id)
}

## ------------------------------------------------------------------------
transcripts(edb, filter = grf)

## ------------------------------------------------------------------------
## Get all gene biotypes from the database. The GenebiotypeFilter
## allows to filter on these values.
listGenebiotypes(edb)

## Get all transcript biotypes from the database.
listTxbiotypes(edb)

## ------------------------------------------------------------------------
## We're going to fetch all genes which names start with BCL. To this end
## we define a GenenameFilter with partial matching, i.e. condition "like"
## and a % for any character/string.
BCLs <- genes(edb,
	      columns = c("gene_name", "entrezid", "gene_biotype"),
	      filter = list(GenenameFilter("BCL%", condition = "like")),
	      return.type = "DataFrame")
nrow(BCLs)
BCLs

## ------------------------------------------------------------------------
## determine the average length of snRNA, snoRNA and rRNA genes encoded on
## chromosomes X and Y.
mean(lengthOf(edb, of = "tx",
	      filter = list(GenebiotypeFilter(c("snRNA", "snoRNA", "rRNA")),
			    SeqnameFilter(c("X", "Y")))))

## determine the average length of protein coding genes encoded on the same
## chromosomes.
mean(lengthOf(edb, of = "tx",
	      filter = list(GenebiotypeFilter("protein_coding"),
			    SeqnameFilter(c("X", "Y")))))

## ------------------------------------------------------------------------
## Extract all exons 1 and (if present) 2 for all genes encoded on the
## Y chromosome
exons(edb, columns = c("tx_id", "exon_idx"),
      filter = list(SeqnameFilter("Y"),
		    ExonrankFilter(3, condition = "<")))

## ------------------------------------------------------------------------
TxByGns <- transcriptsBy(edb, by = "gene",
			 filter = list(SeqnameFilter(c("X", "Y")))
			 )
TxByGns

## ----eval=FALSE----------------------------------------------------------
## ## will just get exons for all genes on chromosomes 1 to 22, X and Y.
## ## Note: want to get rid of the "LRG" genes!!!
## EnsGenes <- exonsBy(edb, by = "gene",
## 		    filter = list(SeqnameFilter(c(1:22, "X", "Y")),
## 				  GeneidFilter("ENSG%", "like")))

## ----eval=FALSE----------------------------------------------------------
## ## Transforming the GRangesList into a data.frame in SAF format
## EnsGenes.SAF <- toSAF(EnsGenes)

## ----eval=FALSE----------------------------------------------------------
## ## Create a GRanges of non-overlapping exon parts.
## DJE <- disjointExons(edb,
## 		     filter = list(SeqnameFilter(c(1:22, "X", "Y")),
## 				   GeneidFilter("ENSG%", "like")))

## ----eval=FALSE----------------------------------------------------------
## library(EnsDb.Hsapiens.v75)
## library(Rsamtools)
## edb <- EnsDb.Hsapiens.v75
## 
## ## Get the FaFile with the genomic sequence matching the Ensembl version
## ## using the AnnotationHub package.
## Dna <- getGenomeFaFile(edb)
## 
## ## Get start/end coordinates of all genes.
## genes <- genes(edb)
## ## Subset to all genes that are encoded on chromosomes for which
## ## we do have DNA sequence available.
## genes <- genes[seqnames(genes) %in% seqnames(seqinfo(Dna))]
## 
## ## Get the gene sequences, i.e. the sequence including the sequence of
## ## all of the gene's exons and introns.
## geneSeqs <- getSeq(Dna, genes)

## ----eval=FALSE----------------------------------------------------------
## ## get all exons of all transcripts encoded on chromosome Y
## yTx <- exonsBy(edb, filter = SeqnameFilter("Y"))
## 
## ## Retrieve the sequences for these transcripts from the FaFile.
## library(GenomicFeatures)
## yTxSeqs <- extractTranscriptSeqs(Dna, yTx)
## yTxSeqs
## 
## ## Extract the sequences of all transcripts encoded on chromosome Y.
## yTx <- extractTranscriptSeqs(Dna, edb, filter = SeqnameFilter("Y"))
## 
## ## Along these lines, we could use the method also to retrieve the coding sequence
## ## of all transcripts on the Y chromosome.
## cdsY <- cdsBy(edb, filter = SeqnameFilter("Y"))
## extractTranscriptSeqs(Dna, cdsY)

## ----message=FALSE-------------------------------------------------------
## Change the seqlevels style form Ensembl (default) to UCSC:
seqlevelsStyle(edb) <- "UCSC"

## Now we can use UCSC style seqnames in SeqnameFilters or GRangesFilter:
genesY <- genes(edb, filter = SeqnameFilter("chrY"))
## The seqlevels of the returned GRanges are also in UCSC style
seqlevels(genesY)

## ------------------------------------------------------------------------
seqlevelsStyle(edb) <- "UCSC"

## Getting the default option:
getOption("ensembldb.seqnameNotFound")

## Listing all seqlevels in the database.
seqlevels(edb)[1:30]

## Setting the option to NA, thus, for each seqname for which no mapping is available,
## NA is returned.
options(ensembldb.seqnameNotFound=NA)
seqlevels(edb)[1:30]

## Resetting the option.
options(ensembldb.seqnameNotFound = "ORIGINAL")

## ----warning=FALSE, message=FALSE----------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
bsg <- BSgenome.Hsapiens.UCSC.hg19

## Get the genome version
unique(genome(bsg))
unique(genome(edb))
## Although differently named, both represent genome build GRCh37.

## Extract the full transcript sequences.
yTxSeqs <- extractTranscriptSeqs(bsg, exonsBy(edb, "tx", filter = SeqnameFilter("chrY")))

yTxSeqs

## Extract just the CDS
Test <- cdsBy(edb, "tx", filter = SeqnameFilter("chrY"))
yTxCds <- extractTranscriptSeqs(bsg, cdsBy(edb, "tx", filter = SeqnameFilter("chrY")))
yTxCds

## ------------------------------------------------------------------------
seqlevelsStyle(edb) <- "Ensembl"

## ----gviz-plot, message=FALSE, fig.align='center', fig.width=7.5, fig.height=2.25----
## Loading the Gviz library
library(Gviz)
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

## Retrieving a Gviz compatible GRanges object with all genes
## encoded on chromosome Y.
gr <- getGeneRegionTrackForGviz(edb, chromosome = "Y", start = 20400000, end = 21400000)
## Define a genome axis track
gat <- GenomeAxisTrack()

## We have to change the ucscChromosomeNames option to FALSE to enable Gviz usage
## with non-UCSC chromosome names.
options(ucscChromosomeNames = FALSE)

plotTracks(list(gat, GeneRegionTrack(gr)))

options(ucscChromosomeNames = TRUE)

## ----message=FALSE-------------------------------------------------------
seqlevelsStyle(edb) <- "UCSC"
## Retrieving the GRanges objects with seqnames corresponding to UCSC chromosome names.
gr <- getGeneRegionTrackForGviz(edb, chromosome = "chrY", start = 20400000, end = 21400000)
seqnames(gr)
## Define a genome axis track
gat <- GenomeAxisTrack()
plotTracks(list(gat, GeneRegionTrack(gr)))

## ----gviz-separate-tracks, message=FALSE, warning=FALSE, fig.align='center', fig.width=7.5, fig.height=2.25----
protCod <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
				     start = 20400000, end = 21400000,
				     filter = GenebiotypeFilter("protein_coding"))
lincs <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
				   start = 20400000, end = 21400000,
				   filter = GenebiotypeFilter("lincRNA"))

plotTracks(list(gat, GeneRegionTrack(protCod, name = "protein coding"),
		GeneRegionTrack(lincs, name = "lincRNAs")), transcriptAnnotation = "symbol")

## At last we change the seqlevels style again to Ensembl
seqlevelsStyle <- "Ensembl"

## ------------------------------------------------------------------------
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

## List all available columns in the database.
columns(edb)

## Note that these do *not* correspond to the actual column names
## of the database that can be passed to methods like exons, genes,
## transcripts etc. These column names can be listed with the listColumns
## method.
listColumns(edb)

## List all of the supported key types.
keytypes(edb)

## Get all gene ids from the database.
gids <- keys(edb, keytype = "GENEID")
length(gids)

## Get all gene names for genes encoded on chromosome Y.
gnames <- keys(edb, keytype = "GENENAME", filter = SeqnameFilter("Y"))
head(gnames)

## ----warning=FALSE-------------------------------------------------------
## Use the /standard/ way to fetch data.
select(edb, keys = c("ZBTB16", "BCL2", "BCL2L11"), keytype = "GENENAME",
       columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE"))

## Use the filtering system of ensembldb
select(edb, keys = list(GenenameFilter(c("BCL2", "BCL2L11")),
			TxbiotypeFilter("protein_coding")),
       columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE"))

## ------------------------------------------------------------------------
## Use the default method, which just returns the first value for multi mappings.
mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME")

## Alternatively, specify multiVals="list" to return all mappings.
mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME",
       multiVals = "list")

## And, just like before, we can use filters to map only to protein coding transcripts.
mapIds(edb, keys = list(GenenameFilter(c("BCL2", "BCL2L11")),
			TxbiotypeFilter("protein_coding")), column = "TXID",
       multiVals = "list")

## ----eval=FALSE----------------------------------------------------------
## library(ensembldb)
## 
## ## get all human gene/transcript/exon annotations from Ensembl (75)
## ## the resulting tables will be stored by default to the current working
## ## directory
## fetchTablesFromEnsembl(75, species = "human")
## 
## ## These tables can then be processed to generate a SQLite database
## ## containing the annotations (again, the function assumes the required
## ## txt files to be present in the current working directory)
## DBFile <- makeEnsemblSQLiteFromTables()
## 
## ## and finally we can generate the package
## makeEnsembldbPackage(ensdb = DBFile, version = "0.99.12",
## 		     maintainer = "Johannes Rainer <johannes.rainer@eurac.edu>",
## 		     author = "J Rainer")

## ----eval=FALSE----------------------------------------------------------
## ## Load the AnnotationHub data.
## library(AnnotationHub)
## ah <- AnnotationHub()
## 
## ## Query all available files for Ensembl release 77 for
## ## Mus musculus.
## query(ah, c("Mus musculus", "release-77"))
## 
## ## Get the resource for the gtf file with the gene/transcript definitions.
## Gtf <- ah["AH28822"]
## ## Create a EnsDb database file from this.
## DbFile <- ensDbFromAH(Gtf)
## ## We can either generate a database package, or directly load the data
## edb <- EnsDb(DbFile)
## 
## 
## ## Identify and get the FaFile object with the genomic DNA sequence matching
## ## the EnsDb annotation.
## Dna <- getGenomeFaFile(edb)
## library(Rsamtools)
## ## We next retrieve the sequence of all exons on chromosome Y.
## exons <- exons(edb, filter = SeqnameFilter("Y"))
## exonSeq <- getSeq(Dna, exons)
## 
## ## Alternatively, look up and retrieve the toplevel DNA sequence manually.
## Dna <- ah[["AH22042"]]

## ----message=FALSE-------------------------------------------------------
## Generate a sqlite database from a GRanges object specifying
## genes encoded on chromosome Y
load(system.file("YGRanges.RData", package = "ensembldb"))
Y

DB <- ensDbFromGRanges(Y, path=tempdir(), version = 75,
		       organism = "Homo_sapiens")

edb <- EnsDb(DB)
edb

## As shown in the example below, we could make an EnsDb package on
## this DB object using the makeEnsembldbPackage function.

## ----eval=FALSE----------------------------------------------------------
## library(ensembldb)
## 
## ## the GTF file can be downloaded from
## ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/
## gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz"
## ## generate the SQLite database file
## DB <- ensDbFromGtf(gtf = gtffile)
## 
## ## load the DB file directly
## EDB <- EnsDb(DB)
## 
## ## alternatively, build the annotation package
## ## and finally we can generate the package
## makeEnsembldbPackage(ensdb = DB, version = "0.99.12",
## 		     maintainer = "Johannes Rainer <johannes.rainer@eurac.edu>",
## 		     author = "J Rainer")