---
title: Reproducing Variant Annotation Results
output:
  BiocStyle::html_document
---

<!-- Copyright 2014 Google Inc. All rights reserved. -->

<!-- Licensed under the Apache License, Version 2.0 (the "License"); -->
<!-- you may not use this file except in compliance with the License. -->
<!-- You may obtain a copy of the License at -->

<!--     http://www.apache.org/licenses/LICENSE-2.0 -->

<!-- Unless required by applicable law or agreed to in writing, software -->
<!-- distributed under the License is distributed on an "AS IS" BASIS, -->
<!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -->
<!-- See the License for the specific language governing permissions and -->
<!-- limitations under the License. -->

<!--
%% \VignetteEngine{knitr::rmarkdown}
%% \VignetteIndexEntry{Reproducing Variant Annotation Results}
-->

```{r style, echo = FALSE, results = 'asis'}
BiocStyle::markdown()
```

```{r, echo = FALSE}
apiKey <- Sys.getenv("GOOGLE_API_KEY")
if (nchar(apiKey) == 0) {
  warning(paste("To build this vignette, please setup the environment variable",
                "GOOGLE_API_KEY with the public API key from your Google",
                "Developer Console before loading the GoogleGenomics package,",
                "or run GoogleGenomics::authenticate."))
  knitr::knit_exit()
}
```

# Reproducing Variant Annotation Results

Below we compare the results of annotating variants via `r Biocpkg("VariantAnnotation")`.  We compare using data from 1,000 Genomes Phase 1 Variants:
* as parsed from a VCF file
* retrieved from the Google Genomics API

### VCF Data

First we read in the data from the VCF file:
```{r}
suppressPackageStartupMessages(library(VariantAnnotation))
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
vcf <- renameSeqlevels(vcf, c("22"="chr22"))
vcf
```

The file `chr22.vcf.gz` within package VariantAnnotation holds data for 5 of the 1,092 individuals in 1,000 Genomes, starting at position 50300078 and ending at position 50999964.

`HG00096 HG00097 HG00099 HG00100 HG00101`

### Google Genomics Data

Important data differences to note:
* VCF data uses 1-based coordinates but data from the GA4GH APIs is 0-based.
* There are two variants in the Google Genomics copy of 1,000 Genomes phase 1 variants that are not in `chr22.vcf.gz`.  They are the only two variants within the genomic range with `ALT == <DEL>`.

```{r}
# Authenticated on package load from the env variable GOOGLE_API_KEY.
suppressPackageStartupMessages(library(GoogleGenomics))

# TODO Right now we're just getting a few variants.  Later update this to retrieve them all.
system.time({
granges <- getVariants(datasetId="10473108253681171589",
                       chromosome="22",
                       start=50300077,
                       end=50303000,         # TODO end=50999964
                       converter=variantsToGRanges)
})
```

### Compare the Loaded Data
Ensure that the data retrieved by each matches:
```{r}
vcf <- vcf[1:length(granges)] # Truncate the VCF data

suppressPackageStartupMessages(library(testthat))
expect_equal(start(granges), start(vcf))
expect_equal(end(granges), end(vcf))
expect_equal(as.character(granges$REF), as.character(ref(vcf)))
expect_equal(as.character(unlist(granges$ALT)), as.character(unlist(alt(vcf))))
expect_equal(granges$QUAL, qual(vcf))
expect_equal(granges$FILTER, filt(vcf))
```

### Compare the Annotations

Now locate the protein coding variants in each:
```{r}
suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg19.knownGene))
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene


rd <- rowData(vcf)
vcf_locations <- locateVariants(rd, txdb, CodingVariants())
vcf_locations

granges_locations <- locateVariants(granges, txdb, CodingVariants())
granges_locations

expect_equal(granges_locations, vcf_locations)
```

And predict the effect of the protein coding variants:
```{r}
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg19))
vcf_coding <- predictCoding(vcf, txdb, seqSource=Hsapiens)
vcf_coding

granges_coding <- predictCoding(rep(granges, elementLengths(granges$ALT)),
                                txdb,
                                seqSource=Hsapiens,
                                varAllele=unlist(granges$ALT, use.names=FALSE))

granges_coding

expect_equal(as.matrix(granges_coding$REFCODON), as.matrix(vcf_coding$REFCODON))
expect_equal(as.matrix(granges_coding$VARCODON), as.matrix(vcf_coding$VARCODON))
expect_equal(granges_coding$GENEID, vcf_coding$GENEID)
expect_equal(granges_coding$CONSEQUENCE, vcf_coding$CONSEQUENCE)

```

Add gene information:
```{r}
suppressPackageStartupMessages(library(org.Hs.eg.db))
annots <- select(org.Hs.eg.db,
                 keys=granges_coding$GENEID,
                 keytype="ENTREZID",
                 columns=c("SYMBOL", "GENENAME","ENSEMBL"))
cbind(elementMetadata(granges_coding), annots)
```

### Provenance
Package versions used:
```{r}
sessionInfo()
```

