%\VignetteIndexEntry{HIBAG vignette pdf}
%\VignetteKeywords{HLA, MHC, Imputation, SNP, GWAS}
%\VignettePackage{HIBAG}
%\VignetteEngine{knitr::knitr}

\documentclass[11pt]{article}

<<style, eval=TRUE, echo=FALSE, results="asis">>=
BiocStyle::latex()
@ 

\usepackage{longtable}

\begin{document}

\bioctitle[HLA Allele Imputation]{
    HIBAG -- an R Package for HLA Genotype Imputation with Attribute Bagging
}
\author{
    Xiuwen Zheng
}
\date{Apr 10, 2015}


\maketitle
\tableofcontents

%\SweaveOpts{keep.source=TRUE, eps=FALSE}

% ---- Overview ----
\section{Overview}

The human leukocyte antigen (HLA) system, located in the major
histocompatibility complex (MHC) on chromosome 6p21.3, is highly polymorphic.
This region has been shown to be important in human disease, adverse drug
reactions and organ transplantation \cite{Shiina09}. HLA genes play a
role in the immune system and autoimmunity as they are central to the
presentation of antigens for recognition by T cells. Since they have to
provide defense against a great diversity of environmental microbes, HLA
genes must be able to present a wide range of peptides. Evolutionary pressure
at these loci have given rise to a great deal of functional diversity. For
example, the {\em HLA--B} locus has 1,898 four-digit alleles listed in the
April 2012 release of the IMGT-HLA Database
\cite{Robinson13}
(\url{http://www.ebi.ac.uk/imgt/hla/}).

Classical HLA genotyping methodologies have been predominantly developed
for tissue typing purposes, with sequence based typing (SBT) approaches
currently considered the gold standard. While there is widespread availability
of vendors offering HLA genotyping services, the complexities involved in
performing this to the standard required for diagnostic purposes make using a
SBT approach time-consuming and cost-prohibitive for most research studies
wishing to look in detail at the involvement of classical HLA genes in disease.

Here we introduce a new prediction method for {\bf H}LA {\bf I}mputation using
attribute {\bf BAG}ging, HIBAG, that is highly accurate, computationally
tractable, and can be used with published parameter estimates, eliminating
the need to access large training samples
\cite{Zheng13}.
It relies on a training set with known HLA and SNP genotypes, and combines
the concepts of attribute bagging with haplotype inference from unphased SNPs
and HLA types. Attribute bagging is a technique for improving the accuracy and
stability of classifier ensembles using bootstrap aggregating and random
variable selection \cite{Breiman96,
Breiman01, Bryll03}. In this case, individual
classifiers are created which utilize a subset of SNPs to predict HLA types
and haplotype frequencies estimated from a training data set of SNPs and HLA
types. Each of the classifiers employs a variable selection algorithm with a
random component to select a subset of the SNPs. HLA type predictions are
determined by maximizing the average posterior probabilities from all
classifiers.



% ---- Features ----
\section{Features}

\begin{enumerate}
    \item HIBAG can be used by researchers with published parameter estimates
        (\url{http://www.biostat.washington.edu/~bsweir/HIBAG/}) instead of
        requiring access to large training sample datasets.
    \item A typical HIBAG parameter file contains only haplotype frequencies
        at different SNP subsets rather than individual training genotypes.
    \item SNPs within the xMHC region (chromosome 6) are used for imputation.
    \item HIBAG employs unphased genotypes of unrelated individuals as a
        training set.
    \item HIBAG supports parallel computing with R.
\end{enumerate}



% ---- Examples ----
\section{Examples}

<<library>>=
library(HIBAG)
@

%     ---- Pre-fit HIBAG models for HLA imputation ----
\subsection{Pre-fit HIBAG models for HLA imputation}

<<plot, fig.width=8, fig.height=4, fig.align='center'>>=
# load the published parameter estimates from European ancestry
#   e.g., filename <- "HumanOmniExpressExome-European-HLA4-hg19.RData"
#   here, we use example data in the package
filename <- system.file("extdata", "ModelList.RData", package="HIBAG")
model.list <- get(load(filename))

# HLA imputation at HLA-A
hla.id <- "A"
model <- hlaModelFromObj(model.list[[hla.id]])
summary(model)

# SNPs in the model
head(model$snp.id)

head(model$snp.position)

# plot SNP information
plot(model)
@

<<eval=FALSE>>=
#########################################################################
# Import your PLINK BED file
#
yourgeno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim")
summary(yourgeno)

# best-guess genotypes and all posterior probabilities
pred.guess <- predict(model, yourgeno, type="response+prob")
summary(pred.guess)

pred.guess$value
pred.guess$postprob
@


%     ---- Build a HIBAG Model for HLA Genotype Imputation ----
\subsection{Build a HIBAG Model for HLA Genotype Imputation}

<<build-model, eval=FALSE>>=
library(HIBAG)

# load HLA types and SNP genotypes in the package
data(HLA_Type_Table, package="HIBAG")
data(HapMap_CEU_Geno, package="HIBAG")

# a list of HLA genotypes
# e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...),
#                        H2=c("02:01", "03:01", ...), locus="A")
#     the HLA type of the first individual is 01:02/02:01,
#                 the second is 05:01/03:01, ...
hla.id <- "A"
train.HLA <- hlaAllele(HLA_Type_Table$sample.id,
    H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
    H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
    locus=hla.id, assembly="hg19")

# selected SNPs:
#    the flanking region of 500kb on each side,
#    or an appropriate flanking size without sacrificing predictive accuracy
region <- 500   # kb
snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id,
    HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19")
length(snpid)

# training genotypes
#   import your PLINK BED file,
#   e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim")
#   or,
train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
    snp.sel = match(snpid, HapMap_CEU_Geno$snp.id))
summary(train.geno)

# train a HIBAG model
set.seed(100)
model <- hlaAttrBagging(train.HLA, train.geno, nclassifier=100)
@
<<echo=FALSE>>=
mobj <- get(load(system.file("extdata", "ModelList.RData", package="HIBAG")))
model <- hlaModelFromObj(mobj$A)
@
<<>>=
summary(model)
@


%     ---- Build and Predict in Parallel ----
\subsection{Build and Predict in Parallel}

<<build-model-parallel, eval=FALSE>>=
library(HIBAG)
library(parallel)

# load HLA types and SNP genotypes in the package
data(HLA_Type_Table, package="HIBAG")
data(HapMap_CEU_Geno, package="HIBAG")

# a list of HLA genotypes
# e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...),
#                        H2=c("02:01", "03:01", ...), locus="A")
#     the HLA type of the first individual is 01:02/02:01,
#                 the second is 05:01/03:01, ...
hla.id <- "A"
train.HLA <- hlaAllele(HLA_Type_Table$sample.id,
    H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
    H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
    locus=hla.id, assembly="hg19")

# selected SNPs:
#    the flanking region of 500kb on each side,
#    or an appropriate flanking size without sacrificing predictive accuracy
region <- 500   # kb
snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id,
    HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19")
length(snpid)

# training genotypes
#   import your PLINK BED file,
#   e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim")
#   or,
train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
    snp.sel = match(snpid, HapMap_CEU_Geno$snp.id))
summary(train.geno)


# Create an environment with an appropriate cluster size,
#   e.g., 2 -- # of (CPU) cores
cl <- makeCluster(2)

# Building ...
set.seed(1000)
hlaParallelAttrBagging(cl, train.HLA, train.geno, nclassifier=100,
    auto.save="AutoSaveModel.RData", stop.cluster=TRUE)
model.obj <- get(load("AutoSaveModel.RData"))
model <- hlaModelFromObj(model.obj)
summary(model)
@

<<echo=FALSE>>=
unlink("AutoSaveModel.RData", force=TRUE)
@


%     ---- Evaluate Overall Accuracy, Sensitivity, Specificity, etc ----
\subsection{Evaluate Overall Accuracy, Sensitivity, Specificity, etc}

The function \Rfunction{hlaReport()} can be used to automatically generate
a tex or HTML report when a validation dataset is available.

<<evaluate>>=
library(HIBAG)

# load HLA types and SNP genotypes in the package
data(HLA_Type_Table, package="HIBAG")
data(HapMap_CEU_Geno, package="HIBAG")

# make a list of HLA types
hla.id <- "A"
hla <- hlaAllele(HLA_Type_Table$sample.id,
    H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
    H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
    locus=hla.id, assembly="hg19")

# divide HLA types randomly
set.seed(100)
hlatab <- hlaSplitAllele(hla, train.prop=0.5)
names(hlatab)
summary(hlatab$training)
summary(hlatab$validation)

# SNP predictors within the flanking region on each side
region <- 500   # kb
snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id,
    HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19")
length(snpid)

# training and validation genotypes
train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
    snp.sel = match(snpid, HapMap_CEU_Geno$snp.id),
    samp.sel = match(hlatab$training$value$sample.id,
        HapMap_CEU_Geno$sample.id))
summary(train.geno)

test.geno <- hlaGenoSubset(HapMap_CEU_Geno, samp.sel=match(
    hlatab$validation$value$sample.id, HapMap_CEU_Geno$sample.id))
@
<<eval=FALSE>>=
# train a HIBAG model
set.seed(100)
model <- hlaAttrBagging(hlatab$training, train.geno, nclassifier=100)
@
<<echo=FALSE>>=
mobj <- get(load(system.file("extdata", "OutOfBag.RData", package="HIBAG")))
model <- hlaModelFromObj(mobj)
@

<<>>=
summary(model)

# validation
pred <- predict(model, test.geno)

# compare
comp <- hlaCompareAllele(hlatab$validation, pred, allele.limit=model,
    call.threshold=0)
comp$overall
@

Output to plain text format:
<<out-text>>=
# report overall accuracy, per-allele sensitivity, specificity, etc
hlaReport(comp, type="txt")
@

\setlength{\LTcapwidth}{7in}
Output to a tex file, and please add ``\textbackslash usepackage\{longtable\}''
to your tex file:
<<out-tex, results="asis">>=
# report overall accuracy, per-allele sensitivity, specificity, etc
hlaReport(comp, type="tex", header=FALSE)
@


%     ---- Release HIBAG Models without Confidential Information ----
\subsection{Release HIBAG Models without Confidential Information}

<<release-model, eval=FALSE>>=
library(HIBAG)

# make a list of HLA types
hla.id <- "DQA1"
hla <- hlaAllele(HLA_Type_Table$sample.id,
    H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
    H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
    locus=hla.id, assembly="hg19")

# training genotypes
region <- 500   # kb
snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position,
    hla.id, region*1000, assembly="hg19")
train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
    snp.sel = match(snpid, HapMap_CEU_Geno$snp.id),
    samp.sel = match(hla$value$sample.id, HapMap_CEU_Geno$sample.id))

set.seed(1000)
model <- hlaAttrBagging(hla, train.geno, nclassifier=100)
summary(model)

# remove unused SNPs and sample IDs from the model
mobj <- hlaPublish(model,
    platform = "Illumina 1M Duo",
    information = "Training set -- HapMap Phase II",
    warning = NULL,
    rm.unused.snp=TRUE, anonymize=TRUE)

save(mobj, file="Your_HIBAG_Model.RData")
@

<<echo=FALSE>>=
unlink("Your_HIBAG_Model.RData", force=TRUE)
@


%     ---- Release a Collection of HIBAG Models ----
\subsection{Release a Collection of HIBAG Models}

<<eval=FALSE>>=
# assume the HIBAG models are stored in R objects: mobj.A, mobj.B, ...

ModelList <- list()
ModelList[["A"]] <- mobj.A
ModelList[["B"]] <- mobj.B
...

# save to an R data file
save(ModelList, file="HIBAG_Model_List.RData")
@


% ---- Resources ----
\section{Resources}

\begin{enumerate}
    \item Allele Frequency Net Database (AFND):
        \url{http://www.allelefrequencies.net}.
    \item IMGT/HLA Database: \url{http://www.ebi.ac.uk/imgt/hla}.
    \item HLA Nomenclature: G Codes
        (\url{http://hla.alleles.org/alleles/g_groups.html}) and P Codes
        (\url{http://hla.alleles.org/alleles/p_groups.html}) for reporting
        of ambiguous allele typings.
\end{enumerate}


\section{Session Info}

<<sessioninfo, results="asis">>=
toLatex(sessionInfo())
@


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliography{HIBAG_Ref}


\end{document}
