\documentclass[a4paper]{article}
\usepackage{natbib}
\usepackage[latin1]{inputenc}
\usepackage{authblk}
\let\proglang=\textsf
\let\pkg=\textsf
% \VignetteIndexEntry{RDAVIDWebService: a versatile R interface to DAVID}
% \VignetteKeyword{DAVID Bioinformatics}
% \VignetteKeyword{web service}

\title{\pkg{RDAVIDWebService}: a versatile \proglang{R} interface to DAVID}
\author[1,2]{Crist\'{o}bal Fresno}
\author[1,2]{Elmer A. Fern\'{a}ndez}
\affil[1]{Bio-science Data Mining Group, Catholic University of C\'{o}rdoba, 
  C\'{o}rdoba, Argentina.}
\affil[2]{CONICET, Argentina}

\begin{document}
\SweaveOpts{concordance=TRUE}
\maketitle
\begin{abstract}
\textbf{Summary}: the \pkg{RDAVIDWebService} package provides a class-based 
interface from \proglang{R} programs/scripts to fully access/control the
Database for Annotation, Visualization and Integrated Discovery (DAVID), without
the need for human interaction on its website (david.abcc.ncifcrf.gov). The
library enhances DAVID capabilities for Gene Ontology analysis by means of
\pkg{GOstats}-based direct acyclic graph conversion methods, in addition to the
usual many-genes-to-many-terms visualization.
\\
\textbf{Availability}: \pkg{RDAVIDWebService} is available as an \proglang{R} 
package from the Bioconductor project (www.bioconductor.org) and on the authors'
website (www.bdmg.com.ar).
\\
\textbf{Contact}: cfresno@bdmg.com.ar or efernandez@bdmg.com.ar
\end{abstract}

\section{Introduction}
One of the most accessed systems for functional genomics/proteomics analysis is 
the database for annotation, visualization and integrated discovery (DAVID), a
web-based online bioinformatics resource (http://david.abcc.ncifcrf.gov) that
aims to provide tools for functional interpretation of large lists of
genes/proteins \citep{DAVID}. Its access is carried mainly through a website.
There is also a uniform resource locator (URL)-based application programming
interface (API), to query DAVID programmatically, accessible through
\pkg{DAVIDQuery} \proglang{R} package \citep{DAVIDQuery}. However, the URL-API
has limited capabilities, such as URL length and only works with the default
settings. In the year 2012, a web service interface was made available allowing
full access and control on all its functionalities except visualization
\citep{DAVIDWS}. Although, it is possible to handle DAVID web services (DWS)
through R, it requires high programming skills. In addition, query results are
very difficult to manage since they are XML (\pkg{SOAP} package, \cite{SSOAP})
or \proglang{Java} objects (\pkg{rJava} package, \cite{rJava}).
\\
Here we provide a versatile class-based \proglang{R} interface to access DAVID. 
It is an \proglang{R} wrapper to all DWS functionalities, with several new
features such as off-line processing (allows using previously queried saved
reports) and native \proglang{R} class data types. Additionally it overcomes
DWS visualization constraints, providing the usual many-genes-to-many-terms
feature, and enhances DAVID capabilities for Gene Ontology (GO, \cite{GO})
analysis by means of \pkg{GOstats}-based \citep{GOstats} direct acyclic graph
(DAG) conversion methods. Therefore, it expands DAVID features by allowing new
developments through one of the most used computer languages in Bioinformatics,
R \citep{R}.

\section{Implementation}
The package implements \emph{RDAVIDWebService}, a reference class object by
means of R5 paradigm, for DWS communication through a \proglang{Java} client
(\emph{RDAVIDWebServiceStub}). This allows the establishment of a unique user
access point (see Figure \ref{fig:RtoJava}).
\begin{figure}[!hb]
  \begin{center}
    \includegraphics[width=\textwidth]{RtoJava}
  \end{center}
  \caption{\proglang{R}-to-\proglang{Java} class diagram.
\texttt{RDAVIDWebService} uses \pkg{rJava} to access/control DAVID web service
through \texttt{RDAVIDWebServiceStub} and parse the reports back to \proglang{R}
by \texttt{DAVIDParser}. Note that operation signatures have been omitted for
simplicity.}
  \label{fig:RtoJava}
\end{figure}
\\
In order to reduce \proglang{Java}-to-\proglang{R} handshaking due to parsing 
data structures (a time consuming computational task), the provided Java-based
file report client was enhanced (\emph{DAVIDParser}) to allow formatting of all
the DAVID outputs into appropriate \pkg{RDAVIDWebService} S4 \proglang{R}
classes(see Figure \ref{fig:Rclasses}, and section \ref{sec:packageoverview}).
This speeds up the bottleneck data importation process. In addition, allows
locally saving DAVID query to file for further analysis, as well as using
website generated reports. Thus, permits using web services and website query
results interchangeably.
\begin{figure}[!ht]
  \begin{center}
    \includegraphics[width=\textwidth]{inR}
  \end{center}
  \caption{Hierarchy \proglang{R} class diagram. Note that operation signatures
have been omitted for simplicity.}
  \label{fig:Rclasses}
\end{figure}

\section{Features}
\begin{enumerate}
  \item \emph{Ease of Use}: it provides a uniform framework to access DAVID 
analysis straight from \proglang{R} without the need of ad hoc parsing queried
reports.
  \item \emph{Data import/export}: results from DAVID can be accessed through 
\proglang{R} or also generated on the website. In both cases they are stored in
the same format for later use. This permits ON/OFF-line processing capabilities
within \proglang{R}. Hence, empowering queried reports generated anytime and
anywhere for processing, without the need to redo the uploading to DAVID.
  \item \emph{Visualization}: customizable many-genes-to-many-terms 2D 
relationship views are also available with the \pkg{ggplot2} package
\citep{ggplot2}.
  \item \emph{Gene Ontology structure}: DAVID set enrichment analysis (SEA) or 
modular enrichment (MEA) results can be mapped into \pkg{GOstats}-based direct
acyclic graphs. This enables visualization of EASE score-based enriched
biological process (BP), molecular function (MF) and cellular component (CC) GO
terms in the DAGs. Thus, the exploration and analysis of blurred pattern
presence is facilitated, compared to the usual tabular format.
\end{enumerate}

\section{Package overview}
\label{sec:packageoverview}
The package can be conceptually divided into two parts: 
\begin{itemize}
  \item \textbf{Connectivity}: a wrapper to DAVID web service for the basic
work flow: (1) upload of gene/protein ids as gene/background list/s; (2) check
DAVID's status for mapped/unmapped genes in the uploaded list/s or lookup the
available categories, etc. (3) select the background/species and categories to
use in the present analysis, and (4) get the different reports which includes
Functional Annotation Chart/Table/Clustering and so on.
  \item \textbf{Exploration}: on/off-line report import of the different
results into \proglang{R} objects hierarchy (see Figure \ref{fig:Rclasses}), ready to  use them with the use favourite CRAN \citep{CRAN} or Bioconductor \citep{Bioconductor} package/s. In addition, DWS capabilities are enhanced with the incorporation of the usual many-gene-to-many terms 2D relationship visualization available at DAVID's website, and the new feature which generates the induced Gene Ontology \pkg{GOstats} direct acyclic graph, in order to get the big biological picture, as we will show in section \ref{sec:exploration}.
\end{itemize}

\subsection{Connectivity example}
\pkg{RDAVIDWebService} requires a registered DAVID user (this is a prerequisite 
to use DWS). By means of the registered institutional e-mail, the user can build
a \texttt{DAVIDWebService} object and establish a connection. Then, a gene list
should be uploaded providing a name and type of list. Here, the one provided in
the DAVID website is used (\texttt{demoList1} with Affymetrix\textregistered $~$
identifiers).
\\
\textbf{Note}: the following code will not run unless you change the \texttt{"user@inst.org"} e-mail by the user registered DAVID account.

\begin{Schunk}
\begin{Sinput}
R> library("RDAVIDWebService")
R> david<-DAVIDWebService$new(email="user@inst.org")
R> data(demoList1)
R> result<-addList(david, demoList1,
+ idType="AFFYMETRIX_3PRIME_IVT_ID",
+ listName="demoList1", listType="Gene")
R> result
\end{Sinput}
\begin{Soutput}
$inDavid
[1] 0.9695122

$unmappedIds
[1] "34902_at"   "1937_at"    "35996_at"   "32163_f_at" "32407_f_at"
\end{Soutput}
\end{Schunk}

The \texttt{result} output shows that 96.95\% of the complete 
\texttt{demoList1} are recognized \texttt{\$inDavid}. In addition, this object
also contains the five \texttt{\$unmappedIds}. On the other hand, the status of
the connection is saved in \texttt{david} object.

\newpage

\begin{Schunk}
\begin{Sinput}
R> david
\end{Sinput}
\begin{Soutput}
DAVIDWebService object to access DAVID's website.
User email: user@inst.org
Available Gene List/s:
       Name Using
1 demoList1     *
Available Specie/s:
               Name Using
1 Homo sapiens(155)     *
Available Background List/s:
          Name Using
1 Homo sapiens     *
\end{Soutput}
\end{Schunk}

In this example,  155 genes corresponding to \emph{Homo sapiens} are present in 
\texttt{demoList1}. The complete genome is selected as the default background
but, the user can upload their own by modifying \texttt{listType="Background"}. If
required, the user can select which annotation category to use, e.g.
\texttt{GOTERM\_BP\_ALL}, \texttt{GOTERM\_MF\_ALL} and \texttt{GOTERM\_CC\_ALL}.

\begin{Schunk}
\begin{Sinput}
R> setAnnotationCategories(david, c("GOTERM_BP_ALL", 
+ "GOTERM_MF_ALL", "GOTERM_CC_ALL"))
\end{Sinput}
\end{Schunk}

Now, that everything is in order, the user can get the different reports to use 
right away or to save into a file for future recall. For example the
Functional Annotation Clustering can be obtained on-line on \texttt{termCluster}
object, or as \texttt{termClusterReport1.tab} file by invoking:

\begin{Schunk}
\begin{Sinput}
R> termCluster<-getClusterReport(david, type="Term")
R> getClusterReportFile(david, type="Term",
+ fileName="termClusterReport1.tab")
\end{Sinput}
\end{Schunk}

\subsection{Exploration example}
\label{sec:exploration}
<<General R options for Sweave, echo=false, results=hide>>=
options(prompt="R> ", continue="+  ", width=70, useFancyQuotes=FALSE, digits=4)
@

Hereafter, we asume that at some point \texttt{demoList1} has been used and every report saved into files (data available in the package). 
\\
A user can obtain the functional annotation cluster report of \texttt{demoList1}
and inspect the results using the following code:

<<Loading library, echo=false, results=hide>>=
suppressMessages(library("RDAVIDWebService"))
@

<<TermCluster1, echo=true>>=
library("RDAVIDWebService")
fileName<-system.file("files/termClusterReport1.tab.tar.gz",
  package="RDAVIDWebService")
untar(fileName)
termCluster<-DAVIDTermCluster(untar(fileName, list=TRUE))
termCluster
head(summary(termCluster))
@

Here, \texttt{termCluster} is an object from class \texttt{DAVIDTermCluster}
with the corresponding \texttt{AnnotationCluster} report data of 
\texttt{demoList1}, where \Sexpr{length(cluster(termCluster))} clusters are
found. Then, \texttt{head(summary(termCluster))} can provide a superficial view
of the \texttt{Enrichment} Score reached in each cluster and how many
\texttt{Members} are present. The user can visually explore the 2D view of a
particular cluster (e.g. the second on Figure \ref{fig:2Dview}).  

\begin{figure}[!hbp]
<<plot2Dview, echo=true, results=hide, fig=TRUE, include=false>>=
clustNumber<-2
plot2D(termCluster, clustNumber)
@
\begin{center}
  \includegraphics[width=0.95\textwidth]{RDavidWS-vignette-plot2Dview}
\end{center}
  \caption{Functional annotation cluster exploration, using a 2D-view of the 
evidence of the four term/category members present in the second cluster.}
  \label{fig:2Dview}
\end{figure}

However, in Figure \ref{fig:2Dview}, the four term/category members of this 
cluster share all the ids at ``extracellular region part'' (upper row). But, as
we go down towards the bottom row (structural molecule activity) only nine ids
have evidence related to it. In this view, the hierarchical structure of GO is
not considered nor the members that are enriched or not (default option).
Therefore, the user can extend DAVID's features obtaining the associated induced
DAG structure of the cluster (\texttt{DAVIDGODag}) and contextualize it using
\pkg{GOstats} functionalities (\texttt{plotGOTermGraph}, see Figure
\ref{fig:GO}).

\begin{figure}[!ht]
<<plotGO, echo=true, results=hide, fig=TRUE, include=false>>=
davidGODag<-DAVIDGODag(members(termCluster)[[clustNumber]], 
  pvalueCutoff=0.1, "CC")
plotGOTermGraph(g=goDag(davidGODag),
  r=davidGODag, max.nchar=40, node.shape="ellipse")
@
\begin{center}
  \includegraphics[width=0.9\textwidth]{RDavidWS-vignette-plotGO}
\end{center}
  \caption{Gene Ontology direct acyclic graph induced by cluster two in figure 
\ref{fig:2Dview}. Terms with an EASE score < 0.1 are shown in grey. In addition,
the ratio between genes on the list vs. background reference is displayed. Since
no information is available regarding the other terms into the mapped GO
structure from the cluster, NAs (not available) are introduced when required.}
  \label{fig:GO}
\end{figure}

\section{Trouble shooting}
Sometimes apache Axis' default parameters needs to be changed in order to use 
RDAVIDWebService. For example if DAVID web service is bussy the default timeout
would not be enought. Then you can inspect it and change it calling:

\begin{Schunk}
\begin{Sinput}
R> getTimeout(david)
\end{Sinput}
\begin{Soutput}
[1] 30000
\end{Soutput}
\begin{Sinput}
R> setTimeout(david, 50000)
R> getTimeout(david)
\end{Sinput}
\begin{Soutput}
[1] 50000
\end{Soutput}
\end{Schunk}

Other reported parameter that might need to be changed is the transport
protocol if you get the following error when uploading a gene list:

\begin{Schunk}
\begin{Soutput}
org.apache.axis2.AxisFault: Transport error: 501 Error: Not Implemented
\end{Soutput}
\end{Schunk}

Then you can change on the client side the appropiate parameter value:

\begin{Schunk}
\begin{Sinput}
R> setHttpProtocolVersion(david, "HTTP/1.0")
R> getHttpProtocolVersion(david)
\end{Sinput}
\begin{Soutput}
[1] "HTTP/1.0"
\end{Soutput}
\end{Schunk}


\section*{Acknowledgements}
\emph{Funding}: this work was supported by the National University of Villa 
Maria [31/0186 to E.F. and 31/0187 to E.F.] and Catholic University of
C\'{o}rdoba, Argentina.

\bibliographystyle{apalike} 
\bibliography{RDavidWS}

\section*{Session Info}

<<Session Info, echo=true>>=
sessionInfo()
@

\end{document}
