## ----style, echo = FALSE, results = 'asis'------------------------------------
BiocStyle::markdown()

## ----global_options, include=FALSE--------------------------------------------------------------------------
knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE)
.libPaths(c(.libPaths(), "/r1/people/steffi_grote/R/x86_64-pc-linux-gnu-library/3.3"))
options(width=110)
set.seed(123)

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## install annotation package 'Homo.sapiens' from bioconductor
#  if (!requireNamespace("BiocManager", quietly=TRUE))
#      install.packages("BiocManager")
#  BiocManager::install('Homo.sapiens')

## -----------------------------------------------------------------------------------------------------------
## load GOfuncR package
library(GOfuncR)
## create input dataframe with candidate genes
gene_ids = c('NCAPG', 'APOL4', 'NGFR', 'NXPH4', 'C21orf59', 'CACNG2', 'AGTR1',
    'ANO1', 'BTBD3', 'MTUS1', 'CALB1', 'GYG1', 'PAX2')
input_hyper = data.frame(gene_ids, is_candidate=1)
input_hyper

## ---- results='hide'----------------------------------------------------------------------------------------
## run enrichment analysis (n_randets=100 lowers compuation time
## compared to default 1000)
res_hyper = go_enrich(input_hyper, n_randset=100)

## -----------------------------------------------------------------------------------------------------------
## first element of go_enrich result has the stats
stats = res_hyper[[1]]
## top-GO categories
head(stats)
## top GO-categories per domain
by(stats, stats$ontology, head, n=3)

## -----------------------------------------------------------------------------------------------------------
## all valid input genes
head(res_hyper[[2]])

## -----------------------------------------------------------------------------------------------------------
## annotation package used (default='Homo.sapiens') and GO-graph version
res_hyper[[3]]

## -----------------------------------------------------------------------------------------------------------
## minimum p-values from randomsets
head(res_hyper[[4]])

## -----------------------------------------------------------------------------------------------------------
## create input dataframe with candidate and background genes
candi_gene_ids = c('NCAPG', 'APOL4', 'NGFR', 'NXPH4', 'C21orf59', 'CACNG2', 
    'AGTR1', 'ANO1', 'BTBD3', 'MTUS1', 'CALB1', 'GYG1', 'PAX2')
bg_gene_ids = c('FGR', 'NPHP1', 'DRD2', 'ABCC10', 'PTBP2', 'JPH4', 'SMARCC2',
    'FN1', 'NODAL', 'CYP1A2', 'ACSS1', 'CDHR1', 'SLC25A36', 'LEPR', 'PRPS2',
    'TNFAIP3', 'NKX3-1', 'LPAR2', 'PGAM2')
is_candidate = c(rep(1,length(candi_gene_ids)), rep(0,length(bg_gene_ids)))
input_hyper_bg = data.frame(gene_ids = c(candi_gene_ids, bg_gene_ids),
    is_candidate)
head(input_hyper_bg)
tail(input_hyper_bg)

## ---- results='hide'----------------------------------------------------------------------------------------
res_hyper_bg = go_enrich(input_hyper_bg, n_randsets=100)


## -----------------------------------------------------------------------------------------------------------
head(res_hyper_bg[[1]])

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## test input genes again with correction for gene length
#  res_hyper_len = go_enrich(input_hyper, gene_len=TRUE)

## -----------------------------------------------------------------------------------------------------------
## create input vector with a candidate region on chromosome 8
## and background regions on chromosome 7, 8 and 9
regions = c('8:81000000-83000000', '7:1300000-56800000', '7:74900000-148700000',
    '8:7400000-44300000', '8:47600000-146300000', '9:0-39200000',
    '9:69700000-140200000')
is_candidate = c(1, rep(0,6))
input_regions = data.frame(regions, is_candidate)
input_regions

## ----results='hide'-----------------------------------------------------------------------------------------
## run GO-enrichment analysis for genes in the candidate region
res_region = go_enrich(input_regions, n_randsets=100, regions=TRUE)

## -----------------------------------------------------------------------------------------------------------
stats_region = res_region[[1]]
head(stats_region)
## see which genes are located in the candidate region
input_genes = res_region[[2]]
candidate_genes = input_genes[input_genes[,2]==1, 1]
candidate_genes

## -----------------------------------------------------------------------------------------------------------
## create input dataframe with scores in second column
high_score_genes = c('GCK', 'CALB1', 'PAX2', 'GYS1','SLC2A8', 'UGP2', 'BTBD3',
    'MTUS1', 'SYP', 'PSEN1')
low_score_genes = c('CACNG2', 'ANO1', 'ZWINT', 'ENGASE', 'HK2', 'PYGL', 'GYG1')
gene_scores = c(runif(length(high_score_genes), 0.5, 1),
    runif(length(low_score_genes), 0, 0.5))
input_willi = data.frame(gene_ids = c(high_score_genes, low_score_genes),
    gene_scores)
head(input_willi)

## ---- results='hide'----------------------------------------------------------------------------------------
res_willi = go_enrich(input_willi, test='wilcoxon', n_randsets=100)

## -----------------------------------------------------------------------------------------------------------
head(res_willi[[1]])

## -----------------------------------------------------------------------------------------------------------
## create a toy example dataset with two counts per gene
high_A_genes = c('G6PD', 'GCK', 'GYS1', 'HK2', 'PYGL', 'SLC2A8', 'UGP2',
    'ZWINT', 'ENGASE')
low_A_genes = c('CACNG2', 'AGTR1', 'ANO1', 'BTBD3', 'MTUS1', 'CALB1', 'GYG1',
    'PAX2')
A_counts = c(sample(20:30, length(high_A_genes)),
    sample(5:15, length(low_A_genes)))
B_counts = c(sample(5:15, length(high_A_genes)),
    sample(20:30, length(low_A_genes)))
input_binom = data.frame(gene_ids=c(high_A_genes, low_A_genes), A_counts,
    B_counts)
head(input_binom)

## -----------------------------------------------------------------------------------------------------------
## run binomial test, excluding the 'biological_process' domain,
## suppress output to screen
res_binom = go_enrich(input_binom, test='binomial', n_randsets=100,
    silent=TRUE, domains=c('molecular_function', 'cellular_component'))
head(res_binom[[1]])

## -----------------------------------------------------------------------------------------------------------
## create a toy example with four counts per gene
high_substi_genes = c('G6PD', 'GCK', 'GYS1', 'HK2', 'PYGL', 'SLC2A8', 'UGP2',
    'ZWINT', 'ENGASE')
low_substi_genes = c('CACNG2', 'AGTR1', 'ANO1', 'BTBD3', 'MTUS1', 'CALB1',
    'GYG1', 'PAX2', 'C21orf59')
subs_non_syn = c(sample(5:15, length(high_substi_genes), replace=TRUE),
    sample(0:5, length(low_substi_genes), replace=TRUE))
subs_syn = sample(5:15, length(c(high_substi_genes, low_substi_genes)),
    replace=TRUE)
vari_non_syn = c(sample(0:5, length(high_substi_genes), replace=TRUE),
    sample(0:10, length(low_substi_genes), replace=TRUE))
vari_syn = sample(5:15, length(c(high_substi_genes, low_substi_genes)),
    replace=TRUE)
input_conti = data.frame(gene_ids=c(high_substi_genes, low_substi_genes),
    subs_non_syn, subs_syn, vari_non_syn, vari_syn)
head(input_conti)

## the corresponding contingency table for the first gene would be:
matrix(input_conti[1, 2:5], ncol=2,
    dimnames=list(c('non-synonymous', 'synonymous'),
    c('substitution','variable')))

## ---- results='hide'----------------------------------------------------------------------------------------
res_conti = go_enrich(input_conti, test='contingency', n_randset=100)

## -----------------------------------------------------------------------------------------------------------
head(res_conti[[1]])

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## perform enrichment analysis for mouse genes
#  ## ('Mus.musculus' has to be installed)
#  mouse_gene_ids = c('Gck', 'Gys1', 'Hk2', 'Pygl', 'Slc2a8', 'Ugp2', 'Zwint',
#      'Engase')
#  input_hyper_mouse = data.frame(mouse_gene_ids, is_candidate=1)
#  res_hyper_mouse = go_enrich(input_hyper_mouse, organismDb='Mus.musculus')
#  

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## perform enrichment analysis for chimp genes
#  ## ('org.Pt.eg.db' has to be installed)
#  chimp_gene_ids = c('SIAH1', 'MIIP', 'ELP3', 'CFB', 'ADARB1', 'TRNT1',
#      'DEFB124', 'OR1A1', 'TYR', 'HOXA7')
#  input_hyper_chimp = data.frame(chimp_gene_ids, is_candidate=1)
#  res_hyper_chimp = go_enrich(input_hyper_chimp, orgDb='org.Pt.eg.db')
#  

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## perform enrichment analysis for chimp genes
#  ## and account for gene-length in FWER
#  ## (needs 'org.Pt.eg.db' and 'TxDb.Ptroglodytes.UCSC.panTro4.refGene' installed)
#  res_hyper_chimp_genelen = go_enrich(input_hyper_chimp, gene_len=TRUE,
#      orgDb='org.Pt.eg.db', txDb='TxDb.Ptroglodytes.UCSC.panTro4.refGene')
#  

## ----eval=FALSE---------------------------------------------------------------------------------------------
#  ## run GO-enrichment analysis for genes in the candidate region
#  ## using hg38 gene-coordinates
#  ## (needs 'org.Hs.eg.db' and 'TxDb.Hsapiens.UCSC.hg38.knownGene' installed)
#  res_region_hg38 = go_enrich(input_regions, regions=TRUE,
#      orgDb='org.Hs.eg.db', txDb='TxDb.Hsapiens.UCSC.hg38.knownGene')

## ----echo=FALSE---------------------------------------------------------------------------------------------
custom_anno = get_anno_categories(input_hyper_bg[,1])
# to have several genes in head
custom_anno = custom_anno[6:nrow(custom_anno),1:2]
rownames(custom_anno) = 1:nrow(custom_anno)

## -----------------------------------------------------------------------------------------------------------
## example for a dataframe with custom annotations
head(custom_anno)

## ----eval=FALSE---------------------------------------------------------------------------------------------
#  ## run enrichment analysis with custom annotations
#  res_hyper_anno = go_enrich(input_hyper, annotations=custom_anno)
#  

## ----echo=FALSE---------------------------------------------------------------------------------------------
gene = c('NCAPG','APOL4','NGFR','NXPH4','C21orf59','CACNG2')
chr = c('chr4', 'chr22', 'chr17', 'chr12', 'chr21', 'chr22')
start = c(17812436, 36585176, 47572655, 57610578, 33954510, 36956916)
end = c(17846487, 36600879, 47592382, 57620232, 33984918, 37098690)
custom_coords = data.frame(gene, chr, start, end, stringsAsFactors=FALSE)

## -----------------------------------------------------------------------------------------------------------
## example for a dataframe with custom gene-coordinates
head(custom_coords)

## ----eval=FALSE---------------------------------------------------------------------------------------------
#  ## use correction for gene-length based on custom gene-coordinates
#  res_hyper_cc = go_enrich(input_hyper, gene_len=TRUE, gene_coords=custom_coords)

## ----eval=FALSE---------------------------------------------------------------------------------------------
#  ## run enrichment with custom GO-graph
#  go_path = '/home/user/go_graphs/2018-11-24/'
#  res_hyper = go_enrich(input_hyper, godir=go_path)
#  

## -----------------------------------------------------------------------------------------------------------
## hypergeometric test
top_gos_hyper = res_hyper[[1]][1:5, 'node_id']
# GO-categories with a high proportion of candidate genes
top_gos_hyper
plot_anno_scores(res_hyper, top_gos_hyper)

## -----------------------------------------------------------------------------------------------------------
## hypergeometric test with defined background
top_gos_hyper_bg = res_hyper_bg[[1]][1:5, 'node_id']
top_gos_hyper_bg
plot_stats = plot_anno_scores(res_hyper_bg, top_gos_hyper_bg)
plot_stats

## -----------------------------------------------------------------------------------------------------------
## scores used for wilcoxon rank-sum test
top_gos_willi = res_willi[[1]][1:5, 'node_id']
# GO-categories with high scores
top_gos_willi
plot_anno_scores(res_willi, top_gos_willi)

## -----------------------------------------------------------------------------------------------------------
## counts used for the binomial test
top_gos_binom = res_binom[[1]][1:5, 'node_id']
# GO-categories with high proportion of A
top_gos_binom
plot_anno_scores(res_binom, top_gos_binom)

## -----------------------------------------------------------------------------------------------------------
## counts used for the 2x2 contingency table test
top_gos_conti = res_conti[[1]][1:5, 'node_id']
# GO-categories with high A/B compared to C/D
top_gos_conti
plot_anno_scores(res_conti, top_gos_conti)

## -----------------------------------------------------------------------------------------------------------
## get the parent nodes (higher level GO-categories) of two GO-IDs
get_parent_nodes(c('GO:0051082', 'GO:0042254'))

## get the child nodes (sub-categories) of two GO-IDs
get_child_nodes(c('GO:0090070', 'GO:0000112'))

## -----------------------------------------------------------------------------------------------------------
## get the full names and domains of two GO-IDs
get_names(c('GO:0090070', 'GO:0000112'))

## -----------------------------------------------------------------------------------------------------------
## get GO-IDs of categories that contain 'blood-brain barrier' in their names
bbb = get_ids('blood-brain barrier')
head(bbb)

## -----------------------------------------------------------------------------------------------------------
## find all genes that are annotated to GO:0000109 
## using default database 'Homo.sapiens'
head(get_anno_genes(go_ids='GO:0000109'))

## find out wich genes from a set of genes
## are annotated to some GO-categories
genes = c('AGTR1', 'ANO1', 'CALB1', 'GYG1', 'PAX2')
gos = c('GO:0001558', 'GO:0005536', 'GO:0072205', 'GO:0006821')
anno_genes = get_anno_genes(go_ids=gos, genes=genes)
# add the names and domains of the GO-categories
cbind(anno_genes, get_names(anno_genes$go_id)[,2:3])

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## find all mouse-gene annotations to two GO-categories
#  ## ('Mus.musculus' has to be installed)
#  gos = c('GO:0072205', 'GO:0000109')
#  get_anno_genes(go_ids=gos, database='Mus.musculus')

## -----------------------------------------------------------------------------------------------------------
## get the GO-annotations for two random genes
anno = get_anno_categories(c('BTC', 'SPAG5'))
head(anno)

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## get the GO-annotations for two mouse genes
#  ## ('Mus.musculus' has to be installed)
#  anno_mus = get_anno_categories(c('Mus81', 'Papola'), database='Mus.musculus')

## -----------------------------------------------------------------------------------------------------------
# get all direct annotations of NXPH4
direct_anno = get_anno_categories('NXPH4')
direct_anno
# get parent nodes of directly annotated GO-categories
parent_ids = unique(get_parent_nodes(direct_anno$go_id)[,2])
# add GO-domain
full_anno = get_names(parent_ids)
head(full_anno)

## ---- results='hide'----------------------------------------------------------------------------------------
## perform enrichment analysis for some genes
gene_ids = c('NCAPG', 'APOL4', 'NGFR', 'NXPH4', 'C21orf59', 'CACNG2', 'AGTR1',
    'ANO1', 'BTBD3', 'MTUS1', 'CALB1', 'GYG1', 'PAX2')
input_hyper = data.frame(gene_ids, is_candidate=1)
res_hyper = go_enrich(input_hyper, n_randset=100)

## perform refinement for categories with FWER < 0.1
refined = refine(res_hyper, fwer=0.1)

## -----------------------------------------------------------------------------------------------------------
## the result shows p-values and significance before and after refinement
refined

## -----------------------------------------------------------------------------------------------------------
sessionInfo()