In the Bioconductor annotation ecosystem, there are TxDb.* packages which provide data for Gene Ontology gene sets. The TxDb.* packages supported in rGREAT are:
library(rGREAT)
rGREAT:::BIOC_ANNO_PKGS$txdb
## [1] "TxDb.Hsapiens.UCSC.hg18.knownGene"
## [2] "TxDb.Hsapiens.UCSC.hg19.knownGene"
## [3] "TxDb.Hsapiens.UCSC.hg38.knownGene"
## [4] "TxDb.Hsapiens.UCSC.hg38.refGene"
## [5] "TxDb.Mmusculus.UCSC.mm10.knownGene"
## [6] "TxDb.Mmusculus.UCSC.mm10.ensGene"
## [7] "TxDb.Mmusculus.UCSC.mm39.refGene"
## [8] "TxDb.Mmusculus.UCSC.mm9.knownGene"
## [9] "TxDb.Rnorvegicus.UCSC.rn4.ensGene"
## [10] "TxDb.Rnorvegicus.UCSC.rn5.refGene"
## [11] "TxDb.Rnorvegicus.UCSC.rn6.refGene"
## [12] "TxDb.Rnorvegicus.UCSC.rn7.refGene"
## [13] "TxDb.Ggallus.UCSC.galGal4.refGene"
## [14] "TxDb.Ggallus.UCSC.galGal5.refGene"
## [15] "TxDb.Ggallus.UCSC.galGal6.refGene"
## [16] "TxDb.Mmulatta.UCSC.rheMac10.refGene"
## [17] "TxDb.Mmulatta.UCSC.rheMac3.refGene"
## [18] "TxDb.Mmulatta.UCSC.rheMac8.refGene"
## [19] "TxDb.Celegans.UCSC.ce11.refGene"
## [20] "TxDb.Celegans.UCSC.ce11.ensGene"
## [21] "TxDb.Cfamiliaris.UCSC.canFam3.refGene"
## [22] "TxDb.Cfamiliaris.UCSC.canFam4.refGene"
## [23] "TxDb.Cfamiliaris.UCSC.canFam5.refGene"
## [24] "TxDb.Sscrofa.UCSC.susScr11.refGene"
## [25] "TxDb.Sscrofa.UCSC.susScr3.refGene"
## [26] "TxDb.Scerevisiae.UCSC.sacCer2.sgdGene"
## [27] "TxDb.Scerevisiae.UCSC.sacCer3.sgdGene"
## [28] "TxDb.Ptroglodytes.UCSC.panTro4.refGene"
## [29] "TxDb.Ptroglodytes.UCSC.panTro5.refGene"
## [30] "TxDb.Ptroglodytes.UCSC.panTro6.refGene"
## [31] "TxDb.Dmelanogaster.UCSC.dm3.ensGene"
## [32] "TxDb.Dmelanogaster.UCSC.dm6.ensGene"
## [33] "TxDb.Drerio.UCSC.danRer10.refGene"
## [34] "TxDb.Drerio.UCSC.danRer11.refGene"
## [35] "TxDb.Btaurus.UCSC.bosTau8.refGene"
## [36] "TxDb.Btaurus.UCSC.bosTau9.refGene"
## [37] "TxDb.Athaliana.BioMart.plantsmart51"
## [38] "TxDb.Athaliana.BioMart.plantsmart22"
## [39] "TxDb.Athaliana.BioMart.plantsmart25"
## [40] "TxDb.Athaliana.BioMart.plantsmart28"
To perform GREAT anlaysis with GO gene sets for other organisms, you can either specify the genome version:
great(gr, "GO:BP", "galGal6")
or with the full name of the corresponding TxDb package:
great(gr, "GO:BP", "TxDb.Ggallus.UCSC.galGal6.refGene")
These two are internally the same.
You can specify a BioMart dataset (which corresponds to a specific organism), e.g.:
# Giant panda
great(gr, "GO:BP", biomart_dataset = "amelanoleuca_gene_ensembl")
A full list of supported BioMart datasets (organisms) can be found
with the function
BioMartGOGeneSets::supportedOrganisms()
.
MSigDB contains gene sets only for human, but it can be extended to other organisms by mapping to the homologues genes. The package msigdbr has already mapped genes to many other organisms. A full list of supported organisms in msigdbr can be obtained by:
library(msigdbr)
msigdbr_species()
## # A tibble: 20 × 2
## species_name species_common_name
## <chr> <chr>
## 1 Anolis carolinensis Carolina anole, green anole
## 2 Bos taurus bovine, cattle, cow, dairy cow, domestic cat…
## 3 Caenorhabditis elegans <NA>
## 4 Canis lupus familiaris dog, dogs
## 5 Danio rerio leopard danio, zebra danio, zebra fish, zebr…
## 6 Drosophila melanogaster fruit fly
## 7 Equus caballus domestic horse, equine, horse
## 8 Felis catus cat, cats, domestic cat
## 9 Gallus gallus bantam, chicken, chickens, Gallus domesticus
## 10 Homo sapiens human
## 11 Macaca mulatta rhesus macaque, rhesus macaques, Rhesus monk…
## 12 Monodelphis domestica gray short-tailed opossum
## 13 Mus musculus house mouse, mouse
## 14 Ornithorhynchus anatinus duck-billed platypus, duckbill platypus, pla…
## 15 Pan troglodytes chimpanzee
## 16 Rattus norvegicus brown rat, Norway rat, rat, rats
## 17 Saccharomyces cerevisiae baker's yeast, brewer's yeast, S. cerevisiae
## 18 Schizosaccharomyces pombe 972h- <NA>
## 19 Sus scrofa pig, pigs, swine, wild boar
## 20 Xenopus tropicalis tropical clawed frog, western clawed frog
To obtain gene sets for non-human organisms, e.g.:
h_gene_sets = msigdbr(species = "chimpanzee", category = "H")
head(h_gene_sets)
## # A tibble: 6 × 18
## gs_cat gs_subcat gs_name gene_symbol entrez_gene ensembl_gene human_gene_symb…
## <chr> <chr> <chr> <chr> <int> <chr> <chr>
## 1 H "" HALLMA… ABCA1 464630 ENSPTRG0000… ABCA1
## 2 H "" HALLMA… ABCB8 463892 ENSPTRG0000… ABCB8
## 3 H "" HALLMA… ACAA2 455414 ENSPTRG0000… ACAA2
## 4 H "" HALLMA… ACADL 459914 ENSPTRG0000… ACADL
## 5 H "" HALLMA… ACADM 469356 ENSPTRG0000… ACADM
## 6 H "" HALLMA… ACADS 742921 ENSPTRG0000… ACADS
## # … with 11 more variables: human_entrez_gene <int>, human_ensembl_gene <chr>,
## # gs_id <chr>, gs_pmid <chr>, gs_geoid <chr>, gs_exact_source <chr>,
## # gs_url <chr>, gs_description <chr>, taxon_id <int>, ortholog_sources <chr>,
## # num_ortholog_sources <dbl>
If the organism you selected has a corresponding TxDb package
available which provides TSS information, you need to make sure the gene
sets use Entrez gene ID as the identifier (Most TxDb packages use Entrez
ID as primary ID, you can check the variable
rGREAT:::BIOC_ANNO_PKGS
).
# convert to a list of gene sets
h_gene_sets = split(h_gene_sets$entrez_gene, h_gene_sets$gs_name)
h_gene_sets = lapply(h_gene_sets, as.character) # just to make sure gene IDs are all in character.
h_gene_sets[1:2]
## $HALLMARK_ADIPOGENESIS
## [1] "464630" "463892" "455414" "459914" "469356" "742921"
## [7] "454672" "104003784" "454895" "451866" "737339" "471032"
## [13] "451742" "737305" "100615914" "456723" "107967644" "454362"
## [19] "464334" "743667" "741867" "449586" "100614256" "741708"
## [25] "459164" "746692" "473976" "452433" "468889" "745443"
## [31] "460926" "455644" "451116" "454684" "744890" "461229"
## [37] "740513" "104005232" "463949" "469319" "748673" "450673"
## [43] "468605" "471455" "456837" "464611" "452659" "472079"
## [49] "452307" "454118" "100616508" "465727" "742828" "737945"
## [55] "107976794" "107976794" "746229" "472893" "456557" "457056"
## [61] "747265" "736777" "464460" "451393" "745691" "454512"
## [67] "466780" "463861" "744984" "452566" "457117" NA
## [73] "747936" "459360" "461436" "464353" "464074" "466651"
## [79] "451984" "456243" "464255" "467738" "466732" "461244"
## [85] "456929" "460520" "450562" "450738" "464140" "459670"
## [91] "452976" "471703" "741876" "471135" "461424" "459828"
## [97] "452295" "460113" "453565" "741179" "747276" "470423"
## [103] "451967" "450290" "473975" "473975" "460157" "462946"
## [109] "449638" "738797" "456076" "451807" "464031" "739986"
## [115] "459173" "460872" "463484" "462853" "739167" "457477"
## [121] "742027" "746245" "472764" "747387" "744096" "101057233"
## [127] "744811" "463686" "744435" "468748" "451175" "460227"
## [133] "454744" "739996" NA "450735" "454478" "457929"
## [139] "738397" "458602" "456908" "451591" "450310" "107970333"
## [145] "465012" "463481" "463481" "460178" "470365" "742092"
## [151] "741184" "459094" "459374" "456940" "745779" "454531"
## [157] "737918" "107973114" "742100" "470420" "468499" "467657"
## [163] "100608935" "462416" "451281" "470281" "470281" "452359"
## [169] "456862" "456526" "747462" "474051" "456155" "458647"
## [175] "744390" "455841" "459096" "459031" "450574" "449637"
## [181] "450628" "470477" "471247" "453405" "739128" "454681"
## [187] "464707" "470417" "450933" "459685" "460443" "468406"
## [193] "458803" "467151" "464550" "745004" "451416" "735808"
## [199] "743144" "460348" "107974864" "471631" "741897" "463489"
##
## $HALLMARK_ALLOGRAFT_REJECTION
## [1] "454210" "461523" "450363" "100609296" "459646" "740898"
## [7] "466415" "450170" "465345" "456984" "744209" "449497"
## [13] "100608992" "459361" "741390" "468208" "748142" "473220"
## [19] "748205" "736543" "454593" "747004" "454579" "747123"
## [25] "462689" "460323" "740071" "450128" "469524" "449512"
## [31] "748272" "470617" "451584" "742330" "451585" "450124"
## [37] "100615583" "473802" "460569" "745293" "462191" "740560"
## [43] "470892" "470900" "735755" "470426" "460577" "465021"
## [49] "465607" "736196" "457127" "453745" "457277" "738275"
## [55] "471200" NA "459634" "457770" "469142" "463415"
## [61] "466216" "458797" "453714" "469204" "750603" NA
## [67] "100615835" "740028" "451695" "451158" "471510" "738331"
## [73] "469584" "739516" "465940" "461906" "468521" "457003"
## [79] "472959" "457020" "467610" "461873" "452825" "460623"
## [85] "463280" "746195" "750725" "100608816" "449592" "471979"
## [91] "471977" "471974" "462591" "462540" "494187" "450196"
## [97] "474132" "473965" "449517" "747276" "470077" "743102"
## [103] "472749" "469657" "736204" "460816" "471723" "455851"
## [109] "449564" "737808" "449644" "739011" "744277" "450200"
## [115] "461472" "456370" "450884" "470203" "101059843" "449565"
## [121] "454005" "463288" "464245" "745517" "463371" "470524"
## [127] "462386" "450927" "454294" "454045" "458607" "735556"
## [133] "464979" "450156" "738375" "456715" "471734" "736309"
## [139] "744486" "459682" "745667" "472771" "462888" "748652"
## [145] "449582" "458294" "460699" "459239" "741196" "460720"
## [151] "100322885" "469743" "455026" "740704" "740477" "450512"
## [157] "453993" "456276" "743176" "748032" "457607" "462249"
## [163] "464277" "737451" "746600" "737526" "456065" "461536"
## [169] "107966305" "107966305" "746721" "737070" "459209" "451169"
## [175] "450503" "461971" "461023" "459834" "100610925" "471978"
## [181] "746399" "746814" "456060" "457742" "451611" "107971092"
## [187] "461167" "471325" "471374" "471167" "494186" "748737"
## [193] "464876" "741922" "745141" "452125" "453161" "743187"
## [199] "459427"
Now we can perform the local GREAT analysis.
great(gr, h_gene_sets, "panTro6")
Since great()
allows both self-defined TSS and gene
sets, this means great()
can be independent to organisms.
Please refer to the vignette “Analyze
with local GREAT” to find out how to manuallly set both TSS and gene
sets.