BioPlex 1.8.0
library(BioPlex)
library(AnnotationDbi)
library(AnnotationHub)
library(graph)
Connect to AnnotationHub:
ah <- AnnotationHub::AnnotationHub()
Connect to ExperimentHub:
eh <- ExperimentHub::ExperimentHub()
OrgDb package for human:
orgdb <- AnnotationHub::query(ah, c("orgDb", "Homo sapiens"))
orgdb <- orgdb[[1]]
orgdb
#> OrgDb object:
#> | DBSCHEMAVERSION: 2.1
#> | Db type: OrgDb
#> | Supporting package: AnnotationDbi
#> | DBSCHEMA: HUMAN_DB
#> | ORGANISM: Homo sapiens
#> | SPECIES: Human
#> | EGSOURCEDATE: 2023-Sep11
#> | EGSOURCENAME: Entrez Gene
#> | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
#> | CENTRALID: EG
#> | TAXID: 9606
#> | GOSOURCENAME: Gene Ontology
#> | GOSOURCEURL: http://current.geneontology.org/ontology/go-basic.obo
#> | GOSOURCEDATE: 2023-07-27
#> | GOEGSOURCEDATE: 2023-Sep11
#> | GOEGSOURCENAME: Entrez Gene
#> | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
#> | KEGGSOURCENAME: KEGG GENOME
#> | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
#> | KEGGSOURCEDATE: 2011-Mar15
#> | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
#> | GPSOURCEURL:
#> | GPSOURCEDATE: 2023-Aug20
#> | ENSOURCEDATE: 2023-May10
#> | ENSOURCENAME: Ensembl
#> | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
#> | UPSOURCENAME: Uniprot
#> | UPSOURCEURL: http://www.UniProt.org/
#> | UPSOURCEDATE: Mon Sep 18 16:12:39 2023
keytypes(orgdb)
#> [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
#> [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
#> [11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
#> [16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
#> [21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
#> [26] "UNIPROT"
Get core set of complexes:
core <- getCorum(set = "core", organism = "Human")
Turn the CORUM complexes into a list of graph instances, where all nodes of a complex are connected to all other nodes of that complex with undirected edges.
core.glist <- corum2graphlist(core, subunit.id.type = "UNIPROT")
Identify complexes that have a subunit of interest:
has.cdk2 <- hasSubunit(core.glist,
subunit = "CDK2",
id.type = "SYMBOL")
Check the answer:
table(has.cdk2)
#> has.cdk2
#> FALSE TRUE
#> 2408 9
cdk2.glist <- core.glist[has.cdk2]
lapply(cdk2.glist, function(g) unlist(graph::nodeData(g, attr = "SYMBOL")))
#> $CORUM311_Cell_cycle_kinase_complex_CDK2
#> P12004 P24385 P24941 P38936
#> "PCNA" "CCND1" "CDK2" "CDKN1A"
#>
#> $`CORUM1003_RC_complex_(Replication_competent_complex)`
#> P09884 P20248 P24941 P35249 P35250 P35251 P40937 P40938 Q14181
#> "POLA1" "CCNA2" "CDK2" "RFC4" "RFC2" "RFC1" "RFC5" "RFC3" "POLA2"
#>
#> $`CORUM1004_RC_complex_during_S-phase_of_cell_cycle`
#> P09874 P09884 P11387 P15927 P18858 P20248 P24941 P27694 P28340 P35244
#> "PARP1" "POLA1" "TOP1" "RPA2" "LIG1" "CCNA2" "CDK2" "RPA1" "POLD1" "RPA3"
#> P35250 P35251 Q07864
#> "RFC2" "RFC1" "POLE"
#>
#> $`CORUM1656_p27-cyclinE-CDK2_complex`
#> P24864 P24941 P46527
#> "CCNE1" "CDK2" "CDKN1B"
#>
#> $`CORUM3015_p27-cyclinE-Cdk2_-_Ubiquitin_E3_ligase_(SKP1A,_SKP2,_CUL1,_CKS1B,_RBX1)_complex`
#> P24864 P24941 P46527 P61024 P62877 P63208 Q13309 Q13616
#> "CCNE1" "CDK2" "CDKN1B" "CKS1B" "RBX1" "SKP1" "SKP2" "CUL1"
#>
#> $`CORUM5556_CDK2-CCNA2_complex`
#> P20248 P24941
#> "CCNA2" "CDK2"
#>
#> $`CORUM5559_CDC2-CCNA2-CDK2_complex`
#> P06493 P20248 P24941
#> "CDK1" "CCNA2" "CDK2"
#>
#> $`CORUM5560_CDK2-CCNE1_complex`
#> P24864 P24941
#> "CCNE1" "CDK2"
#>
#> $`CORUM6589_E2F-1-DP-1-cyclinA-CDK2_complex`
#> P24941 P78396 Q01094 Q14186
#> "CDK2" "CCNA1" "E2F1" "TFDP1"
We can then also inspect the graph with plotting utilities from the Rgraphviz package:
plot(cdk2.glist[[1]], main = names(cdk2.glist)[1])
Get the latest version of the 293T PPI network:
bp.293t <- getBioPlex(cell.line = "293T", version = "3.0")
#> Using cached version from 2023-04-25 14:56:45
Turn the BioPlex PPI network into one big graph where bait and prey relationship are represented by directed edges from bait to prey.
bp.gr <- bioplex2graph(bp.293t)
Now we can also easily pull out a BioPlex subnetwork for a CORUM complex of interest:
n <- graph::nodes(cdk2.glist[[1]])
bp.sgr <- graph::subGraph(n, bp.gr)
bp.sgr
#> A graphNEL graph with directed edges
#> Number of Nodes = 4
#> Number of Edges = 5
Add PFAM domain annotations to the node metadata:
bp.gr <- BioPlex::annotatePFAM(bp.gr, orgdb)
Create a map from PFAM to UNIPROT:
unip2pfam <- graph::nodeData(bp.gr, graph::nodes(bp.gr), "PFAM")
pfam2unip <- stack(unip2pfam)
pfam2unip <- split(as.character(pfam2unip$ind), pfam2unip$values)
head(pfam2unip, 2)
#> $PF00001
#> [1] "P28566" "P25106" "P23945" "Q9HBX9" "P16473" "P04201" "Q9HC97" "P30968"
#> [9] "Q9Y2T6" "Q14330" "P46089" "Q15391" "Q9BXA5" "Q13304" "P61073" "P21462"
#> [17] "P25090" "Q99679" "P21730" "P30556" "P43088" "P32246" "P32249" "Q9Y2T5"
#> [25] "Q7Z602" "P43657" "O00398" "Q9H244" "Q86VZ1" "Q9NPB9" "Q99788" "P51684"
#> [33] "P35414" "O00590" "Q9H1Y3" "P55085" "O15218" "Q9GZQ4" "P25101" "Q9NS66"
#> [41] "Q9NQS5" "P21453" "P14416" "P24530" "P32239" "Q16581" "O00421" "Q9UHM6"
#> [49] "Q8N6U8" "P20309" "O15354" "Q9BXC0" "P47775" "P30550" "P49146" "P47900"
#> [57] "Q8TDU9" "P25103" "P35372" "P41597" "Q9P296" "P28335" "O95136" "P08173"
#> [65] "P29371" "P41146" "P43119" "O95977" "Q99677" "Q9BXB1" "O43193" "P30989"
#> [73] "Q8NGU9" "P47901" "P22888" "Q9GZN0" "P21917" "O60755" "Q9NS67" "P08912"
#> [81] "Q9UPC5" "Q8TDV2" "Q92633" "Q9NQ55" "Q13585" "Q9UBY5" "Q9H228" "P28222"
#>
#> $PF00002
#> [1] "Q8IZP9" "P41587" "Q8IZF4" "P49190" "P32241" "P47871" "P48960" "Q8IZF5"
#> [9] "O14514" "Q03431" "Q9NYQ6" "Q8WXG9" "Q9NYQ7" "O60242" "O60241" "Q9HAR2"
#> [17] "O94910" "Q8IWK6" "O95490" "Q96PE1" "Q86SQ4"
Let’s focus on PF02023, corresponding to the zinc finger-associated SCAN domain. For each protein containing the SCAN domain, we now extract PFAM domains connected to the SCAN domain by an edge in the BioPlex network.
scan.unip <- pfam2unip[["PF02023"]]
getIAPfams <- function(n) graph::nodeData(bp.gr, graph::edges(bp.gr)[[n]], "PFAM")
unip2iapfams <- lapply(scan.unip, getIAPfams)
unip2iapfams <- lapply(unip2iapfams, unlist)
names(unip2iapfams) <- scan.unip
Looking at the top 5 PFAM domains most frequently connected to the SCAN domain by an edge in the BioPlex network …
pfam2iapfams <- unlist(unip2iapfams)
sort(table(pfam2iapfams), decreasing = TRUE)[1:5]
#> pfam2iapfams
#> PF02023 PF00096 PF01352 PF06467 PF13465
#> 208 169 100 14 10
… we find PF02023, the SCAN domain itself, and PF00096, a C2H2 type zinc finger domain. This finding is consistent with results reported in the BioPlex 3.0 publication.
See also the PFAM domain-domain association analysis vignette for a more comprehensive analysis of PFAM domain associations in the BioPlex network.
Genomic data from whole-genome sequencing for six different lineages of the human embryonic kidney HEK293 cell line can be obtained from hek293genome.org.
This includes copy number variation (CNV) data for the 293T cell line. Available CNV tracks include (i) CNV regions inferred from sequencing read-depth analysis, and (ii) CNV regions inferred from Illumina SNP arrays.
Here, we check agreement between inferred copy numbers from both assay types.
We start by obtaining genomic coordinates and copy number scores from the sequencing track …
cnv.hmm <- getHEK293GenomeTrack(track = "cnv.hmm", cell.line = "293T")
#> Using cached version from 2023-04-25 14:57:00
cnv.hmm
#> GRanges object with 12382 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <numeric>
#> [1] chr1 823231-829231 * | 3.26
#> [2] chr1 835231-913231 * | 3.08
#> [3] chr1 923231-1063231 * | 3.20
#> [4] chr1 1079231-1213231 * | 3.21
#> [5] chr1 1223231-1399231 * | 3.27
#> ... ... ... ... . ...
#> [12378] chrX 154750237-154762237 * | 3.96
#> [12379] chrX 154778237-154780237 * | 4.02
#> [12380] chrX 154802237-154822237 * | 3.79
#> [12381] chrX 154842237-154846237 * | 3.85
#> [12382] chrM 0-12000 * | 8.82
#> -------
#> seqinfo: 24 sequences from hg18 genome; no seqlengths
… and from the SNP track.
cnv.snp <- getHEK293GenomeTrack(track = "cnv.snp", cell.line = "293T")
#> Using cached version from 2023-04-25 14:57:03
cnv.snp
#> GRanges object with 204 ranges and 1 metadata column:
#> seqnames ranges strand | score
#> <Rle> <IRanges> <Rle> | <integer>
#> [1] chr1 742429-5117504 * | 3
#> [2] chr1 5126548-27691288 * | 4
#> [3] chr1 27696717-33934376 * | 6
#> [4] chr1 33946560-48944728 * | 4
#> [5] chr1 48956914-50523337 * | 2
#> ... ... ... ... . ...
#> [200] chrX 106780442-127925246 * | 5
#> [201] chrX 127931857-131443550 * | 6
#> [202] chrX 131541633-136116260 * | 5
#> [203] chrX 136125141-147982191 * | 4
#> [204] chrX 147983995-154582606 * | 5
#> -------
#> seqinfo: 23 sequences from hg18 genome; no seqlengths
We reduce the check for agreement between both CNV tracks by transferring copy numbers to overlapping genes, and subsequently, assess the agreement between the resulting gene copy numbers for both tracks.
As the genomic coordinates from both CNV tracks is based on the hg18 human genome assembly, we obtain gene coordinates for hg18 from AnnotationHub:
AnnotationHub::query(ah, c("TxDb", "Homo sapiens"))
#> AnnotationHub with 42 records
#> # snapshotDate(): 2023-10-19
#> # $dataprovider: GENCODE, UCSC, NCBI, tRNAdb, snoRNAdb, RMBase v2.0
#> # $species: Homo sapiens
#> # $rdataclass: TxDb, SQLiteFile, ChainFile, FaFile
#> # additional mcols(): taxonomyid, genome, description,
#> # coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> # rdatapath, sourceurl, sourcetype
#> # retrieve records with, e.g., 'object[["AH52256"]]'
#>
#> title
#> AH52256 | TxDb.Hsapiens.BioMart.igis.sqlite
#> AH52257 | TxDb.Hsapiens.UCSC.hg18.knownGene.sqlite
#> AH52258 | TxDb.Hsapiens.UCSC.hg19.knownGene.sqlite
#> AH52259 | TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts.sqlite
#> AH52260 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
#> ... ...
#> AH100419 | TxDb.Hsapiens.UCSC.hg38.refGene.sqlite
#> AH107068 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
#> AH111585 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
#> AH114093 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
#> AH114094 | TxDb.Hsapiens.UCSC.hg38.refGene.sqlite
txdb <- ah[["AH52257"]]
#> loading from cache
#> Loading required package: GenomicFeatures
gs <- GenomicFeatures::genes(txdb)
#> 379 genes were dropped because they have exons located on both strands
#> of the same reference sequence or on more than one reference sequence,
#> so cannot be represented by a single genomic range.
#> Use 'single.strand.genes.only=FALSE' to get all the genes in a
#> GRangesList object, or use suppressMessages() to suppress this message.
gs
#> GRanges object with 19742 ranges and 1 metadata column:
#> seqnames ranges strand | gene_id
#> <Rle> <IRanges> <Rle> | <character>
#> 1 chr19 63549984-63565932 - | 1
#> 10 chr8 18293035-18303003 + | 10
#> 100 chr20 42681577-42713790 - | 100
#> 1000 chr18 23784933-24011189 - | 1000
#> 10000 chr1 241718158-242073207 - | 10000
#> ... ... ... ... . ...
#> 9991 chr9 114020538-114135733 - | 9991
#> 9992 chr21 34658193-34665310 + | 9992
#> 9993 chr22 17403795-17489967 - | 9993
#> 9994 chr6 90596334-90640876 + | 9994
#> 9997 chr22 49308863-49310900 - | 9997
#> -------
#> seqinfo: 49 sequences (1 circular) from hg18 genome
We then transfer SNP-inferred copy numbers to genes by overlap …
olaps <- GenomicRanges::findOverlaps(gs, cnv.snp)
joined <- gs[S4Vectors::queryHits(olaps)]
joined$score <- cnv.snp$score[S4Vectors::subjectHits(olaps)]
joined
#> GRanges object with 19555 ranges and 2 metadata columns:
#> seqnames ranges strand | gene_id score
#> <Rle> <IRanges> <Rle> | <character> <integer>
#> 1 chr19 63549984-63565932 - | 1 4
#> 10 chr8 18293035-18303003 + | 10 2
#> 100 chr20 42681577-42713790 - | 100 4
#> 1000 chr18 23784933-24011189 - | 1000 3
#> 10000 chr1 241718158-242073207 - | 10000 6
#> ... ... ... ... . ... ...
#> 9991 chr9 114020538-114135733 - | 9991 3
#> 9992 chr21 34658193-34665310 + | 9992 4
#> 9993 chr22 17403795-17489967 - | 9993 5
#> 9994 chr6 90596334-90640876 + | 9994 3
#> 9997 chr22 49308863-49310900 - | 9997 4
#> -------
#> seqinfo: 49 sequences (1 circular) from hg18 genome
… and, analogously, transfer sequencing-inferred copy numbers to genes by overlap.
olaps <- GenomicRanges::findOverlaps(gs, cnv.hmm)
joined2 <- gs[S4Vectors::queryHits(olaps)]
joined2$score <- cnv.hmm$score[S4Vectors::subjectHits(olaps)]
joined2
#> GRanges object with 22366 ranges and 2 metadata columns:
#> seqnames ranges strand | gene_id score
#> <Rle> <IRanges> <Rle> | <character> <numeric>
#> 1 chr19 63549984-63565932 - | 1 3.18
#> 10 chr8 18293035-18303003 + | 10 1.96
#> 100 chr20 42681577-42713790 - | 100 2.98
#> 1000 chr18 23784933-24011189 - | 1000 2.94
#> 10000 chr1 241718158-242073207 - | 10000 5.92
#> ... ... ... ... . ... ...
#> 9990 chr15 32309489-32417557 - | 9990 2.26
#> 9991 chr9 114020538-114135733 - | 9991 3.45
#> 9993 chr22 17403795-17489967 - | 9993 4.48
#> 9994 chr6 90596334-90640876 + | 9994 3.14
#> 9997 chr22 49308863-49310900 - | 9997 3.35
#> -------
#> seqinfo: 49 sequences (1 circular) from hg18 genome
We then restrict both tracks to common genes.
isect <- intersect(names(joined), names(joined2))
joined <- joined[isect]
joined2 <- joined2[isect]
Now, can assess agreement by testing the correlation between SNP-inferred gene copy numbers and the corresponding sequencing-inferred gene copy numbers.
cor(joined$score, joined2$score)
#> [1] 0.7953383
cor.test(joined$score, joined2$score)
#>
#> Pearson's product-moment correlation
#>
#> data: joined$score and joined2$score
#> t = 176.07, df = 18008, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#> 0.7899089 0.8006431
#> sample estimates:
#> cor
#> 0.7953383
We also inspect the correlation via a boxplot.
spl <- split(joined2$score, joined$score)
boxplot(spl, xlab = "SNP-inferred copy number", ylab = "sequencing-inferred copy number")
rho <- cor(joined$score, joined2$score)
rho <- paste("cor", round(rho, digits=3), sep=" = ")
p <- cor.test(joined$score, joined2$score)
p <- p$p.value
p <- " p < 2.2e-16"
legend("topright", legend=c(rho, p))
Get RNA-seq data for HEK293 cells from GEO: GSE122425
se <- getGSE122425()
#> Using cached version from 2023-04-25 14:57:15
se
#> class: SummarizedExperiment
#> dim: 57905 6
#> metadata(0):
#> assays(2): raw rpkm
#> rownames(57905): ENSG00000223972 ENSG00000227232 ... ENSG00000231514
#> ENSG00000235857
#> rowData names(4): SYMBOL KO GO length
#> colnames(6): GSM3466389 GSM3466390 ... GSM3466393 GSM3466394
#> colData names(41): title geo_accession ... passages.ch1 strain.ch1
Inspect expression of prey genes:
bait <- unique(bp.293t$SymbolA)
length(bait)
#> [1] 8995
prey <- unique(bp.293t$SymbolB)
length(prey)
#> [1] 10419
ind <- match(prey, rowData(se)$SYMBOL)
par(las = 2)
boxplot(log2(assay(se, "rpkm") + 0.5)[ind,],
names = se$title,
ylab = "log2 RPKM")
How many prey genes are expressed (raw read count > 0) in all 3 WT reps:
# background: how many genes in total are expressed in all three WT reps
gr0 <- rowSums(assay(se)[,1:3] > 0)
table(gr0 == 3)
#>
#> FALSE TRUE
#> 33842 24063
# prey: expressed in all three WT reps
table(gr0[ind] == 3)
#>
#> FALSE TRUE
#> 599 9346
# prey: expressed in at least one WT rep
table(gr0[ind] > 0)
#>
#> FALSE TRUE
#> 305 9640
Are prey genes overrepresented in the expressed genes?
exprTable <-
matrix(c(9346, 1076, 14717, 32766),
nrow = 2,
dimnames = list(c("Expressed", "Not.expressed"),
c("In.prey.set", "Not.in.prey.set")))
exprTable
#> In.prey.set Not.in.prey.set
#> Expressed 9346 14717
#> Not.expressed 1076 32766
Test using hypergeometric test (i.e. one-sided Fisher’s exact test):
fisher.test(exprTable, alternative = "greater")
#>
#> Fisher's Exact Test for Count Data
#>
#> data: exprTable
#> p-value < 2.2e-16
#> alternative hypothesis: true odds ratio is greater than 1
#> 95 percent confidence interval:
#> 18.29105 Inf
#> sample estimates:
#> odds ratio
#> 19.34726
Alternatively: permutation test, i.e. repeatedly sample number of prey genes from the background, and assess how often we have as many or more than 9346 genes expressed:
permgr0 <- function(gr0, nr.genes = length(prey))
{
ind <- sample(seq_along(gr0), nr.genes)
sum(gr0[ind] == 3)
}
perms <- replicate(permgr0(gr0), 1000)
summary(perms)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1000 1000 1000 1000 1000 1000
(sum(perms >= 9346) + 1) / 1001
#> [1] 0.000999001
Check which genes turn up most frequently as prey:
prey.freq <- sort(table(bp.293t$SymbolB), decreasing = TRUE)
preys <- names(prey.freq)
prey.freq <- as.vector(prey.freq)
names(prey.freq) <- preys
head(prey.freq)
#> HSPA5 HSPA8 TUBB8 UBB YBX1 YWHAH
#> 199 192 176 173 139 132
summary(prey.freq)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1.00 2.00 6.00 11.34 16.00 199.00
hist(prey.freq, breaks = 50, main = "", xlab = "Number of PPIs", ylab = "Number of genes")