This file contains step-by-step instructions on how to obtain the data and deconvolute phenotypes from pathogen infection screens, as shown in gespeR: A statistical model for deconvoluting off-target-confounded RNA interference screens (Schmich et al., 2015). For full compatibility, please download the development version of gespeR at: https://github.com/fschmich/gespeR
sapply(c("QIAGEN", "DHARMACON", "VALIDATION"), function(x) {
download.file(url = sprintf("http://n.ethz.ch/~fschmich/gespeR/%s.rds", x),
destfile = sprintf("/tmp/%s.rds", x), quiet = FALSE, mode = "wb")
})
require(Matrix)
require(gespeR)
# Construct TargetRelations objects for each library
Q <- TargetRelations("/tmp/QIAGEN.rds")
show(Q)
## 91003 x 27240 siRNA-to-gene relations.
## 10 x 5 sparse Matrix of class "dgCMatrix"
## 1 2 3 9 10
## SI00000007 . 0.75 . . .
## SI00000035 . . . . .
## SI00000063 . . . 0.10992427 .
## SI00000070 . . . . .
## SI00000077 . . . . .
## SI00000084 . . . . .
## SI00000105 . . . . .
## SI00000112 . . . 0.09310967 .
## SI00000119 . . . . .
## SI00000133 . . . . .
## ...
require(dplyr)
require(reshape2)
require(gespeR)
# Read phenotypes
phenotypes <- read.delim("/tmp/phenotypes.csv",
sep = ",", stringsAsFactors = FALSE) %>%
tbl_df() %>%
select(SID, GeneID = NCBI.Gene.ID, siRNASet = SIRNA_SET, contains("Infectivity")) %>%
melt(id.vars = c("SID", "siRNASet", "GeneID")) %>%
tbl_df() %>%
select(SID = SID, GeneID, siRNASet, Pathogen = variable, Phenotype = value) %>%
mutate(Pathogen = gsub("Infectivity_", "", Pathogen),
SID = as.character(SID)) %>%
filter(!is.na(Phenotype)) %>% # Artifact of how data is deposited in Pubchem
arrange(SID)
head(phenotypes)
## Source: local data frame [6 x 5]
##
## SID GeneID siRNASet Pathogen Phenotype
## 1 249376050 53 5 Salmonella 0.354975
## 2 249376051 53 5 Salmonella 1.588140
## 3 249376052 53 5 Salmonella 2.014610
## 4 249376053 373 5 Salmonella -1.078850
## 5 249376054 379 5 Salmonella -1.452680
## 6 249376055 9275 5 Salmonella 0.342689
# Read ID mapping between SIDs and Vendor IDs
map <- read.delim("/tmp/mapping.txt", header = FALSE, stringsAsFactors = FALSE)
map <- map[seq(2, nrow(map), by = 2),]
map <- data.frame(t(sapply(map, function(x) {
unlist(strsplit(x, split = "SID: | InfectX Consortium: "))[2:3]
})), stringsAsFactors = FALSE) %>% tbl_df()
rownames(map) <- NULL
colnames(map) <- c("SID", "VendorID")
# Map IDs
phenotypes <- left_join(phenotypes, map, by = "SID") %>%
tbl_df() %>%
select(SID, VendorID, siRNASet, GeneID, Pathogen, Phenotype)
head(phenotypes)
## Source: local data frame [6 x 6]
##
## SID VendorID siRNASet GeneID Pathogen Phenotype
## 1 249376050 10031 5 53 Salmonella 0.354975
## 2 249376051 10122 5 53 Salmonella 1.588140
## 3 249376052 10210 5 53 Salmonella 2.014610
## 4 249376053 10235 5 373 Salmonella -1.078850
## 5 249376054 10240 5 379 Salmonella -1.452680
## 6 249376055 10268 5 9275 Salmonella 0.342689
# Construct Phenotypes objects for each (Qiagen) library + pathogen combination
obs.ssp <- obs.gsp <- list()
phenotypes <- split(phenotypes, phenotypes$Pathogen)
for (pathogen in names(phenotypes)) {
obs.ssp[[pathogen]] <- obs.gsp[[pathogen]] <- list()
for (s in 1:4) {
spl <- filter(phenotypes[[pathogen]], siRNASet == s, VendorID %in% Q@siRNAs)
obs.ssp[[pathogen]][[s]] <- Phenotypes(phenotypes = Matrix(spl$Phenotype),
ids = spl$VendorID,
pnames = c("Infectivity"),
type = "SSP")
spl.noNA <- filter(spl, !is.na(GeneID)) %>%
group_by(GeneID) %>%
summarise(Phenotype = mean(Phenotype, na.rm = TRUE)) %>%
filter(!is.nan(Phenotype))
# We need gene-based Phenotypes objects for concordance evaluation
obs.gsp[[pathogen]][[s]] <- Phenotypes(phenotypes = Matrix(spl.noNA$Phenotype),
ids = as.character(spl.noNA$GeneID),
pnames = c("Infectivity"),
type = "GSP")
}
}
show(obs.ssp$Bartonella[[1]])
## 20087 SSP Phenotypes
##
## Source: local data frame [20,087 x 2]
##
## ID Infectivity
## 1 SI00000035 -1.2441600
## 2 SI00000077 2.0732300
## 3 SI00000112 -1.0704600
## 4 SI00000168 -1.7540600
## 5 SI00000266 -0.8546760
## 6 SI00000399 -0.4619630
## 7 SI00000420 1.3742000
## 8 SI00000476 0.4220430
## 9 SI00000518 -1.7471500
## 10 SI00000567 0.0507219
## .. ... ...
require(gespeR)
# Fit gespeR models
ans.cv <- list()
for (pathogen in c("Bartonella", "Brucella", "Salmonella")) {
ans.cv[[pathogen]] <- list()
for (s in 1:4) {
cat(sprintf("set: %d, pathogen: %s\n", s, pathogen))
ges <- gespeR(phenotypes = obs.ssp[[pathogen]][[s]],
target.relations = Q,
mode = "cv",
alpha = 0.5,
ncores = 1)
ans.cv[[pathogen]][[s]] <- unloadValues(ges, writeValues = FALSE)
}
}
# Obtain gene-specific phenotypes (GSPs)
ges.gsp <- lapply(ans.cv, function(x) {
lapply(x, gsp)
})
require(gespeR)
require(ggplot2)
# Function computes concordance between all pairs of phenotypes. Measures used
# are Spearman's correlation, rank-biased overlap and the Jaccard index.
# Observed phenotypes are cut to the same length as gespeR GSPs, respecting the
# proportion of negative and positive phenotypes, in order to guarantee fair
# comparison.
get.conc <- function(phen, cut = NULL) {
min.overlap = 10
rbo.k = 1000
rbo.p = 1-1e-3
rbo.mid <- 0
cor.method = "spearman"
uneven.lengths = TRUE
if (!is.null(cut)) { # cut longer ranked lists to gespeR's lengths
lapply(names(phen), function(x) {
l <- lapply(cut[[x]], function(z) {
ans <- as.data.frame(z)
list(pos = length(which(ans$Infectivity > 0)), neg = length(which(ans$Infectivity < 0)))
})
phencut <- lapply(phen[[x]], function(y) {
as.data.frame(y) %>%
tbl_df() %>%
mutate(ID = as.character(ID)) %>%
filter(!is.na(Infectivity)) %>%
arrange(desc(Infectivity))
})
for (lib in 1:length(l)) {
len <- l[[lib]]
a <- nrow(phencut[[lib]]) - len$neg + 1
b <- nrow(phencut[[lib]])
phencut[[lib]] <- phencut[[lib]][c(1:len$pos, a:b),]
phencut[[lib]] <- Phenotypes(phenotypes = Matrix(phencut[[lib]]$Infectivity),
ids = phencut[[lib]]$ID,
pnames = c("Infectivity"),
type = "SSP")
}
concordance(phencut,
min.overlap = min.overlap,
rbo.k = rbo.k,
rbo.p = rbo.p,
cor.method = cor.method,
rbo.mid = rbo.mid,
uneven.lengths = uneven.lengths) %>%
data.frame %>%
select(-lisect) %>%
melt(id.vars = c("test.pair", "phen")) %>%
mutate(Method = "SSP", Pathogen = x) %>%
select(-test.pair, Method, Pathogen, Measure = variable, value) %>%
tbl_df()
}) %>% do.call("rbind", .)
} else {
lapply(names(phen), function(x) {
concordance(phen[[x]],
min.overlap = min.overlap,
rbo.k = rbo.k,
rbo.p = rbo.p,
cor.method = cor.method,
rbo.mid = rbo.mid,
uneven.lengths = uneven.lengths) %>%
data.frame %>%
select(-lisect) %>%
melt(id.vars = c("test.pair", "phen")) %>%
mutate(Method = "gespeR", Pathogen = x) %>%
select(-test.pair, Method, Pathogen, Measure = variable, value) %>%
tbl_df()
}) %>% do.call("rbind", .)
}
}
# Computation of concordance for gespeR GSPs and observed phenotypes
conc.gespeR <- get.conc(ges.gsp)
conc.obs <- get.conc(obs.gsp, cut = ges.gsp)
# Visualisation of concordance measures
dat <- rbind(conc.gespeR, conc.obs) %>% tbl_df() %>%
mutate(Pathogen = factor(Pathogen, levels = c("Brucella", "Bartonella", "Salmonella"),
labels = c("B. abortus", "B. henselae", "S. typhimurium")),
Method = factor(Method, levels = c("gespeR", "SSP"),
labels = c("gespeR", "Observed")))
dat$Measure <- factor(dat$Measure,
levels = c("cor", "rbo.top", "rbo.bottom", "jaccard"),
labels = c(expression(rho), expression(rbo["" %down% ""]),
expression(rbo["" %up% ""]), expression("J")))
ggplot(data = dat, aes(x = Pathogen, y = value, colour = Method)) +
geom_boxplot(outlier.size = 0, width = 0.8) +
facet_grid(. ~ Measure, labeller = label_parsed) +
xlab("") + ylab("") +
scale_colour_manual("", values = c("#d7191c", "#525252"), drop = FALSE) +
ylim(c(0, 1)) +
theme_bw(base_size = 12, base_family = "Helvetica") +
theme(axis.text = element_text(size = rel(1.0)),
axis.title = element_text(size = rel(1.0), face = "bold"),
strip.text = element_text(size = rel(1.0), face = "bold"),
axis.ticks = element_line(colour = "black"),
legend.key = element_rect(colour = NA),
legend.text = element_text(size = rel(1.0)),
legend.title = element_text(size = rel(1.0), face = "bold"),
panel.background = element_rect(fill = "white", colour = NA),
panel.border = element_rect(fill = NA, colour = "grey50"),
panel.grid.major = element_line(colour = "grey90", size = 0.2),
panel.grid.minor = element_line(colour = "grey98", size = 0.5),
strip.background = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.9.5 (Mavericks)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.1 dplyr_0.4.2 gespeR_1.1.1 ggplot2_1.0.1
## [5] Matrix_1.2-1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.11.6 mvtnorm_1.0-2 lattice_0.20-31
## [4] assertthat_0.1 glmnet_2.0-2 digest_0.6.8
## [7] foreach_1.4.2 R6_2.0.1 GenomeInfoDb_1.4.1
## [10] plyr_1.8.3 stats4_3.2.0 pcaPP_1.9-60
## [13] RSQLite_1.0.0 evaluate_0.7 BiocInstaller_1.18.3
## [16] zlibbioc_1.14.0 lazyeval_0.1.10 annotate_1.46.0
## [19] S4Vectors_0.6.0 preprocessCore_1.30.0 rmarkdown_0.7
## [22] labeling_0.3 proto_0.3-10 splines_3.2.0
## [25] stringr_1.0.0 RCurl_1.95-4.6 biomaRt_2.24.0
## [28] munsell_0.4.2 BiocGenerics_0.14.0 htmltools_0.2.6
## [31] IRanges_2.2.4 codetools_0.2-11 XML_3.98-1.2
## [34] rrcov_1.3-8 MASS_7.3-41 bitops_1.0-6
## [37] grid_3.2.0 RBGL_1.44.0 prada_1.44.0
## [40] xtable_1.7-4 GSEABase_1.30.2 gtable_0.1.2
## [43] affy_1.46.1 DBI_0.3.1 magrittr_1.5
## [46] formatR_1.2 scales_0.2.5 graph_1.46.0
## [49] stringi_0.5-2 genefilter_1.50.0 affyio_1.36.0
## [52] doParallel_1.0.8 limma_3.24.10 robustbase_0.92-4
## [55] RColorBrewer_1.1-2 iterators_1.0.7 tools_3.2.0
## [58] Biobase_2.28.0 Category_2.34.2 DEoptimR_1.0-2
## [61] parallel_3.2.0 survival_2.38-2 yaml_2.1.13
## [64] AnnotationDbi_1.30.1 colorspace_1.2-6 cluster_2.0.2
## [67] vsn_3.36.0 cellHTS2_2.32.0 knitr_1.10.5