This vignette contains code snippets to display results of the NestLink project (p1875). Briefly, it analyzes the amno acid frequencies of the in-silico composed and measured flycodes.
NestLink 1.16.0
library(NestLink)
## Warning: replacing previous import 'utils::findMatches' by
## 'S4Vectors::findMatches' when loading 'AnnotationDbi'
ESP_Prediction was generated using an application https://genepattern.broadinstitute.org (???).
library(ggplot2)
ESP <- rbind(getFC(), getNB())
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
geom_histogram(bins = 50, alpha = 0.4, position="identity") +
labs(x = "detectability in LC-MS (ESP prediction)") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)
ESP <- rbind(getFC(), NB.unambiguous(getNB()))
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
geom_histogram(bins = 50, alpha = 0.4, position="identity") +
labs(x = "detectability in LC-MS (ESP prediction)") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)
ESP <- rbind(getFC(), NB.unique(getNB()))
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
geom_histogram(bins = 50, alpha = 0.4, position="identity") +
labs(x = "detectability in LC-MS (ESP prediction)") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)
ESP <- rbind(getNB(), NB.unambiguous(getNB()), NB.unique(getNB()))
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
geom_histogram(bins = 50, alpha = 0.4, position="identity") +
labs(x = "detectability in LC-MS (ESP prediction)") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)
table(ESP$cond)
##
## NB NB.unambiguous NB.unique
## 27681 5832 7705
# library(ExperimentHub)
# eh <- ExperimentHub();
# load(query(eh, c("NestLink", "WU160118.RData"))[[1]])
load(getExperimentHubFilename("WU160118.RData"))
WU <- WU160118
filtering
PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
idx <- grepl(PATTERN, WU$pep_seq)
WU <- WU[idx & WU$pep_score > 25,]
determine charge state frequency through counting
WU$chargeInt <- as.integer(substr(WU$query_charge, 0, 1))
table(WU$chargeInt)
##
## 2 3 4
## 33392 157 1
in percent
round(100 * (table(WU$chargeInt) / nrow(WU)), 1)
##
## 2 3 4
## 99.5 0.5 0.0
as histograms
library(scales)
ggplot(WU, aes(x = moverz * chargeInt, fill = (query_charge),
colour = (query_charge))) +
facet_wrap(~ datfilename, ncol = 2) +
geom_histogram(binwidth= 10, alpha=.3, position="identity") +
labs(title = "Precursor mass to charge frequency plot",
subtitle = "Plotting frequency of precursor masses for each charge state",
x = "Precursor mass [neutral mass]",
y = "Frequency [counts]",
fill = "Charge State",
colour = "Charge State") +
scale_x_continuous(breaks = pretty_breaks(8)) +
theme_light()
We confirmed this prediction by experimental data and found that 99.9 percent of flycode precursor ions correspond to doubly charge species (data not shown). The omission of positively charged residues is also critical in order to render trypsin a site-specific protease.
WU$suffix <- substr(WU$pep_seq, 10, 100)
ggplot(WU, aes(x = moverz * chargeInt, fill = suffix, colour = suffix)) +
geom_histogram(binwidth= 10, alpha=.2, position="identity") +
#facet_wrap(~ substr(pep_seq, 10, 100)) +
theme_light()
ggplot(WU, aes(x = moverz * chargeInt, fill = suffix)) +
geom_histogram(binwidth= 10, alpha=.9, position="identity") +
facet_wrap(~ substr(pep_seq, 10, 100)) +
theme_light()
ggplot(WU, aes(x = pep_score, fill = query_charge, colour = query_charge)) +
geom_histogram(binwidth= 2, alpha=.5, position="identity") +
facet_wrap(~ substr(pep_seq, 10, 100)) +
theme_light()
Reads the flycodes from p1875 db10 fasta file
# FC <- read.table(system.file("extdata/FC.tryptic", package = "NestLink"),
# header = TRUE)
FC <- getFC()
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
FC$peptide <- (as.character(FC$peptide))
idx <- grep (PATTERN, FC$peptide)
FC$cond <- "FC"
FC$pim <- parentIonMass(FC$peptide)
FC <- FC[nchar(FC$peptide) >2, ]
FC$ssrc <- sapply(FC$peptide, ssrc)
FC$peptideLength <- nchar(as.character(FC$peptide))
FC <- FC[idx,]
head(FC)
## peptide ESP_Prediction cond pim ssrc peptideLength
## 120 GSAAAAADSWLTVR 0.75450 FC 1375.696 27.80445 14
## 121 GSAAAAATDWLTVR 0.76422 FC 1389.712 29.00445 14
## 122 GSAAAAATGWLTVR 0.65522 FC 1331.707 28.60445 14
## 123 GSAAAAATVWLR 0.65496 FC 1173.637 29.10445 12
## 124 GSAAAAAYEWLTVR 0.72754 FC 1465.743 33.10445 14
## 125 GSAAAADAAWQEGGR 0.53588 FC 1417.645 11.70445 15
aa_pool_x7 <- c(rep('A', 18), rep('S', 6), rep('T', 12), rep('N', 1),
rep('Q', 1), rep('D', 11), rep('E', 11), rep('V', 12), rep('L', 2),
rep('F', 1), rep('Y', 4), rep('W', 1), rep('G', 8), rep('P', 12))
aa_pool_x7.post <- c(rep('WR', 20), rep('WLTVR', 20), rep('WQEGGR', 20),
rep('WLR', 20), rep('WQSR', 20))
FC.pep_seq.freq <- table(unlist(strsplit(substr(FC$peptide, 3, 9), "")))
FC.freq <- round(FC.pep_seq.freq / sum(FC.pep_seq.freq) * 100, 3)
FC.pep_seq.freq.post <- table(unlist((substr(FC$peptide, 10, 100))))
FC.freq.post <- round(FC.pep_seq.freq.post / sum(FC.pep_seq.freq.post) * 100, 3)
WU.pep_seq.freq <- table(unlist(strsplit(substr(WU$pep_seq, 3, 9), "")))
WU.freq <- round(WU.pep_seq.freq / sum(WU.pep_seq.freq) * 100,3)
WU.pep_seq.freq.post <- table(unlist(substr(WU$pep_seq, 10, 100)))
WU.freq.post <- round(WU.pep_seq.freq.post / sum(WU.pep_seq.freq.post) *100,3)
table(aa_pool_x7)
## aa_pool_x7
## A D E F G L N P Q S T V W Y
## 18 11 11 1 8 2 1 12 1 6 12 12 1 4
FC.freq
##
## A D E F G L N P Q S T
## 16.027 13.452 10.097 0.872 6.918 1.658 0.618 10.662 1.142 5.637 10.663
## V W Y
## 16.533 1.245 4.478
WU.freq
##
## A D E F G L N P Q S T
## 15.434 16.983 11.939 0.532 7.077 1.165 0.784 11.813 1.484 5.748 10.532
## V W Y
## 12.645 0.773 3.090
AAfreq1 <- data.frame(aa = table(aa_pool_x7))
names(AAfreq1) <- c('AA', 'freq')
AAfreq1 <-rbind(AAfreq1, df<-data.frame(AA=c('C','H','I','M','K','R'), freq=rep(0,6)))
AAfreq1$cond <- 'theoretical frequency'
AAfreq2 <- data.frame(aa=FC.freq)
names(AAfreq2) <- c('AA', 'freq')
AAfreq2$cond <- "db10.fasta"
AAfreq3 <- data.frame(aa=WU.freq)
names(AAfreq3) <- c('AA', 'freq')
AAfreq3$cond <- "WU"
#FC.all <- read.table(system.file("extdata/FC.tryptic", package = "NestLink"),
# header = TRUE)
FC.all <- getFC()
## see ?NestLink and browseVignettes('NestLink') for documentation
## loading from cache
PATTERN.all <- "^GS[A-Z]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
FC.all <- as.character(FC.all$peptide)[grep(PATTERN.all, as.character(FC.all$peptide))]
FC.all.pep_seq.freq <- table(unlist(strsplit(substr(FC.all, 3, 9), "")))
FC.all.freq <- round(FC.all.pep_seq.freq / sum(FC.all.pep_seq.freq) * 100, 3)
FC.all.freq[20] <- 0
names(FC.all.freq) <- c(names(FC.all.freq)[1:19], "K")
AAfreq4 <- data.frame(AA=names(FC.all.freq), freq=as.numeric(FC.all.freq))
AAfreq4$cond <- "sequenced frequency"
AAfreq <- do.call('rbind', list(AAfreq1, AAfreq2, AAfreq3, AAfreq4))
AAfreq$freq <- as.numeric(AAfreq$freq)
ggplot(data=AAfreq, aes(x=AA, y=freq, fill=cond)) +
geom_bar(stat="identity", position="dodge") +
labs(title = "amino acid frequency plot") +
theme_light()
## Warning: Removed 5 rows containing missing values (`geom_bar()`).
AAfreq1.post <- data.frame(aa=table(aa_pool_x7.post))
names(AAfreq1.post) <- c('AA', 'freq')
AAfreq1.post$cond <- "aa_pool_x7"
AAfreq2.post <- data.frame(aa=FC.freq.post)
names(AAfreq2.post) <- c('AA', 'freq')
AAfreq2.post$cond <- "db10.fasta"
AAfreq3.post <- data.frame(aa=WU.freq.post)
names(AAfreq3.post) <- c('AA', 'freq')
AAfreq3.post$cond <- "WU"
AAfreq.post <- do.call('rbind', list(AAfreq1.post, AAfreq2.post, AAfreq3.post))
df<-AAfreq[(AAfreq$cond != 'WU' & AAfreq$cond != 'db10.fasta'), ]
df$freq<- as.numeric(df$freq)
ggplot(data=df, aes(x=AA, y=freq, fill=cond)) +
geom_bar(stat="identity", position="dodge") +
labs(title = "Amino acid frequency plot") +
ylab("Frequency [%]") +
theme_light()
## Warning: Removed 5 rows containing missing values (`geom_bar()`).
FCfreq <- data.frame(table(unlist(strsplit(substr(FC$peptide, 3, 9), ""))))
colnames(FCfreq) <- c('AA', 'freq')
ggplot(data=FCfreq, aes(x=AA, y=freq)) +
geom_bar(stat="identity", position="dodge") +
labs(title = "Amino acid frequency plot") +
theme_light()
ggplot(data=AAfreq.post, aes(x=AA, y=freq, fill=cond)) +
geom_bar(stat="identity", position="dodge") +
labs(title = "suffix frequency plot") +
theme_light()
The RT is predicted using the method described in Krokhin et al. (2004) and implemented using the specL package Panse et al. (2015).
WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
WU$suffix <- substr(WU$pep_seq, 10,100)
ggplot(WU, aes(x = RTINSECONDS, y = ssrc)) +
geom_point(aes(alpha=pep_score, colour = datfilename)) +
facet_wrap(~ datfilename * suffix, ncol = 5, nrow = 8) +
geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
cor(WU$RTINSECONDS, WU$ssrc, method = 'spearman')
## [1] 0.9056695
Considers only Mascot Result File F255737
WU <- WU160118
PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
idx <- grepl(PATTERN, WU$pep_seq)
# Mascot score cut-off valye 25
WU <- WU[idx & WU$pep_score > 25,]
WU$suffix <- substr(WU$pep_seq, 10, 100)
WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
ggplot(WU, aes(x = RTINSECONDS, fill = datfilename)) +
geom_histogram(bins = 50)
ggplot(WU, aes(x = ssrc, fill = datfilename)) +
geom_histogram(bins = 50)
ggplot(WU, aes(x = RTINSECONDS, fill = suffix)) +
geom_histogram(bins = 50)
ggplot(WU, aes(x = ssrc, fill = suffix)) +
geom_histogram(bins = 50)
WU <- WU[WU$datfilename == "F255737",]
WU <- aggregate(WU$RTINSECONDS ~ WU$pep_seq, FUN = min)
names(WU) <-c("pep_seq", "RTINSECONDS")
WU$suffix <- substr(WU$pep_seq, 10, 100)
WU$peptide_mass <- parentIonMass(as.character(WU$pep_seq))
WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
WU$datfilename <- "F255737"
ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) +
geom_point(aes(colour = suffix)) +
facet_wrap(~ datfilename * suffix, ncol=5)
ggplot(WU, aes(x = RTINSECONDS, fill = suffix)) +
geom_histogram(bins = 20) +
facet_wrap(~ datfilename * suffix, ncol=5)
ggplot(WU, aes(x = ssrc, y = peptide_mass)) +
geom_point(aes(colour = suffix)) +
facet_wrap(~ datfilename * suffix, ncol = 5)
ggplot(WU, aes(x = ssrc,fill = suffix)) +
geom_histogram(bins = 20) +
facet_wrap(~ datfilename * suffix, ncol=5)
ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) +
geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1)
ggplot(WU, aes(x = ssrc, y = peptide_mass)) +
geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1)
WU <- do.call('rbind', lapply(c(25,35,45), function(cutoff){
WU <- WU160118
PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
idx <- grepl(PATTERN, WU$pep_seq)
WU <- WU[idx & WU$pep_score > cutoff, ]
WU <- WU[WU$datfilename == "F255737",]
WU <- aggregate(WU$RTINSECONDS ~ WU$pep_seq, FUN=min)
names(WU) <-c("pep_seq","RTINSECONDS")
WU$suffix <- substr(WU$pep_seq, 10, 100)
WU$peptide_mass <- parentIonMass(as.character(WU$pep_seq))
WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
WU$datfilename <- "F255737"
WU$mascotScoreCutOff <- cutoff
WU}))
ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) +
geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) +
facet_wrap(~ mascotScoreCutOff, ncol = 1)
ggplot(WU, aes(x = ssrc, y = peptide_mass)) +
geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) +
facet_wrap(~ mascotScoreCutOff, ncol = 1 )
Here is the output of the sessionInfo()
command.
## R version 4.3.0 RC (2023-04-13 r84269)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.17-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] scales_1.2.1 ggplot2_3.4.2
## [3] NestLink_1.16.0 ShortRead_1.58.0
## [5] GenomicAlignments_1.36.0 SummarizedExperiment_1.30.0
## [7] Biobase_2.60.0 MatrixGenerics_1.12.0
## [9] matrixStats_0.63.0 Rsamtools_2.16.0
## [11] GenomicRanges_1.52.0 BiocParallel_1.34.0
## [13] protViz_0.7.7 gplots_3.1.3
## [15] Biostrings_2.68.0 GenomeInfoDb_1.36.0
## [17] XVector_0.40.0 IRanges_2.34.0
## [19] S4Vectors_0.38.0 ExperimentHub_2.8.0
## [21] AnnotationHub_3.8.0 BiocFileCache_2.8.0
## [23] dbplyr_2.3.2 BiocGenerics_0.46.0
## [25] BiocStyle_2.28.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.1.3 bitops_1.0-7
## [3] deldir_1.0-6 rlang_1.1.0
## [5] magrittr_2.0.3 compiler_4.3.0
## [7] RSQLite_2.3.1 mgcv_1.8-42
## [9] png_0.1-8 vctrs_0.6.2
## [11] pkgconfig_2.0.3 crayon_1.5.2
## [13] fastmap_1.1.1 magick_2.7.4
## [15] ellipsis_0.3.2 labeling_0.4.2
## [17] caTools_1.18.2 utf8_1.2.3
## [19] promises_1.2.0.1 rmarkdown_2.21
## [21] purrr_1.0.1 bit_4.0.5
## [23] xfun_0.39 zlibbioc_1.46.0
## [25] cachem_1.0.7 jsonlite_1.8.4
## [27] blob_1.2.4 highr_0.10
## [29] later_1.3.0 DelayedArray_0.26.0
## [31] interactiveDisplayBase_1.38.0 jpeg_0.1-10
## [33] parallel_4.3.0 R6_2.5.1
## [35] bslib_0.4.2 RColorBrewer_1.1-3
## [37] jquerylib_0.1.4 Rcpp_1.0.10
## [39] bookdown_0.33 knitr_1.42
## [41] splines_4.3.0 httpuv_1.6.9
## [43] Matrix_1.5-4 tidyselect_1.2.0
## [45] yaml_2.3.7 codetools_0.2-19
## [47] hwriter_1.3.2.1 curl_5.0.0
## [49] lattice_0.21-8 tibble_3.2.1
## [51] shiny_1.7.4 withr_2.5.0
## [53] KEGGREST_1.40.0 evaluate_0.20
## [55] pillar_1.9.0 BiocManager_1.30.20
## [57] filelock_1.0.2 KernSmooth_2.23-20
## [59] generics_0.1.3 RCurl_1.98-1.12
## [61] BiocVersion_3.17.1 munsell_0.5.0
## [63] gtools_3.9.4 xtable_1.8-4
## [65] glue_1.6.2 tools_4.3.0
## [67] interp_1.1-4 grid_4.3.0
## [69] latticeExtra_0.6-30 AnnotationDbi_1.62.0
## [71] colorspace_2.1-0 nlme_3.1-162
## [73] GenomeInfoDbData_1.2.10 cli_3.6.1
## [75] rappdirs_0.3.3 fansi_1.0.4
## [77] dplyr_1.1.2 gtable_0.3.3
## [79] sass_0.4.5 digest_0.6.31
## [81] farver_2.1.1 memoise_2.0.1
## [83] htmltools_0.5.5 lifecycle_1.0.3
## [85] httr_1.4.5 mime_0.12
## [87] bit64_4.0.5
Krokhin, O. V., R. Craig, V. Spicer, W. Ens, K. G. Standing, R. C. Beavis, and J. A. Wilkins. 2004. “An improved model for prediction of retention times of tryptic peptides in ion pair reversed-phase HPLC: its application to protein peptide mapping by off-line HPLC-MALDI MS.” Mol. Cell Proteomics 3 (9): 908–19.
Panse, C., C. Trachsel, J. Grossmann, and R. Schlapbach. 2015. “specL–an R/Bioconductor package to prepare peptide spectrum matches for use in targeted proteomics.” Bioinformatics 31 (13): 2228–31.