Contents

1 Load requiered R packages

library(NestLink)

2 Display ESP

ESP_Prediction was generated using an application https://genepattern.broadinstitute.org (???).

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
ESP <- rbind(getFC(), getNB())
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2064 : 2064'
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2065 : 2065'
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
  geom_histogram(bins = 50, alpha = 0.4, position="identity") +
  labs(x = "detectability in LC-MS (ESP prediction)") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
      panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)

ESP <- rbind(getFC(), NB.unambiguous(getNB()))
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2064 : 2064'
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2065 : 2065'
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
  geom_histogram(bins = 50, alpha = 0.4, position="identity") +
  labs(x = "detectability in LC-MS (ESP prediction)") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)

ESP <- rbind(getFC(), NB.unique(getNB()))
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2064 : 2064'
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2065 : 2065'
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
  geom_histogram(bins = 50, alpha = 0.4, position="identity") +
  labs(x = "detectability in LC-MS (ESP prediction)") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)

ESP <- rbind(getNB(), NB.unambiguous(getNB()), NB.unique(getNB()))
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2065 : 2065'
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2065 : 2065'
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2065 : 2065'
p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) +
  geom_histogram(bins = 50, alpha = 0.4, position="identity") +
  labs(x = "detectability in LC-MS (ESP prediction)") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
print(p)

table(ESP$cond)
## 
##             NB NB.unambiguous      NB.unique 
##          27681           5832           7705

3 Charge State Frequency

3.1 Load workunit 160118

# library(ExperimentHub)                               
# eh <- ExperimentHub();                                
# load(query(eh, c("NestLink", "WU160118.RData"))[[1]])

load(getExperimentHubFilename("WU160118.RData"))
WU <- WU160118

filtering

PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"

idx <- grepl(PATTERN, WU$pep_seq)
WU <- WU[idx & WU$pep_score > 25,]

determine charge state frequency through counting

WU$chargeInt <- as.integer(substr(WU$query_charge, 0, 1))
table(WU$chargeInt)
## 
##     2     3     4 
## 33392   157     1

in percent

round(100 * (table(WU$chargeInt) / nrow(WU)), 1)
## 
##    2    3    4 
## 99.5  0.5  0.0

as histograms

library(scales)
ggplot(WU, aes(x = moverz * chargeInt, fill = (query_charge),
               colour = (query_charge))) +
    facet_wrap(~ datfilename, ncol = 2) +
    geom_histogram(binwidth= 10, alpha=.3, position="identity") +
    labs(title = "Precursor mass to charge frequency plot",
      subtitle = "Plotting frequency of precursor masses for each charge state",
      x = "Precursor mass [neutral mass]", 
      y = "Frequency [counts]",
      fill = "Charge State",
      colour = "Charge State") +
    scale_x_continuous(breaks = pretty_breaks(8)) +
    theme_light()

We confirmed this prediction by experimental data and found that 99.9 percent of flycode precursor ions correspond to doubly charge species (data not shown). The omission of positively charged residues is also critical in order to render trypsin a site-specific protease.

3.2 FlyCode Mass Distribution

WU$suffix <- substr(WU$pep_seq, 10, 100)

ggplot(WU, aes(x = moverz * chargeInt, fill = suffix, colour = suffix)) +
    geom_histogram(binwidth= 10, alpha=.2, position="identity") +
    #facet_wrap(~ substr(pep_seq, 10, 100)) +
   theme_light()

ggplot(WU, aes(x = moverz * chargeInt, fill = suffix)) +
    geom_histogram(binwidth= 10, alpha=.9, position="identity") +
    facet_wrap(~ substr(pep_seq, 10, 100)) +
   theme_light()

3.3 Mascot Ion Score Distribution

ggplot(WU, aes(x = pep_score, fill = query_charge, colour = query_charge)) +
    geom_histogram(binwidth= 2, alpha=.5, position="identity") +
    facet_wrap(~ substr(pep_seq, 10, 100)) +
   theme_light()

4 Compare Real and in-silico FlyCodes

Reads the flycodes from p1875 db10 fasta file

# FC <- read.table(system.file("extdata/FC.tryptic", package = "NestLink"),
#                header = TRUE)
FC <- getFC()
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2064 : 2064'
FC$peptide <- (as.character(FC$peptide))
idx <- grep (PATTERN, FC$peptide)

FC$cond <- "FC"
FC$pim <- parentIonMass(FC$peptide)
FC <- FC[nchar(FC$peptide) >2, ]
FC$ssrc <- sapply(FC$peptide, ssrc)
FC$peptideLength <- nchar(as.character(FC$peptide))
FC <- FC[idx,]
head(FC)    
##             peptide ESP_Prediction cond      pim     ssrc peptideLength
## 120  GSAAAAADSWLTVR        0.75450   FC 1375.696 27.80445            14
## 121  GSAAAAATDWLTVR        0.76422   FC 1389.712 29.00445            14
## 122  GSAAAAATGWLTVR        0.65522   FC 1331.707 28.60445            14
## 123    GSAAAAATVWLR        0.65496   FC 1173.637 29.10445            12
## 124  GSAAAAAYEWLTVR        0.72754   FC 1465.743 33.10445            14
## 125 GSAAAADAAWQEGGR        0.53588   FC 1417.645 11.70445            15

4.1 AA design frequency

aa_pool_x7 <- c(rep('A', 18), rep('S', 6), rep('T', 12), rep('N', 1),
  rep('Q', 1), rep('D', 11),  rep('E', 11), rep('V', 12), rep('L', 2),
  rep('F', 1), rep('Y', 4), rep('W', 1), rep('G', 8), rep('P', 12))

aa_pool_x7.post <- c(rep('WR', 20),  rep('WLTVR', 20), rep('WQEGGR', 20), 
  rep('WLR', 20), rep('WQSR', 20))
FC.pep_seq.freq <- table(unlist(strsplit(substr(FC$peptide, 3, 9), "")))
FC.freq <- round(FC.pep_seq.freq / sum(FC.pep_seq.freq) * 100, 3)


FC.pep_seq.freq.post <- table(unlist((substr(FC$peptide, 10, 100))))
FC.freq.post <- round(FC.pep_seq.freq.post / sum(FC.pep_seq.freq.post) * 100, 3)


WU.pep_seq.freq <- table(unlist(strsplit(substr(WU$pep_seq, 3, 9), "")))
WU.freq <- round(WU.pep_seq.freq / sum(WU.pep_seq.freq) * 100,3)


WU.pep_seq.freq.post <- table(unlist(substr(WU$pep_seq, 10, 100)))
WU.freq.post <- round(WU.pep_seq.freq.post / sum(WU.pep_seq.freq.post) *100,3)
table(aa_pool_x7)
## aa_pool_x7
##  A  D  E  F  G  L  N  P  Q  S  T  V  W  Y 
## 18 11 11  1  8  2  1 12  1  6 12 12  1  4
FC.freq
## 
##      A      D      E      F      G      L      N      P      Q      S      T 
## 16.027 13.452 10.097  0.872  6.918  1.658  0.618 10.662  1.142  5.637 10.663 
##      V      W      Y 
## 16.533  1.245  4.478
WU.freq
## 
##      A      D      E      F      G      L      N      P      Q      S      T 
## 15.434 16.983 11.939  0.532  7.077  1.165  0.784 11.813  1.484  5.748 10.532 
##      V      W      Y 
## 12.645  0.773  3.090
AAfreq1 <- data.frame(aa = table(aa_pool_x7))
names(AAfreq1) <- c('AA', 'freq')

AAfreq1 <-rbind(AAfreq1, df<-data.frame(AA=c('C','H','I','M','K','R'), freq=rep(0,6)))

AAfreq1$cond <- 'theoretical frequency'

AAfreq2 <- data.frame(aa=FC.freq)
names(AAfreq2) <- c('AA', 'freq')
AAfreq2$cond <- "db10.fasta"

AAfreq3 <- data.frame(aa=WU.freq)
names(AAfreq3) <- c('AA', 'freq')
AAfreq3$cond <- "WU"

#FC.all <- read.table(system.file("extdata/FC.tryptic", package = "NestLink"),
#                 header = TRUE)
FC.all <- getFC()
## snapshotDate(): 2019-04-29
## see ?NestLink and browseVignettes('NestLink') for documentation
## downloading 0 resources
## loading from cache 
##     'EH2064 : 2064'
PATTERN.all <- "^GS[A-Z]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
FC.all <- as.character(FC.all$peptide)[grep(PATTERN.all, as.character(FC.all$peptide))]

FC.all.pep_seq.freq <- table(unlist(strsplit(substr(FC.all, 3, 9), "")))
FC.all.freq <- round(FC.all.pep_seq.freq / sum(FC.all.pep_seq.freq) * 100, 3)
FC.all.freq[20] <- 0
names(FC.all.freq) <- c(names(FC.all.freq)[1:19], "K")
AAfreq4 <- data.frame(AA=names(FC.all.freq), freq=as.numeric(FC.all.freq))

AAfreq4$cond <- "sequenced frequency"

AAfreq <- do.call('rbind', list(AAfreq1, AAfreq2, AAfreq3, AAfreq4))
AAfreq$freq <- as.numeric(AAfreq$freq)
ggplot(data=AAfreq, aes(x=AA, y=freq, fill=cond)) + 
  geom_bar(stat="identity", position="dodge") + 
  labs(title = "amino acid frequency plot") +
  theme_light()
## Warning: Removed 5 rows containing missing values (geom_bar).

AAfreq1.post <- data.frame(aa=table(aa_pool_x7.post))
names(AAfreq1.post) <- c('AA', 'freq')
AAfreq1.post$cond <- "aa_pool_x7"

AAfreq2.post <- data.frame(aa=FC.freq.post)
names(AAfreq2.post) <- c('AA', 'freq')
AAfreq2.post$cond <- "db10.fasta"

AAfreq3.post <- data.frame(aa=WU.freq.post)
names(AAfreq3.post) <- c('AA', 'freq')
AAfreq3.post$cond <- "WU"

AAfreq.post <- do.call('rbind', list(AAfreq1.post, AAfreq2.post, AAfreq3.post))
df<-AAfreq[(AAfreq$cond != 'WU' & AAfreq$cond != 'db10.fasta'), ]
df$freq<- as.numeric(df$freq)
ggplot(data=df, aes(x=AA, y=freq, fill=cond)) + 
  geom_bar(stat="identity", position="dodge") + 
  labs(title = "Amino acid frequency plot") +
  ylab("Frequency [%]") +
  theme_light()
## Warning: Removed 5 rows containing missing values (geom_bar).

FCfreq <- data.frame(table(unlist(strsplit(substr(FC$peptide, 3, 9), ""))))
colnames(FCfreq) <- c('AA', 'freq')

ggplot(data=FCfreq, aes(x=AA, y=freq)) + 
  geom_bar(stat="identity", position="dodge") + 
  labs(title = "Amino acid frequency plot") +
  theme_light()

ggplot(data=AAfreq.post, aes(x=AA, y=freq, fill=cond)) + 
  geom_bar(stat="identity", position="dodge") + 
  labs(title = "suffix frequency plot") +
  theme_light()

4.2 SSRC Prediction vs. Measured Retention Time

The RT is predicted using the method described in Krokhin et al. (2004) and implemented using the specL package Panse et al. (2015).

WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
WU$suffix <- substr(WU$pep_seq, 10,100)
ggplot(WU, aes(x = RTINSECONDS, y = ssrc)) +
  geom_point(aes(alpha=pep_score, colour = datfilename)) +
  facet_wrap(~ datfilename * suffix, ncol = 5, nrow = 8) +
  geom_smooth(method = 'lm') 
## Warning in qt((1 - level)/2, df): NaNs produced

cor(WU$RTINSECONDS, WU$ssrc, method = 'spearman')
## [1] 0.9056695

4.3 SSRC and Retention Time Distribution

Considers only Mascot Result File F255737

WU <- WU160118
PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
idx <- grepl(PATTERN, WU$pep_seq)

# Mascot score cut-off valye 25
WU <- WU[idx & WU$pep_score > 25,]
WU$suffix <- substr(WU$pep_seq, 10, 100)
WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)

ggplot(WU, aes(x = RTINSECONDS, fill = datfilename)) + 
  geom_histogram(bins = 50)

ggplot(WU, aes(x = ssrc, fill = datfilename)) + 
  geom_histogram(bins = 50)

ggplot(WU, aes(x = RTINSECONDS, fill = suffix)) + 
  geom_histogram(bins = 50)

ggplot(WU, aes(x = ssrc, fill = suffix)) + 
  geom_histogram(bins = 50)

WU <- WU[WU$datfilename == "F255737",]

WU <- aggregate(WU$RTINSECONDS ~ WU$pep_seq, FUN = min)
names(WU) <-c("pep_seq", "RTINSECONDS")
WU$suffix <- substr(WU$pep_seq, 10, 100)
WU$peptide_mass <- parentIonMass(as.character(WU$pep_seq))
WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
WU$datfilename <- "F255737"
ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) +
  geom_point(aes(colour = suffix)) +
  facet_wrap(~ datfilename * suffix, ncol=5) 

ggplot(WU, aes(x = RTINSECONDS, fill = suffix)) + 
  geom_histogram(bins = 20) + 
  facet_wrap(~ datfilename * suffix, ncol=5)

ggplot(WU, aes(x = ssrc, y = peptide_mass)) +
  geom_point(aes(colour = suffix)) +
  facet_wrap(~ datfilename * suffix,  ncol = 5)

ggplot(WU, aes(x = ssrc,fill = suffix)) + 
  geom_histogram(bins = 20) + 
  facet_wrap(~ datfilename * suffix, ncol=5)

ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) +
  geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) 

ggplot(WU, aes(x = ssrc, y = peptide_mass)) +
  geom_point(aes(colour = suffix),  size = 3.0, alpha = 0.1) 

4.4 Mass vs. RT Panels - Mascot Ion Score cut-off

WU <-  do.call('rbind', lapply(c(25,35,45), function(cutoff){
  WU <- WU160118 
  PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$"
  idx <- grepl(PATTERN, WU$pep_seq)
  
  WU <- WU[idx & WU$pep_score > cutoff, ]
  
  WU <- WU[WU$datfilename == "F255737",]
  
  WU <- aggregate(WU$RTINSECONDS ~ WU$pep_seq, FUN=min)
  names(WU) <-c("pep_seq","RTINSECONDS")
  WU$suffix <- substr(WU$pep_seq, 10, 100)
  WU$peptide_mass <- parentIonMass(as.character(WU$pep_seq))
  WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc)
  WU$datfilename <- "F255737"
  WU$mascotScoreCutOff <- cutoff
  WU}))


ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) +
  geom_point(aes(colour = suffix),  size = 3.0, alpha = 0.1) +
  facet_wrap(~ mascotScoreCutOff, ncol = 1)

ggplot(WU, aes(x = ssrc, y = peptide_mass)) +
  geom_point(aes(colour = suffix),  size = 3.0, alpha = 0.1) +
  facet_wrap(~ mascotScoreCutOff, ncol = 1 )

5 Session info

Here is the output of the sessionInfo() command.

## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.9-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.9-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] scales_1.0.0                ggplot2_3.1.1              
##  [3] NestLink_1.0.0              ShortRead_1.42.0           
##  [5] GenomicAlignments_1.20.0    SummarizedExperiment_1.14.0
##  [7] DelayedArray_0.10.0         matrixStats_0.54.0         
##  [9] Biobase_2.44.0              Rsamtools_2.0.0            
## [11] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [13] BiocParallel_1.18.0         protViz_0.4.0              
## [15] gplots_3.0.1.1              Biostrings_2.52.0          
## [17] XVector_0.24.0              IRanges_2.18.0             
## [19] S4Vectors_0.22.0            ExperimentHub_1.10.0       
## [21] AnnotationHub_2.16.0        BiocFileCache_1.8.0        
## [23] dbplyr_1.4.0                BiocGenerics_0.30.0        
## [25] BiocStyle_2.12.0           
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.0                    bit64_0.9-7                  
##  [3] gtools_3.8.1                  shiny_1.3.2                  
##  [5] assertthat_0.2.1              interactiveDisplayBase_1.22.0
##  [7] BiocManager_1.30.4            latticeExtra_0.6-28          
##  [9] blob_1.1.1                    GenomeInfoDbData_1.2.1       
## [11] yaml_2.2.0                    pillar_1.3.1                 
## [13] RSQLite_2.1.1                 lattice_0.20-38              
## [15] glue_1.3.1                    digest_0.6.18                
## [17] RColorBrewer_1.1-2            promises_1.0.1               
## [19] colorspace_1.4-1              plyr_1.8.4                   
## [21] htmltools_0.3.6               httpuv_1.5.1                 
## [23] Matrix_1.2-17                 pkgconfig_2.0.2              
## [25] bookdown_0.9                  zlibbioc_1.30.0              
## [27] purrr_0.3.2                   xtable_1.8-4                 
## [29] gdata_2.18.0                  later_0.8.0                  
## [31] tibble_2.1.1                  withr_2.1.2                  
## [33] lazyeval_0.2.2                magrittr_1.5                 
## [35] crayon_1.3.4                  mime_0.6                     
## [37] memoise_1.1.0                 evaluate_0.13                
## [39] hwriter_1.3.2                 tools_3.6.0                  
## [41] stringr_1.4.0                 munsell_0.5.0                
## [43] AnnotationDbi_1.46.0          compiler_3.6.0               
## [45] caTools_1.17.1.2              rlang_0.3.4                  
## [47] grid_3.6.0                    RCurl_1.95-4.12              
## [49] rappdirs_0.3.1                labeling_0.3                 
## [51] bitops_1.0-6                  rmarkdown_1.12               
## [53] gtable_0.3.0                  codetools_0.2-16             
## [55] DBI_1.0.0                     curl_3.3                     
## [57] R6_2.4.0                      knitr_1.22                   
## [59] dplyr_0.8.0.1                 bit_1.1-14                   
## [61] KernSmooth_2.23-15            stringi_1.4.3                
## [63] Rcpp_1.0.1                    tidyselect_0.2.5             
## [65] xfun_0.6

References

Krokhin, O. V., R. Craig, V. Spicer, W. Ens, K. G. Standing, R. C. Beavis, and J. A. Wilkins. 2004. “An improved model for prediction of retention times of tryptic peptides in ion pair reversed-phase HPLC: its application to protein peptide mapping by off-line HPLC-MALDI MS.” Mol. Cell Proteomics 3 (9):908–19.

Panse, C., C. Trachsel, J. Grossmann, and R. Schlapbach. 2015. “specL–an R/Bioconductor package to prepare peptide spectrum matches for use in targeted proteomics.” Bioinformatics 31 (13):2228–31.