Integration of rawDiag and rawrr into the Spectra ecosystem (by courtesy of Johannes Rainer).

Figure 1: Integration of rawDiag and rawrr into the Spectra ecosystem (by courtesy of Johannes Rainer)

1 Requirements

suppressMessages(
  stopifnot(require(Spectra),
            require(MsBackendRawFileReader),
            require(tartare),
            require(BiocParallel))
)

assemblies aka Common Intermediate Language bytecode The download and install can be done on all platforms using the command: rawrr::installRawFileReaderDLLs()

if (isFALSE(rawrr::.checkDllInMonoPath())){
  rawrr::installRawFileReaderDLLs()
}
## removing files in directory '/home/biocbuild/.cache/R/rawrr/rawrrassembly'
##                   ThermoFisher.CommonCore.Data.dll 
##                                                  0 
## ThermoFisher.CommonCore.MassPrecisionEstimator.dll 
##                                                  0 
##          ThermoFisher.CommonCore.RawFileReader.dll 
##                                                  0
if (isFALSE(file.exists(rawrr:::.rawrrAssembly()))){
 rawrr::installRawrrExe(sourceUrl = "http://fgcz-ms.uzh.ch/~cpanse/rawrr/rawrr.1.1.12.exe")
}
## MD5 6d5f6d19512eaba73e92d2bfb474e6f2 /home/biocbuild/.cache/R/rawrr/rawrrassembly/rawrr.exe
## [1] 0

2 Load data

# fetch via ExperimentHub
library(ExperimentHub)
eh <- ExperimentHub::ExperimentHub()
query(eh, c('tartare'))
## ExperimentHub with 5 records
## # snapshotDate(): 2021-10-18
## # $dataprovider: Functional Genomics Center Zurich (FGCZ)
## # $species: NA
## # $rdataclass: Spectra
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## #   rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["EH3219"]]' 
## 
##            title                     
##   EH3219 | Q Exactive HF-X mzXML     
##   EH3220 | Q Exactive HF-X raw       
##   EH3221 | Fusion Lumos mzXML        
##   EH3222 | Fusion Lumos raw          
##   EH4547 | Q Exactive HF Orbitrap raw

The RawFileReader libraries require a file extension ending with .raw.

EH3220 <- normalizePath(eh[["EH3220"]])
(rawfileEH3220 <- paste0(EH3220, ".raw"))
## [1] "/home/biocbuild/.cache/R/ExperimentHub/a4e016e095565_3236.raw"
if (!file.exists(rawfileEH3220)){
  file.link(EH3220, rawfileEH3220)
}

EH3222 <- normalizePath(eh[["EH3222"]])
(rawfileEH3222 <- paste0(EH3222, ".raw"))
## [1] "/home/biocbuild/.cache/R/ExperimentHub/a4e0178f2c78c_3238.raw"
if (!file.exists(rawfileEH3222)){
  file.link(EH3222, rawfileEH3222)
}

EH4547  <- normalizePath(eh[["EH4547"]])
(rawfileEH4547  <- paste0(EH4547 , ".raw"))
## [1] "/home/biocbuild/.cache/R/ExperimentHub/279b53a6a5276_4590.raw"
if (!file.exists(rawfileEH4547 )){
  file.link(EH4547 , rawfileEH4547 )
}

3 Usage

Call the constructor

beRaw <- Spectra::backendInitialize(
  MsBackendRawFileReader::MsBackendRawFileReader(),
  files = c(rawfileEH3220, rawfileEH3222, rawfileEH4547))

Call the print method

beRaw
## MsBackendRawFileReader with 32497 spectra
##         msLevel     rtime scanIndex
##       <integer> <numeric> <integer>
## 1             1     0.215         1
## 2             1     0.714         2
## 3             1     1.108         3
## 4             1     1.503         4
## 5             1     1.897         5
## ...         ...       ...       ...
## 32493         2   2099.70     21876
## 32494         2   2099.78     21877
## 32495         2   2099.87     21878
## 32496         2   2099.95     21879
## 32497         2   2100.04     21880
##  ... 20 more variables/columns.
## 
## file(s):
## a4e016e095565_3236.raw
## a4e0178f2c78c_3238.raw
## 279b53a6a5276_4590.raw

4 Application example

Here we reproduce the Figure 2 of Kockmann and Panse (2021) rawrr. The MsBackendRawFileReader ships with a filterScan method using functionality provided by the C# libraries by Thermo Fisher Scientific Shofstahl (2016).

(S <- (beRaw |>  
   filterScan("FTMS + c NSI Full ms2 487.2567@hcd27.00 [100.0000-1015.0000]") )[437]) |> 
  plotSpectra()

# supposed to be scanIndex 9594
S
## MsBackendRawFileReader with 1 spectra
##     msLevel     rtime scanIndex
##   <integer> <numeric> <integer>
## 1         2   925.225      9594
##  ... 20 more variables/columns.
## 
## file(s):
## 279b53a6a5276_4590.raw
# add yIonSeries to the plot
(yIonSeries <- protViz::fragmentIon("LGGNEQVTR")[[1]]$y[1:8])
## [1] 175.1190 276.1666 375.2350 503.2936 632.3362 746.3791 803.4006 860.4221
names(yIonSeries) <- paste0("y", seq(1, length(yIonSeries)))
abline(v = yIonSeries, col='#DDDDDD88', lwd=5)
axis(3, yIonSeries, names(yIonSeries))
Peptide spectrum match. The vertical grey lines indicate the *in-silico* computed y-ions of the peptide precusor LGGNEQVTR++ as calculated by the [protViz]( https://CRAN.R-project.org/package=protViz) package.

Figure 2: Peptide spectrum match
The vertical grey lines indicate the in-silico computed y-ions of the peptide precusor LGGNEQVTR++ as calculated by the protViz package.

5 Extend the class

For demonstration reasons, we extent the MsBackend class by a filter method. The filterIons function returns spectra if and only if all fragment ions, given as argument, match. We use protViz::findNN binary search method for determining the nearest mZ peak for each ion. If the mass error between an ion and an mz value is less than the given mass tolerance, an ion is considered a hit.

setGeneric("filterIons", function(object, ...) standardGeneric("filterIons"))
## [1] "filterIons"
setMethod("filterIons", "MsBackend",
  function(object, mZ=numeric(), tol=numeric(), ...) {
    
    keep <- lapply(peaksData(object, BPPARAM = bpparam()),
                   FUN=function(x){
       NN <- protViz::findNN(mZ, x[, 1])
       hit <- (error <- mZ - x[NN, 1]) < tol & x[NN, 2] >= quantile(x[, 2], .9)
       if (sum(hit) == length(mZ))
         TRUE
       else
         FALSE
                   })
    object[unlist(keep)]
  })

The lines below implement a simple targeted peptide search engine. The R code snippet takes as input a MsBackendRawFileReader object containing 32497 spectra and y-fragment-ion mZ values determined for LGGNEQVTR++.

start_time <- Sys.time()
X <- beRaw |> 
  MsBackendRawFileReader::filterScan("FTMS + c NSI Full ms2 487.2567@hcd27.00 [100.0000-1015.0000]") |>
  filterIons(yIonSeries, tol = 0.005) |> 
  Spectra::Spectra() |>
  Spectra::peaksData() 
end_time <- Sys.time()

The defined filterIons method runs on 995 input spectra and returns 4 spectra.

The runtime is shown below.

end_time - start_time
## Time difference of 4.148496 secs

Next, we define and apply a method for graphing LGGNEQVTR peptide spectrum matches. Also, the function returns some statistics of the match.

.plot.LGGNEQVTR <- function(x){
  
  yIonSeries <- protViz::fragmentIon("LGGNEQVTR")[[1]]$y[1:8]
  names(yIonSeries) <- paste0("y", seq(1, length(yIonSeries)))
  
  plot(x, type ='h', xlim=range(yIonSeries))
  abline(v = yIonSeries, col='#DDDDDD88', lwd=5)
  axis(3, yIonSeries, names(yIonSeries))
  
  # find nearest mZ value
  idx <- protViz::findNN(yIonSeries, x[,1])
  
  data.frame(
    ion = names(yIonSeries),
    mZ.yIon = yIonSeries,
    mZ = x[idx, 1],
    intensity = x[idx, 2]
  )
}
Visualizing of the LGGNEQVTR spectrum matches.

Figure 3: Visualizing of the LGGNEQVTR spectrum matches

stats::aggregate(formula = mZ  ~ ion, FUN = base::mean, data=rv)
##   ion       mZ
## 1  y1 175.1190
## 2  y2 276.1665
## 3  y3 375.2349
## 4  y4 503.2936
## 5  y5 632.3362
## 6  y6 746.3791
## 7  y7 803.4003
## 8  y8 860.4216
stats::aggregate(formula = intensity ~ ion, FUN = base::max, data=rv)
##   ion intensity
## 1  y1   1505214
## 2  y2   2583122
## 3  y3   2364014
## 4  y4   3179124
## 5  y5   2286947
## 6  y6   1236341
## 7  y7   4586484
## 8  y8  12894520

For the sake of demonstration we apply the Spectra::combinePeaks method and aggregate the 4 spectra into a singe peak matrix.

The statistics returned by .plot.LGGNEQVTR() should be identical with the output of the aggregation code snippet above.

X |>
  Spectra::combinePeaks(ppm=10, intensityFun=base::max) |>
  .plot.LGGNEQVTR()
Combined LGGNEQVTR peptide spectrum match plot.

Figure 4: Combined LGGNEQVTR peptide spectrum match plot

##    ion  mZ.yIon       mZ intensity
## y1  y1 175.1190 175.1190   1505214
## y2  y2 276.1666 276.1665   2583122
## y3  y3 375.2350 375.2349   2364014
## y4  y4 503.2936 503.2936   3179124
## y5  y5 632.3362 632.3362   2286947
## y6  y6 746.3791 746.3791   1236341
## y7  y7 803.4006 803.4003   4586484
## y8  y8 860.4221 860.4216  12894520

6 Evaluation

6.1 Efficiency - I/O Benchmark

When reading spectra the MsBackendRawFileReader:::.RawFileReader_read_peaks method is calling the rawrr::readSpectrum method.

The figure below displays the time performance for reading a single spectrum in dependency from the chunk size (how many spectra are read in one function call) for reading different numbers of overall spectra.

ioBm <- file.path(system.file(package = 'MsBackendRawFileReader'),
               'extdata', 'specs.csv') |>
  read.csv2(header=TRUE)

# perform and include a local IO benchmark
ioBmLocal <- ioBenchmark(1000, c(32, 64, 128, 256), rawfile = rawfileEH4547)


lattice::xyplot((1 / as.numeric(time)) * workers ~ size | factor(n) ,
                group = host,
                data = rbind(ioBm, ioBmLocal),
                horizontal = FALSE,
        scales=list(y = list(log = 10)),
                auto.key = TRUE,
                layout = c(3, 1),
                ylab = 'spectra read in one second',
                xlab = 'number of spectra / file')
I/O Benchmark. The XY plot graphs how many spectra the backend can read in one second versus the chunk size of the rawrr::readSpectrum method for different compute architectures.

Figure 5: I/O Benchmark
The XY plot graphs how many spectra the backend can read in one second versus the chunk size of the rawrr::readSpectrum method for different compute architectures.

6.2 Effectiveness

We compare the output of the Thermo Fischer Scientific raw files versus their corresponding mzXML files using Spectra::MsBackendMzR relying on the mzR package.

mzXMLEH3219 <- normalizePath(eh[["EH3219"]])
## see ?tartare and browseVignettes('tartare') for documentation
## loading from cache
mzXMLEH3221 <- normalizePath(eh[["EH3221"]])
## see ?tartare and browseVignettes('tartare') for documentation
## loading from cache
if (require(mzR)){
  beMzXML <- Spectra::backendInitialize(
    Spectra::MsBackendMzR(),
    files = c(mzXMLEH3219))
  
  beRaw <- Spectra::backendInitialize(
    MsBackendRawFileReader::MsBackendRawFileReader(),
    files = c(rawfileEH3220))
  
  intensity.xml <- sapply(intensity(beMzXML[1:100]), sum)
  intensity.raw <- sapply(intensity(beRaw[1:100]), sum)
  
  plot(intensity.xml ~ intensity.raw, log = 'xy', asp = 1,
    pch = 16, col = rgb(0.5, 0.5, 0.5, alpha=0.5), cex=2)
  abline(lm(intensity.xml ~ intensity.raw), 
    col='red')
}
Aggregated intensities mzXML versus raw of the 1st 100 spectra.

Figure 6: Aggregated intensities mzXML versus raw of the 1st 100 spectra

Are all scans of the raw file in the mzXML file?

if (require(mzR)){
  table(scanIndex(beRaw) %in% scanIndex(beMzXML))
}
## 
## FALSE  TRUE 
##   112  1764

Session information

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.14-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.14-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] mzR_2.28.0                   Rcpp_1.0.7                  
##  [3] tartare_1.7.2                ExperimentHub_2.2.0         
##  [5] AnnotationHub_3.2.0          BiocFileCache_2.2.0         
##  [7] dbplyr_2.1.1                 MsBackendRawFileReader_1.0.0
##  [9] Spectra_1.4.0                ProtGenerics_1.26.0         
## [11] BiocParallel_1.28.0          S4Vectors_0.32.0            
## [13] BiocGenerics_0.40.0          BiocStyle_2.22.0            
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7                  fs_1.5.0                     
##  [3] bit64_4.0.5                   filelock_1.0.2               
##  [5] httr_1.4.2                    GenomeInfoDb_1.30.0          
##  [7] tools_4.1.1                   bslib_0.3.1                  
##  [9] utf8_1.2.2                    R6_2.5.1                     
## [11] DBI_1.1.1                     withr_2.4.2                  
## [13] tidyselect_1.1.1              bit_4.0.4                    
## [15] curl_4.3.2                    compiler_4.1.1               
## [17] Biobase_2.54.0                bookdown_0.24                
## [19] sass_0.4.0                    rappdirs_0.3.3               
## [21] stringr_1.4.0                 digest_0.6.28                
## [23] rmarkdown_2.11                XVector_0.34.0               
## [25] pkgconfig_2.0.3               htmltools_0.5.2              
## [27] fastmap_1.1.0                 highr_0.9                    
## [29] rlang_0.4.12                  RSQLite_2.2.8                
## [31] shiny_1.7.1                   jquerylib_0.1.4              
## [33] generics_0.1.1                jsonlite_1.7.2               
## [35] dplyr_1.0.7                   RCurl_1.98-1.5               
## [37] magrittr_2.0.1                GenomeInfoDbData_1.2.7       
## [39] fansi_0.5.0                   MsCoreUtils_1.6.0            
## [41] lifecycle_1.0.1               stringi_1.7.5                
## [43] yaml_2.2.1                    MASS_7.3-54                  
## [45] zlibbioc_1.40.0               grid_4.1.1                   
## [47] blob_1.2.2                    parallel_4.1.1               
## [49] promises_1.2.0.1              crayon_1.4.1                 
## [51] lattice_0.20-45               Biostrings_2.62.0            
## [53] KEGGREST_1.34.0               magick_2.7.3                 
## [55] knitr_1.36                    pillar_1.6.4                 
## [57] codetools_0.2-18              rawrr_1.2.0                  
## [59] glue_1.4.2                    BiocVersion_3.14.0           
## [61] evaluate_0.14                 BiocManager_1.30.16          
## [63] png_0.1-7                     vctrs_0.3.8                  
## [65] httpuv_1.6.3                  purrr_0.3.4                  
## [67] clue_0.3-60                   assertthat_0.2.1             
## [69] cachem_1.0.6                  xfun_0.27                    
## [71] mime_0.12                     protViz_0.7.0                
## [73] xtable_1.8-4                  later_1.3.0                  
## [75] ncdf4_1.17                    tibble_3.1.5                 
## [77] AnnotationDbi_1.56.0          memoise_2.0.0                
## [79] IRanges_2.28.0                cluster_2.1.2                
## [81] ellipsis_0.3.2                interactiveDisplayBase_1.32.0

References

Kockmann, Tobias, and Christian Panse. 2021. “The Rawrr R Package: Direct Access to Orbitrap Data and Beyond.” Journal of Proteome Research. https://doi.org/10.1021/acs.jproteome.0c00866.

Shofstahl, Jim. 2016. “New Rawfilereader from Thermo Fisher Scientific.” 2016. https://planetorbitrap.com/rawfilereader.