!!!Caution work in progress!!!

1 Introduction

Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.

2 Prerequisites

specL contains a function specL::cdsw.

## Loading required package: DBI

## Loading required package: protViz

## Loading required package: RSQLite

## Loading required package: seqinr

## 
## Attaching package: 'specL'

## The following objects are masked from 'package:protViz':
## 
##     plot.psm, plot.psmSet, summary.psmSet

3 Classical Method based on `quantile`

# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
  newfrom <- NULL
  for(from in windfrom){
    idx <- which.min(abs(from - empty))
    startmass <- 0
    if(isfrom){
      if(empty[idx] < from) {
        startmass <- empty[idx]
      } else {
        startmass <- empty[idx-1]
      }
    }else{
      if(empty[idx] > from) {
        startmass <- empty[idx]
      } else {
        startmass <- empty[idx+1]
      }
    }
    newfrom <- c(newfrom, round(startmass,digits=1))
  }
  return(newfrom)
}

.cdsw_compute_breaks <- 
  function(xx, nbins){
    q <- quantile(xx, seq(0, 1, length = nbins + 1))
    q[1] <- q[1] - 0.5
    q[length(q)] <- q[length(q)] + 0.5
    q <- round(q)
  }


cdsw <-
  function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
    if (class(x) == "psmSet") {
      x <- unlist(lapply(x, function(x) {
        x$pepmass
      }))
    } else if (class(x) == 'specLSet') {
      x <- unlist(lapply(x@ionlibrary, function(xx) {
        xx@q1
      }))
    }
    # x should be numeric
    if (class(x) != "numeric") {
      warning("can not compute quantils. 'x' is not numeric.")
      return (NULL)
    }
    
    x <- x[x > massrange[1] & x < massrange[2]]
    
    q <- FUN(xx=x, nbins=n)
    
    idx <- 1:n
    from <- q[idx] - overlap * 0.5
    to <- q[idx + 1] + overlap * 0.5
    width <- 0.5 * (to - from)
    mid <- from + width
    h <- hist(x, breaks = q, ...)
    data.frame(from, to, mid, width, counts = h$counts)
    
    
  }

cdsw(exampledata, 
     freq=TRUE, 
     overlap = 0, 
     main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)

##     from   to    mid width counts
## 0%   301  384  342.5  41.5    586
## 5%   384  420  402.0  18.0    591
## 10%  420  449  434.5  14.5    583
## 15%  449  476  462.5  13.5    599
## 20%  476  500  488.0  12.0    565
## 25%  500  524  512.0  12.0    604
## 30%  524  547  535.5  11.5    566
## 35%  547  572  559.5  12.5    598
## 40%  572  598  585.0  13.0    591
## 45%  598  625  611.5  13.5    577
## 50%  625  651  638.0  13.0    603
## 55%  651  682  666.5  15.5    575
## 60%  682  713  697.5  15.5    594
## 65%  713  745  729.0  16.0    574
## 70%  745  783  764.0  19.0    595
## 75%  783  825  804.0  21.0    572
## 80%  825  881  853.0  28.0    592
## 85%  881  948  914.5  33.5    583
## 90%  948 1045  996.5  48.5    592
## 95% 1045 1250 1147.5 102.5    586

4 Iterative Distribution Mixing based cdsw

4.1 Requirements

Mass range can be specified (mass_range)
Maximal window size can be specified (max_window_size). This is because windows should not be to large because of optimal collision energy (personal communication by Bernd R.).
Minimal window size can be specified (min_window_size).
target number of windows can be specified (nr_windows).
boundaries between windows are placed in regions were no precursors are observed.

4.2 Contrains and Objective Function

.cdsw_objective <- function(splits, data){
  counts <- hist(data, breaks=splits,plot=FALSE)$counts
  nbins<-length(splits)-1
  optimumN <- length(data)/(length(splits)-1)
  optimumN<-rep(optimumN,nbins)
  
  score2 <-sqrt(sum((counts - optimumN)^2))
  score1 <- sum(abs(counts - round(optimumN)))
  return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}

.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
  
  difsp<-diff(splits)
  return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}

4.3 Construction Heuristic

.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
  breaks <- nbins+1
  #xx <- x
  
  #xx<-xx[xx >=310 & xx<1250]
  # TODO(wew): there is something insconsitent with the nbins parameter
  qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
  
  plot(1:breaks, qqs, type="b" , 
       sub=".cdsw_compute_sampling_breaks")
  legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
                               paste("nbins = ", breaks) ))
  
  
  # equidistant spaced bins
  unif <- seq(min(xx),max(xx),length=(breaks))
  lines(1:breaks,unif,col=2,type="b")
  
  if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
    warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
  }else{
    .cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
  }
  
  mixeddata <- xx
  it_count <- 0
  error <- 0
  
  while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
    it_count <- it_count + 1
    uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
    mixeddata<-c(mixeddata,uniformdata)
    
    qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
    lines(1:breaks,qqs,type="l", col="#00DD00AA")
    error[it_count] <-.cdsw_objective(qqs, xx)$score1
    
  }
  
  lines(1:breaks,qqs,type="b", col="#FF1111AA")
  plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
  
  
  qqs <- as.numeric(sort(round(qqs)))
  qqs[1]  <- qqs[1] - 0.5
  qqs[length(qqs)]  <- qqs[length(qqs)] + 0.5
  
  round(qqs, 1)
}

4.4 Evaluation

4.4.1 Comparizon to Classical Approach

op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata, 
             freq=TRUE,
             plot=TRUE,
             overlap = 0, 
             n=35,
             massrange = c(350,1250),
             sub='sampling based',
             main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})

## Warning in plot.histogram(r, freq = freq1, col = col, border = border, angle
## = angle, : the AREAS in the plot are wrong -- rather use 'freq = FALSE'

readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
  res <- hist(ms1data, breaks=breaks)
  abline(v=wind$from,col=2,lty=2)
  abline(v=wind$to,col=3,lty=2)
  
  empty <- res$mids[which(res$counts < maxbin )]
  newfrom <- .makenewfromto(wind$from , empty)
  newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
  
  plot(res,xlim=c(1060,1065))
  abline(v = newfrom,lwd=0.5,col="red")
  abline(v = newto , lwd=0.5,col="green")
  plot(res,xlim=c(520,550))
  abline(v = newfrom,lwd=0.5,col="red")
  abline(v = newto , lwd=0.5,col="green")
  
  width <- (newto - newfrom) * 0.5
  mid <- (newfrom + newto)*0.5
  newCounts <- NULL
  for(i in 1:length(newfrom))
  {
    newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
  }
  data.frame(newfrom, newto, mid, width, counts =newCounts)

}
readjustWindows(wind,exampledata)

##    newfrom  newto     mid width counts
## 1    349.5  377.1  363.30 13.80    282
## 2    377.0  401.1  389.05 12.05    346
## 3    401.0  423.1  412.05 11.05    381
## 4    423.0  441.1  432.05  9.05    382
## 5    441.0  461.1  451.05 10.05    401
## 6    461.0  479.1  470.05  9.05    424
## 7    479.0  496.1  487.55  8.55    414
## 8    496.0  513.0  504.50  8.50    423
## 9    513.0  530.0  521.50  8.50    402
## 10   530.0  547.0  538.50  8.50    431
## 11   547.0  565.0  556.00  9.00    434
## 12   565.0  582.0  573.50  8.50    397
## 13   582.0  600.0  591.00  9.00    405
## 14   600.0  619.0  609.50  9.50    413
## 15   619.0  636.0  627.50  8.50    384
## 16   636.0  655.0  645.50  9.50    422
## 17   655.0  677.0  666.00 11.00    389
## 18   677.0  696.0  686.50  9.50    398
## 19   696.0  717.0  706.50 10.50    383
## 20   717.0  738.0  727.50 10.50    371
## 21   738.0  759.0  748.50 10.50    376
## 22   759.0  784.2  771.60 12.60    359
## 23   784.0  810.0  797.00 13.00    362
## 24   810.0  837.0  823.50 13.50    338
## 25   837.0  867.0  852.00 15.00    313
## 26   867.0  898.0  882.50 15.50    285
## 27   898.0  929.1  913.55 15.55    273
## 28   929.0  960.1  944.55 15.55    261
## 29   960.0  995.1  977.55 17.55    218
## 30   995.0 1031.0 1013.00 18.00    200
## 31  1031.0 1069.0 1050.00 19.00    192
## 32  1069.0 1108.0 1088.50 19.50    174
## 33  1108.0 1152.0 1130.00 22.00    121
## 34  1152.0 1201.0 1176.50 24.50    103
## 35  1201.0 1249.5 1225.25 24.25     69

cdsw(exampledata, 
     freq=TRUE,
     plot=TRUE,
     n=35,
     overlap = 0, 
     sub='quantile based',
     main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)

## Warning in plot.histogram(r, freq = freq1, col = col, border = border, angle
## = angle, : the AREAS in the plot are wrong -- rather use 'freq = FALSE'

##           from   to    mid width counts
## 0%         301  363  332.0  31.0    332
## 2.857143%  363  390  376.5  13.5    343
## 5.714286%  390  410  400.0  10.0    328
## 8.571429%  410  429  419.5   9.5    331
## 11.42857%  429  445  437.0   8.0    339
## 14.28571%  445  462  453.5   8.5    347
## 17.14286%  462  476  469.0   7.0    339
## 20%        476  489  482.5   6.5    315
## 22.85714%  489  503  496.0   7.0    333
## 25.71429%  503  517  510.0   7.0    353
## 28.57143%  517  531  524.0   7.0    338
## 31.42857%  531  544  537.5   6.5    316
## 34.28571%  544  557  550.5   6.5    337
## 37.14286%  557  572  564.5   7.5    341
## 40%        572  586  579.0   7.0    337
## 42.85714%  586  601  593.5   7.5    324
## 45.71429%  601  617  609.0   8.0    350
## 48.57143%  617  632  624.5   7.5    332
## 51.42857%  632  646  639.0   7.0    321
## 54.28571%  646  663  654.5   8.5    342
## 57.14286%  663  682  672.5   9.5    340
## 60%        682  698  690.0   8.0    336
## 62.85714%  698  716  707.0   9.0    323
## 65.71429%  716  736  726.0  10.0    350
## 68.57143%  736  754  745.0   9.0    328
## 71.42857%  754  776  765.0  11.0    335
## 74.28571%  776  800  788.0  12.0    336
## 77.14286%  800  825  812.5  12.5    327
## 80%        825  855  840.0  15.0    344
## 82.85714%  855  891  873.0  18.0    330
## 85.71429%  891  928  909.5  18.5    334
## 88.57143%  928  969  948.5  20.5    340
## 91.42857%  969 1029  999.0  30.0    337
## 94.28571% 1029 1097 1063.0  34.0    332
## 97.14286% 1097 1250 1173.5  76.5    336

4.4.2 Comparizon using different MaxWindowSize

op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
  cdsw(exampledata, 
       freq=TRUE,
       plot=TRUE,
       overlap = 0, 
       main = paste("max window size", mws), xlab='pepmass', 
       FUN=function(...){
         .cdsw_compute_sampling_breaks(...,maxwindow = mws)
       })
})

5 Chaning nr bins

op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
  cdsw(exampledata, 
       freq=TRUE,
       plot=TRUE,
       n=nbins,
       overlap = 0, 
       main = paste("nr bins", nbins), xlab='pepmass', 
       FUN=function(...){
         .cdsw_compute_sampling_breaks(...,maxwindow = 100)
       })
})

6 Session info

Here is the output of sessionInfo() on the system on which this document was compiled:

## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.3 LTS
## 
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.6-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.6-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] specL_1.12.0    seqinr_3.4-5    RSQLite_2.0     protViz_0.2.37 
## [5] DBI_0.7         BiocStyle_2.6.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.13    bookdown_0.5    digest_0.6.12   rprojroot_1.2  
##  [5] backports_1.1.1 magrittr_1.5    evaluate_0.10.1 rlang_0.1.2    
##  [9] stringi_1.1.5   blob_1.1.0      rmarkdown_1.6   tools_3.4.2    
## [13] ade4_1.7-8      bit64_0.9-7     stringr_1.2.0   bit_1.1-12     
## [17] yaml_2.1.14     compiler_3.4.2  memoise_1.1.0   htmltools_0.3.6
## [21] knitr_1.17      tibble_1.3.4

On Computing Dynamic SWATH Windows

30 October 2017

Package

Contents

1 Introduction

2 Prerequisites

3 Classical Method based on `quantile`

4 Iterative Distribution Mixing based cdsw

4.1 Requirements

4.2 Contrains and Objective Function

4.3 Construction Heuristic

4.4 Evaluation

4.4.1 Comparizon to Classical Approach

4.4.2 Comparizon using different MaxWindowSize

5 Chaning nr bins

6 Session info

On Computing Dynamic SWATH Windows

30 October 2017

Package

Contents

1 Introduction

2 Prerequisites

3 Classical Method based on quantile

4 Iterative Distribution Mixing based cdsw

4.1 Requirements

4.2 Contrains and Objective Function

4.3 Construction Heuristic

4.4 Evaluation

4.4.1 Comparizon to Classical Approach

4.4.2 Comparizon using different MaxWindowSize

5 Chaning nr bins

6 Session info

3 Classical Method based on `quantile`