!!!Caution work in progress!!!
Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.
specL contains a function specL::cdsw
.
## Loading required package: DBI
## Loading required package: RSQLite
## Loading required package: seqinr
## Loading required package: protViz
quantile
# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
newfrom <- NULL
for(from in windfrom){
idx <- which.min(abs(from - empty))
startmass <- 0
if(isfrom){
if(empty[idx] < from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx-1]
}
}else{
if(empty[idx] > from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx+1]
}
}
newfrom <- c(newfrom, round(startmass,digits=1))
}
return(newfrom)
}
.cdsw_compute_breaks <-
function(xx, nbins){
q <- quantile(xx, seq(0, 1, length = nbins + 1))
q[1] <- q[1] - 0.5
q[length(q)] <- q[length(q)] + 0.5
q <- round(q)
}
cdsw <-
function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
if (class(x) == "psmSet") {
x <- unlist(lapply(x, function(x) {
x$pepmass
}))
} else if (class(x) == 'specLSet') {
x <- unlist(lapply(x@ionlibrary, function(xx) {
xx@q1
}))
}
# x should be numeric
if (class(x) != "numeric") {
warning("can not compute quantils. 'x' is not numeric.")
return (NULL)
}
x <- x[x > massrange[1] & x < massrange[2]]
q <- FUN(xx=x, nbins=n)
idx <- 1:n
from <- q[idx] - overlap * 0.5
to <- q[idx + 1] + overlap * 0.5
width <- 0.5 * (to - from)
mid <- from + width
h <- hist(x, breaks = q, ...)
data.frame(from, to, mid, width, counts = h$counts)
}
cdsw(exampledata,
freq=TRUE,
overlap = 0,
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## from to mid width counts
## 0% 301 384 342.5 41.5 586
## 5% 384 420 402.0 18.0 591
## 10% 420 449 434.5 14.5 583
## 15% 449 476 462.5 13.5 599
## 20% 476 500 488.0 12.0 565
## 25% 500 524 512.0 12.0 604
## 30% 524 547 535.5 11.5 566
## 35% 547 572 559.5 12.5 598
## 40% 572 598 585.0 13.0 591
## 45% 598 625 611.5 13.5 577
## 50% 625 651 638.0 13.0 603
## 55% 651 682 666.5 15.5 575
## 60% 682 713 697.5 15.5 594
## 65% 713 745 729.0 16.0 574
## 70% 745 783 764.0 19.0 595
## 75% 783 825 804.0 21.0 572
## 80% 825 881 853.0 28.0 592
## 85% 881 948 914.5 33.5 583
## 90% 948 1045 996.5 48.5 592
## 95% 1045 1250 1147.5 102.5 586
.cdsw_objective <- function(splits, data){
counts <- hist(data, breaks=splits,plot=FALSE)$counts
nbins<-length(splits)-1
optimumN <- length(data)/(length(splits)-1)
optimumN<-rep(optimumN,nbins)
score2 <-sqrt(sum((counts - optimumN)^2))
score1 <- sum(abs(counts - round(optimumN)))
return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}
.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
difsp<-diff(splits)
return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}
.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
breaks <- nbins+1
#xx <- x
#xx<-xx[xx >=310 & xx<1250]
# TODO(wew): there is something insconsitent with the nbins parameter
qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
plot(1:breaks, qqs, type="b" ,
sub=".cdsw_compute_sampling_breaks")
legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
paste("nbins = ", breaks) ))
# equidistant spaced bins
unif <- seq(min(xx),max(xx),length=(breaks))
lines(1:breaks,unif,col=2,type="b")
if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
}else{
.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
}
mixeddata <- xx
it_count <- 0
error <- 0
while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
it_count <- it_count + 1
uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
mixeddata<-c(mixeddata,uniformdata)
qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
lines(1:breaks,qqs,type="l", col="#00DD00AA")
error[it_count] <-.cdsw_objective(qqs, xx)$score1
}
lines(1:breaks,qqs,type="b", col="#FF1111AA")
plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
qqs <- as.numeric(sort(round(qqs)))
qqs[1] <- qqs[1] - 0.5
qqs[length(qqs)] <- qqs[length(qqs)] + 0.5
round(qqs, 1)
}
op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
n=35,
massrange = c(350,1250),
sub='sampling based',
main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
res <- hist(ms1data, breaks=breaks)
abline(v=wind$from,col=2,lty=2)
abline(v=wind$to,col=3,lty=2)
empty <- res$mids[which(res$counts < maxbin )]
newfrom <- .makenewfromto(wind$from , empty)
newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
plot(res,xlim=c(1060,1065))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
plot(res,xlim=c(520,550))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
width <- (newto - newfrom) * 0.5
mid <- (newfrom + newto)*0.5
newCounts <- NULL
for(i in 1:length(newfrom))
{
newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
}
data.frame(newfrom, newto, mid, width, counts =newCounts)
}
readjustWindows(wind,exampledata)
## newfrom newto mid width counts
## 1 349.5 379.1 364.30 14.80 305
## 2 379.0 404.1 391.55 12.55 383
## 3 404.0 424.1 414.05 10.05 338
## 4 424.0 443.1 433.55 9.55 406
## 5 443.0 463.1 453.05 10.05 403
## 6 463.0 481.1 472.05 9.05 433
## 7 481.0 498.1 489.55 8.55 404
## 8 498.0 515.0 506.50 8.50 430
## 9 515.0 532.0 523.50 8.50 418
## 10 531.9 549.0 540.45 8.55 423
## 11 548.9 566.0 557.45 8.55 413
## 12 566.0 583.0 574.50 8.50 408
## 13 582.9 602.0 592.45 9.55 415
## 14 602.0 622.0 612.00 10.00 425
## 15 622.0 639.0 630.50 8.50 393
## 16 639.0 658.0 648.50 9.50 417
## 17 658.0 679.0 668.50 10.50 361
## 18 679.0 698.2 688.60 9.60 412
## 19 698.0 720.0 709.00 11.00 385
## 20 720.0 741.0 730.50 10.50 384
## 21 741.0 762.0 751.50 10.50 372
## 22 762.0 786.0 774.00 12.00 339
## 23 785.9 811.0 798.45 12.55 344
## 24 811.0 838.0 824.50 13.50 343
## 25 837.9 865.0 851.45 13.55 282
## 26 865.0 896.0 880.50 15.50 285
## 27 896.0 927.1 911.55 15.55 281
## 28 927.0 958.1 942.55 15.55 264
## 29 958.0 993.1 975.55 17.55 221
## 30 993.0 1030.0 1011.50 18.50 206
## 31 1030.0 1068.0 1049.00 19.00 193
## 32 1068.0 1109.0 1088.50 20.50 181
## 33 1109.0 1153.0 1131.00 22.00 121
## 34 1153.0 1199.0 1176.00 23.00 98
## 35 1199.0 1249.5 1224.25 25.25 71
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=35,
overlap = 0,
sub='quantile based',
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
## from to mid width counts
## 0% 301 363 332.0 31.0 332
## 2.857143% 363 390 376.5 13.5 343
## 5.714286% 390 410 400.0 10.0 328
## 8.571429% 410 429 419.5 9.5 331
## 11.42857% 429 445 437.0 8.0 339
## 14.28571% 445 462 453.5 8.5 347
## 17.14286% 462 476 469.0 7.0 339
## 20% 476 489 482.5 6.5 315
## 22.85714% 489 503 496.0 7.0 333
## 25.71429% 503 517 510.0 7.0 353
## 28.57143% 517 531 524.0 7.0 338
## 31.42857% 531 544 537.5 6.5 316
## 34.28571% 544 557 550.5 6.5 337
## 37.14286% 557 572 564.5 7.5 341
## 40% 572 586 579.0 7.0 337
## 42.85714% 586 601 593.5 7.5 324
## 45.71429% 601 617 609.0 8.0 350
## 48.57143% 617 632 624.5 7.5 332
## 51.42857% 632 646 639.0 7.0 321
## 54.28571% 646 663 654.5 8.5 342
## 57.14286% 663 682 672.5 9.5 340
## 60% 682 698 690.0 8.0 336
## 62.85714% 698 716 707.0 9.0 323
## 65.71429% 716 736 726.0 10.0 350
## 68.57143% 736 754 745.0 9.0 328
## 71.42857% 754 776 765.0 11.0 335
## 74.28571% 776 800 788.0 12.0 336
## 77.14286% 800 825 812.5 12.5 327
## 80% 825 855 840.0 15.0 344
## 82.85714% 855 891 873.0 18.0 330
## 85.71429% 891 928 909.5 18.5 334
## 88.57143% 928 969 948.5 20.5 340
## 91.42857% 969 1029 999.0 30.0 337
## 94.28571% 1029 1097 1063.0 34.0 332
## 97.14286% 1097 1250 1173.5 76.5 336
op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
main = paste("max window size", mws), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = mws)
})
})
op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=nbins,
overlap = 0,
main = paste("nr bins", nbins), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = 100)
})
})
Here is the output of sessionInfo()
on the system on which this document was compiled:
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.1 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] specL_1.8.0 protViz_0.2.15 seqinr_3.3-3 RSQLite_1.0.0
## [5] DBI_0.5-1 BiocStyle_2.2.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.7 digest_0.6.10 assertthat_0.1 formatR_1.4
## [5] magrittr_1.5 evaluate_0.10 stringi_1.1.2 rmarkdown_1.1
## [9] tools_3.3.1 ade4_1.7-4 stringr_1.1.0 yaml_2.1.13
## [13] htmltools_0.3.5 knitr_1.14 tibble_1.2