!!!Caution work in progress!!!
Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.
specL contains a function specL::cdsw
.
## Loading required package: DBI
## Loading required package: RSQLite
## Loading required package: seqinr
## Loading required package: protViz
quantile
# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
newfrom <- NULL
for(from in windfrom){
idx <- which.min(abs(from - empty))
startmass <- 0
if(isfrom){
if(empty[idx] < from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx-1]
}
}else{
if(empty[idx] > from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx+1]
}
}
newfrom <- c(newfrom, round(startmass,digits=1))
}
return(newfrom)
}
.cdsw_compute_breaks <-
function(xx, nbins){
q <- quantile(xx, seq(0, 1, length = nbins + 1))
q[1] <- q[1] - 0.5
q[length(q)] <- q[length(q)] + 0.5
q <- round(q)
}
cdsw <-
function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
if (class(x) == "psmSet") {
x <- unlist(lapply(x, function(x) {
x$pepmass
}))
} else if (class(x) == 'specLSet') {
x <- unlist(lapply(x@ionlibrary, function(xx) {
xx@q1
}))
}
# x should be numeric
if (class(x) != "numeric") {
warning("can not compute quantils. 'x' is not numeric.")
return (NULL)
}
x <- x[x > massrange[1] & x < massrange[2]]
q <- FUN(xx=x, nbins=n)
idx <- 1:n
from <- q[idx] - overlap * 0.5
to <- q[idx + 1] + overlap * 0.5
width <- 0.5 * (to - from)
mid <- from + width
h <- hist(x, breaks = q, ...)
data.frame(from, to, mid, width, counts = h$counts)
}
cdsw(exampledata,
freq=TRUE,
overlap = 0,
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## from to mid width counts
## 0% 301 384 342.5 41.5 586
## 5% 384 420 402.0 18.0 591
## 10% 420 449 434.5 14.5 583
## 15% 449 476 462.5 13.5 599
## 20% 476 500 488.0 12.0 565
## 25% 500 524 512.0 12.0 604
## 30% 524 547 535.5 11.5 566
## 35% 547 572 559.5 12.5 598
## 40% 572 598 585.0 13.0 591
## 45% 598 625 611.5 13.5 577
## 50% 625 651 638.0 13.0 603
## 55% 651 682 666.5 15.5 575
## 60% 682 713 697.5 15.5 594
## 65% 713 745 729.0 16.0 574
## 70% 745 783 764.0 19.0 595
## 75% 783 825 804.0 21.0 572
## 80% 825 881 853.0 28.0 592
## 85% 881 948 914.5 33.5 583
## 90% 948 1045 996.5 48.5 592
## 95% 1045 1250 1147.5 102.5 586
.cdsw_objective <- function(splits, data){
counts <- hist(data, breaks=splits,plot=FALSE)$counts
nbins<-length(splits)-1
optimumN <- length(data)/(length(splits)-1)
optimumN<-rep(optimumN,nbins)
score2 <-sqrt(sum((counts - optimumN)^2))
score1 <- sum(abs(counts - round(optimumN)))
return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}
.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
difsp<-diff(splits)
return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}
.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
breaks <- nbins+1
#xx <- x
#xx<-xx[xx >=310 & xx<1250]
# TODO(wew): there is something insconsitent with the nbins parameter
qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
plot(1:breaks, qqs, type="b" ,
sub=".cdsw_compute_sampling_breaks")
legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
paste("nbins = ", breaks) ))
# equidistant spaced bins
unif <- seq(min(xx),max(xx),length=(breaks))
lines(1:breaks,unif,col=2,type="b")
if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
}else{
.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
}
mixeddata <- xx
it_count <- 0
error <- 0
while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
it_count <- it_count + 1
uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
mixeddata<-c(mixeddata,uniformdata)
qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
lines(1:breaks,qqs,type="l", col="#00DD00AA")
error[it_count] <-.cdsw_objective(qqs, xx)$score1
}
lines(1:breaks,qqs,type="b", col="#FF1111AA")
plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
qqs <- as.numeric(sort(round(qqs)))
qqs[1] <- qqs[1] - 0.5
qqs[length(qqs)] <- qqs[length(qqs)] + 0.5
round(qqs, 1)
}
op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
n=35,
massrange = c(350,1250),
sub='sampling based',
main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
res <- hist(ms1data, breaks=breaks)
abline(v=wind$from,col=2,lty=2)
abline(v=wind$to,col=3,lty=2)
empty <- res$mids[which(res$counts < maxbin )]
newfrom <- .makenewfromto(wind$from , empty)
newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
plot(res,xlim=c(1060,1065))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
plot(res,xlim=c(520,550))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
width <- (newto - newfrom) * 0.5
mid <- (newfrom + newto)*0.5
newCounts <- NULL
for(i in 1:length(newfrom))
{
newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
}
data.frame(newfrom, newto, mid, width, counts =newCounts)
}
readjustWindows(wind,exampledata)
## newfrom newto mid width counts
## 1 349.5 379.1 364.30 14.80 305
## 2 379.0 403.1 391.05 12.05 355
## 3 403.0 424.1 413.55 10.55 366
## 4 424.0 444.1 434.05 10.05 418
## 5 444.0 463.1 453.55 9.55 391
## 6 463.0 481.1 472.05 9.05 433
## 7 481.0 500.1 490.55 9.55 448
## 8 500.0 517.0 508.50 8.50 436
## 9 517.0 535.0 526.00 9.00 435
## 10 535.0 551.0 543.00 8.00 403
## 11 551.0 569.0 560.00 9.00 424
## 12 569.0 586.0 577.50 8.50 407
## 13 586.0 605.0 595.50 9.50 418
## 14 605.0 625.0 615.00 10.00 413
## 15 625.0 643.2 634.10 9.10 430
## 16 643.0 662.0 652.50 9.50 396
## 17 662.0 684.0 673.00 11.00 399
## 18 684.0 704.0 694.00 10.00 401
## 19 704.0 725.0 714.50 10.50 382
## 20 725.0 747.0 736.00 11.00 390
## 21 747.0 769.0 758.00 11.00 361
## 22 769.0 794.0 781.50 12.50 356
## 23 793.9 819.0 806.45 12.55 314
## 24 819.0 846.0 832.50 13.50 346
## 25 846.0 875.0 860.50 14.50 284
## 26 875.0 904.0 889.50 14.50 262
## 27 904.0 935.1 919.55 15.55 271
## 28 935.0 966.1 950.55 15.55 263
## 29 966.0 1001.1 983.55 17.55 203
## 30 1001.0 1038.0 1019.50 18.50 188
## 31 1038.0 1073.0 1055.50 17.50 178
## 32 1073.0 1111.0 1092.00 19.00 166
## 33 1111.0 1155.0 1133.00 22.00 123
## 34 1155.0 1199.0 1177.00 22.00 92
## 35 1199.0 1249.5 1224.25 25.25 71
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=35,
overlap = 0,
sub='quantile based',
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
## from to mid width counts
## 0% 301 363 332.0 31.0 332
## 2.857143% 363 390 376.5 13.5 343
## 5.714286% 390 410 400.0 10.0 328
## 8.571429% 410 429 419.5 9.5 331
## 11.42857% 429 445 437.0 8.0 339
## 14.28571% 445 462 453.5 8.5 347
## 17.14286% 462 476 469.0 7.0 339
## 20% 476 489 482.5 6.5 315
## 22.85714% 489 503 496.0 7.0 333
## 25.71429% 503 517 510.0 7.0 353
## 28.57143% 517 531 524.0 7.0 338
## 31.42857% 531 544 537.5 6.5 316
## 34.28571% 544 557 550.5 6.5 337
## 37.14286% 557 572 564.5 7.5 341
## 40% 572 586 579.0 7.0 337
## 42.85714% 586 601 593.5 7.5 324
## 45.71429% 601 617 609.0 8.0 350
## 48.57143% 617 632 624.5 7.5 332
## 51.42857% 632 646 639.0 7.0 321
## 54.28571% 646 663 654.5 8.5 342
## 57.14286% 663 682 672.5 9.5 340
## 60% 682 698 690.0 8.0 336
## 62.85714% 698 716 707.0 9.0 323
## 65.71429% 716 736 726.0 10.0 350
## 68.57143% 736 754 745.0 9.0 328
## 71.42857% 754 776 765.0 11.0 335
## 74.28571% 776 800 788.0 12.0 336
## 77.14286% 800 825 812.5 12.5 327
## 80% 825 855 840.0 15.0 344
## 82.85714% 855 891 873.0 18.0 330
## 85.71429% 891 928 909.5 18.5 334
## 88.57143% 928 969 948.5 20.5 340
## 91.42857% 969 1029 999.0 30.0 337
## 94.28571% 1029 1097 1063.0 34.0 332
## 97.14286% 1097 1250 1173.5 76.5 336
op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
main = paste("max window size", mws), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = mws)
})
})
op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=nbins,
overlap = 0,
main = paste("nr bins", nbins), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = 100)
})
})
Here is the output of sessionInfo()
on the system on which this document was compiled:
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.9.5 (Mavericks)
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] specL_1.8.0 protViz_0.2.15 seqinr_3.3-3 RSQLite_1.0.0
## [5] DBI_0.5-1 BiocStyle_2.2.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.7 digest_0.6.10 assertthat_0.1 formatR_1.4
## [5] magrittr_1.5 evaluate_0.10 stringi_1.1.2 rmarkdown_1.1
## [9] tools_3.3.1 ade4_1.7-4 stringr_1.1.0 yaml_2.1.13
## [13] htmltools_0.3.5 knitr_1.14 tibble_1.2