## ----echo=FALSE------------------------------------------------------------ library(specL) exampledata <- ms1.p2069 ## -------------------------------------------------------------------------- # moves the windows start and end to regions where no peaks are observed .makenewfromto <- function( windfrom, empty , isfrom=TRUE){ newfrom <- NULL for(from in windfrom){ idx <- which.min(abs(from - empty)) startmass <- 0 if(isfrom){ if(empty[idx] < from) { startmass <- empty[idx] } else { startmass <- empty[idx-1] } }else{ if(empty[idx] > from) { startmass <- empty[idx] } else { startmass <- empty[idx+1] } } newfrom <- c(newfrom, round(startmass,digits=1)) } return(newfrom) } .cdsw_compute_breaks <- function(xx, nbins){ q <- quantile(xx, seq(0, 1, length = nbins + 1)) q[1] <- q[1] - 0.5 q[length(q)] <- q[length(q)] + 0.5 q <- round(q) } cdsw <- function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) { if (class(x) == "psmSet") { x <- unlist(lapply(x, function(x) { x$pepmass })) } else if (class(x) == 'specLSet') { x <- unlist(lapply(x@ionlibrary, function(xx) { xx@q1 })) } # x should be numeric if (class(x) != "numeric") { warning("can not compute quantils. 'x' is not numeric.") return (NULL) } x <- x[x > massrange[1] & x < massrange[2]] q <- FUN(xx=x, nbins=n) idx <- 1:n from <- q[idx] - overlap * 0.5 to <- q[idx + 1] + overlap * 0.5 width <- 0.5 * (to - from) mid <- from + width h <- hist(x, breaks = q, ...) data.frame(from, to, mid, width, counts = h$counts) } ## ----fig.retina=3, warning=FALSE------------------------------------------- cdsw(exampledata, freq=TRUE, overlap = 0, main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks) ## -------------------------------------------------------------------------- .cdsw_objective <- function(splits, data){ counts <- hist(data, breaks=splits,plot=FALSE)$counts nbins<-length(splits)-1 optimumN <- length(data)/(length(splits)-1) optimumN<-rep(optimumN,nbins) score2 <-sqrt(sum((counts - optimumN)^2)) score1 <- sum(abs(counts - round(optimumN))) return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN))) } .cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){ difsp<-diff(splits) return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp)) } ## -------------------------------------------------------------------------- .cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){ breaks <- nbins+1 #xx <- x #xx<-xx[xx >=310 & xx<1250] # TODO(wew): there is something insconsitent with the nbins parameter qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins))) plot(1:breaks, qqs, type="b" , sub=".cdsw_compute_sampling_breaks") legend("topleft", legend = c(paste("maxwindow = ", maxwindow), paste("nbins = ", breaks) )) # equidistant spaced bins unif <- seq(min(xx),max(xx),length=(breaks)) lines(1:breaks,unif,col=2,type="b") if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){ warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n") }else{ .cdsw_hardconstrain(qqs,minwindow = 5, maxwindow) } mixeddata <- xx it_count <- 0 error <- 0 while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){ it_count <- it_count + 1 uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx)) mixeddata<-c(mixeddata,uniformdata) qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins))) lines(1:breaks,qqs,type="l", col="#00DD00AA") error[it_count] <-.cdsw_objective(qqs, xx)$score1 } lines(1:breaks,qqs,type="b", col="#FF1111AA") plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks") qqs <- as.numeric(sort(round(qqs))) qqs[1] <- qqs[1] - 0.5 qqs[length(qqs)] <- qqs[length(qqs)] + 0.5 round(qqs, 1) } ## ----fig.height=8, fig.width=8, fig.retina=3------------------------------- op <- par(mfrow=c(2,2)) par(mfrow=c(3,1)) wind <- cdsw(exampledata, freq=TRUE, plot=TRUE, overlap = 0, n=35, massrange = c(350,1250), sub='sampling based', main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)}) readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){ res <- hist(ms1data, breaks=breaks) abline(v=wind$from,col=2,lty=2) abline(v=wind$to,col=3,lty=2) empty <- res$mids[which(res$counts < maxbin )] newfrom <- .makenewfromto(wind$from , empty) newto <- .makenewfromto(wind$to , empty , isfrom=FALSE ) plot(res,xlim=c(1060,1065)) abline(v = newfrom,lwd=0.5,col="red") abline(v = newto , lwd=0.5,col="green") plot(res,xlim=c(520,550)) abline(v = newfrom,lwd=0.5,col="red") abline(v = newto , lwd=0.5,col="green") width <- (newto - newfrom) * 0.5 mid <- (newfrom + newto)*0.5 newCounts <- NULL for(i in 1:length(newfrom)) { newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i])) } data.frame(newfrom, newto, mid, width, counts =newCounts) } readjustWindows(wind,exampledata) ## ----fig.height=8, fig.width=8, fig.retina=3------------------------------- cdsw(exampledata, freq=TRUE, plot=TRUE, n=35, overlap = 0, sub='quantile based', main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks) ## ----fig.height=10, fig.width=8, fig.retina=3, warning=FALSE--------------- op <-par(mfrow=c(4,3)) res <- lapply(c(75,150,300, 800), function(mws){ cdsw(exampledata, freq=TRUE, plot=TRUE, overlap = 0, main = paste("max window size", mws), xlab='pepmass', FUN=function(...){ .cdsw_compute_sampling_breaks(...,maxwindow = mws) }) }) ## ----fig.height=10, fig.width=8, fig.retina=3, warning=FALSE--------------- op <-par(mfrow=c(4,3)) res <- lapply(c(20,25,30, 40), function(nbins){ cdsw(exampledata, freq=TRUE, plot=TRUE, n=nbins, overlap = 0, main = paste("nr bins", nbins), xlab='pepmass', FUN=function(...){ .cdsw_compute_sampling_breaks(...,maxwindow = 100) }) }) ## ----sessionInfo, echo=FALSE----------------------------------------------- sessionInfo()