## ----githubInstall, eval=FALSE------------------------------------------------
#  ## Install SPsimSeq
#  library(devtools)
#  install_github("CenterForStatistics-UGent/SPsimSeq")

## ----BiocInstall, eval=FALSE--------------------------------------------------
#  ## Install SPsimSeq
#  library(BiocManager)
#  BiocManager::install("SPsimSeq")

## ----loadSPsimSeq-------------------------------------------------------------
# load package
library(SPsimSeq) 

## ----loadData, eval=TRUE------------------------------------------------------
 # load the Zhang bulk RNA-seq data (availabl with the package) 
 data("zhang.data.sub") 
  # filter genes with sufficient expression (important step to avoid bugs) 
 zhang.counts <- zhang.data.sub$counts 
 MYCN.status  <- zhang.data.sub$MYCN.status #The grouping variable

## ----simulateData-------------------------------------------------------------
 set.seed(6452) #Set seed for reproducibility
 # simulate data
 sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
                          group = MYCN.status, n.genes = 3000, batch.config = 1,
                          group.config = c(0.5, 0.5), tot.samples = ncol(zhang.counts), 
                          pDE = 0.1, lfc.thrld = 0.5, result.format = "list", return.details = TRUE)

## ----dataExploration----------------------------------------------------------
 sim.data.bulk1 <- sim.data.bulk$sim.data.list[[1]]
 head(sim.data.bulk1$counts[, seq_len(5)])  # count data
 head(sim.data.bulk1$colData)        # sample info
 head(sim.data.bulk1$rowData)        # gene info

## ----evaluateDensities--------------------------------------------------------
geneDens = evaluateDensities(sim.data.bulk, newData = rownames(zhang.counts)[1])
#This returns for every sample, the midpoints (mids) and associated densities (gy)

## ----comparison, warning=FALSE, fig.width=8, fig.height=4---------------------
# compare the distributions of the mean expressions, variability, 
# and fraction of zero counts per gene
library(LSD) # for generating heatmap plots
# normalize counts for comparison  
Y0.log.cpm <- log2(edgeR::cpm(zhang.counts)+1)
Y1.log.cpm <- log2(edgeR::cpm(sim.data.bulk1$counts)+1)
Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
rowVars <- function(X){apply(X, 1, var, na.rm=TRUE)}
rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=log(colSums(zhang.counts)), 
             simulated.data=log(sim.data.bulk1$colData$sim.Lib.Size)), 
        main="library size") 
boxplot(list(real.data=rowMeans(Y0.log.cpm), 
             simulated.data=rowMeans(Y1.log.cpm)), 
        main="mean expression of genes") 
boxplot(list(real.data=rowVars(Y0.log.cpm), 
             simulated.data=rowVars(Y1.log.cpm)), 
        main="variance of gene expressions") 

## ----meanvariancetrend--------------------------------------------------------
# compare the relationship between the mean and variability
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowCVs(Y0.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
            colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM", 
            ylab="coefficients of variation", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowCVs(Y1.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
     main="SPsimSeq", xlab="mean log2-CPM", ylab="coefficients of variation", 
     cexplot=0.5, alpha = 60, colpal="bl2gr2rd", cex.lab=1.25)

n.gride <- 1000
min.g   <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g   <- seq(0, 20, length.out = n.gride+1)[-1] 
mid.g   <- (min.g+max.g)/2
f.real  <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x))) 
  y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x))) 
  y
})

sm1 <- loess(I(f.SPsim-f.real)~mid.g) 
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
     cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3) 

## ----correlation--------------------------------------------------------------
# compare the correlation between genes and samples 
cor.mat.Y0 <- cor(t(Y0.log.cpm))
cor.mat.Y1 <- cor(t(Y1.log.cpm)) 
cor.vec.Y0 <- cor.mat.Y0[upper.tri(cor.mat.Y0)]
cor.vec.Y1 <- cor.mat.Y1[upper.tri(cor.mat.Y1)] 
par(mfrow=c(1,3), mar=c(4,4,3.5,1))
hist(cor.vec.Y0, nclass = 30, probability = TRUE, 
     border="gray", col="steelblue1", main="real data", xlab="Genewise correlations", 
     ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
hist(cor.vec.Y1, nclass = 30, probability = TRUE, border="gray",
     col="steelblue1",  main="SPsimSeq", xlab="Genewise correlations",
     ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
plot(seq(-1, 1, 0.1), seq(-1, 1, 0.1), type="n", xlab="quantile (real data)", 
     ylab="quantile (simulated data)",  main="correlation quantile-quantile plot")
abline(0, 1, col="gray")
points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.001)), 
       col="blue", pch=20, cex=1.5, cex.lab=1.25)  

## ----scRNA, eval=TRUE, warning=FALSE, fig.width=8, fig.height=4---------------
library(SingleCellExperiment)
# load the NGP nutlin data (availabl with the package, processed with SMARTer/C1 protocol, and contains read-counts)
  data("scNGP.data")
  set.seed(654321)
 # simulate data (we simulate here only a single data, n.sim = 1)
 sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data,
                         group = scNGP.data$characteristics..treatment, 
                         n.genes = 4000, batch.config = 1,
                         group.config = c(0.5, 0.5), tot.samples = 100, 
                         pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
                         result.format = "SCE")

## ----scRNAhead----------------------------------------------------------------
 sim.data.sc1 <- sim.data.sc[[1]]
 class(sim.data.sc1)
 head(counts(sim.data.sc1)[, seq_len(5)])
 colData(sim.data.sc1)
 rowData(sim.data.sc1)

## ----basicSCrna---------------------------------------------------------------
# normalize counts for comparison  
Y0.log.cpm  <- log2(edgeR::cpm(counts(scNGP.data))+1)
Y1.log.cpm  <- log2(edgeR::cpm(counts(sim.data.sc1))+1)
Y0.log.cpm  <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm  <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
rowVars <- function(X){apply(X, 1, var, na.rm=TRUE)}
rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE))}
rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=colSums(counts(scNGP.data)), 
             simulated.data=colData(sim.data.sc1)$sim.Lib.Size), 
        main="library size") 
boxplot(list(real.data=rowMeans(Y0.log.cpm), 
             simulated.data=rowMeans(Y1.log.cpm)), 
        main="mean expression of genes") 
boxplot(list(real.data=rowVars(Y0.log.cpm), 
             simulated.data=rowVars(Y1.log.cpm)), 
        main="variance of gene expressions")

## ----meanVarSCrna-------------------------------------------------------------
# compare the relationship between the mean and variability  
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowCVs(Y0.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
            colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM", 
            ylab="coefficients of variation", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowCVs(Y1.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
     main="SPsimSeq", xlab="mean log2-CPM", ylab="coefficients of variation", 
     cexplot=0.5, alpha = 60, colpal="bl2gr2rd", cex.lab=1.25)
n.gride <- 1000
min.g   <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g   <- seq(0, 20, length.out = n.gride+1)[-1] 
mid.g   <- (min.g+max.g)/2
f.real  <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x))) 
  y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x))) 
  y
})

sm1 <- loess(I(f.SPsim-f.real)~mid.g) 
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
     cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3)

## ----scRNAmeanzeroes----------------------------------------------------------
# compare the relationship between the  mean and fraction of zeros
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowZeroFrac(Y0.log.cpm), ylim=c(0, 1), 
            xlim=c(0, 16), colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM", 
            ylab="fraction of zero counts", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowZeroFrac(Y1.log.cpm), ylim=c(0, 1), 
            xlim=c(0, 16), main="SPsimSeq", xlab="mean log2-CPM", 
            ylab="fraction of zero counts",  cexplot=0.5, alpha = 60,
            colpal="bl2gr2rd", cex.lab=1.25)
n.gride <- 1000
min.g   <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g   <- seq(0, 20, length.out = n.gride+1)[-1] 
mid.g   <- (min.g+max.g)/2
f.real  <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowZeroFrac(x)), mean(x==0)) 
  y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
  x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
  y <- ifelse(!is.null(dim(x)), mean(rowZeroFrac(x)), mean(x==0)) 
  y
})
sm1 <- loess(I(f.SPsim-f.real)~mid.g) 
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
     cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3)

## ----corSCrna-----------------------------------------------------------------
# compare the correlation between genes and samples
Y0.log.cpm2 <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>0.25, ]
Y1.log.cpm2 <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>0.25, ]

cor.mat.Y0 <- cor(t(Y0.log.cpm2))
cor.mat.Y1 <- cor(t(Y1.log.cpm2)) 

cor.vec.Y0 <- cor.mat.Y0[upper.tri(cor.mat.Y0)]
cor.vec.Y1 <- cor.mat.Y1[upper.tri(cor.mat.Y1)] 

par(mfrow=c(1,3), mar=c(4,4,3.5,1))
hist(cor.vec.Y0, nclass = 30, probability = TRUE, 
     border="gray", col="steelblue1", main="real data", xlab="pairwise correlation between genes", 
     ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
hist(cor.vec.Y1, nclass = 30, probability = TRUE, border="gray",
     col="steelblue1",  main="SPsimSeq", xlab="pairwise correlation between genes",
     ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)

plot(seq(-1, 1, 0.1), seq(-1, 1, 0.1), type="n", xlab="quantile (real data)", 
     ylab="quantile (simulated data)",  main="correlation quantile-quantile plot")
abline(0, 1, col="gray")
points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.001)), 
       col="blue", pch=20, cex=1.5, cex.lab=1.25)  

## -----------------------------------------------------------------------------
sessionInfo()