## ----options, include=FALSE, cache=FALSE, results='hide', message=FALSE----

knitr::opts_chunk$set(fig.align="center", cache=FALSE,error=FALSE,
                      fig.width=6,fig.height=6,autodep=TRUE,
                      out.width="600px", out.height="600px",
                      results="markup", echo=TRUE, eval=TRUE)

options(getClass.msg=FALSE)

set.seed(6473) ## for reproducibility

library(scone)
library(RColorBrewer)


## ----datain, message=FALSE-------------------------------------------------

library(scRNAseq)

## ----- Load Example Data -----
fluidigm <- ReprocessedFluidigmData(assays = "rsem_counts")

## ----showqc----------------------------------------------------------------

## ----- List all QC fields -----

# List all qc fields (accessible via colData())
metadata(fluidigm)$which_qc


## ----biocoverage-----------------------------------------------------------

# Joint distribution of "biological condition"" and "coverage type""
table(colData(fluidigm)$Coverage_Type,
      colData(fluidigm)$Biological_Condition)


## ----prefilter-------------------------------------------------------------

# Preliminary Sample Filtering: High-Coverage Only
is_select = colData(fluidigm)$Coverage_Type == "High"
fluidigm = fluidigm[,is_select]

# Retain only detected transcripts
fluidigm = fluidigm[which(apply(assay(fluidigm) > 0,1,any)),]


## ----ralign----------------------------------------------------------------

# Define a color scheme
cc <- c(brewer.pal(9, "Set1"))

# One batch per Biological Condition
batch = factor(colData(fluidigm)$Biological_Condition)

# Alignment Quality Metrics
qc = colData(fluidigm)[,metadata(fluidigm)$which_qc]

# Barplot of read proportion mapping to human transcriptome
ralign = qc$RALIGN
o = order(ralign)[order(batch[order(ralign)])] # Order by batch, then value

barplot(ralign[o], col=cc[batch][o], 
        border=cc[batch][o], main="Percentage of reads mapped")
legend("bottomleft", legend=levels(batch), fill=cc,cex=0.4)


## ----nreads----------------------------------------------------------------

# Barplot of total read number
nreads = qc$NREADS
o = order(nreads)[order(batch[order(nreads)])] # Order by batch, then value

barplot(nreads[o], col=cc[batch][o], 
        border=cc[batch][o], main="Total number of reads")
legend("topright", legend=levels(batch), fill=cc, cex=0.4)


## ----qpc-------------------------------------------------------------------

## ----- PCA of QC matrix -----
qpc = prcomp(qc,center = TRUE,scale. = TRUE)
barplot((qpc$sdev^2)/sum(qpc$sdev^2), border="gray", 
        xlab="PC", ylab="Proportion of Variance", main="Quality PCA")


## ----qpc_view--------------------------------------------------------------

# Barplot of PC1 of the QC matrix
qc1 = as.vector(qpc$x[,1])
o = order(qc1)[order(batch[order(qc1)])]

barplot(qc1[o], col=cc[batch][o], 
        border=cc[batch][o], main="Quality PC1")
legend("bottomright", legend=levels(batch), 
       fill=cc, cex=0.8)


## ----fnr_fit---------------------------------------------------------------

# Extract Housekeeping Genes
data(housekeeping)
hk = intersect(housekeeping$V1,rownames(assay(fluidigm)))

# Mean log10(x+1) expression
mu_obs = rowMeans(log10(assay(fluidigm)[hk,]+1))

# Assumed False Negatives
drop_outs = assay(fluidigm)[hk,] == 0

# Logistic Regression Model of Failure
ref.glms = list()
for (si in 1:dim(drop_outs)[2]){
  fit = glm(cbind(drop_outs[,si],1 - drop_outs[,si]) ~ mu_obs,
            family=binomial(logit))
  ref.glms[[si]] = fit$coefficients
}


## ----fnr_vis,fig.width=8,fig.height=4,out.width="800px",out.height="400px"----

par(mfrow=c(1,2))

# Plot Failure Curves and Calculate AUC
plot(NULL, main = "False Negative Rate Curves",
     ylim = c(0,1),xlim = c(0,6), 
     ylab = "Failure Probability", xlab = "Mean log10 Expression")
x = (0:60)/10
AUC = NULL
for(si in 1:ncol(assay(fluidigm))){
  y = 1/(exp(-ref.glms[[si]][1] - ref.glms[[si]][2] * x) + 1)
  AUC[si] = sum(y)/10
  lines(x, 1/(exp(-ref.glms[[si]][1] - ref.glms[[si]][2] * x) + 1),
        type = 'l', lwd = 2, col = cc[batch][si])
}

# Barplot of FNR AUC
o = order(AUC)[order(batch[order(AUC)])]

barplot(AUC[o], col=cc[batch][o], border=cc[batch][o], main="FNR AUC")
legend("topright", legend=levels(batch), fill=cc, cex=0.4)


## ----metric_sample_filter, fig.height= 10,out.height="1000px"--------------

# Initial Gene Filtering: 
# Select "common" transcripts based on proportional criteria.
num_reads = quantile(assay(fluidigm)[assay(fluidigm) > 0])[4]
num_cells = 0.25*ncol(fluidigm)
is_common = rowSums(assay(fluidigm) >= num_reads ) >= num_cells

# Metric-based Filtering
mfilt = metric_sample_filter(assay(fluidigm),
                             nreads = colData(fluidigm)$NREADS,
                             ralign = colData(fluidigm)$RALIGN,
                             gene_filter = is_common,
                             pos_controls = rownames(fluidigm) %in% hk,

                             zcut = 3, mixture = FALSE,
                             plot = TRUE)

# Simplify to a single logical
mfilt = !apply(simplify2array(mfilt[!is.na(mfilt)]),1,any)


## ----thresh,fig.width= 6, fig.height= 4, out.width="600px",out.height="400px"----

hist(qc$RALIGN, breaks = 0:100)
# Hard threshold
abline(v = 15, col = "yellow", lwd = 2)
# 3 (zcut) standard deviations below the mean ralign value
abline(v = mean(qc$RALIGN) - 3*sd(qc$RALIGN), col = "green", lwd = 2)
# 3 (zcut) MADs below the median ralign value
abline(v = median(qc$RALIGN) - 3*mad(qc$RALIGN), col = "red", lwd = 2)
# Sufficient threshold
abline(v = NULL, col = "grey", lwd = 2)

# Final threshold is the minimum of 
# 1) the sufficient threshold and 
# 2) the max of all others
thresh = min(NULL,
             max(c(15,mean(qc$RALIGN) - 3*sd(qc$RALIGN),
                   median(qc$RALIGN) - 3*mad(qc$RALIGN))))
abline(v = thresh, col = "blue", lwd = 2, lty = 2)

legend("topleft",legend = c("Hard","SD","MAD","Sufficient","Final"),
       lwd = 2, col = c("yellow","green","red","grey","blue"),
       lty = c(1,1,1,1,2), cex = .5)


## ----filterCount-----------------------------------------------------------

goodDat = fluidigm[,mfilt]

# Final Gene Filtering: Highly expressed in at least 5 cells
num_reads = quantile(assay(fluidigm)[assay(fluidigm) > 0])[4]
num_cells = 5
is_quality = rowSums(assay(fluidigm) >= num_reads ) >= num_cells


## ----scone_init------------------------------------------------------------


# Expression Data (Required)
expr = assay(goodDat)[is_quality,]

# Biological Origin - Variation to be preserved (Optional)
bio = factor(colData(goodDat)$Biological_Condition)

# Processed Alignment Metrics - Variation to be removed (Optional)
qc = colData(goodDat)[,metadata(goodDat)$which_qc]
ppq = scale(qc[,apply(qc,2,sd) > 0],center = TRUE,scale = TRUE)

# Positive Control Genes - Prior knowledge of DE (Optional)
poscon = intersect(rownames(expr),strsplit(paste0("ALS2, CDK5R1, CYFIP1,",
                                                  " DPYSL5, FEZ1, FEZ2, ",
                                                  "MAPT, MDGA1, NRCAM, ",
                                                  "NRP1, NRXN1, OPHN1, ",
                                                  "OTX2, PARD6B, PPT1, ",
                                                  "ROBO1, ROBO2, RTN1, ",
                                                  "RTN4, SEMA4F, SIAH1, ",
                                                  "SLIT2, SMARCA1, THY1, ",
                                                  "TRAPPC4, UBB, YWHAG, ",
                                                  "YWHAH"),split = ", ")[[1]])

# Negative Control Genes - Uniformly expressed transcripts (Optional)
negcon = intersect(rownames(expr),hk)

# Creating a SconeExperiment Object
my_scone <- SconeExperiment(expr,
                qc=ppq, bio = bio,
                negcon_ruv = rownames(expr) %in% negcon,
                poscon = rownames(expr) %in% poscon
)


## ----scone_in2-------------------------------------------------------------

## ----- User-defined function: Dividing by number of detected genes -----

EFF_FN = function (ei)
{
  sums = colSums(ei > 0)
  eo = t(t(ei)*sums/mean(sums))
  return(eo)
}

## ----- Scaling Argument -----

scaling=list(none=identity, # Identity - do nothing

             eff = EFF_FN, # User-defined function

             sum = SUM_FN, # SCONE library wrappers...
             tmm = TMM_FN, 
             uq = UQ_FN,
             fq = FQT_FN,
             deseq = DESEQ_FN)


## ----scone_in3, eval=FALSE-------------------------------------------------
#  
#  # Simple FNR model estimation with SCONE::estimate_ziber
#  fnr_out = estimate_ziber(x = expr, bulk_model = TRUE,
#                           pos_controls = rownames(expr) %in% hk,
#                           maxiter = 10000)
#  
#  ## ----- Imputation List Argument -----
#  imputation=list(none=impute_null, # No imputation
#                  expect=impute_expectation) # Replace zeroes
#  
#  ## ----- Imputation Function Arguments -----
#  # accessible by functions in imputation list argument
#  impute_args = list(p_nodrop = fnr_out$p_nodrop, mu = exp(fnr_out$Alpha[1,]))
#  
#  my_scone <- scone(my_scone,
#                  imputation = imputation, impute_args = impute_args,
#                  scaling=scaling,
#                  k_qc=3, k_ruv = 3,
#                  adjust_bio="no",
#                  run=FALSE)

## ----scone_params----------------------------------------------------------

my_scone <- scone(my_scone,
                scaling=scaling,
                k_qc=3, k_ruv = 3,
                adjust_bio="no",
                run=FALSE)

head(get_params(my_scone))


## ----scone_params_view-----------------------------------------------------

apply(get_params(my_scone),2,unique)


## ----scone_params_filt, eval=FALSE-----------------------------------------
#  
#  is_screened = ((get_params(my_scone)$imputation_method == "expect") &
#                   (get_params(my_scone)$scaling_method %in% c("none",
#                                                               "eff")))
#  
#  my_scone = select_methods(my_scone,
#                            rownames(get_params(my_scone))[!is_screened ])
#  

## ----scone_run-------------------------------------------------------------

BiocParallel::register(
  BiocParallel::SerialParam()
) # Register BiocParallel Serial Execution

my_scone <- scone(my_scone,
                  scaling=scaling,
                  run=TRUE,
                  eval_kclust = 2:6,
                  stratified_pam = TRUE,
                  return_norm = "in_memory",
                  zero = "postadjust")


## ----scone_view1-----------------------------------------------------------

# View Metric Scores
head(get_scores(my_scone))

# View Mean Score Rank
head(get_score_ranks(my_scone))

# Extract normalized data from top method
out_norm = get_normalized(my_scone,
                          method = rownames(get_params(my_scone))[1])


## ----biplot_color----------------------------------------------------------

pc_obj = prcomp(apply(t(get_scores(my_scone)),1,rank),
                center = TRUE,scale = FALSE)
bp_obj = biplot_color(pc_obj,y = -get_score_ranks(my_scone),expand = .6)


## ----biplot_color4---------------------------------------------------------

bp_obj = biplot_color(pc_obj,y = -get_score_ranks(my_scone),expand = .6)

points(t(bp_obj[1,]), pch = 1, col = "red", cex = 1)
points(t(bp_obj[1,]), pch = 1, col = "red", cex = 1.5)

points(t(bp_obj[rownames(bp_obj) == "none,none,no_uv,no_bio,no_batch",]),
       pch = 1, col = "blue", cex = 1)
points(t(bp_obj[rownames(bp_obj) == "none,none,no_uv,no_bio,no_batch",]),
       pch = 1, col = "blue", cex = 1.5)

arrows(bp_obj[rownames(bp_obj) == "none,none,no_uv,no_bio,no_batch",][1],
       bp_obj[rownames(bp_obj) == "none,none,no_uv,no_bio,no_batch",][2],
       bp_obj[1,][1],
       bp_obj[1,][2],
       lty = 2, lwd = 2)


## ----sconeReport, eval=FALSE-----------------------------------------------
#  
#  # Methods to consider
#  scone_methods = c(rownames(get_params(my_scone))[1:12],
#                    "none,none,no_uv,no_bio,no_batch")
#  
#  # Shiny app
#  sconeReport(my_scone,methods = scone_methods,
#              qc = ppq,
#              bio = bio,
#              negcon = negcon, poscon = poscon)
#  

## ----session---------------------------------------------------------------
sessionInfo()