Contents

library(COTAN)
library(data.table)
library(Matrix)
library(ggrepel)
#> Loading required package: ggplot2
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(Rtsne)
library(utils)
library(plotly)
#> 
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following object is masked from 'package:graphics':
#> 
#>     layout
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ tibble  3.1.6     ✔ dplyr   1.0.8
#> ✔ tidyr   1.2.0     ✔ stringr 1.4.0
#> ✔ readr   2.1.2     ✔ forcats 0.5.1
#> ✔ purrr   0.3.4
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::between()   masks data.table::between()
#> ✖ tidyr::expand()    masks Matrix::expand()
#> ✖ dplyr::filter()    masks plotly::filter(), stats::filter()
#> ✖ dplyr::first()     masks data.table::first()
#> ✖ dplyr::lag()       masks stats::lag()
#> ✖ dplyr::last()      masks data.table::last()
#> ✖ tidyr::pack()      masks Matrix::pack()
#> ✖ purrr::transpose() masks data.table::transpose()
#> ✖ tidyr::unpack()    masks Matrix::unpack()
library(htmlwidgets)
library(MASS)
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> The following object is masked from 'package:plotly':
#> 
#>     select
library(dendextend)
#> 
#> ---------------------
#> Welcome to dendextend version 1.15.2
#> Type citation('dendextend') for how to cite the package.
#> 
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#> 
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> You may ask questions at stackoverflow, use the r and dendextend tags: 
#>   https://stackoverflow.com/questions/tagged/dendextend
#> 
#>  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#> 
#> Attaching package: 'dendextend'
#> The following object is masked from 'package:data.table':
#> 
#>     set
#> The following object is masked from 'package:stats':
#> 
#>     cutree
mycolours <- c("A" = "#8491B4B2","B"="#E64B35FF")
my_theme = theme(axis.text.x = element_text(size = 14, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
                 axis.text.y = element_text( size = 14, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),
                 axis.title.x = element_text( size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
                 axis.title.y = element_text( size = 14, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"))
data_dir = tempdir()

This is just an example on a toy dataset. If the user want to try on a real dataset it can be used the previous command to download it and the next commented line to use it as input data. There are also available online many other examples at link.

data("ERCCraw", package = "COTAN")

ERCCraw = as.data.frame(ERCCraw)
rownames(ERCCraw) = ERCCraw$V1
ERCCraw = ERCCraw[,2:ncol(ERCCraw)]
ERCCraw[1:5,1:5]
#>            AAACCGTGAAGCCT.1 AAACGCACTGCTGA.1 AAACTTGACCGAAT.1 AAAGACGAGGAAGC.1
#> ERCC-00002             1662             4036             3919             4124
#> ERCC-00003               95              283              298              290
#> ERCC-00004               25               75               70               75
#> ERCC-00009               41               57               78               98
#> ERCC-00012                0                0                0                0
#>            AAAGACGATCCGAA.1
#> ERCC-00002             4220
#> ERCC-00003              312
#> ERCC-00004               87
#> ERCC-00009               87
#> ERCC-00012                0

Define a directory where the output will be stored.

out_dir <- paste0(tempdir(),"/")

1 Analytical pipeline

Inizialise the COTAN object with the row count table and the metadata for the experiment.

obj = new("scCOTAN",raw = ERCCraw)
#obj = initRaw(obj,GEO="GSM2861514" ,sc.method="Drop_seq",cond = "mouse cortex E17.5")
obj = initRaw(obj,GEO="ERCC" ,sc.method="10X",cond = "negative ERCC dataset")
#> [1] "Initializing S4 object"

Now we can start the cleaning. Analysis requires and starts from a matrix of raw UMI counts after removing possible cell doublets or multiplets and low quality or dying cells (with too high mtRNA percentage, easily done with Seurat or other tools).

If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.

genes_to_rem = get.genes(obj)[grep('^mt', get.genes(obj))] 
cells_to_rem = names(get.cell.size(obj)[which(get.cell.size(obj) == 0)])
obj = drop.genes.cells(obj,genes_to_rem,cells_to_rem )

We want also to define a prefix to identify the sample.

#t = "E17.5_cortex"
t = "ERCC"

print(paste("Condition ",t,sep = ""))
#> [1] "Condition ERCC"
#--------------------------------------
n_cells = length(get.cell.size(object = obj))
print(paste("n cells", n_cells, sep = " "))
#> [1] "n cells 1015"

n_it = 1

1.1 Data cleaning

First we create the directory to store all information regarding the data cleaning.

if(!file.exists(out_dir)){
  dir.create(file.path(out_dir))
}

if(!file.exists(paste(out_dir,"cleaning", sep = ""))){   
  dir.create(file.path(out_dir, "cleaning"))
}
ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1]   88 1015
#>       rowname AACCTACTAGTGTC.1 AACGTTCTTGCCCT.1 AACTGTCTTGTCGA.1
#> 1  ERCC-00002       3868.72569       3666.17330       3583.62138
#> 50 ERCC-00096       2155.29007       2103.20280       2365.56635
#> 39 ERCC-00074       1193.60684       1170.86640       1098.13070
#> 65 ERCC-00130        738.75666        754.80039        639.59647
#> 59 ERCC-00113        565.81363        774.40559        834.76745
#> 88 ERCC-00171        452.85084        417.15519        493.80610
#> 25 ERCC-00046        397.86895        393.19327        399.74779
#> 2  ERCC-00003        256.91538        281.00794        249.25451
#> 73 ERCC-00145        156.94830        144.86068        101.11268
#> 23 ERCC-00043        126.95818        123.07712        122.27580
#> 57 ERCC-00111         74.97530         79.51000         61.13790
#> 4  ERCC-00009         55.98156         81.68835         84.65247
#> 3  ERCC-00004         56.98123         69.70739         54.08352
#> 22 ERCC-00042         70.97662         68.61822         56.43498
#> 40 ERCC-00076         58.98057         64.26150         47.02915
#>    AAGAGATGGACGAG.1 AAGTAACTGAATAG.1 AATGTAACGAGATA.1 AGCAAGCTCCCACT.1
#> 1        3843.20177       3596.41754       3658.35653       3794.25176
#> 50       2149.36255       2231.89998       2293.19618       2171.19647
#> 39        972.96247       1259.40899       1138.91426       1177.10994
#> 65        782.79254        725.34382        713.74237        712.58352
#> 59        773.94742        595.61523        676.17698        552.78643
#> 88        411.29777        465.88663        448.22338        428.29336
#> 25        327.26920        438.42583        414.07303        432.93862
#> 2         393.60755        241.46563        275.76410        290.79354
#> 73        172.47971        155.29554        149.40779        194.17204
#> 23        110.56392        145.82630        122.94127        146.79035
#> 57         79.60602         66.28468         68.30071         67.82086
#> 4          48.64812         73.86008         74.27702         47.38169
#> 3          75.18346         64.39084         60.61688         79.89854
#> 22         48.64812         88.06394         58.90936         65.96275
#> 40         53.07068         57.76237         45.24922         75.25328
#>    AGTATCCTTCTATC.1 AGTCTACTCATTCT.1 AGTTGTCTAGAACA.1 AGTTTGCTCATGCA.1
#> 1        3662.89484       3742.58314       3732.19890       3598.95146
#> 50       2126.18440       2230.92416       2160.79985       2151.40591
#> 39       1190.99693       1136.99996       1138.43168       1184.35545
#> 65        747.03776        730.28456        692.34409        780.91273
#> 59        601.52296        610.07311        626.74297        613.82165
#> 88        462.49608        433.76298        427.92113        444.99905
#> 25        425.42225        429.75593        446.08759        399.97969
#> 2         280.83430        254.44757        277.54318        304.74643
#> 73        160.34433        144.25374        157.44268        174.01714
#> 23        185.36917        125.22026        158.45193        170.55411
#> 57         76.92821         70.12335         73.67510         91.77023
#> 4          69.51344         75.13216         94.86931         83.11266
#> 3          54.68391         74.13039         62.57337         75.32085
#> 22         62.09867         77.13568         52.48089         58.87147
#> 40         69.51344         64.11277         68.62886         48.48239
#>    ATACTCTGCCAAGT.1 ATAGATTGGTGTAC.1 ATCAGGTGCGGAGA.1 ATCCCGTGCCAACA.1
#> 1        3861.52507       3712.95257       3684.84776       3823.53240
#> 50       2036.19303       2217.69942       2203.66257       2158.99362
#> 39       1060.69470       1164.70423       1146.58652       1190.84351
#> 65        613.41380        693.14552        739.52699        709.59930
#> 59        609.15398        630.88146        613.78609        681.29082
#> 88        472.83980        444.08927        511.48841        420.85278
#> 25        472.83980        444.08927        425.17474        368.01028
#> 2         268.36854        248.14060        269.59702        279.31036
#> 73        236.41990        137.34720        136.39691        149.09134
#> 23        155.48336        165.73228        157.70893        143.42965
#> 57         63.89727         59.51712         67.13285         67.94036
#> 4         112.88518         75.99878         77.78886         79.26375
#> 3          83.06645         70.50489         83.11687         54.72973
#> 22         61.76736         64.09536         59.67365         56.61697
#> 40         57.50754         83.32397         56.47685         54.72973
#>    CAAAGCACGCTTCC.1 CAAGGTTGGGAACG.1 CAGACTGAACACCA.1 CAGCGTCTTTCACT.1
#> 1        3655.08341       3657.20427       3715.69806       3711.63105
#> 50       2194.86149       2163.14214       2182.81283       2206.38856
#> 39       1251.90632       1259.28094       1188.74807       1247.32477
#> 65        697.40441        708.98352        672.54233        677.30494
#> 59        590.73072        649.59223        592.89916        593.86097
#> 88        474.99983        455.64255        448.36156        438.89360
#> 25        416.63120        405.53115        439.51231        434.55885
#> 2         258.63338        267.26080        272.35998        252.49928
#> 73        174.09951        152.19018        149.45385        146.29787
#> 23        139.88342        136.41437        163.21934        153.88368
#> 57         70.44489         79.80705         81.60967         82.36028
#> 4          71.45125         64.95922         76.69342         59.60283
#> 3          68.43218        101.15079         68.82743         92.11347
#> 22         61.38769         58.46330         65.87768         63.93759
#> 40         71.45125         63.10325         54.07870         62.85390
#>    CATTACACACCCAA.1 CATTTGACGGTCTA.1 CCACGGGATTCGGA.1 CCCAACACATCTTC.1
#> 1        3622.15916       3742.63053       3765.13428       3790.41464
#> 50       2096.84700       2194.21820       2187.73734       2108.92219
#> 39       1400.94618       1085.22019       1158.64321       1230.87667
#> 65        697.72972        718.09048        729.85399        696.58950
#> 59        641.94792        611.56580        626.76212        666.34683
#> 88        428.87974        466.04548        455.24643        420.37311
#> 25        393.21597        443.21876        388.64725        398.19515
#> 2         269.76444        282.48063        231.72864        246.98180
#> 73        162.77312        154.08034        142.32153        177.42366
#> 23        135.33945        142.66698        121.33823        150.20526
#> 57         74.98537         80.84462        123.16286         73.59050
#> 4          64.01190         86.55130         62.03759         70.56623
#> 3          76.81428         80.84462         69.33613         85.68756
#> 22         68.58418         57.06679         68.42381         66.53387
#> 40         57.61071         55.16457         59.30064         49.39636
#>    CCTATTGAAGCCAT.1 CGACTCTGAAGTGA.1 CGCACGGACTCTAT.1 CGCATAGAAGCTAC.1
#> 1        3671.13788       3677.86374       3841.61207       3681.38464
#> 50       2091.82163       2269.64334       2155.18209       2090.32164
#> 39       1344.87229       1150.01049       1215.52270       1321.56699
#> 65        696.97098        708.44986        724.14118        649.55449
#> 59        663.34917        699.77053        558.19216        665.10234
#> 88        452.53135        432.88131        455.82101        426.70202
#> 25        408.00517        403.58859        397.63110        428.42956
#> 2         264.43097        259.29482        280.17367        281.58878
#> 73        139.03072        141.03902        133.62129        138.20308
#> 23        143.57420        120.42563        168.10420        162.38862
#> 57         62.70013         90.04799         58.18992         84.64939
#> 4          73.60450         75.94409         60.34510         79.46677
#> 3          77.23929         67.26476         64.65546         67.37400
#> 22         59.06534         60.75527         62.50028         72.55662
#> 40         60.88273         68.34968         51.72437         60.46385
#>    CTATGTACATGTGC.1 GACATTCTACTCTT.1 GATAGCACACCATG.1 GATTCTACCCATGA.1
#> 1        3751.26648       3703.12656       3685.11091       3464.21213
#> 50       2260.16572       2120.92139       2308.64506       2205.08496
#> 39       1176.65094       1186.92761       1132.93095       1327.92409
#> 65        669.47381        706.24383        706.74487        735.98383
#> 59        575.41551        629.59721        546.30803        703.01864
#> 88        446.31587        526.67175        458.27345        431.41409
#> 25        414.96311        399.65736        462.38721        404.18197
#> 2         275.71993        277.02277        267.39474        277.33763
#> 73        136.47675        140.15381        146.45004        165.54262
#> 23        165.06310        173.00236        155.50033        161.24281
#> 57         67.31624         88.69109         69.93401         73.09674
#> 4          79.30406         67.88700         65.82024         73.81337
#> 3          52.56199         84.31128         80.62980         54.46424
#> 22         75.61550         59.12739         59.23822         79.54645
#> 40         60.86126         54.74758         60.88373         53.74760
#>    GCTTAACTTCTATC.1 GGACAACTACGTAC.1 GGCGCATGCACTAG.1 GGTTGAACGTCGAT.1
#> 1        3766.93854       3645.87095       3694.27792       3618.43371
#> 50       2399.38432       2164.73588       2267.94103       2201.27042
#> 39       1177.87958       1202.34333       1137.70683       1160.88586
#> 65        648.68730        743.15693        717.37179        762.04350
#> 59        572.81744        650.80177        670.66790        636.45058
#> 88        407.80050        455.73387        460.50038        431.08919
#> 25        367.96882        387.54641        391.37862        378.47595
#> 2         265.54451        295.19126        240.99209        260.52044
#> 73        170.70718        153.63755        139.17760        168.02295
#> 23        127.08202        171.76333        142.91391        185.84357
#> 57         75.86986         72.50312         83.13293         86.55728
#> 4         106.21780         79.40817         72.85807         84.01148
#> 3          70.17962         68.18745         75.66031         75.52547
#> 22         45.52192         69.05059         56.97875         78.91987
#> 40         49.31541         57.82987         59.78098         57.70485
#>    GTCATACTAAACAG.1 GTGGAGGATAGAAG.1 GTTATAGAAAACGA.1 TAAGCTCTAAGTAG.1
#> 1        3739.83042       3795.44448       3637.07900       3751.27681
#> 50       2130.02706       2162.74744       2221.33295       2180.26749
#> 39       1114.62746       1162.03356       1210.59109       1160.36395
#> 65        716.75266        705.55203        710.96803        725.56296
#> 59        620.41494        629.32405        590.70478        634.30843
#> 88        492.28577        424.57214        464.25151        430.32772
#> 25        440.26340        426.34488        481.05300        453.58868
#> 2         274.56251        261.47971        275.01375        249.60797
#> 73        165.70088        130.29667        139.71760        155.66949
#> 23        143.54321        123.20569        162.70910        135.09249
#> 57         76.10680         78.88710         83.12313         68.88822
#> 4          60.69277         75.34161         56.59447         70.67753
#> 3          85.74057         67.36426         70.74309         70.67753
#> 22         97.30110        103.70551         65.43736         69.78287
#> 40         49.13224         52.29594         61.90020         47.41657
#>    TACATCACGACTAC.1 TCCGGACTCTGCAA.1 TCGAATCTAAGGGC.1 TGATACCTCTTCCG.1
#> 1        3636.08433       3657.77388       3803.30743       3721.74044
#> 50       2249.19502       2097.72405       2210.40930       2216.40234
#> 39       1161.45551       1196.30307       1071.87308       1123.33019
#> 65        747.44591        716.89896        680.66572        742.26785
#> 59        646.30070        808.71853        607.86256        594.75984
#> 88        378.86593        466.16090        455.23906        423.61273
#> 25        420.00974        358.44948        454.36191        430.23168
#> 2         252.00584        298.41361        273.66972        239.22773
#> 73        204.86189        143.02664        145.60633        174.92937
#> 23        138.86036        190.70219        142.09774        148.45357
#> 57        108.00250         68.86468        102.62615         83.20964
#> 4          86.57344         67.09892         70.17172         62.40723
#> 3          64.28721         60.03587         66.66314         74.69957
#> 22         70.28734         67.09892         67.54028         70.91731
#> 40         58.28707         62.68452         52.62879         71.86287
#>    TGCCACTGTGCTTT.1 TGGAACTGTGCTAG.1
#> 1        3622.12519       3810.36193
#> 50       2269.69815       2065.01775
#> 39       1207.37506       1168.46200
#> 65        726.30341        701.81208
#> 59        586.46913        797.34671
#> 88        478.98458        485.02196
#> 25        445.59132        426.23142
#> 2         264.01546        260.88303
#> 73        160.70506        124.92990
#> 23        126.26827        150.65076
#> 57         63.65590         47.76731
#> 4          79.30899         29.39527
#> 3          65.74298         36.74409
#> 22         62.61236         47.76731
#> 40         58.43821         66.13936

obj = ttm$object

ttm$pca.cell.2

Run this when B cells need to be removed.

pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle("B cell group genes mean expression")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 5,hjust = 0.02 ))
dev.off()

if (length(ttm$cl1) < length(ttm$cl2)) {
  to_rem = ttm$cl1
}else{
  to_rem = ttm$cl2
}
n_it = n_it+1
obj = drop.genes.cells(object = obj,genes = c(),cells = to_rem)

gc()

ttm = clean(obj)
#ttm = clean.sqrt(obj, cells)
obj = ttm$object

ttm$pca.cell.2

Run this only in the last iteration, instead the previous code, when B cells group has not to be removed

pdf(file.path(out_dir,"cleaning",paste(t,"_",n_it,"_plots_before_cells_exlusion.pdf", sep = "")))
ttm$pca.cell.2
ggplot(ttm$D, aes(x=n,y=means)) + geom_point() +
  geom_text_repel(data=subset(ttm$D, n > (max(ttm$D$n)- 15) ), aes(n,means,label=rownames(ttm$D[ttm$D$n > (max(ttm$D$n)- 15),])),
                  nudge_y      = 0.05,
                  nudge_x      = 0.05,
                  direction    = "x",
                  angle        = 90,
                  vjust        = 0,
                  segment.size = 0.2)+
  ggtitle(label = "B cell group genes mean expression", subtitle = " - B group NOT removed -")+my_theme +
  theme(plot.title = element_text(color = "#3C5488FF", size = 20, face = "italic",vjust = - 10,hjust = 0.02 ),
        plot.subtitle = element_text(color = "darkred",vjust = - 15,hjust = 0.01 ))

dev.off()
#> png 
#>   2

To color the pca based on nu_j (so the cells’ efficiency)

nu_est = round(get.nu(object = obj), digits = 7)

plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))

plot.nu = plot.nu + geom_point(size = 1,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored.pdf", sep = "")))
plot.nu
dev.off()
#> png 
#>   2

plot.nu

The next part is use to remove the cells with efficiency too low.

nu_df = data.frame("nu"= sort(get.nu(obj)), "n"=c(1:length(get.nu(obj))))

ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1)+
    my_theme #+ ylim(0,1) + xlim(0,70)

We can zoom on the smallest values and, if we detect a clear elbow, we can decide to remove the cells.

yset = 0.4#threshold to remove low UDE cells
plot.ude <- ggplot(nu_df, aes(x = n, y=nu)) + 
    geom_point(colour = "#8491B4B2", size=1) + 
    my_theme + ylim(0,1) + xlim(0,400) +
    geom_hline(yintercept=yset, linetype="dashed", color = "darkred") +
    annotate(geom="text", x=200, y=0.25, 
             label=paste("to remove cells with nu < ",yset,sep = " "), 
             color="darkred", size=4.5)


pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).
dev.off()
#> png 
#>   2

plot.ude
#> Warning: Removed 668 rows containing missing values (geom_point).

We also save the defined threshold in the metadata and re run the estimation

obj = add.row.to.meta(obj,c("Threshold low UDE cells:",yset)) 

to_rem = rownames(nu_df[which(nu_df$nu < yset),])

obj = drop.genes.cells(object = obj, genes = c(),cells =  to_rem)

Repeat the estimation after the cells are removed

ttm = clean(obj)
#> [1] "Start estimation mu with linear method"
#> [1]   88 1004
#>       rowname ATACTCTGCCAAGT.1
#> 1  ERCC-00002       3891.24110
#> 50 ERCC-00096       2051.86238
#> 39 ERCC-00074       1068.85718
#> 65 ERCC-00130        618.13427
#> 59 ERCC-00113        613.84167
#> 25 ERCC-00046        476.47850
#> 88 ERCC-00171        476.47850
#> 2  ERCC-00003        270.43374
#> 73 ERCC-00145        238.23925
#> 23 ERCC-00043        156.67987
#> 4  ERCC-00009        113.75388
#> 3  ERCC-00004         83.70568
#> 11 ERCC-00022         70.82789
#> 57 ERCC-00111         64.38899
#> 22 ERCC-00042         62.24269
obj = ttm$object
ttm$pca.cell.2

In case we do not want to remove anything, we can run:

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_efficiency.pdf", sep = "")))
ggplot(nu_df, aes(x = n, y=nu)) + geom_point(colour = "#8491B4B2", size=1) +my_theme + #xlim(0,100)+
  annotate(geom="text", x=50, y=0.25, label="nothing to remove ", color="darkred")
dev.off()
#> png 
#>   2

Just to check again, we plot the final efficiency colored PCA

nu_est = round(get.nu(object = obj), digits = 7)
plot.nu <-ggplot(ttm$pca_cells,aes(x=PC1,y=PC2, colour = log(nu_est)))
plot.nu = plot.nu + geom_point(size = 2,alpha= 0.8)+
  scale_color_gradient2(low = "#E64B35B2",mid =  "#4DBBD5B2", high =  "#3C5488B2" ,
                        midpoint = log(mean(nu_est)),name = "ln(nu)")+
  ggtitle("Cells PCA coloured by cells efficiency: last") +
  my_theme +  theme(plot.title = element_text(color = "#3C5488FF", size = 20),
                    legend.title=element_text(color = "#3C5488FF", size = 14,face = "italic"),
                    legend.text = element_text(color = "#3C5488FF", size = 11),
                    legend.key.width = unit(2, "mm"),
                    legend.position="right")

pdf(file.path(out_dir,"cleaning",paste(t,"_plots_PCA_efficiency_colored_FINAL.pdf", sep = "")))
plot.nu
dev.off()
#> png 
#>   2

plot.nu

1.2 COTAN analysis

In this part all the contingency tables are computed and used to get the statistics (S) To storage efficiency of all the observed tables only the yes_yes is stored. Note that if will be necessary re-computing the yes-yes table, the yes-yes table should be cancelled before running cotan_analysis.

obj = cotan_analysis(obj,cores = 2)
#> [1] "cotan analysis"
#> [1] "mu estimator creation"
#> [1] "save effective constitutive genes"
#> [1] "start a minimization"
#> [1] "Final trance!"
#> [1] "a min: -0.147216796875 | a max 13.5546875 | negative a %: 26.984126984127"

COEX evaluation and storing

obj = get.coex(obj)
#> [1] "coex dataframe creation"
#> [1] "creation of observed yes/yes contingency table"
#> [1] "mu estimator creation"
#> [1] "expected contingency tables creation"
#> [1] "The distance between estimated n of zeros and observed number of zero is 0.0041370202609155 over 63"
#> [1] "Done"
#> [1] "coex estimation"
#> [1] "Cleaning RAM"

# saving the structure 
saveRDS(obj,file = paste(out_dir,t,".cotan.RDS", sep = ""))

2 Analysis of the elaborated data

2.1 GDI

The next function can directly plot the GDI for the dataset with the 1.5 threshold (in red) and the two higher quantiles (in blue).

plot_GDI(obj, cond = "ERCC")
#> [1] "GDI plot "
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "

If a more complex plot is needed, or if we want to analyze more in detail the GDI data, we can get directly the GDI dataframe.

quant.p = get.GDI(obj)
#> [1] "function to generate GDI dataframe"
#> [1] "Using S"
#> [1] "function to generate S "

head(quant.p)
#>            sum.raw.norm       GDI exp.cells
#> ERCC-00012     3.175703 1.1614957  2.390438
#> ERCC-00013     5.995188 1.3512649 27.788845
#> ERCC-00014     6.077256 1.3473405 34.860558
#> ERCC-00016     3.569692 1.1518864  3.685259
#> ERCC-00017     1.609829 0.7957451  0.498008
#> ERCC-00019     7.157739 1.4853109 66.135458

In the third column of this dataframe is reported the percentage of cells expressing the gene.

Next we can define some gene sets (in this case three) that we want to specifically label in the GDI plot.

AA=c("ERCC-00012","ERCC-00013","ERCC-00014") 
BB=c("ERCC-00016","ERCC-00017","ERCC-00019")
CC=c("ERCC-00022","ERCC-00024","ERCC-00028")

text.size = 12

quant.p$highlight = with(quant.p, ifelse(rownames(quant.p) %in% AA, "AA",
                                                 ifelse(rownames(quant.p) %in% CC,"Constitutive" ,
                                                               ifelse(rownames(quant.p) %in% BB,"BB" , "normal"))))

textdf <- quant.p[rownames(quant.p) %in% c(AA,CC,BB), ]
mycolours <- c("Con" = "#00A087FF","AA"="#E64B35FF","BB"="#F39B7FFF","normal" = "#8491B4B2")
f1 = ggplot(subset(quant.p,highlight == "normal" ), aes(x=sum.raw.norm, y=GDI)) +  geom_point(alpha = 0.1, color = "#8491B4B2", size=2.5)
GDI_plot = f1 +  geom_point(data = subset(quant.p,highlight != "normal"  ), aes(x=sum.raw.norm, y=GDI, colour=highlight),size=2.5,alpha = 0.8) +
  geom_hline(yintercept=quantile(quant.p$GDI)[4], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=quantile(quant.p$GDI)[3], linetype="dashed", color = "darkblue") +
  geom_hline(yintercept=1.5, linetype="dotted", color = "red", size= 0.5) +
  scale_color_manual("Status", values = mycolours)  +
  scale_fill_manual("Status", values = mycolours)  +
  xlab("log normalized counts")+ylab("GDI")+
  geom_label_repel(data = textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf),fill=highlight),
                   label.size = NA, 
                   alpha = 0.5, 
                   direction ="both",
                   na.rm=TRUE,
                   seed = 1234) +
  geom_label_repel(data =textdf , aes(x=sum.raw.norm, y=GDI, label = rownames(textdf)),
                   label.size = NA, 
                   segment.color = 'black',
                   segment.size = 0.5,
                   direction = "both",
                   alpha = 0.8, 
                   na.rm=TRUE,
                   fill = NA,
                   seed = 1234) +
  theme(axis.text.x = element_text(size = text.size, angle = 0, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF" ),
        axis.text.y = element_text( size = text.size, angle = 0, hjust = 0, vjust = .5, face = "plain", colour ="#3C5488FF"),  
        axis.title.x = element_text( size = text.size, angle = 0, hjust = .5, vjust = 0, face = "plain", colour ="#3C5488FF"),
        axis.title.y = element_text( size = text.size, angle = 90, hjust = .5, vjust = .5, face = "plain", colour ="#3C5488FF"),
        legend.title = element_blank(),
        legend.text = element_text(color = "#3C5488FF",face ="italic" ),
        legend.position = "bottom") 
legend <- cowplot::get_legend(GDI_plot)
GDI_plot =GDI_plot + theme(
        legend.position = "none") 
GDI_plot

2.2 Heatmaps

For the Gene Pair Analysis, we can plot an heatmap with the coex values between two genes sets. To do so we need to define, in a list, the different gene sets (list.genes). Then in the function parameter sets we can decide which sets need to be considered (in the example from 1 to 3). In the condition parameter we should insert an array with each file name prefix to be considered (in the example, the file names is “E17.5_cortex”).

list.genes = list("Ref.col"= BB, "AA"=AA, "Const."=CC )

plot_heatmap(df_genes = list.genes,sets = c(1:3),conditions = "ERCC",dir = out_dir)
#> [1] "plot heatmap"
#> [1] "Loading condition ERCC"
#> [1] "ERCC-00016" "ERCC-00017" "ERCC-00019"
#> [1] "Get p-values on a set of genes on columns on a set of genes on rows"
#> [1] "Using function S"
#> [1] "function to generate S "
#> [1] "Ref.col"
#> [1] "AA"
#> [1] "Const."
#> [1] "min coex: 0 max coex 0.98246092315627"

We can also plot a general heatmap of coex values based on some markers like the following one.

plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"), condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "

plot_general.heatmap(prim.markers = c("ERCC-00014","ERCC-00019"),markers.list =c("ERCC-00084" ,"ERCC-00085" ,"ERCC-00086") ,symmetric = FALSE, condition = "ERCC",dir = out_dir, p_value = 0.05)
#> [1] "ploting a general heatmap"
#> NULL
#> [1] "Get p-values genome wide on columns genome wide on rows"
#> [1] "Using function S"
#> [1] "function to generate S "

2.3 Get data tables

Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:

  1. get.observed.ct to produce the observed data
 get.observed.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#>                ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes            241           423
#> ERCC-00019.no             109           231
  1. get.expected.ct to produce the expected values
get.expected.ct(object = obj, g1 = "ERCC-00014",g2 = "ERCC-00019")
#>                ERCC-00014.yes ERCC-00014.no
#> ERCC-00019.yes       235.7119      428.2888
#> ERCC-00019.no        114.2885      225.7108

Another useful function is extract.coex. This can be used to extract the whole or a partial coex matrix from a COTAN object.

# For the whole matrix
coex <- extract.coex(object = obj)
coex[1:5,1:5]
#>              ERCC-00012  ERCC-00013   ERCC-00014   ERCC-00016   ERCC-00017
#> ERCC-00012  0.769954876  0.03079731 -0.008469118  0.035409255 -0.004367265
#> ERCC-00013  0.030797310  0.98637914  0.020020333 -0.018534811 -0.013478967
#> ERCC-00014 -0.008469118  0.02002033  0.983490318 -0.003222142  0.094474696
#> ERCC-00016  0.035409255 -0.01853481 -0.003222142  0.982460923  0.028389361
#> ERCC-00017 -0.004367265 -0.01347897  0.094474696  0.028389361  0.185954347
# For a partial matrix
coex <- extract.coex(object = obj,genes = c("ERCC-00017", "ERCC-00019", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031"))
head(coex)
#>              ERCC-00017   ERCC-00019   ERCC-00024   ERCC-00025    ERCC-00028
#> ERCC-00012 -0.004367265 5.272805e-02 -0.011980011  0.017574171 -0.0120737279
#> ERCC-00013 -0.013478967 4.360570e-02 -0.005497781  0.068917037  0.0310017944
#> ERCC-00014  0.094474696 2.345844e-02 -0.016243015 -0.006247842 -0.0226624019
#> ERCC-00016  0.028389361 7.232674e-05 -0.004746929  0.033126881  0.0005889499
#> ERCC-00017  0.185954347 1.880656e-02 -0.010473707  0.012328879  0.0162721054
#> ERCC-00019  0.018806561 9.695314e-01 -0.010600940 -0.032637961  0.0473851891
#>              ERCC-00031
#> ERCC-00012  0.013066001
#> ERCC-00013 -0.020289792
#> ERCC-00014 -0.054916592
#> ERCC-00016 -0.012314141
#> ERCC-00017 -0.017522836
#> ERCC-00019 -0.003396497

The next few lines are just to clean after compilation of the documentation.

if (file.exists(paste(out_dir,t,".cotan.RDS", sep = ""))) {
  #Delete file if it exists
  file.remove(paste(out_dir,t,".cotan.RDS", sep = ""))
}
#> [1] TRUE
unlink(paste(out_dir,"cleaning", sep = ""),recursive = TRUE)

print(sessionInfo())
#> R version 4.2.0 RC (2022-04-19 r82224)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
#> LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] dendextend_1.15.2 MASS_7.3-57       htmlwidgets_1.5.4 forcats_0.5.1    
#>  [5] stringr_1.4.0     dplyr_1.0.8       purrr_0.3.4       readr_2.1.2      
#>  [9] tidyr_1.2.0       tibble_3.1.6      tidyverse_1.3.1   plotly_4.10.0    
#> [13] Rtsne_0.16        factoextra_1.0.7  ggrepel_0.9.1     ggplot2_3.3.5    
#> [17] Matrix_1.4-1      data.table_1.14.2 COTAN_1.0.0       BiocStyle_2.24.0 
#> 
#> loaded via a namespace (and not attached):
#>  [1] colorspace_2.0-3      rjson_0.2.21          ellipsis_0.3.2       
#>  [4] rprojroot_2.0.3       circlize_0.4.14       GlobalOptions_0.1.2  
#>  [7] fs_1.5.2              clue_0.3-60           rstudioapi_0.13      
#> [10] farver_2.1.0          fansi_1.0.3           lubridate_1.8.0      
#> [13] xml2_1.3.3            codetools_0.2-18      doParallel_1.0.17    
#> [16] knitr_1.38            jsonlite_1.8.0        Cairo_1.5-15         
#> [19] broom_0.8.0           cluster_2.1.3         dbplyr_2.1.1         
#> [22] png_0.1-7             BiocManager_1.30.17   compiler_4.2.0       
#> [25] httr_1.4.2            basilisk_1.8.0        backports_1.4.1      
#> [28] assertthat_0.2.1      RcppZiggurat_0.1.6    fastmap_1.1.0        
#> [31] lazyeval_0.2.2        cli_3.3.0             htmltools_0.5.2      
#> [34] tools_4.2.0           gtable_0.3.0          glue_1.6.2           
#> [37] Rcpp_1.0.8.3          cellranger_1.1.0      jquerylib_0.1.4      
#> [40] vctrs_0.4.1           iterators_1.0.14      xfun_0.30            
#> [43] rvest_1.0.2           lifecycle_1.0.1       scales_1.2.0         
#> [46] basilisk.utils_1.8.0  hms_1.1.1             parallel_4.2.0       
#> [49] RColorBrewer_1.1-3    ComplexHeatmap_2.12.0 yaml_2.3.5           
#> [52] reticulate_1.24       gridExtra_2.3         sass_0.4.1           
#> [55] stringi_1.7.6         highr_0.9             S4Vectors_0.34.0     
#> [58] foreach_1.5.2         BiocGenerics_0.42.0   filelock_1.0.2       
#> [61] shape_1.4.6           rlang_1.0.2           pkgconfig_2.0.3      
#> [64] matrixStats_0.62.0    evaluate_0.15         lattice_0.20-45      
#> [67] labeling_0.4.2        Rfast_2.0.6           cowplot_1.1.1        
#> [70] tidyselect_1.1.2      here_1.0.1            magrittr_2.0.3       
#> [73] bookdown_0.26         R6_2.5.1              IRanges_2.30.0       
#> [76] magick_2.7.3          generics_0.1.2        DBI_1.1.2            
#> [79] pillar_1.7.0          haven_2.5.0           withr_2.5.0          
#> [82] dir.expiry_1.4.0      modelr_0.1.8          crayon_1.5.1         
#> [85] utf8_1.2.2            tzdb_0.3.0            rmarkdown_2.14       
#> [88] viridis_0.6.2         GetoptLong_1.0.5      grid_4.2.0           
#> [91] readxl_1.4.0          reprex_2.0.1          digest_0.6.29        
#> [94] stats4_4.2.0          munsell_0.5.0         viridisLite_0.4.0    
#> [97] bslib_0.3.1