## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) library('ProteoMM') set.seed(135) ## ----eval=FALSE--------------------------------------------------------------- # source("https://bioconductor.org/biocLite.R") # biocLite("ProteoMM") # library(ProteoMM) ## ----eval=FALSE--------------------------------------------------------------- # devtools::install_github("YuliyaLab/ProteoMM") # library(ProteoMM) ## ----fig.cap="Numbers of missing values in each of the Human samples."-------- # Load data for human, then mouse data("hs_peptides") # loads variable hs_peptides dim(hs_peptides) # 695 x 13 intsCols = 8:13 # column indices that contain intensities m_logInts = make_intencities(hs_peptides, intsCols) # replace 0's with NA's, NA's are more appropriate for analysis & log2 transform m_logInts = convert_log2(m_logInts) metaCols = 1:7 # column indices for metadata such as protein IDs and sequences m_prot.info = make_meta(hs_peptides, metaCols) # m_prot.info - 2+ column data frame with peptide IDs and protein IDs head(m_prot.info) dim(m_logInts) # 695 x 6 grps = as.factor(c('CG','CG','CG', 'mCG','mCG','mCG')) # 3 samples for CG & mCG # check the number of missing values m_nummiss = sum(is.na(m_logInts)) m_nummiss m_numtot = dim(m_logInts)[1] * dim(m_logInts)[2] # total # of observations m_percmiss = m_nummiss/m_numtot # % missing observations m_percmiss # 38.29% missing values, representative of the true larger dataset # plot number of missing values for each sample par(mfcol=c(1,1)) barplot(colSums(is.na(m_logInts)), main="Numbers of missing values in Human samples (group order)") ## ----results = FALSE, fig.cap="Eigentrends for raw and residual peptide intensities in Human samples. Dots at positions 1-6 correspond to the 6 samples. The top trend in the raw data (left panel) shows a pattern representative of the differences between the two groups. Top trend in the Residual Data (right panel) shows that sample 2 and 5 have higher similarity to each other, as well as, 1, 3, 4 and 6 whereas in reality samples 1-3 are from the same treatment group and 3-6 are from the other."---- hs_m_ints_eig1 = eig_norm1(m=m_logInts,treatment=grps,prot.info=m_prot.info) ## ----------------------------------------------------------------------------- names(hs_m_ints_eig1) ## ----------------------------------------------------------------------------- hs_m_ints_eig1$h.c ## ----results = FALSE, fig.cap="Eigetrends for raw and normalized peptide intensities in Human samples. Dots at positions 1-6 correspond to the 6 samples. Top trend in the normalized data (right panel) shows a pattern representative of the differences between the two groups (eigen trends can be rotated around the x-axis)."---- hs_m_ints_norm_1bt = eig_norm2(rv=hs_m_ints_eig1) ## ----------------------------------------------------------------------------- names(hs_m_ints_eig1) # how many peptides with no missing values (complete) are in the data? dim(hs_m_ints_eig1$complete)# bias trend identification is based on 196 peptides ## ----------------------------------------------------------------------------- hs_m_ints_eig1$h.c = 2 # visually there are more than 1 bias trend, set to 2 ## ----results = FALSE, fig.cap="Eigetrends for raw and normalized peptide intensities in Human samples with the effects of two bias trends removed. Dots at positions 1-6 correspond to the 6 samples. Top trend in the Normalized Data (Figure 4, right panel) shows a pattern representative of the differences between the two groups (eigen trends can be rotated around x-axis)."---- hs_m_ints_norm = eig_norm2(rv=hs_m_ints_eig1) ## ---- results='asis', echo=FALSE---------------------------------------------- cat("\\newpage") ## ---- fig.cap="Numbers of missing values in each of the Human samples. mCG treatment group has more missing values."---- data("mm_peptides") # loads variable mm_peptides dim(mm_peptides) dim(mm_peptides) # 1102 x 13 head(mm_peptides) intsCols = 8:13 # may differ for each dataset, users need to adjust m_logInts = make_intencities(mm_peptides, intsCols) # reuse the name m_logInts m_logInts = convert_log2(m_logInts) metaCols = 1:7 m_prot.info = make_meta(mm_peptides, metaCols) head(m_prot.info) dim(m_logInts)# 1102 x 6 # check numbers of missing values in Mouse samples m_nummiss = sum(is.na(m_logInts)) m_nummiss m_numtot = dim(m_logInts)[1] * dim(m_logInts)[2] # total observations m_percmiss = m_nummiss/m_numtot # % missing observations m_percmiss # 40.8% missing values, representative of the true larger dataset # plot number of missing values for each sample par(mfcol=c(1,1)) barplot(colSums(is.na(m_logInts)), main="Numbers of missing values in Mouse samples (group order)") ## ----results = FALSE, fig.cap="Eigentrends for raw and residual peptide intensities in mouse samples. Dots at positions 1-6 correspond to the 6 samples. The top trend in the normalized data (right panel) shows a pattern representative of the differences between the two groups (eigen trends can be rotated around x-axis)."---- mm_m_ints_eig1 = eig_norm1(m=m_logInts,treatment=grps,prot.info=m_prot.info) ## ----------------------------------------------------------------------------- mm_m_ints_eig1$h.c ## ----results = FALSE, fig.cap="Eigentrends for raw and normalized peptide intensities in mouse samples with the effects of one bias trends removed. Dots at positions 1-6 correspond to the 6 samples. The top trend in the normalized data (Figure 7, right panel) shows a pattern representative of the differences between the two groups."---- mm_m_ints_norm_1bt = eig_norm2(rv=mm_m_ints_eig1) ## ----------------------------------------------------------------------------- mm_m_ints_eig1$h.c = 2 ## ----results = FALSE, fig.cap="Eigentrends for raw and normalized peptide intensities in mouse samples with the effects of two bias trends removed. Dots at positions 1-6 correspond to the 6 samples. Top trend in the Normalized Data (right panel) shows a pattern representative of the differences between the two groups."---- mm_m_ints_norm = eig_norm2(rv=mm_m_ints_eig1) # 190 petides with no missing values were used to id bais trends ($complete) ## ----------------------------------------------------------------------------- length(mm_m_ints_eig1$prot.info$MatchedID) # 1102 - correct length(hs_m_ints_eig1$prot.info$MatchedID) # 695 - can normalize all length(unique(mm_m_ints_eig1$prot.info$MatchedID) ) # 69 length(unique(hs_m_ints_eig1$prot.info$MatchedID) ) # 69 # 787 peptides were normalized, rest eliminated due to low # of observations dim(mm_m_ints_norm$norm_m) dim(hs_m_ints_norm$norm_m) # 480 peptides were normalized ## ---- results='asis', echo=FALSE---------------------------------------------- cat("\\newpage") ## ----------------------------------------------------------------------------- hs_prot.info = hs_m_ints_norm$normalized[,metaCols] hs_norm_m = hs_m_ints_norm$normalized[,intsCols] head(hs_prot.info) head(hs_norm_m) dim(hs_norm_m) # 480 x 6, raw: 695, 215 peptides were eliminated due to lack of # observations length(unique(hs_prot.info$MatchedID)) # 59 length(unique(hs_prot.info$ProtID)) # 59 ## ----warning=FALSE, message=FALSE, results = FALSE---------------------------- imp_hs = MBimpute(hs_norm_m, grps, prot.info=hs_prot.info, pr_ppos=3, my.pi=0.05, compute_pi=FALSE) # use default pi # historically pi=.05 has been representative of the % missing # observations missing completely at random ## ---- fig.cap="All peptides within protein Prot32 in raw, normalized, and imputed form."---- # check some numbers after the imputation length(unique(imp_hs$imp_prot.info$MatchedID)) # 59 - MatchedID IDs length(unique(imp_hs$imp_prot.info$ProtID)) # 59 - Protein IDs length(unique(imp_hs$imp_prot.info$GeneID)) # 59 dim(imp_hs$imp_prot.info) # 480 x 7 imputed peptides dim(imp_hs$y_imputed) # 480 x 6 # plot one of the protiens to check normalization and imputation visually mylabs = c( 'CG','CG','CG', 'mCG','mCG','mCG') # same as grps this is a string prot_to_plot = 'Prot32' # 43 gene_to_plot = 'Gene32' plot_3_pep_trends_NOfile(as.matrix(hs_m_ints_eig1$m), hs_m_ints_eig1$prot.info, as.matrix(hs_norm_m), hs_prot.info, imp_hs$y_imputed, imp_hs$imp_prot.info, prot_to_plot, 3, gene_to_plot, 4, mylabs) ## ----------------------------------------------------------------------------- mm_prot.info = mm_m_ints_norm$normalized[,1:7] mm_norm_m = mm_m_ints_norm$normalized[,8:13] head(mm_prot.info) head(mm_norm_m) dim(mm_norm_m) # 787 x 6, raw had: 1102 peptides/rows length(unique(mm_prot.info$MatchedID)) # 56 length(unique(mm_prot.info$ProtID)) # 56 ## ----warning=FALSE, results = FALSE------------------------------------------- set.seed(12131) # set random number generator seed for reproducibility, # otherwise will get various imputed values for repeated attempts # as for Human, impute based on ProtID - position in the matrix for the Protein Identifier imp_mm = MBimpute(mm_norm_m, grps, prot.info=mm_prot.info, pr_ppos=3, my.pi=0.05, compute_pi=FALSE) # pi =.05 is usually a good estimate ## ----------------------------------------------------------------------------- dim(imp_mm$imp_prot.info) # 787 x 7 - imputed peptides & 787 were normalized dim(imp_mm$y_imputed) # 787 x 6 ## ---- results='asis', echo=FALSE---------------------------------------------- cat("\\newpage") ## ----------------------------------------------------------------------------- # make parallel lists to pass as parameters mms = list() treats = list() protinfos = list() mms[[1]] = imp_mm$y_imputed mms[[2]] = imp_hs$y_imputed treats[[1]] = grps treats[[2]] = grps protinfos[[1]] = imp_mm$imp_prot.info protinfos[[2]] = imp_hs$imp_prot.info subset_data = subset_proteins(mm_list=mms, prot.info=protinfos, 'MatchedID') names(subset_data) mm_dd_only = subset_data$sub_unique_prot.info[[1]] hs_dd_only = subset_data$sub_unique_prot.info[[2]] ugene_mm_dd = unique(mm_dd_only$MatchedID) ugene_hs_dd = unique(hs_dd_only$MatchedID) length(ugene_mm_dd) # 24 - in Mouse only length(ugene_hs_dd) # 27 - Human only nsets = length(mms) nperm = 50 # number of permutations should be 500+ for publication quality ## ----warning=FALSE, results = FALSE------------------------------------------- ptm = proc.time() set.seed=(12357) comb_MBDE = prot_level_multi_part(mm_list=mms, treat=treats,prot.info=protinfos, prot_col_name='ProtID', nperm=nperm, dataset_suffix=c('MM', 'HS')) proc.time() - ptm # shows how long it takes to run the test ## ---- fig.cap="P-value distributions for unadjusted and adjusted p-values. Adjusted p-values (top right) look as expected according to the theory with a peak near 0 and an approximately uniform distribution throughout the interval [0 1]. Benjamini-Hochberg adjusted p-values (bottom left) do not look according to the theoretical distribution, thus Benjamini-Hochberg adjusted may not be appropriate."---- mybreaks = seq(0,1, by=.05) # adjustment for permutation test is done by stretching out values on the # interval [0 1] as expected in a theoretical p-value distribution par(mfcol=c(1,3)) # always check out p-values # bunched up on interval [0 .5] hist(comb_MBDE$P_val, breaks=mybreaks, xlab='unadjusted p-values', main='') # adjusted p-values look good hist(comb_MBDE$BH_P_val, breaks=mybreaks, xlab='adjusted p-values', main='') # bunched up on interval [0 .5] hist(p.adjust(comb_MBDE$P_val, method='BH'), breaks=mybreaks, xlab='BH adjusted p-values', main='') ## ---- fig.cap="Distribution of p-values and fold changes for combined multi-matrix analysis of Mouse and Human."---- # horizontal streaks correspond to where a permutation test produces 0 or # very small value, these are reset to improve visualization par(mfcol=c(1,1)) # Volcano generally look better for larger dataset... plot_volcano_wLab(comb_MBDE$FC, comb_MBDE$BH_P_val, comb_MBDE$GeneID, FC_cutoff=1.2, PV_cutoff=.05, 'CG vs mCG') ## ----warning=FALSE, fig.cap="Distribution of p-values and fold changes for differential expression analysis in Mouse."---- # subset_data contains "sub_unique_mm_list" "sub_unique_prot.info" lists # for each dataset in the order provided to subset function mms_mm_dd = subset_data$sub_unique_mm_list[[1]] # Mouse dim(mms_mm_dd) # 258 x 6, protinfos_mm_dd = subset_data$sub_unique_prot.info[[1]] length(unique(protinfos_mm_dd$ProtID)) # 24 length(unique(protinfos_mm_dd$GeneID)) # 24 length(unique(protinfos_mm_dd$MatchedID)) # 24 DE_mCG_CG_mm_dd = peptideLevel_DE(mms_mm_dd, grps, prot.info=protinfos_mm_dd, pr_ppos=2) # volcano plot FCval = 1.2 # change this value for alternative fold change cutoff plot_volcano_wLab(DE_mCG_CG_mm_dd$FC, DE_mCG_CG_mm_dd$BH_P_val, DE_mCG_CG_mm_dd$GeneID, FC_cutoff=FCval, PV_cutoff=.05, 'Mouse specific - CG vs mCG') ## ---- results='asis', echo=FALSE---------------------------------------------- cat("\\newpage") ## ----------------------------------------------------------------------------- # make data structures suitable for get_presAbs_prots() function raw_list = list() norm_imp_prot.info_list = list() raw_list[[1]] = mm_m_ints_eig1$m raw_list[[2]] = hs_m_ints_eig1$m norm_imp_prot.info_list[[1]] = mm_m_ints_eig1$prot.info norm_imp_prot.info_list[[2]] = hs_m_ints_eig1$prot.info protnames_norm_list = list() protnames_norm_list[[1]] = unique(mm_m_ints_norm$normalized$MatchedID) #56/69 protnames_norm_list[[2]] = unique(hs_m_ints_norm$normalized$MatchedID) #59 presAbs_dd = get_presAbs_prots(mm_list=raw_list, prot.info=norm_imp_prot.info_list, protnames_norm=protnames_norm_list, prot_col_name=2) ints_presAbs = list() protmeta_presAbs = list() ints_presAbs[[1]] = presAbs_dd[[1]][[1]] # Mouse ints_presAbs[[2]] = presAbs_dd[[1]][[2]] # HS protmeta_presAbs[[1]] = presAbs_dd[[2]][[1]] protmeta_presAbs[[2]] = presAbs_dd[[2]][[2]] dim(protmeta_presAbs[[2]]) # 32 x 7 peptides length(unique(protmeta_presAbs[[2]]$MatchedID)) # 10 - proteins dim(protmeta_presAbs[[1]]) # 30 x 7 peptides length(unique(protmeta_presAbs[[1]]$MatchedID)) # 13 - proteins # grps are the same for all analyses subset_presAbs = subset_proteins(mm_list=ints_presAbs, prot.info=protmeta_presAbs,'MatchedID') names(subset_presAbs) dim(subset_presAbs$sub_unique_prot.info[[1]]) dim(subset_presAbs$sub_unique_prot.info[[2]]) dim(subset_presAbs$sub_prot.info[[1]]) dim(subset_presAbs$sub_prot.info[[2]]) ## ----results = FALSE---------------------------------------------------------- nperm = 50 # set to 500+ for publication ptm = proc.time() set.seed=(123372) presAbs_comb=prot_level_multiMat_PresAbs(mm_list=subset_presAbs$sub_mm_list, treat=treats, prot.info=subset_presAbs$sub_prot.info, prot_col_name='MatchedID', nperm=nperm, dataset_suffix=c('MM', 'HS') ) proc.time() - ptm ## ---- fig.cap="Distribution of p-values and fold changes for differential expression in the combined analysis of Human and Mouse data in CG context."---- plot_volcano_wLab(presAbs_comb$FC, presAbs_comb$BH_P_val, presAbs_comb$GeneID, FC_cutoff=.5, PV_cutoff=.05, 'Combined Pres/Abs CG vs mCG') ## ----------------------------------------------------------------------------- # just checking the numbers here dim(subset_presAbs$sub_unique_mm_list[[1]]) dim(subset_presAbs$sub_unique_mm_list[[2]]) unique(subset_presAbs$sub_unique_prot.info[[1]]$ProtID)# 8 unique(subset_presAbs$sub_unique_prot.info[[2]]$ProtID)# 5 ## ---- fig.cap="Distribution of p-values and fold changes for the presence/absence analysis in Mouse data in CG context."---- mm_presAbs = peptideLevel_PresAbsDE(subset_presAbs$sub_unique_mm_list[[1]], treats[[1]], subset_presAbs$sub_unique_prot.info[[1]], pr_ppos=3) plot_volcano_wLab(mm_presAbs$FC, mm_presAbs$BH_P_val, mm_presAbs$GeneID, FC_cutoff=.5, PV_cutoff=.05, 'MM Pres/Abs CG vs mCG') ## ---- Distribution of p-values and fold changes for the presence/absence analysis in Human data in CG context.---- hs_presAbs = peptideLevel_PresAbsDE(subset_presAbs$sub_unique_mm_list[[2]], treats[[2]], subset_presAbs$sub_unique_prot.info[[2]], pr_ppos=3) plot_volcano_wLab(hs_presAbs$FC, hs_presAbs$BH_P_val, hs_presAbs$GeneID, FC_cutoff=.5, PV_cutoff=.05, 'HS Pres/Abs CG vs mCG') ## ----------------------------------------------------------------------------- sessionInfo()