## ----eval=TRUE, echo=FALSE------------------------------------------------- ## Note, this workflow requires R version 3.3.1 or later, which can be installed from https://cran.r-project.org/bin/windows/base/ ## Load required packages from: `BiocManager::install()` if (!"BiocManager" %in% rownames(installed.packages())) install.packages("BiocManager") BiocManager::install() ## You may get messages letting you know which packages are old or masked, and asking you whether to update all, some, or none of the packages ## BiocGenerics and parallel will be installed, while various packages are masked, and you are welcomed to Bioconductor ## Load the Expression Normalization Workflow from Bioconductor library(ExpressionNormalizationWorkflow) library(Biobase) ## ----eval=FALSE, echo=TRUE------------------------------------------------- # ## Set your working directory (or use Control + Shift + H to choose it) # setwd("working directory") # ## read in the file containing the gene expression values # expData_file_name <- system.file("folder containing the data", "CAD_Expression.csv", package="ExpressionNormalizationWorkflow") # exprs <- read.table(expData_file_name, header=TRUE, sep=",", row.names=1, as.is=TRUE) # ## read in the file containing the covariates # expDesign_file_name <- system.file("folder containing the data", "CAD_ExptDsgn.csv", package="ExpressionNormalizationWorkflow") # covrts <- read.table(expDesign_file_name, header=TRUE, sep=",", row.names=1, as.is=TRUE) # ## ----eval=TRUE, echo=FALSE------------------------------------------------- ## read in the file containing the gene expression values exprs_path <- system.file("extdata", "CAD_Expression.csv", package="ExpressionNormalizationWorkflow") exprs <- read.table(exprs_path, header=TRUE, sep=",", row.names=1, as.is=TRUE) ## read in the file containing the covariates covrts_path <- system.file("extdata", "CAD_ExptDsgn.csv", package="ExpressionNormalizationWorkflow") covrts <- read.table(covrts_path, row.names=1, header=TRUE, sep=",") ## Confirm that the Expression data and the Covariates/Experimental design data is correctly uploaded ## You can also view the complete Covariate matrix with - View(covrts) exprs[1:5, 1:5] covrts[1:5, 1:10] #View(covrts) ## ----eval=TRUE------------------------------------------------------------- inpData <- expSetobj(exprs, covrts) ## ----eval=TRUE------------------------------------------------------------- ## Set the covariates whose effect size on the data needs to be calculated cvrts_eff_var <- c("CAD", "BMI3", "Rin3", "Gender", "Study") ## Set a PVCA Threshold Value between 0 & 1 ## PVCA Threshold Value is the percentile value of the minimum amount of the variabilities that the selected principal components need to explain, here requiring 75% of the expression variance to be captured by the PCs pct_thrsh <- 0.75 ## Perform the PVCA analysis pvcAnaly(inpData, pct_thrsh, cvrts_eff_var) ## ----eval=TRUE------------------------------------------------------------- ## Choose a biological variable that is to be used to calculate the surrogate variables biol_var_sva <- "CAD" ## Perform SVA sur_var_obj <- surVarAnaly(inpData, biol_var_sva) ## The newly generated surrogate variables sv1 through sv4 are appended to the ExpressionSet object inpData_sv <- sur_var_obj$expSetobject ## ----eval=TRUE------------------------------------------------------------- ## Fit a generalized linear model for sv1 glm.sv1 <- glm(pData(inpData_sv)[,"sv1"]~pData(inpData_sv)[,"BMI"]+pData(inpData_sv)[,"Rin"]+pData(inpData_sv)[,"CAD"] +pData(inpData_sv)[,"Study"]) summary(glm.sv1) ## ----eval = TRUE----------------------------------------------------------- ## Fit a generalized linear model for sv2 glm.sv2 <- glm(pData(inpData_sv)[,"sv2"]~pData(inpData_sv)[,"BMI"]+pData(inpData_sv)[,"Rin"]+pData(inpData_sv)[,"CAD"] +pData(inpData_sv)[,"Study"]) summary(glm.sv2) ## Output should be similar to that shown above for sv1 ## ----eval=TRUE------------------------------------------------------------- ## First discretize the continuous surrogate variables var_names <- c("sv1", "sv2", "sv3", "sv4") pData(inpData_sv)<-conTocat(pData(inpData_sv), var_names) ## View them appended to the covariate matrix as additional covariate columns #View(pData(inpData_sv)) ## Include the surrogate variables as covariates in addition to BMI3, Rin3, CAD and Study (be sure to use categorical measures of BMI and Rin rather than the continuous measure) cvrts_eff_var <- c("BMI3", "Rin3", "CAD", "Study", "sv1_cat", "sv2_cat", "sv3_cat", "sv4_cat") ## Again set the PVCA Threshold to explain 75% of the expression variation pct_thrsh <- 0.75 ## Perform PVCA pvcAnaly(inpData_sv, pct_thrsh, cvrts_eff_var) ## ----eval=TRUE------------------------------------------------------------- cvrts_eff_var <- c("BMI3", "Rin3", "CAD","sv1_cat", "sv2_cat", "sv3_cat", "sv4_cat") ## Again set the PVCA Threshold to explain 75% of the expression variation pct_thrsh <- 0.75 ## Perform PVCA pvcAnaly(inpData_sv, pct_thrsh, cvrts_eff_var) ## ----eval=TRUE------------------------------------------------------------- ## Choose the biological variableof interest bv <- c("CAD") ## Choose your adjustment variable of interest, starting with just 'Study' av <- c("Study") ## The intensity-dependent adjustment variables adjust for array effects iv <- c("Array") ## Run SNM sv_snmObj <- snmAnaly(exprs, pData(inpData_sv), bv, av, iv) ## ----eval=FALSE, echo=TRUE------------------------------------------------- # ## Now choose the adjustment variables to be all four SVs plus Rin # av <- c("Rin", "sv1", "sv2", "sv3", "sv4") # ## Run SNM # sv_snmObj <- snmAnaly(exprs, pData(inpData_sv), bv, av, iv) ## ----eval=TRUE, echo=TRUE-------------------------------------------------- av <- c("Rin", "sv2", "Study") sv_snmObj <- snmAnaly(exprs, pData(inpData_sv), bv, av, iv) ## ----eval=TRUE------------------------------------------------------------- ## Create an expressionSet object of the normalized dataset(s) sv_snmNorm_data <- sv_snmObj$norm.dat colnames(sv_snmNorm_data) <- colnames(exprs) sv_snm_data <- expSetobj(sv_snmNorm_data, pData(inpData_sv)) ## Write this dateset to a table with rows as genes and columns as samples (with names the same as that from the input file) write.table(sv_snmNorm_data, file="CAD_SNM.csv", sep=",") ## ----eval=TRUE------------------------------------------------------------- ## Specify the covariates whose effect size is to be calculated cvrts_eff_var <- c("BMI3", "Rin3", "CAD", "sv1_cat", "sv2_cat","sv3_cat", "sv4_cat") ## If needed, keep the same PC Threshold Value pct_thrsh <- 0.75 ## Perform PVCA pvcAnaly(sv_snm_data, pct_thrsh, cvrts_eff_var)