## ---- eval=TRUE, message=FALSE------------------------------------------------ # Load GEO and biobase libraries library(GEOquery) library(Biobase) library(multiClust) library(preprocessCore) library(ctc) library(gplots) library(dendextend) library(graphics) library(grDevices) library(amap) library(survival) ## ---- eval=FALSE, message=FALSE----------------------------------------------- # # Obtain GSE series matrix file from GEO website using getGEO function # gse <- getGEO(GEO="GSE2034") # # # Save the gene expression matrix as an object # data.gse <- exprs(gse[[1]]) # # # Save the patient clinical data to an object # pheno <- pData(phenoData(gse[[1]])) # # # Write the gene expression and clinical data to text files # WriteMatrixToFile(tmpMatrix=data.gse, tmpFileName="GSE2034.expression.txt", # blnRowNames=TRUE, blnColNames=TRUE) # # WriteMatrixToFile(tmpMatrix=pheno, tmpFileName="GSE2034.clinical.txt", # blnRowNames=TRUE, blnColNames=TRUE) # ## ---- eval=FALSE-------------------------------------------------------------- # # Obtain GSE series matrix file from GEO website using getGEO function # gse <- getGEO(filename="GSE2034_series_matrix.txt.gz") # # # Save the gene expression matrix as an object # data.gse <- exprs(gse[[1]]) # # # Save the patient clinical data to an object # pheno <- pData(phenoData(gse[[1]])) # # # Write the gene expression and clinical data to text files # WriteMatrixToFile(tmpMatrix=data.gse, tmpFileName="GSE2034.expression.txt", # blnRowNames=TRUE, blnColNames=TRUE) # # WriteMatrixToFile(tmpMatrix=pheno, tmpFileName="GSE2034.clinical.txt", # blnRowNames=TRUE, blnColNames=TRUE) # ## ---- eval=FALSE-------------------------------------------------------------- # # Obtain GSE series matrix file from GEO website using getGEo function # gse <- getGEO(GEO="GSE2034") # # # Save the gene expression matrix as an object # data.gse <- exprs(gse) # # # Quantile normalization of the dataset # data.norm <- normalize.quantiles(data.gse, copy=FALSE) # # # shift data before log scaling to prevent errors from log scaling # # negative numbers # if (min(data.norm)> 0) { # } # else { # mindata.norm=abs(min(data.norm)) + .001 # data.norm=data.norm + mindata.norm # } # # # Log2 scaling of the dataset # data.log <- t(apply(data.norm, 1, log2)) # # # Write the gene expression and clinical data to text files # WriteMatrixToFile(tmpMatrix=data.log, # tmpFileName="GSE2034.normalized.expression.txt", # blnRowNames=TRUE, blnColNames=TRUE) # ## ---- eval=TRUE--------------------------------------------------------------- # Obtain clinical outcome file clin_file <- system.file("extdata", "GSE2034-RFS-clinical-outcome.txt", package="multiClust") # Load in survival information file clinical <- read.delim2(file=clin_file, header=TRUE) # Display first few rows of the file clinical[1:5, 1:2] # Column one represents the survival time in months # Column two represents the relapse free survival (RFS) event ## ---- eval=TRUE--------------------------------------------------------------- # Obtain gene expression matrix exp_file <- system.file("extdata", "GSE2034.normalized.expression.txt", package= "multiClust") # Load the gene expression matrix data.exprs <- input_file(input=exp_file) # View the first few rows and columns of the matrix data.exprs[1:4,1:4] ## ---- eval=TRUE--------------------------------------------------------------- # Example 1: Choosing a fixed gene or probe number # Obtain gene expression matrix exp_file <- system.file("extdata", "GSE2034.normalized.expression.txt", package="multiClust") gene_num <- number_probes(input=exp_file, data.exp=data.exprs, Fixed=300, Percent=NULL, Poly=NULL, Adaptive=NULL) ## ---- eval=TRUE--------------------------------------------------------------- # Example 2: Choosing 50% of the total selected gene probes in a dataset # Obtain gene expression matrix exp_file <- system.file("extdata", "GSE2034.normalized.expression.txt", package="multiClust") gene_num <- number_probes(input=exp_file, data.exp=data.exprs, Fixed=NULL, Percent=50, Poly=NULL, Adaptive=NULL) ## ---- eval=TRUE--------------------------------------------------------------- # Example 3: Choosing the polynomial selected gene probes in a dataset # Obtain gene expression matrix exp_file <- system.file("extdata", "GSE2034.normalized.expression.txt", package="multiClust") gene_num <- number_probes(input=exp_file, data.exp=data.exprs, Fixed=NULL, Percent=NULL, Poly=TRUE, Adaptive=NULL) ## ---- eval=FALSE-------------------------------------------------------------- # # Example 4: Choosing the Adaptive Gaussian Mixture Modeling method # # gene_num <- number_probes(input=exp_file, # data.exp=data.exprs, Fixed=NULL, # Percent=NULL, Poly=NULL, Adaptive=TRUE) # ## ---- eval=TRUE--------------------------------------------------------------- # Obtain gene expression matrix exp_file <- system.file("extdata", "GSE2034.normalized.expression.txt", package="multiClust") # Load the gene expression matrix data.exprs <- input_file(input=exp_file) # Call probe_ranking function # Select for 500 probes # Choose genes using the SD_Rank method ranked.exprs <- probe_ranking(input=exp_file, probe_number=300, probe_num_selection="Fixed_Probe_Num", data.exp=data.exprs, method="SD_Rank") # Display the first few columns and rows of the matrix ranked.exprs[1:4,1:4] ## ---- eval=TRUE--------------------------------------------------------------- # Call the number_clusters function # data.exp is the original expression matrix object outputted from # the input_file function # User specifies that samples will be separated into 3 clusters # with the "Fixed" argument cluster_num <- number_clusters(data.exp=data.exprs, Fixed=3, gap_statistic=NULL) ## ---- eval=FALSE-------------------------------------------------------------- # # # Call the number_clusters function # # data.exp is the original expression matrix object ouputted from # # the input_file function # # User chooses the gap_statistic option by making gap_statistic equal TRUE # # The Fixed argument is also set to NULL # cluster_num <- number_clusters(data.exp=data.exprs, Fixed=NULL, # gap_statistic=TRUE) # ## ---- eval=TRUE, fig.show='hold', warning=FALSE------------------------------- # Call the cluster_analysis function hclust_analysis <- cluster_analysis(sel.exp=ranked.exprs, cluster_type="HClust", distance="euclidean", linkage_type="ward.D2", gene_distance="correlation", num_clusters=3, data_name="GSE2034 Breast", probe_rank="SD_Rank", probe_num_selection="Fixed_Probe_Num", cluster_num_selection="Fixed_Clust_Num") # Display the first few columns and rows of the object head(hclust_analysis) ## ---- eval=TRUE, echo=FALSE, fig.show='hold', fig.width=8, fig.height=8, fig.align='center', warning=FALSE---- # retrieve file path for image fpath <- system.file("extdata", "GSE2034_Breast.SD_Rank.ward.D2.Fixed_Probe_Num.Fixed_Clust_Num.jpg", package="multiClust") knitr::include_graphics(path = fpath) ## ---- eval=TRUE, echo=FALSE, fig.show='hold', fig.width=8, fig.height=8, fig.align='center', warning=FALSE---- # Produce a sample dendrogram using the selected gene/probe expression object # In the function, cluster_analysis the dendrogram is not displayed in # the console. # Make the selected expression object numeric colname <- colnames(ranked.exprs) ranked.exprs <- t(apply(ranked.exprs, 1,as.numeric)) colnames(ranked.exprs) <- colname # Normalization of the selected expression matrix before clustering norm.sel.exp <- t(apply(ranked.exprs, 1, nor.min.max)) hc <- hclust(dist(x=t(norm.sel.exp), method="euclidean"), method="ward.D2") dc <- as.dendrogram(hc) # Portray the samples in each cluster as a different color dc <- set(dc, "branches_k_color", value=1:3, k=3) # Make a vector filled with blank spaces to replace sample names with # blank space vec <- NULL len <- length(labels(dc)) for (i in 1:len) { i <- "" vec <- c(vec, i) } # Remove sample labels from dendrogram dc <- dendextend::set(dc, "labels", vec) # Plot the sample dendrogram plot(dc, main=paste("GSE2034 Breast", "Sample Dendrogram", sep=" ")) # Display orange lines on the plot to clearly separate each cluster rect.dendrogram(dc, k=3, border='orange') ## ---- eval=TRUE--------------------------------------------------------------- # Call the cluster_analysis function kmeans_analysis <- cluster_analysis(sel.exp=ranked.exprs, cluster_type="Kmeans", distance=NULL, linkage_type=NULL, gene_distance=NULL, num_clusters=3, data_name="GSE2034 Breast", probe_rank="SD_Rank", probe_num_selection="Fixed_Probe_Num", cluster_num_selection="Fixed_Clust_Num") # Display the first few rows and columns of the object head(kmeans_analysis) ## ---- eval=TRUE--------------------------------------------------------------- # Call the avg_probe_exp function avg_matrix <- avg_probe_exp(sel.exp=ranked.exprs, samp_cluster=kmeans_analysis, data_name="GSE2034 Breast", cluster_type="Kmeans", distance=NULL, linkage_type=NULL, probe_rank="SD_Rank", probe_num_selection="Fixed_Probe_Num", cluster_num_selection="Fixed_Clust_Num") # Display the first few rows and columns of the matrix head(avg_matrix) ## ---- eval=TRUE, fig.show='hold'---------------------------------------------- # Obtain clinical outcome file clin_file <- system.file("extdata", "GSE2034-RFS-clinical-outcome.txt", package="multiClust") # Call the avg_probe_exp function surv <- surv_analysis(samp_cluster=kmeans_analysis, clinical=clin_file, survival_type="RFS", data_name="GSE2034 Breast", cluster_type="Kmeans", distance=NULL, linkage_type=NULL, probe_rank="SD_Rank", probe_num_selection="Fixed_Probe_Num", cluster_num_selection="Fixed_Clust_Num") # Display the survival P value surv ## ---- eval=TRUE, echo=FALSE, fig.show='hold', fig.width=8, fig.height=8, fig.align='center', warning=FALSE---- # Obtain clinical outcome file clin_file <- system.file("extdata", "GSE2034-RFS-clinical-outcome.txt", package="multiClust") # Produce a Kaplan-Meier plot using the sample cluster information # from the cluster_analysis function # Read in survival data text file sample.anns <- read.delim2(file=clin_file, header=TRUE, stringsAsFactors=FALSE) time <- sample.anns[,1] event <- sample.anns[,2] time <- as.numeric(time) event <- as.numeric(event) event_type <- "RFS" time_type <- 'Months' # Determine max number of clusters in samp_cluster samp_cluster <- kmeans_analysis Number_Clusters <- max(samp_cluster) # Produce Kaplan Meier survival plots lev <- '1' for(i in 2:Number_Clusters) { lev <- c(lev, paste('',i, sep='')) } groupfac <- factor(samp_cluster, levels=lev) cox <- coxph(Surv(time, event==1) ~ groupfac) csumm <- summary(cox) cox.p.value <- c(csumm$logtest[3]) p <- survfit(Surv(time,event==1)~strata(samp_cluster)) plot (p,xlab= time_type ,ylab='Survival Probability',conf.int=FALSE, xlim=c(0,100), col=1:Number_Clusters, main=paste("GSE2034 Breast", "Kmeans", "SD_Rank", "RFS", sep =' ')) legend('bottomleft',legend=levels(strata(samp_cluster)),lty=1, col=1:Number_Clusters, text.col=1:Number_Clusters, title='clusters') legend('topright', paste('Pvalue =', signif(csumm$logtest[3],digits=2), sep =''))