1 Data Introduction

This package provides a dataset for those wishing to try out the TCGA Workflow: Analyze cancer genomics and epigenomics data using Bioconductor packages [@10.12688/f1000research.8923.2]. The data in this package are a subset of the TCGA data for LGG (Lower grade glioma) and GBM (Glioblastoma multiforme) samples.

2 Loading the data

library(TCGAWorkflowData)
data("elmerExample")
data("GBMnocnvhg19")
data("GBMIllumina_HiSeq")
data("LGGIllumina_HiSeq")
data("mafMutect2LGGGBM")
data("markersMatrix")
data("histoneMarks")
data("biogrid")
data("genes_GR")

3 Data creation

The following commands were used to create the data included with this package.

3.1 Genes information

Download gene information for hg19 using TCGAbiolinks, which uses biomart parckage.

library(GenomicRanges)
library(TCGAbiolinks)
##############################
## Recurrent CNV annotation ## 
##############################
# Get gene information from GENCODE using biomart
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") 
genes <- genes[genes$external_gene_id != "" & genes$chromosome_name %in% c(1:22,"X","Y"),]
genes[genes$chromosome_name == "X", "chromosome_name"] <- 23
genes[genes$chromosome_name == "Y", "chromosome_name"] <- 24
genes$chromosome_name <- sapply(genes$chromosome_name,as.integer)
genes <- genes[order(genes$start_position),]
genes <- genes[order(genes$chromosome_name),]
genes <- genes[,c("external_gene_id", "chromosome_name", "start_position","end_position")]
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR,genes,file = "genes_GR.rda", compress = "xz")

3.2 GISTIC results

Download and save a subset of GBM GISTIC results from GDAC firehose.

library(RTCGAToolbox)
# Download GISTIC results
lastAnalyseDate <- getFirehoseAnalyzeDates(1)
gistic <- getFirehoseData("GBM",gistic2_Date = lastAnalyseDate)

# get GISTIC results
gistic.allbygene <- getData(gistic,type = "GISTIC", CN = "All")
gistic.allbygene <- gistic.allbygene[1:10,]
gistic.thresholedbygene <- getData(gistic,type = "GISTIC", CN = "Thresholed")
gistic.thresholedbygene <- gistic.thresholedbygene[1:10,]
save(gistic.allbygene,gistic.thresholedbygene,file = "GBMGistic.rda", compress = "xz")

3.3 Copy number variations (CNVs)

The following code will download segmented CNV from SNP array (Affymetrix Genome-Wide Human SNP Array 6.0) for 20 Glioblastoma multiforme (GBM) samples.

library(TCGAbiolinks)
query.gbm.nocnv <- GDCquery(project = "TCGA-GBM",
                            data.category = "Copy number variation",
                            legacy = TRUE,
                            file.type = "nocnv_hg19.seg",
                            sample.type = c("Primary solid Tumor"))
# to reduce time we will select only 20 samples
query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]

GDCdownload(query.gbm.nocnv, chunks.per.download = 100)

gbm.nocnv <- GDCprepare(query.gbm.nocnv, save = TRUE, save.filename = "GBMnocnvhg19.rda")

3.4 Gene expression data

The following code will download 20 LGG (Lower grade glioma) and 20 GBM (Glioblastoma multiforme) samples that have gene expression data. The Gene expression data is the raw expression signal for expression of a gene.

query <- GDCquery(project = "TCGA-GBM",
                  data.category = "Gene expression",
                  data.type = "Gene expression quantification",
                  platform = "Illumina HiSeq", 
                  file.type  = "results", 
                  sample.type = c("Primary solid Tumor"),
                  legacy = TRUE)
# We will use only 20 samples to make the example faster
query$results[[1]] <-  query$results[[1]][1:20,]                  
GDCdownload(query)
gbm.exp <- GDCprepare(query, save = TRUE, summarizedExperiment = TRUE, save.filename = "GBMIllumina_HiSeq.rda")

query <- GDCquery(project = "TCGA-LGG",
                  data.category = "Gene expression",
                  data.type = "Gene expression quantification",
                  platform = "Illumina HiSeq", 
                  file.type  = "results", 
                  sample.type = c("Primary solid Tumor"),
                  legacy = TRUE)
# We will use only 20 samples to make the example faster
query$results[[1]] <-  query$results[[1]][1:20,]
GDCdownload(query)
lgg.exp <- GDCprepare(query, save = TRUE, summarizedExperiment = TRUE, save.filename = "LGGIllumina_HiSeq.rda")

3.5 DNA methylation and Gene expression data

The following code will select 10 LGG (Lower grade glioma) and 10 GBM (Glioblastoma multiforme) samples that have both DNA methylation and gene expression data. This objects will be then prepared to the format accept by the Biocondcutor package ELMER([link])(http://bioconductor.org/packages/release/bioc/html/ELMER.html). The DNA methylation will have only probes in chromossome 9 in order to make the example of the workflow faster. For a real analysis, all chromossomes should be used. The Gene expression data is the normalized results for expression of a gene.

#----------- 8.3 Identification of Regulatory Enhancers   -------
library(TCGAbiolinks)
# Samples: primary solid tumor w/ DNA methylation and gene expression
matched_met_exp <- function(project, n = NULL){
    # get primary solid tumor samples: DNA methylation
    message("Download DNA methylation information")
    met450k <- GDCquery(project = project,
                        data.category = "DNA methylation",
                        platform = "Illumina Human Methylation 450",
                        legacy = TRUE, 
                        sample.type = c("Primary solid Tumor"))
    met450k.tp <-  met450k$results[[1]]$cases
    
    # get primary solid tumor samples: RNAseq
    message("Download gene expression information")
    exp <- GDCquery(project = project,
                    data.category = "Gene expression",
                    data.type = "Gene expression quantification",
                    platform = "Illumina HiSeq", 
                    file.type  = "normalized_results", 
                    sample.type = c("Primary solid Tumor"),
                    legacy = TRUE)
    exp.tp <- exp$results[[1]]$cases
    # Get patients with samples in both platforms
    patients <- unique(substr(exp.tp,1,15)[substr(exp.tp,1,12) %in% substr(met450k.tp,1,12)] )
    if(!is.null(n)) patients <- patients[1:n] # get only n samples
    return(patients)
}
lgg.samples <- matched_met_exp("TCGA-LGG", n = 10)
gbm.samples <- matched_met_exp("TCGA-GBM", n = 10)
samples <- c(lgg.samples,gbm.samples)

#-----------------------------------
# 1 - Methylation
# ----------------------------------
query.met <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"),
                      data.category = "DNA methylation",
                      platform = "Illumina Human Methylation 450",
                      legacy = TRUE, 
                      barcode = samples)
GDCdownload(query.met)
met <- GDCprepare(query.met, save = FALSE)
met <- subset(met,subset = as.character(GenomicRanges::seqnames(met)) %in% c("chr9"))

#-----------------------------------
# 2 - Expression
# ----------------------------------
query.exp <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"),
                     data.category = "Gene expression",
                     data.type = "Gene expression quantification",
                     platform = "Illumina HiSeq", 
                     file.type  = "normalized_results", 
                     legacy = TRUE, 
                     barcode =  samples)
GDCdownload(query.exp)
exp <- GDCprepare(query.exp, save = FALSE)
save(exp, met, gbm.samples, lgg.samples, file = "elmerExample.rda", compress = "xz")

3.6 Mutation data

The following code will download Mutation annotation files (aligned against the genoem of reference hg38) for LGG and GBM samples and merge them into one single single file. The GDC Somatic Mutation Calling Workflow used is the mutect2. For more information please check GDC.

library(TCGAbiolinks)
LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2")
mut <- plyr::rbind.fill(LGGmut, GBMmut)
save(mut, LGGmut, GBMmut,file = "mafMutect2LGGGBM.rda")

3.7 Probes meta file from broadinstitute website for Copy Number Variation Analysis (CNV) analysis

gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
# Retrieve probes meta file from broadinstitute website
if(!file.exists(basename(file))) downloader::download(file, basename(file))
# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==---=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==--
# For hg38 analysis please take a look on:
# https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
# File: SNP6 GRCh38 Liftover Probeset File for Copy Number Variation Analysis
# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==---=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==--
markersMatrix <-  readr::read_tsv(basename(file), col_names = FALSE, col_types = "ccn", progress = FALSE)
save(markersMatrix, file = "markersMatrix.rda", compress = "xz")

3.8 Biogrid data

Download biogrid information.

### read biogrid info
### Check last version in https://thebiogrid.org/download.php 
file <- "http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.146/BIOGRID-ALL-3.4.146.tab2.zip"
if(!file.exists(gsub("zip","txt",basename(file)))){
  downloader::download(file,basename(file))
  unzip(basename(file),junkpaths =TRUE)
}

tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)), header=TRUE, sep="\t", stringsAsFactors=FALSE)
save(tmp.biogrid, file = "biogrid.rda", compress = "xz")

3.9 Histone marks

The code below was used to download histone marks specific for brain tissue using the AnnotationHub package that can access the Roadmap database.

4 Session info

R version 3.4.3 (2017-11-30)

**Platform:** x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_US.UTF-8, LC_NUMERIC=C, LC_TIME=en_US.UTF-8, LC_COLLATE=C, LC_MONETARY=en_US.UTF-8, LC_MESSAGES=en_US.UTF-8, LC_PAPER=en_US.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_US.UTF-8 and LC_IDENTIFICATION=C

attached base packages:

stats
graphics
grDevices
utils
datasets
methods
base

other attached packages:

TCGAWorkflowData(v.1.2.1)
BiocStyle(v.2.6.1)

loaded via a namespace (and not attached):

Rcpp(v.0.12.14)
knitr(v.1.17)
XVector(v.0.18.0)
magrittr(v.1.5)
GenomicRanges(v.1.30.0)
BiocGenerics(v.0.24.0)
zlibbioc(v.1.24.0)
IRanges(v.2.12.0)
lattice(v.0.20-35)
stringr(v.1.2.0)
GenomeInfoDb(v.1.14.0)
tools(v.3.4.3)
grid(v.3.4.3)
SummarizedExperiment(v.1.8.0)
parallel(v.3.4.3)
Biobase(v.2.38.0)
matrixStats(v.0.52.2)
htmltools(v.0.3.6)
yaml(v.2.1.15)
rprojroot(v.1.2)
digest(v.0.6.12)
bookdown(v.0.5)
Matrix(v.1.2-12)
GenomeInfoDbData(v.0.99.1)
S4Vectors(v.0.16.0)
bitops(v.1.0-6)
RCurl(v.1.95-4.8)
evaluate(v.0.10.1)
rmarkdown(v.1.8)
DelayedArray(v.0.4.1)
stringi(v.1.1.6)
pander(v.0.6.1)
compiler(v.3.4.3)
backports(v.1.1.1)
stats4(v.3.4.3)

Example data for TCGA Workflow: Analyze cancer genomics and epigenomics data using Bioconductor packages

2017-12-06

Contents