## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE-------- library(knitr) opts_chunk$set(warning=FALSE, message=FALSE) BiocStyle::markdown() ## ----install_bioc, eval=FALSE---------------------------------------------- # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install('GenomicDataCommons') ## ----libraries, message=FALSE---------------------------------------------- library(GenomicDataCommons) ## ----statusQS-------------------------------------------------------------- GenomicDataCommons::status() ## ----statusCheck----------------------------------------------------------- stopifnot(GenomicDataCommons::status()$status=="OK") ## ----manifest-------------------------------------------------------------- ge_manifest = files() %>% filter( cases.project.project_id == 'TCGA-OV') %>% filter( type == 'gene_expression' ) %>% filter( analysis.workflow_type == 'HTSeq - Counts') %>% manifest() head(ge_manifest) fnames = lapply(ge_manifest$id[1:20],gdcdata) ## ----downloadQS------------------------------------------------------------ fnames = lapply(ge_manifest$id[1:20],gdcdata) ## ----gdc_clinical---------------------------------------------------------- case_ids = cases() %>% results(size=10) %>% ids() clindat = gdc_clinical(case_ids) names(clindat) ## ----clinData-------------------------------------------------------------- head(clindat[["main"]]) head(clindat[["diagnoses"]]) ## ----metadataQS------------------------------------------------------------ expands = c("diagnoses","annotations", "demographic","exposures") clinResults = cases() %>% GenomicDataCommons::select(NULL) %>% GenomicDataCommons::expand(expands) %>% results(size=50) str(clinResults[[1]],list.len=6) # or listviewer::jsonedit(clinResults) ## ----projectquery---------------------------------------------------------- pquery = projects() ## ----pquery---------------------------------------------------------------- str(pquery) ## ----pquerycount----------------------------------------------------------- pcount = count(pquery) # or pcount = pquery %>% count() pcount ## ----pqueryresults--------------------------------------------------------- presults = pquery %>% results() ## ----presultsstr----------------------------------------------------------- str(presults) ## ----presultsall----------------------------------------------------------- length(ids(presults)) presults = pquery %>% results_all() length(ids(presults)) # includes all records length(ids(presults)) == count(pquery) ## ----defaultfields--------------------------------------------------------- default_fields('files') # The number of fields available for files endpoint length(available_fields('files')) # The first few fields available for files endpoint head(available_fields('files')) ## ----selectexample--------------------------------------------------------- # Default fields here qcases = cases() qcases$fields # set up query to use ALL available fields # Note that checking of fields is done by select() qcases = cases() %>% GenomicDataCommons::select(available_fields('cases')) head(qcases$fields) ## ----aggexample------------------------------------------------------------ # total number of files of a specific type res = files() %>% facet(c('type','data_type')) %>% aggregations() res$type ## ----allfilesunfiltered---------------------------------------------------- qfiles = files() qfiles %>% count() # all files ## ----onlyGeneExpression---------------------------------------------------- qfiles = files() %>% filter( type == 'gene_expression') # here is what the filter looks like after translation str(get_filter(qfiles)) ## ----filtAvailFields------------------------------------------------------- grep('pro',available_fields('files'),value=TRUE) %>% head() ## ----filtProgramID--------------------------------------------------------- files() %>% facet('cases.project.project_id') %>% aggregations() %>% head() ## ----filtfinal------------------------------------------------------------- qfiles = files() %>% filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression') str(get_filter(qfiles)) qfiles %>% count() ## ----filtChain------------------------------------------------------------- qfiles2 = files() %>% filter( cases.project.project_id == 'TCGA-OV') %>% filter( type == 'gene_expression') qfiles2 %>% count() (qfiles %>% count()) == (qfiles2 %>% count()) #TRUE ## ----filtAndManifest------------------------------------------------------- manifest_df = qfiles %>% manifest() head(manifest_df) ## ----filterForHTSeqCounts-------------------------------------------------- qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' & type == 'gene_expression' & analysis.workflow_type == 'HTSeq - Counts') manifest_df = qfiles %>% manifest() nrow(manifest_df) ## ----authenNoRun, eval=FALSE----------------------------------------------- # token = gdc_token() # transfer(...,token=token) # # or # transfer(...,token=get_token()) ## ----singlefileDL---------------------------------------------------------- fnames = gdcdata(manifest_df$id[1:2],progress=FALSE) ## ----bulkDL---------------------------------------------------------------- fnames = gdcdata(manifest_df$id[3:10], access_method = 'client') ## ----casesPerProject------------------------------------------------------- res = cases() %>% facet("project.project_id") %>% aggregations() head(res) library(ggplot2) ggplot(res$project.project_id,aes(x = key, y = doc_count)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----casesInTCGA----------------------------------------------------------- cases() %>% filter(~ project.program.name=='TARGET') %>% count() ## ----casesInTARGET--------------------------------------------------------- cases() %>% filter(~ project.program.name=='TCGA') %>% count() ## ----casesTCGABRCASampleTypes---------------------------------------------- # The need to do the "&" here is a requirement of the # current version of the GDC API. I have filed a feature # request to remove this requirement. resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' & project.project_id=='TCGA-BRCA' ) %>% facet('samples.sample_type') %>% aggregations() resp$samples.sample_type ## ----casesTCGABRCASolidNormal---------------------------------------------- # The need to do the "&" here is a requirement of the # current version of the GDC API. I have filed a feature # request to remove this requirement. resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' & samples.sample_type=='Solid Tissue Normal') %>% GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) %>% response_all() count(resp) res = resp %>% results() str(res[1],list.len=6) head(ids(resp)) ## ----filesVCFCount--------------------------------------------------------- res = files() %>% facet('type') %>% aggregations() res$type ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----filesRNAseqGeneGBM---------------------------------------------------- q = files() %>% GenomicDataCommons::select(available_fields('files')) %>% filter(~ cases.project.project_id=='TCGA-GBM' & data_type=='Gene Expression Quantification') q %>% facet('analysis.workflow_type') %>% aggregations() # so need to add another filter file_ids = q %>% filter(~ cases.project.project_id=='TCGA-GBM' & data_type=='Gene Expression Quantification' & analysis.workflow_type == 'HTSeq - Counts') %>% GenomicDataCommons::select('file_id') %>% response_all() %>% ids() ## ----filesRNAseqGeneGBMforBAM---------------------------------------------- q = files() %>% GenomicDataCommons::select(available_fields('files')) %>% filter(~ cases.project.project_id == 'TCGA-GBM' & data_type == 'Aligned Reads' & experimental_strategy == 'RNA-Seq' & data_format == 'BAM') file_ids = q %>% response_all() %>% ids() ## ----slicing10, eval=FALSE------------------------------------------------- # bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token()) # library(GenomicAlignments) # aligns = readGAlignments(bamfile) ## ----sessionInfo----------------------------------------------------------- sessionInfo()