## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE----------- library(knitr) opts_chunk$set(warning=FALSE, message=FALSE) BiocStyle::markdown() ## ----install_bioc, eval=FALSE------------------------------------------------- # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install('GenomicDataCommons') ## ----libraries, message=FALSE------------------------------------------------- library(GenomicDataCommons) ## ----statusQS----------------------------------------------------------------- GenomicDataCommons::status() ## ----statusCheck-------------------------------------------------------------- stopifnot(GenomicDataCommons::status()$status=="OK") ## ----manifest----------------------------------------------------------------- ge_manifest <- files() %>% filter( cases.project.project_id == 'TCGA-OV') %>% filter( type == 'gene_expression' ) %>% filter( analysis.workflow_type == 'STAR - Counts') %>% manifest() head(ge_manifest) ## ----downloadQS, eval=FALSE--------------------------------------------------- # fnames <- lapply(ge_manifest$id[1:20], gdcdata) ## ----gdc_clinical------------------------------------------------------------- case_ids = cases() %>% results(size=10) %>% ids() clindat = gdc_clinical(case_ids) names(clindat) ## ----clinData----------------------------------------------------------------- head(clindat[["main"]]) head(clindat[["diagnoses"]]) ## ----metadataQS--------------------------------------------------------------- expands = c("diagnoses","annotations", "demographic","exposures") clinResults = cases() %>% GenomicDataCommons::select(NULL) %>% GenomicDataCommons::expand(expands) %>% results(size=50) str(clinResults[[1]],list.len=6) # or listviewer::jsonedit(clinResults) ## ----projectquery------------------------------------------------------------- pquery = projects() ## ----pquery------------------------------------------------------------------- str(pquery) ## ----pquerycount-------------------------------------------------------------- pcount = count(pquery) # or pcount = pquery %>% count() pcount ## ----pqueryresults------------------------------------------------------------ presults = pquery %>% results() ## ----presultsstr-------------------------------------------------------------- str(presults) ## ----presultsall-------------------------------------------------------------- length(ids(presults)) presults = pquery %>% results_all() length(ids(presults)) # includes all records length(ids(presults)) == count(pquery) ## ----defaultfields------------------------------------------------------------ default_fields('files') # The number of fields available for files endpoint length(available_fields('files')) # The first few fields available for files endpoint head(available_fields('files')) ## ----selectexample------------------------------------------------------------ # Default fields here qcases = cases() qcases$fields # set up query to use ALL available fields # Note that checking of fields is done by select() qcases = cases() %>% GenomicDataCommons::select(available_fields('cases')) head(qcases$fields) ## ----aggexample--------------------------------------------------------------- # total number of files of a specific type res = files() %>% facet(c('type','data_type')) %>% aggregations() res$type ## ----allfilesunfiltered------------------------------------------------------- qfiles = files() qfiles %>% count() # all files ## ----onlyGeneExpression------------------------------------------------------- qfiles = files() %>% filter( type == 'gene_expression') # here is what the filter looks like after translation str(get_filter(qfiles)) ## ----filtAvailFields---------------------------------------------------------- grep('pro',available_fields('files'),value=TRUE) %>% head() ## ----filtProgramID------------------------------------------------------------ files() %>% facet('cases.project.project_id') %>% aggregations() %>% head() ## ----filtfinal---------------------------------------------------------------- qfiles = files() %>% filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression') str(get_filter(qfiles)) qfiles %>% count() ## ----filtChain---------------------------------------------------------------- qfiles2 = files() %>% filter( cases.project.project_id == 'TCGA-OV') %>% filter( type == 'gene_expression') qfiles2 %>% count() (qfiles %>% count()) == (qfiles2 %>% count()) #TRUE ## ----filtAndManifest---------------------------------------------------------- manifest_df = qfiles %>% manifest() head(manifest_df) ## ----filterForSTARCounts------------------------------------------------------ qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' & type == 'gene_expression' & access == "open" & analysis.workflow_type == 'STAR - Counts') manifest_df = qfiles %>% manifest() nrow(manifest_df) ## ----authenNoRun, eval=FALSE-------------------------------------------------- # token = gdc_token() # transfer(...,token=token) # # or # transfer(...,token=get_token()) ## ----singlefileDL------------------------------------------------------------- fnames = gdcdata(manifest_df$id[1:2],progress=FALSE) ## ----bulkDL, eval=FALSE------------------------------------------------------- # # Requires gcd_client command-line utility to be isntalled # # separately. # fnames = gdcdata(manifest_df$id[3:10], access_method = 'client') ## ----casesPerProject---------------------------------------------------------- res = cases() %>% facet("project.project_id") %>% aggregations() head(res) library(ggplot2) ggplot(res$project.project_id,aes(x = key, y = doc_count)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----casesInTCGA-------------------------------------------------------------- cases() %>% filter(~ project.program.name=='TARGET') %>% count() ## ----casesInTARGET------------------------------------------------------------ cases() %>% filter(~ project.program.name=='TCGA') %>% count() ## ----casesTCGABRCASampleTypes------------------------------------------------- # The need to do the "&" here is a requirement of the # current version of the GDC API. I have filed a feature # request to remove this requirement. resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' & project.project_id=='TCGA-BRCA' ) %>% facet('samples.sample_type') %>% aggregations() resp$samples.sample_type ## ----casesTCGABRCASolidNormal------------------------------------------------- # The need to do the "&" here is a requirement of the # current version of the GDC API. I have filed a feature # request to remove this requirement. resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' & samples.sample_type=='Solid Tissue Normal') %>% GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) %>% response_all() count(resp) res = resp %>% results() str(res[1],list.len=6) head(ids(resp)) ## ----casesFemaleTCGA---------------------------------------------------------- cases() %>% GenomicDataCommons::filter(~ project.program.name == 'TCGA' & "cases.demographic.gender" %in% "female") %>% GenomicDataCommons::results(size = 4) %>% ids() ## ----notFemaleTCGACOAD-------------------------------------------------------- cases() %>% GenomicDataCommons::filter(~ project.project_id == 'TCGA-COAD' & "cases.demographic.gender" %exclude% "female") %>% GenomicDataCommons::results(size = 4) %>% ids() ## ----missingGenderTCGA-------------------------------------------------------- cases() %>% GenomicDataCommons::filter(~ project.program.name == 'TCGA' & missing("cases.demographic.gender")) %>% GenomicDataCommons::results(size = 4) %>% ids() ## ----notMissingGenderTCGA----------------------------------------------------- cases() %>% GenomicDataCommons::filter(~ project.program.name == 'TCGA' & !missing("cases.demographic.gender")) %>% GenomicDataCommons::results(size = 4) %>% ids() ## ----filesVCFCount------------------------------------------------------------ res = files() %>% facet('type') %>% aggregations() res$type ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----filesRNAseqGeneGBM------------------------------------------------------- q = files() %>% GenomicDataCommons::select(available_fields('files')) %>% filter(~ cases.project.project_id=='TCGA-GBM' & data_type=='Gene Expression Quantification') q %>% facet('analysis.workflow_type') %>% aggregations() # so need to add another filter file_ids = q %>% filter(~ cases.project.project_id=='TCGA-GBM' & data_type=='Gene Expression Quantification' & analysis.workflow_type == 'STAR - Counts') %>% GenomicDataCommons::select('file_id') %>% response_all() %>% ids() ## ----filesRNAseqGeneGBMforBAM------------------------------------------------- q = files() %>% GenomicDataCommons::select(available_fields('files')) %>% filter(~ cases.project.project_id == 'TCGA-GBM' & data_type == 'Aligned Reads' & experimental_strategy == 'RNA-Seq' & data_format == 'BAM') file_ids = q %>% response_all() %>% ids() ## ----slicing10, eval=FALSE---------------------------------------------------- # bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token()) # library(GenomicAlignments) # aligns = readGAlignments(bamfile) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()