## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE--------
library(knitr)
opts_chunk$set(warning=FALSE, message=FALSE)
BiocStyle::markdown()

## ----install_bioc, eval=FALSE----------------------------------------------
#  if (!require("BiocManager"))
#      install.packages("BiocManager")
#  BiocManager::install('GenomicDataCommons')

## ----libraries, message=FALSE----------------------------------------------
library(GenomicDataCommons)

## ----statusQS--------------------------------------------------------------
GenomicDataCommons::status()

## ----statusCheck-----------------------------------------------------------
stopifnot(GenomicDataCommons::status()$status=="OK")

## ----manifest--------------------------------------------------------------
ge_manifest = files() %>%
    filter( cases.project.project_id == 'TCGA-OV') %>% 
    filter( type == 'gene_expression' ) %>%
    filter( analysis.workflow_type == 'HTSeq - Counts')  %>%
    manifest()
head(ge_manifest)
fnames = lapply(ge_manifest$id[1:20],gdcdata)

## ----downloadQS------------------------------------------------------------
fnames = lapply(ge_manifest$id[1:20],gdcdata)

## ----gdc_clinical----------------------------------------------------------
case_ids = cases() %>% results(size=10) %>% ids()
clindat = gdc_clinical(case_ids)
names(clindat)

## ----clinData--------------------------------------------------------------
head(clindat[["main"]])
head(clindat[["diagnoses"]])

## ----metadataQS------------------------------------------------------------
expands = c("diagnoses","annotations",
             "demographic","exposures")
clinResults = cases() %>%
    GenomicDataCommons::select(NULL) %>%
    GenomicDataCommons::expand(expands) %>%
    results(size=50)
str(clinResults[[1]],list.len=6)
# or listviewer::jsonedit(clinResults)

## ----projectquery----------------------------------------------------------
pquery = projects()

## ----pquery----------------------------------------------------------------
str(pquery)

## ----pquerycount-----------------------------------------------------------
pcount = count(pquery)
# or
pcount = pquery %>% count()
pcount

## ----pqueryresults---------------------------------------------------------
presults = pquery %>% results()

## ----presultsstr-----------------------------------------------------------
str(presults)

## ----presultsall-----------------------------------------------------------
length(ids(presults))
presults = pquery %>% results_all()
length(ids(presults))
# includes all records
length(ids(presults)) == count(pquery)

## ----defaultfields---------------------------------------------------------
default_fields('files')
# The number of fields available for files endpoint
length(available_fields('files'))
# The first few fields available for files endpoint
head(available_fields('files'))

## ----selectexample---------------------------------------------------------
# Default fields here
qcases = cases()
qcases$fields
# set up query to use ALL available fields
# Note that checking of fields is done by select()
qcases = cases() %>% GenomicDataCommons::select(available_fields('cases'))
head(qcases$fields)

## ----aggexample------------------------------------------------------------
# total number of files of a specific type
res = files() %>% facet(c('type','data_type')) %>% aggregations()
res$type

## ----allfilesunfiltered----------------------------------------------------
qfiles = files()
qfiles %>% count() # all files

## ----onlyGeneExpression----------------------------------------------------
qfiles = files() %>% filter( type == 'gene_expression')
# here is what the filter looks like after translation
str(get_filter(qfiles))

## ----filtAvailFields-------------------------------------------------------
grep('pro',available_fields('files'),value=TRUE) %>% 
    head()

## ----filtProgramID---------------------------------------------------------
files() %>% 
    facet('cases.project.project_id') %>% 
    aggregations() %>% 
    head()

## ----filtfinal-------------------------------------------------------------
qfiles = files() %>%
    filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression')
str(get_filter(qfiles))
qfiles %>% count()

## ----filtChain-------------------------------------------------------------
qfiles2 = files() %>%
    filter( cases.project.project_id == 'TCGA-OV') %>% 
    filter( type == 'gene_expression') 
qfiles2 %>% count()
(qfiles %>% count()) == (qfiles2 %>% count()) #TRUE

## ----filtAndManifest-------------------------------------------------------
manifest_df = qfiles %>% manifest()
head(manifest_df)

## ----filterForHTSeqCounts--------------------------------------------------
qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' &
                            type == 'gene_expression' &
                            analysis.workflow_type == 'HTSeq - Counts')
manifest_df = qfiles %>% manifest()
nrow(manifest_df)

## ----authenNoRun, eval=FALSE-----------------------------------------------
#  token = gdc_token()
#  transfer(...,token=token)
#  # or
#  transfer(...,token=get_token())

## ----singlefileDL----------------------------------------------------------
fnames = gdcdata(manifest_df$id[1:2],progress=FALSE)


## ----bulkDL----------------------------------------------------------------
fnames = gdcdata(manifest_df$id[3:10], access_method = 'client')

## ----casesPerProject-------------------------------------------------------
res = cases() %>% facet("project.project_id") %>% aggregations()
head(res)
library(ggplot2)
ggplot(res$project.project_id,aes(x = key, y = doc_count)) +
    geom_bar(stat='identity') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

## ----casesInTCGA-----------------------------------------------------------
cases() %>% filter(~ project.program.name=='TARGET') %>% count()

## ----casesInTARGET---------------------------------------------------------
cases() %>% filter(~ project.program.name=='TCGA') %>% count()

## ----casesTCGABRCASampleTypes----------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' &
                              project.project_id=='TCGA-BRCA' ) %>%
    facet('samples.sample_type') %>% aggregations()
resp$samples.sample_type

## ----casesTCGABRCASolidNormal----------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' &
                              samples.sample_type=='Solid Tissue Normal') %>%
    GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) %>%
    response_all()
count(resp)
res = resp %>% results()
str(res[1],list.len=6)
head(ids(resp))

## ----filesVCFCount---------------------------------------------------------
res = files() %>% facet('type') %>% aggregations()
res$type
ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

## ----filesRNAseqGeneGBM----------------------------------------------------
q = files() %>%
    GenomicDataCommons::select(available_fields('files')) %>%
    filter(~ cases.project.project_id=='TCGA-GBM' &
               data_type=='Gene Expression Quantification')
q %>% facet('analysis.workflow_type') %>% aggregations()
# so need to add another filter
file_ids = q %>% filter(~ cases.project.project_id=='TCGA-GBM' &
                            data_type=='Gene Expression Quantification' &
                            analysis.workflow_type == 'HTSeq - Counts') %>%
    GenomicDataCommons::select('file_id') %>%
    response_all() %>%
    ids()

## ----filesRNAseqGeneGBMforBAM----------------------------------------------
q = files() %>%
    GenomicDataCommons::select(available_fields('files')) %>%
    filter(~ cases.project.project_id == 'TCGA-GBM' &
               data_type == 'Aligned Reads' &
               experimental_strategy == 'RNA-Seq' &
               data_format == 'BAM')
file_ids = q %>% response_all() %>% ids()

## ----slicing10, eval=FALSE-------------------------------------------------
#  bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token())
#  library(GenomicAlignments)
#  aligns = readGAlignments(bamfile)

## ----sessionInfo-----------------------------------------------------------
sessionInfo()