readTCGA {RTCGA} | R Documentation |
readTCGA
function allows to read unzipped files:
clinical data - Merge_Clinical.Level_1
rnaseq data (genes' expressions) - rnaseqv2__illuminahiseq_rnaseqv2
genes' mutations data - Mutation_Packager_Calls.Level
Reverse phase protein array data (RPPA) - protein_normalization__data.Level_3
Merge transcriptome agilent data (mRNA) -
Merge_transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.Level_3
miRNASeq data -
Merge_mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3
or
"Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3"
methylation data -
Merge_methylation__humanmethylation27
isoforms data -
Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.Level_3
from TCGA project. Those files can be easily downloded with downloadTCGA function. See examples.
readTCGA(path, dataType, ...)
path |
See details and examples. |
dataType |
One of |
... |
Further arguments passed to the as.data.frame. |
All cohort names can be checked using: sub( x = names( infoTCGA() ), '-counts', '')
.
Parameter path
specification:
If dataType = 'clinical'
a path to a cancerType.clin.merged.txt
file.
If dataType = 'mutations'
a path to the unzziped folder Mutation_Packager_Calls.Level
containing .maf
files.
If dataType = 'rnaseq'
a path to the uzziped file rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level
.
If dataType = 'RPPA'
a path to the unzipped file in folder protein_normalization__data.Level_3
.
If dataType = 'mRNA'
a path to the unzipped file cancerType.transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.data.txt
.
If dataType = 'miRNASeq'
a path to unzipped files cancerType.mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt
or cancerType.mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt
If dataType = 'methylation'
a path to unzipped files cancerType.methylation__humanmethylation27__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.data.txt
.
If dataType = 'isoforms'
a path to unzipped files cancerType.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.data.txt
.
An output:
If dataType = 'clinical'
a data.frame
with clinical data.
If dataType = 'rnaseq'
a data.frame
with rnaseq data.
If dataType = 'mutations'
a data.frame
with mutations data.
If dataType = 'RPPA'
a data.frame
with RPPA data.
If dataType = 'mRNA'
a data.frame
with mRNA data.
If dataType = 'miRNASeq'
a data.frame
with miRNASeq data.
If dataType = 'methylation'
a data.frame
with methylation data.
If dataType = 'isoforms'
a data.frame
with isoforms data.
If you have any problems, issues or think that something is missing or is not clear please post an issue on https://github.com/RTCGA/RTCGA/issues.
Marcin Kosinski, m.p.kosinski@gmail.com
Witold Chodor, witoldchodor@gmail.com
RTCGA website http://rtcga.github.io/RTCGA/Download.html.
Other RTCGA:
RTCGA-package
,
boxplotTCGA()
,
checkTCGA()
,
convertTCGA()
,
datasetsTCGA
,
downloadTCGA()
,
expressionsTCGA()
,
heatmapTCGA()
,
infoTCGA()
,
installTCGA()
,
kmTCGA()
,
mutationsTCGA()
,
pcaTCGA()
,
survivalTCGA()
,
theme_RTCGA()
## Not run: ############## ##### clinical ############## dir.create('data') # downloading clinical data # dataset = "clinical" is default parameter so we may omit it downloadTCGA( cancerTypes = c('BRCA', 'OV'), destDir = 'data' ) # reading datasets sapply( c('BRCA', 'OV'), function( element ){ folder <- grep( paste0( '(_',element,'\\.', '|','_',element,'-FFPE)', '.*Clinical'), list.files('data/'),value = TRUE) path <- paste0( 'data/', folder, '/', element, '.clin.merged.txt') assign( value = readTCGA( path, 'clinical' ), x = paste0(element, '.clin.data'), envir = .GlobalEnv) }) ############ ##### rnaseq ############ dir.create('data2') # downloading rnaseq data downloadTCGA( cancerTypes = 'BRCA', dataSet = 'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level', destDir = 'data2' ) # shortening paths and directories list.files( 'data2/') %>% file.path( 'data2', .) %>% file.rename( to = substr(.,start=1,stop=50)) # reading data list.files( 'data2/') %>% file.path( 'data2', .) -> folder folder %>% list.files %>% file.path( folder, .) %>% grep( pattern = 'illuminahiseq', x = ., value = TRUE) -> pathRNA readTCGA( path = pathRNA, dataType = 'rnaseq' ) -> my_data ############### ##### mutations ############### # Example directory in which untarred data will be stored dir.create('data3') downloadTCGA( cancerTypes = 'OV', dataSet = 'Mutation_Packager_Calls.Level', destDir = 'data3' ) # reading data list.files( 'data3/') %>% file.path( 'data3', .) -> folder readTCGA(folder, 'mutations') -> mut_file ################# ##### methylation ################# # Example directory in which untarred data will be stored dir.create('data4') # Download KIRP methylation data and store it in data4 folder cancerType = "KIRP" downloadTCGA(cancerTypes = cancerType, dataSet = "Merge_methylation__humanmethylation27", destDir = "data4") # Shorten path of subdirectory with KIRP methylation data list.files(path = "data4", full.names = TRUE) %>% file.rename(from = ., to = file.path("data4", paste0(cancerType, ".methylation"))) # Remove manifest.txt file list.files(path = "data4", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) %>% grep("MANIFEST.txt", x = ., value = TRUE) %>% file.remove() # Read KIRP methylation data path <- list.files(path = "data4", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) KIRP.methylation <- readTCGA(path, dataType = "methylation") ########## ##### RPPA ########## # Directory in which untarred data will be stored dir.create('data5') # Download BRCA RPPA data and store it in data5 folder cancerType = "BRCA" downloadTCGA(cancerTypes = cancerType, dataSet = "protein_normalization__data.Level_3", destDir = "data5") # Shorten path of subdirectory with BRCA RPPA data list.files(path = "data5", full.names = TRUE) %>% file.rename(from = ., to = file.path("data5", paste0(cancerType, ".RPPA"))) # Remove manifest.txt file list.files(path = "data5", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) %>% grep("MANIFEST.txt", x = ., value = TRUE) %>% file.remove() # Read BRCA RPPA data path <- list.files(path = "data5", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) BRCA.RPPA <- readTCGA(path, dataType = "RPPA") ########## ##### mRNA ########## # Directory in which untarred data will be stored dir.create('data6') # Download UCEC mRNA data and store it in data6 folder cancerType = "UCEC" downloadTCGA(cancerTypes = cancerType, dataSet = "Merge_transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.Level_3", destDir = "data6") # Shorten path of subdirectory with UCEC mRNA data list.files(path = "data6", full.names = TRUE) %>% file.rename(from = ., to = file.path("data6",paste0(cancerType, ".mRNA"))) # Remove manifest.txt file list.files(path = "data6", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) %>% grep("MANIFEST.txt", x = ., value = TRUE) %>% file.remove() # Read UCEC mRNA data path <- list.files(path = "data6", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) UCEC.mRNA <- readTCGA(path, dataType = "mRNA") ############## ##### miRNASeq ############## # Directory in which untarred data will be stored dir.create('data7') # Download BRCA miRNASeq data and store it in data7 folder # Remember that miRNASeq data are produced by two machines: # Illumina Genome Analyzer and Illumina HiSeq 2000 machines cancerType <- "BRCA" downloadTCGA(cancerTypes = cancerType, dataSet = "Merge_mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3", destDir = "data7") downloadTCGA(cancerTypes = cancerType, dataSet = "Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3", destDir = "data7") # Shorten path of subdirectory with BRCA miRNASeq data list.files(path = "data7", full.names = TRUE) %>% sapply(function(path){ if (grepl(pattern = "illuminaga", path)){ file.rename(from = grep(pattern = "illuminaga", path, value = TRUE), to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminaga"))) } else if (grepl(pattern = "illuminahiseq", path)){ file.rename(from = grep(pattern = "illuminahiseq", path, value = TRUE), to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminahiseq"))) } }) # Remove manifest.txt file list.files(path = "data7", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) %>% grep("MANIFEST.txt", x = ., value = TRUE) %>% file.remove() # Read BRCA miRNASeq data path <- list.files(path = "data7", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) path_illuminaga <- grep("illuminaga", path, fixed = TRUE, value = TRUE) path_illuminahiseq <- grep("illuminahiseq", path, fixed = TRUE, value = TRUE) BRCA.miRNASeq.illuminaga <- readTCGA(path_illuminaga, dataType = "miRNASeq") BRCA.miRNASeq.illuminahiseq <- readTCGA(path_illuminahiseq, dataType = "miRNASeq") BRCA.miRNASeq.illuminaga <- cbind(machine = "Illumina Genome Analyzer", BRCA.miRNASeq.illuminaga) BRCA.miRNASeq.illuminahiseq <- cbind(machine = "Illumina HiSeq 2000", BRCA.miRNASeq.illuminahiseq) BRCA.miRNASeq <- rbind(BRCA.miRNASeq.illuminaga, BRCA.miRNASeq.illuminahiseq) ############## ##### isoforms ############## # Directory in which untarred data will be stored dir.create('data8') # Download ACC isoforms data and store it in data8 folder cancerType = "ACC" downloadTCGA(cancerTypes = cancerType, dataSet = "Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.Level_3", destDir = "data8") # Shorten path of subdirectory with ACC isoforms data list.files(path = "data8", full.names = TRUE) %>% file.rename(from = ., to = file.path("data8",paste0(cancerType, ".isoforms"))) # Remove manifest.txt file list.files(path = "data8", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) %>% grep("MANIFEST.txt", x = ., value = TRUE) %>% file.remove() # Read ACC isoforms data path <- list.files(path = "data8", full.names = TRUE) %>% list.files(path = ., full.names = TRUE) ACC.isoforms <- readTCGA(path, dataType = "isoforms") ## End(Not run)