## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"-------------------- BiocStyle::latex(use.unsrturl=FALSE) ## ----setup_knitr, include=FALSE, cache=FALSE------------------------------- library(knitr) opts_chunk$set(cache = TRUE, tidy = FALSE, tidy.opts = list(blank = FALSE, width.cutoff=70), highlight = FALSE, out.width = "7cm", out.height = "7cm", fig.align = "center") ## ----eval = TRUE, message = FALSE------------------------------------------ library(GenomicRanges) library(rtracklayer) ## ----annotation, eval=FALSE------------------------------------------------ # gtf_dir <- "gencode.v12.annotation.gtf" # # gtf <- import(gtf_dir) # # ### Keep protein coding genes # keep_index <- mcols(gtf)$gene_type == "protein_coding" & # mcols(gtf)$type == "gene" # gtf <- gtf[keep_index] # ### Remove 'chr' from chromosome names # seqlevels(gtf) <- gsub(pattern = "chr", replacement = "", x = seqlevels(gtf)) # # genes_bed <- data.frame(chr = seqnames(gtf), start = start(gtf), # end = end(gtf), geneId = mcols(gtf)$gene_id, # stringsAsFactors = FALSE) # # for(i in as.character(1:22)){ # genes_bed_sub <- genes_bed[genes_bed$chr == i, ] # write.table(genes_bed_sub, "genes_chr", i ,".bed", quote = FALSE, # sep = "\t", row.names = FALSE, col.names = FALSE) # } # ## ----eval = TRUE, message = FALSE------------------------------------------ library(limma) ## ----eval=FALSE------------------------------------------------------------ # metadata_dir <- "E-GEUV-1.sdrf.txt" # # samples <- read.table(metadata_dir, header = TRUE, sep = "\t", as.is = TRUE) # # samples <- unique(samples[c("Assay.Name", "Characteristics.population.")]) # colnames(samples) <- c("sample_id", "population") # # samples$sample_id_short <- strsplit2(samples$sample_id, "\\.")[,1] # ## ----eval = FALSE---------------------------------------------------------- # expr_dir <- "GD660.TrQuantCount.txt" # # expr_all <- read.table(expr_dir, header = TRUE, sep="\t", as.is = TRUE) # # expr_all <- expr_all[, c("TargetID", "Gene_Symbol", "Chr", # samples$sample_id)] # colnames(expr_all) <- c("TargetID", "Gene_Symbol", "Chr", # samples$sample_id_short) # # for(j in "CEU"){ # for(i in 1:22){ # expr <- expr_all[expr_all$Chr == i, c("TargetID", "Gene_Symbol", # samples$sample_id_short[samples$population == j])] # write.table(expr, paste0("TrQuantCount_", j, "_chr", i, ".tsv"), # quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) # } # } ## ----eval = TRUE, message = FALSE------------------------------------------ library(Rsamtools) ## ----eval = FALSE---------------------------------------------------------- # files <- list.files(path = ".", pattern = "genotypes.vcf.gz", # full.names = TRUE, include.dirs = FALSE) # # ### bgzip and index the vcf files # for(i in 1:length(files)){ # zipped <- bgzip(files[i]) # idx <- indexTabix(zipped, format = "vcf") # } ## ----eval = TRUE, message = FALSE------------------------------------------ library(VariantAnnotation) library(tools) ## ----eval = FALSE---------------------------------------------------------- # ### Extended gene ranges # window <- 5000 # gene_ranges <- resize(gtf, GenomicRanges::width(gtf) + 2 * window, # fix = "center") # # chr <- gsub("chr", "", strsplit2(files, split = "\\.")[, 2]) # # for(j in "CEU"){ # for(i in 1:length(files)){ # cat(j, chr[i], fill = TRUE) # # zipped <- paste0(file_path_sans_ext(files[i]), ".bgz") # idx <- paste0(file_path_sans_ext(files[i]), ".bgz.tbi") # tab <- TabixFile(zipped, idx) # # ### Explore the file header with scanVcfHeader # hdr <- scanVcfHeader(tab) # print(all(samples$sample_id_short %in% samples(hdr))) # # ### Read a subset of VCF file # gene_ranges_tmp <- gene_ranges[seqnames(gene_ranges) == chr[i]] # param <- ScanVcfParam(which = gene_ranges_tmp, samples = # samples$sample_id_short[samples$population == j]) # vcf <- readVcf(tab, "hg19", param) # # ### Keep only the bi-allelic SNPs # # width of ref seq # rw <- width(ref(vcf)) # # width of first alt seq # aw <- unlist(lapply(alt(vcf), function(x) {width(x[1])})) # # number of alternate genotypes # nalt <- elementLengths(alt(vcf)) # # select only bi-allelic SNPs (monomorphic OK, so aw can be 0 or 1) # snp <- rw == 1 & aw <= 1 & nalt == 1 # # subset vcf # vcfbi <- vcf[snp,] # # rowdata <- rowData(vcfbi) # # ### Convert genotype into number of alleles different from reference # geno <- geno(vcfbi)$GT # geno01 <- geno # geno01[,] <- -1 # geno01[geno %in% c("0/0", "0|0")] <- 0 # REF/REF # geno01[geno %in% c("0/1", "0|1", "1/0", "1|0")] <- 1 # REF/ALT # geno01[geno %in% c("1/1", "1|1")] <- 2 # ALT/ALT # mode(geno01) <- "integer" # # genotypes <- unique(data.frame(chr = seqnames(rowdata), # start = start(rowdata), end = end(rowdata), snpId = rownames(geno01), # geno01, stringsAsFactors = FALSE)) # # ### Sort SNPs by position # genotypes <- genotypes[order(genotypes[ ,2]), ] # # write.table(genotypes, file = paste0("genotypes_", j, "_chr", # chr[i], ".tsv"), quote = FALSE, sep = "\t", row.names = FALSE, # col.names = TRUE) # # } # } # ## ----sessionInfo----------------------------------------------------------- sessionInfo()