### R code from vignette source 'vignettes/SeqArray/inst/doc/SeqArrayTutorial.Rnw' ################################################### ### code chunk number 1: SeqArrayTutorial.Rnw:90-92 ################################################### # load the R package SeqArray library(SeqArray) ################################################### ### code chunk number 2: SeqArrayTutorial.Rnw:96-101 ################################################### gds.fn <- seqExampleFileName("gds") # or gds.fn <- "C:/YourFolder/Your_GDS_File.gds" gds.fn seqSummary(gds.fn) ################################################### ### code chunk number 3: SeqArrayTutorial.Rnw:105-110 ################################################### # open a GDS file genofile <- seqOpen(gds.fn) # display the contents of the GDS file in a hierarchical structure genofile ################################################### ### code chunk number 4: SeqArrayTutorial.Rnw:114-119 ################################################### # display all contents of the GDS file print(genofile, all=TRUE) # close the GDS file seqClose(genofile) ################################################### ### code chunk number 5: SeqArrayTutorial.Rnw:173-180 ################################################### # the VCF file, using the example in the SeqArray package vcf.fn <- seqExampleFileName("vcf") # or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf" vcf.fn # parse the header seqVCF.Header(vcf.fn) ################################################### ### code chunk number 6: SeqArrayTutorial.Rnw:184-187 ################################################### # convert, save in "tmp.gds" seqVCF2GDS(vcf.fn, "tmp.gds") seqSummary("tmp.gds") ################################################### ### code chunk number 7: SeqArrayTutorial.Rnw:190-191 ################################################### unlink("tmp.gds") ################################################### ### code chunk number 8: SeqArrayTutorial.Rnw:200-221 ################################################### # the file of GDS gds.fn <- seqExampleFileName("gds") # or gds.fn <- "C:/YourFolder/Your_GDS_File.gds" # open a GDS file genofile <- seqOpen(gds.fn) # convert seqGDS2VCF(genofile, "tmp.vcf.gz") # read z <- readLines("tmp.vcf.gz", n=20) for (i in z) cat(substr(i, 1, 76), " ...\n", sep="") # output BN,GP,AA,HM2 in INFO (the variables are in this order), no FORMAT seqGDS2VCF(genofile, "tmp2.vcf.gz", info.var=c("BN","GP","AA","HM2"), fmt.var=character(0)) # read z <- readLines("tmp2.vcf.gz", n=15) for (i in z) cat(substr(i, 1, 56), " ...\n", sep="") # close the GDS file seqClose(genofile) ################################################### ### code chunk number 9: SeqArrayTutorial.Rnw:243-245 ################################################### # delete temporary files unlink(c("tmp.vcf.gz", "tmp1.vcf.gz", "tmp2.vcf.gz")) ################################################### ### code chunk number 10: SeqArrayTutorial.Rnw:254-278 ################################################### # the file of VCF vcf.fn <- seqExampleFileName("vcf") # or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf" # convert seqVCF2GDS(vcf.fn, "tmp.gds", verbose=FALSE) # make sure that open with "readonly=FALSE" genofile <- seqOpen("tmp.gds", readonly=FALSE) # display the original structure genofile # delete "HM2", "HM3", "AA", "OR" and "DP" seqDelete(genofile, info.varname=c("HM2", "HM3", "AA", "OR"), format.varname="DP") # display genofile # close the GDS file seqClose(genofile) # clean up the fragments, reduce the file size cleanup.gds("tmp.gds") ################################################### ### code chunk number 11: SeqArrayTutorial.Rnw:281-282 ################################################### unlink("tmp.gds") ################################################### ### code chunk number 12: SeqArrayTutorial.Rnw:295-298 ################################################### # open a GDS file gds.fn <- seqExampleFileName("gds") genofile <- seqOpen(gds.fn) ################################################### ### code chunk number 13: SeqArrayTutorial.Rnw:302-316 ################################################### # take out sample id head(samp.id <- seqGetData(genofile, "sample.id")) # take out variant id head(variant.id <- seqGetData(genofile, "variant.id")) # get "chromosome" table(seqGetData(genofile, "chromosome")) # get "allele" head(seqGetData(genofile, "allele")) # get "annotation/info/GP" head(seqGetData(genofile, "annotation/info/GP")) ################################################### ### code chunk number 14: SeqArrayTutorial.Rnw:320-327 ################################################### # set sample and variant filters seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)]) set.seed(100) seqSetFilter(genofile, variant.id=sample(variant.id, 4)) # get "allele" seqGetData(genofile, "allele") ################################################### ### code chunk number 15: SeqArrayTutorial.Rnw:331-333 ################################################### # get genotypic data seqGetData(genofile, "genotype") ################################################### ### code chunk number 16: SeqArrayTutorial.Rnw:337-339 ################################################### # get "annotation/info/AA", a variable-length dataset seqGetData(genofile, "annotation/info/AA") ################################################### ### code chunk number 17: SeqArrayTutorial.Rnw:342-344 ################################################### # get "annotation/format/DP", a variable-length dataset seqGetData(genofile, "annotation/format/DP") ################################################### ### code chunk number 18: SeqArrayTutorial.Rnw:352-355 ################################################### # set sample and variant filters set.seed(100) seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)], variant.id=sample(variant.id, 4)) ################################################### ### code chunk number 19: SeqArrayTutorial.Rnw:357-360 ################################################### # read multiple variables variant by variant seqApply(genofile, c(geno="genotype", id="annotation/id"), FUN=function(x) print(x), margin="by.variant", as.is="none") ################################################### ### code chunk number 20: SeqArrayTutorial.Rnw:363-370 ################################################### # remove the sample and variant filters seqSetFilter(genofile) # get the numbers of alleles per variant z <- seqApply(genofile, "allele", FUN=function(x) length(unlist(strsplit(x,","))), as.is="integer") table(z) ################################################### ### code chunk number 21: SeqArrayTutorial.Rnw:374-375 ################################################### HM3 <- seqGetData(genofile, "annotation/info/HM3") ################################################### ### code chunk number 22: SeqArrayTutorial.Rnw:377-384 ################################################### # Now HM3 is a global variable # print out RS id if the frequency of reference allele is less than 0.5% and it is HM3 seqApply(genofile, c(geno="genotype", id="annotation/id"), FUN = function(index, x) { p <- mean(x$geno == 0, na.rm=TRUE) # the frequency of reference allele if ((p < 0.005) & HM3[index]) print(x$id) }, as.is="none", var.index="relative", margin="by.variant") ################################################### ### code chunk number 23: SeqArrayTutorial.Rnw:392-397 ################################################### # calculate the frequency of reference allele, afreq <- seqApply(genofile, "genotype", FUN=function(x) mean(x==0, na.rm=TRUE), as.is="double", margin="by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 24: SeqArrayTutorial.Rnw:401-406 ################################################### # load the "parallel" package library(parallel) # Use option cl.core to choose an appropriate cluster size (or # of cores) cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 25: SeqArrayTutorial.Rnw:408-416 ################################################### # run in parallel afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) { seqApply(gdsfile, "genotype", as.is="double", FUN=function(x) mean(x==0, na.rm=TRUE)) }, split = "by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 26: SeqArrayTutorial.Rnw:419-421 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 27: SeqArrayTutorial.Rnw:424-425 ################################################### seqClose(genofile) ################################################### ### code chunk number 28: SeqArrayTutorial.Rnw:440-443 ################################################### # open a GDS file gds.fn <- seqExampleFileName("gds") genofile <- seqOpen(gds.fn) ################################################### ### code chunk number 29: SeqArrayTutorial.Rnw:526-544 ################################################### m.variant <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # apply the function marginally m <- seqApply(genofile, "genotype", function(x) { sum(is.na(x)) }, margin="by.variant", as.is="integer") # output m / (ploidy * dm[1]) }) length(m.variant) summary(m.variant) ################################################### ### code chunk number 30: SeqArrayTutorial.Rnw:548-570 ################################################### m.sample <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # need a global variable (only available in the bracket of "local") n <- integer(dm[1]) # apply the function marginally # use "<<-" operator to find "n" in the parent environment seqApply(genofile, "genotype", function(x) { n <<- n + colSums(is.na(x)) }, margin="by.variant", as.is="none") # output n / (ploidy * dm[2]) }) length(m.sample) summary(m.sample) ################################################### ### code chunk number 31: SeqArrayTutorial.Rnw:576-581 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 32: SeqArrayTutorial.Rnw:585-606 ################################################### # run in parallel m.variant <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # run in parallel m <- seqParallel(cl, genofile, FUN = function(gdsfile) { # apply the function marginally seqApply(gdsfile, "genotype", function(x) { sum(is.na(x)) }, margin="by.variant", as.is="integer") }, split = "by.variant") # output m / (ploidy * dm[1]) }) summary(m.variant) ################################################### ### code chunk number 33: SeqArrayTutorial.Rnw:610-642 ################################################### m.sample <- local({ # run in parallel m <- seqParallel(cl, genofile, FUN = function(gdsfile) { # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need a global variable (only available in the bracket of "local") n <- integer(dm[1]) # apply the function marginally # use "<<-" operator to find "n" in the parent environment seqApply(gdsfile, "genotype", function(x) { n <<- n + colSums(is.na(x)) }, margin="by.variant", as.is="none") # output n }, .combine = "+", # sum all variables "n" of different processes split = "by.variant") # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # output m / (ploidy * dm[2]) }) summary(m.sample) ################################################### ### code chunk number 34: SeqArrayTutorial.Rnw:645-647 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 35: SeqArrayTutorial.Rnw:656-662 ################################################### # apply the function variant by variant afreq <- seqApply(genofile, "genotype", function(x) { mean(x==0, na.rm=TRUE) }, as.is="double", margin="by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 36: SeqArrayTutorial.Rnw:668-673 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 37: SeqArrayTutorial.Rnw:675-683 ################################################### # run in parallel afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) { seqApply(gdsfile, "genotype", as.is="double", FUN=function(x) mean(x==0, na.rm=TRUE)) }, split = "by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 38: SeqArrayTutorial.Rnw:686-688 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 39: SeqArrayTutorial.Rnw:701-730 ################################################### # genetic correlation matrix genmat <- local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(genofile, "genotype")$seldim # need a global variable (only available in the bracket of "local") s <- matrix(0.0, nrow=dm[1], ncol=dm[1]) # apply the function variant by variant # use "<<-" operator to find "s" in the parent environment seqApply(genofile, "genotype", function(x) { g <- (x==0) # indicator of reference allele p <- mean(g, na.rm=TRUE) # allele frequency g2 <- colSums(g) - 2*p # genotypes adjusted by allele frequency m <- (g2 %o% g2) / (p*(1-p)) # genetic correlation matrix m[!is.finite(m)] <- 0 # correct missing values s <<- s + m # output to the global variable "s" }, margin="by.variant", as.is="none") # output, scaled by the trace of matrix "s" over the number of samples s / (sum(diag(s)) / dm[1]) }) # eigen-decomposition eig <- eigen(genmat) # eigenvalues head(eig$value) ################################################### ### code chunk number 40: SeqArrayTutorial.Rnw:732-734 ################################################### # eigenvectors plot(eig$vectors[,1], eig$vectors[,2], xlab="PC 1", ylab="PC 2") ################################################### ### code chunk number 41: SeqArrayTutorial.Rnw:741-746 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 42: SeqArrayTutorial.Rnw:749-782 ################################################### genmat <- seqParallel(cl, genofile, FUN = function(gdsfile) { # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need a global variable (only available in the bracket of "local") s <- matrix(0.0, nrow=dm[1], ncol=dm[1]) # apply the function variant by variant # use "<<-" operator to find "s" in the parent environment seqApply(gdsfile, "genotype", function(x) { g <- (x==0) # indicator of reference allele p <- mean(g, na.rm=TRUE) # allele frequency g2 <- colSums(g) - 2*p # genotypes adjusted by allele frequency m <- (g2 %o% g2) / (p*(1-p)) # genetic correlation matrix m[!is.finite(m)] <- 0 # correct missing values s <<- s + m # output to the global variable "s" }, margin="by.variant", as.is="none") # output s }, .combine = "+", # sum all variables "s" of different processes split = "by.variant") # finally, scaled by the trace of matrix "s" over the number of samples dm <- seqSummary(genofile, "genotype")$seldim genmat <- genmat / (sum(diag(genmat)) / dm[1]) # eigen-decomposition eig <- eigen(genmat) # eigenvalues head(eig$value) ################################################### ### code chunk number 43: SeqArrayTutorial.Rnw:785-787 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 44: SeqArrayTutorial.Rnw:801-827 ################################################### coeff <- local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(genofile, "genotype")$seldim # need global variables (only available in the bracket of "local") n <- integer(dm[1]) s <- double(dm[1]) # apply the function variant by variant # use "<<-" operator to find "n" and "s" in the parent environment seqApply(genofile, "genotype", function(x) { p <- mean(x==0, na.rm=TRUE) # allele frequency g <- colSums(x==0) # genotypes: # of reference allele d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p)) n <<- n + is.finite(d) # output to the global variable "n" d[!is.finite(d)] <- 0 s <<- s + d # output to the global variable "s" }, margin="by.variant", as.is="none") # output s / n }) length(coeff) summary(coeff) ################################################### ### code chunk number 45: SeqArrayTutorial.Rnw:833-838 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 46: SeqArrayTutorial.Rnw:841-872 ################################################### coeff <- seqParallel(cl, genofile, FUN = function(gdsfile) { # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need global variables (only available in the bracket) n <- integer(dm[1]) s <- double(dm[1]) # apply the function variant by variant # use "<<-" operator to find "n" and "s" in the parent environment seqApply(gdsfile, "genotype", function(x) { p <- mean(x==0, na.rm=TRUE) # allele frequency g <- colSums(x==0) # genotypes: # of reference allele d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p)) n <<- n + is.finite(d) # output to the global variable "n" d[!is.finite(d)] <- 0 s <<- s + d # output to the global variable "s" }, margin="by.variant", as.is="none") # output list(s=s, n=n) }, # sum all variables "s" and "n" of different processes .combine = function(x1,x2) { list(s = x1$s + x2$s, n = x1$n + x2$n) }, split = "by.variant") # finally, average! coeff <- coeff$s / coeff$n summary(coeff) ################################################### ### code chunk number 47: SeqArrayTutorial.Rnw:875-877 ################################################### # Since we created the cluster manually, we should stop it: stopCluster(cl) ################################################### ### code chunk number 48: SeqArrayTutorial.Rnw:885-886 (eval = FALSE) ################################################### ## library(Rcpp) ################################################### ### code chunk number 49: SeqArrayTutorial.Rnw:890-907 (eval = FALSE) ################################################### ## cppFunction('double RefAlleleFreq(IntegerMatrix x) ## { ## int nrow = x.nrow(), ncol = x.ncol(); ## int cnt=0, zero_cnt=0, g; ## for (int i = 0; i < nrow; i++) ## { ## for (int j = 0; j < ncol; j++) ## { ## if ((g = x(i, j)) != NA_INTEGER) ## { ## cnt ++; ## if (g == 0) zero_cnt ++; ## } ## } ## } ## return double(zero_cnt) / cnt; ## }') ################################################### ### code chunk number 50: SeqArrayTutorial.Rnw:911-917 (eval = FALSE) ################################################### ## RefAlleleFreq ## ## afreq <- seqApply(genofile, "genotype", RefAlleleFreq, ## as.is="double", margin="by.variant") ## ## summary(afreq) ################################################### ### code chunk number 51: SeqArrayTutorial.Rnw:921-923 ################################################### # close the GDS file seqClose(genofile)