seqApply {SeqArray} | R Documentation |
Returns a vector or list of values obtained by applying a function to margins of genotypes and annotations.
seqApply(gdsfile, var.name, FUN, margin=c("by.variant", "by.sample"), as.is=c("none", "list", "integer", "double", "character", "logical", "raw"), var.index=c("none", "relative", "absolute"), parallel=FALSE, .useraw=FALSE, .progress=FALSE, .list_dup=TRUE, ...)
gdsfile |
a |
var.name |
the variable name(s), see details |
FUN |
the function to be applied |
margin |
giving the dimension which the function will be applied over;
|
as.is |
returned value: a list, an integer vector, etc; return nothing
by default |
var.index |
if |
parallel |
|
.useraw |
|
.progress |
if |
.list_dup |
internal use only |
... |
optional arguments to |
The variable name should be "sample.id"
, "variant.id"
,
"position"
, "chromosome"
, "allele"
, "genotype"
,
"annotation/id"
, "annotation/qual"
, "annotation/filter"
,
"annotation/info/VARIABLE_NAME"
, or
"annotation/format/VARIABLE_NAME"
.
"@genotype"
, "annotation/info/@VARIABLE_NAME"
or
"annotation/format/@VARIABLE_NAME"
are used to obtain the index
associated with these variables.
"$dosage"
is also allowed for the dosages of reference allele (integer:
0, 1, 2 and NA for diploid genotypes).
"$dosage_alt"
returns a RAW/INTEGER matrix for the dosages of alternative
allele without distinguishing different alternative alleles.
"$num_allele"
returns an integer vector with the numbers of distinct
alleles.
"$ref"
returns a character vector of reference alleles
"$alt"
returns a character vector of alternative alleles (delimited by
comma)
"$chrom_pos"
returns characters with the combination of chromosome and
position, e.g., "1:1272721". "$chrom_pos_allele"
returns characters with
the combination of chromosome, position and alleles, e.g., "1:1272721_A_G"
(i.e., chr:position_REF_ALT).
The algorithm is highly optimized by blocking the computations to exploit the high-speed memory instead of disk.
A vector, a list of values or none.
Xiuwen Zheng
seqBlockApply
, seqSetFilter
,
seqGetData
, seqParallel
,
seqGetParallel
# the GDS file (gds.fn <- seqExampleFileName("gds")) # display (f <- seqOpen(gds.fn)) # get 'sample.id (samp.id <- seqGetData(f, "sample.id")) # "NA06984" "NA06985" "NA06986" ... # get 'variant.id' head(variant.id <- seqGetData(f, "variant.id")) # set sample and variant filters set.seed(100) seqSetFilter(f, sample.id=samp.id[c(2,4,6,8,10)], variant.id=sample(variant.id, 10)) # read seqApply(f, "genotype", FUN=print, margin="by.variant") seqApply(f, "genotype", FUN=print, margin="by.variant", .useraw=TRUE) seqApply(f, "genotype", FUN=print, margin="by.sample") seqApply(f, "genotype", FUN=print, margin="by.sample", .useraw=TRUE) # read multiple variables variant by variant seqApply(f, c(geno="genotype", phase="phase", rsid="annotation/id", DP="annotation/format/DP"), FUN=print, as.is="none") # get the numbers of alleles per variant seqApply(f, "allele", FUN=function(x) length(unlist(strsplit(x,","))), as.is="integer") # output to a file fl <- file("tmp.txt", "wt") seqApply(f, "genotype", FUN=sum, na.rm=TRUE, as.is=fl) close(fl) readLines("tmp.txt") seqApply(f, "genotype", FUN=sum, na.rm=TRUE, as.is=stdout()) seqApply(f, "genotype", FUN=sum, na.rm=TRUE, as.is="integer") # should be identical ################################################################ # with an index of variant seqApply(f, c(geno="genotype", phase="phase", rsid="annotation/id"), FUN=function(index, x) { print(index); print(x); index }, as.is="integer", var.index="relative") # it is as the same as which(seqGetFilter(f)$variant.sel) ################################################################ # reset sample and variant filters seqResetFilter(f) # calculate the frequency of reference allele, # a faster version could be obtained by C coding af <- seqApply(f, "genotype", FUN=function(x) mean(x==0L, na.rm=TRUE), as.is="double") length(af) summary(af) ################################################################ # apply the user-defined function sample by sample # reset sample and variant filters seqResetFilter(f) summary(seqApply(f, "genotype", FUN=function(x) { mean(is.na(x)) }, margin="by.sample", as.is="double")) # set sample and variant filters set.seed(100) seqSetFilter(f, sample.id=samp.id[c(2,4,6,8,10)], variant.id=sample(variant.id, 10)) seqApply(f, "genotype", FUN=print, margin="by.variant", as.is="none") seqApply(f, "genotype", FUN=print, margin="by.sample", as.is="none") seqApply(f, c(sample.id="sample.id", genotype="genotype"), FUN=print, margin="by.sample", as.is="none") # close the GDS file seqClose(f) # delete the temporary file unlink("tmp.txt")