## ----eval=FALSE----------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly=TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("gdsfmt")
#  BiocManager::install("SNPRelate")

## ----eval=FALSE----------------------------------------------------------
#  library("devtools")
#  install_github("zhengxwen/gdsfmt")
#  install_github("zhengxwen/SNPRelate")

## ------------------------------------------------------------------------
# Load the R packages: gdsfmt and SNPRelate
library(gdsfmt)
library(SNPRelate)

## ------------------------------------------------------------------------
snpgdsSummary(snpgdsExampleFileName())

## ------------------------------------------------------------------------
# Open a GDS file
(genofile <- snpgdsOpen(snpgdsExampleFileName()))

## ------------------------------------------------------------------------
# Get the attributes of chromosome coding
get.attr.gdsn(index.gdsn(genofile, "snp.chromosome"))

## ------------------------------------------------------------------------
# Take out genotype data for the first 3 samples and the first 5 SNPs
(g <- read.gdsn(index.gdsn(genofile, "genotype"), start=c(1,1), count=c(5,3)))

## ----eval=FALSE----------------------------------------------------------
#  g <- snpgdsGetGeno(genofile, sample.id=..., snp.id=...)

## ------------------------------------------------------------------------
# Get the attribute of genotype
get.attr.gdsn(index.gdsn(genofile, "genotype"))

## ------------------------------------------------------------------------
# Take out snp.id
head(read.gdsn(index.gdsn(genofile, "snp.id")))
# Take out snp.rs.id
head(read.gdsn(index.gdsn(genofile, "snp.rs.id")))

## ------------------------------------------------------------------------
# Read population information
pop <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"))
table(pop)

# Close the GDS file
snpgdsClose(genofile)

## ------------------------------------------------------------------------
# Load data
data(hapmap_geno)

# Create a gds file
snpgdsCreateGeno("test.gds", genmat = hapmap_geno$genotype,
    sample.id = hapmap_geno$sample.id, snp.id = hapmap_geno$snp.id,
    snp.chromosome = hapmap_geno$snp.chromosome,
    snp.position = hapmap_geno$snp.position,
    snp.allele = hapmap_geno$snp.allele, snpfirstdim=TRUE)

# Open the GDS file
(genofile <- snpgdsOpen("test.gds"))

# Close the GDS file
snpgdsClose(genofile)

## ----eval=FALSE----------------------------------------------------------
#  # Create a new GDS file
#  newfile <- createfn.gds("your_gds_file.gds")
#  
#  # add a flag
#  put.attr.gdsn(newfile$root, "FileFormat", "SNP_ARRAY")
#  
#  # Add variables
#  add.gdsn(newfile, "sample.id", sample.id)
#  add.gdsn(newfile, "snp.id", snp.id)
#  add.gdsn(newfile, "snp.chromosome", snp.chromosome)
#  add.gdsn(newfile, "snp.position", snp.position)
#  add.gdsn(newfile, "snp.allele", c("A/G", "T/C", ...))
#  
#  #####################################################################
#  # Create a snp-by-sample genotype matrix
#  
#  # Add genotypes
#  var.geno <- add.gdsn(newfile, "genotype",
#      valdim=c(length(snp.id), length(sample.id)), storage="bit2")
#  
#  # Indicate the SNP matrix is snp-by-sample
#  put.attr.gdsn(var.geno, "snp.order")
#  
#  # Write SNPs into the file sample by sample
#  for (i in 1:length(sample.id))
#  {
#      g <- ...
#      write.gdsn(var.geno, g, start=c(1,i), count=c(-1,1))
#  }
#  
#  #####################################################################
#  # OR, create a sample-by-snp genotype matrix
#  
#  # Add genotypes
#  var.geno <- add.gdsn(newfile, "genotype",
#      valdim=c(length(sample.id), length(snp.id)), storage="bit2")
#  
#  # Indicate the SNP matrix is sample-by-snp
#  put.attr.gdsn(var.geno, "sample.order")
#  
#  # Write SNPs into the file sample by sample
#  for (i in 1:length(snp.id))
#  {
#      g <- ...
#      write.gdsn(var.geno, g, start=c(1,i), count=c(-1,1))
#  }
#  
#  # Get a description of chromosome codes
#  #   allowing to define a new chromosome code, e.g., snpgdsOption(Z=27)
#  option <- snpgdsOption()
#  var.chr <- index.gdsn(newfile, "snp.chromosome")
#  put.attr.gdsn(var.chr, "autosome.start", option$autosome.start)
#  put.attr.gdsn(var.chr, "autosome.end", option$autosome.end)
#  for (i in 1:length(option$chromosome.code))
#  {
#      put.attr.gdsn(var.chr, names(option$chromosome.code)[i],
#          option$chromosome.code[[i]])
#  }
#  
#  # Add your sample annotation
#  samp.annot <- data.frame(sex = c("male", "male", "female", ...),
#      pop.group = c("CEU", "CEU", "JPT", ...), ...)
#  add.gdsn(newfile, "sample.annot", samp.annot)
#  
#  # Add your SNP annotation
#  snp.annot <- data.frame(pass=c(TRUE, TRUE, FALSE, FALSE, TRUE, ...), ...)
#  add.gdsn(newfile, "snp.annot", snp.annot)
#  
#  # Close the GDS file
#  closefn.gds(newfile)

## ------------------------------------------------------------------------
# The PLINK BED file, using the example in the SNPRelate package
bed.fn <- system.file("extdata", "plinkhapmap.bed.gz", package="SNPRelate")
fam.fn <- system.file("extdata", "plinkhapmap.fam.gz", package="SNPRelate")
bim.fn <- system.file("extdata", "plinkhapmap.bim.gz", package="SNPRelate")

## ----eval=FALSE----------------------------------------------------------
#  bed.fn <- "C:/your_folder/your_plink_file.bed"
#  fam.fn <- "C:/your_folder/your_plink_file.fam"
#  bim.fn <- "C:/your_folder/your_plink_file.bim"

## ------------------------------------------------------------------------
# Convert
snpgdsBED2GDS(bed.fn, fam.fn, bim.fn, "test.gds")

# Summary
snpgdsSummary("test.gds")

## ------------------------------------------------------------------------
# The VCF file, using the example in the SNPRelate package
vcf.fn <- system.file("extdata", "sequence.vcf", package="SNPRelate")

## ----eval=FALSE----------------------------------------------------------
#  vcf.fn <- "C:/your_folder/your_vcf_file.vcf"

## ------------------------------------------------------------------------
# Reformat
snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only")

# Summary
snpgdsSummary("test.gds")

## ------------------------------------------------------------------------
# Open the GDS file
genofile <- snpgdsOpen(snpgdsExampleFileName())

## ------------------------------------------------------------------------
# Get population information
#   or pop_code <- scan("pop.txt", what=character())
#   if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"))

table(pop_code)

# Display the first six values
head(pop_code)

## ------------------------------------------------------------------------
set.seed(1000)

# Try different LD thresholds for sensitivity analysis
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
names(snpset)
head(snpset$chr1)  # snp.id

# Get all selected snp id
snpset.id <- unlist(unname(snpset))
head(snpset.id)

## ------------------------------------------------------------------------
# Run PCA
pca <- snpgdsPCA(genofile, snp.id=snpset.id, num.thread=2)

## ------------------------------------------------------------------------
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)

# Draw
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")

## ------------------------------------------------------------------------
# Get sample id
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))

# Get population information
#   or pop_code <- scan("pop.txt", what=character())
#   if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))

# assume the order of sample IDs is as the same as population codes
head(cbind(sample.id, pop_code))

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# Make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
    pop = factor(pop_code)[match(pca$sample.id, sample.id)],
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)

# Draw
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:nlevels(tab$pop))

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)

## ----fig.width=8, fig.height=5, fig.align='center'-----------------------
library(MASS)

datpop <- factor(pop_code)[match(pca$sample.id, sample.id)]
parcoord(pca$eigenvect[,1:16], col=datpop)

## ----fig.width=8, fig.height=4, fig.align='center'-----------------------
# Get chromosome index
chr <- read.gdsn(index.gdsn(genofile, "snp.chromosome"))
CORR <- snpgdsPCACorr(pca, genofile, eig.which=1:4)

savepar <- par(mfrow=c(2,1), mai=c(0.45, 0.55, 0.1, 0.25))
for (i in 1:2)
{
    plot(abs(CORR$snpcorr[i,]), ylim=c(0,1), xlab="", ylab=paste("PC", i),
        col=chr, pch="+")
}
par(savepar)

## ------------------------------------------------------------------------
# Get sample id
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))

# Get population information
#   or pop_code <- scan("pop.txt", what=character())
#   if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))

# Two populations: HCB and JPT
flag <- pop_code %in% c("HCB", "JPT")
samp.sel <- sample.id[flag]
pop.sel <- pop_code[flag]
v <- snpgdsFst(genofile, sample.id=samp.sel, population=as.factor(pop.sel),
    method="W&C84")
v$Fst        # Weir and Cockerham weighted Fst estimate
v$MeanFst    # Weir and Cockerham mean Fst estimate
summary(v$FstSNP)

# Multiple populations: CEU HCB JPT YRI
#   we should remove offsprings
father <- read.gdsn(index.gdsn(genofile, "sample.annot/father.id"))
mother <- read.gdsn(index.gdsn(genofile, "sample.annot/mother.id"))
flag <- (father=="") & (mother=="")
samp.sel <- sample.id[flag]
pop.sel <- pop_code[flag]
v <- snpgdsFst(genofile, sample.id=samp.sel, population=as.factor(pop.sel),
    method="W&C84")
v$Fst        # Weir and Cockerham weighted Fst estimate
v$MeanFst    # Weir and Cockerham mean Fst estimate
summary(v$FstSNP)

## ------------------------------------------------------------------------
# YRI samples
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
YRI.id <- sample.id[pop_code == "YRI"]

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# Estimate IBD coefficients
ibd <- snpgdsIBDMoM(genofile, sample.id=YRI.id, snp.id=snpset.id,
    maf=0.05, missing.rate=0.05, num.thread=2)

# Make a data.frame
ibd.coeff <- snpgdsIBDSelection(ibd)
head(ibd.coeff)

plot(ibd.coeff$k0, ibd.coeff$k1, xlim=c(0,1), ylim=c(0,1),
    xlab="k0", ylab="k1", main="YRI samples (MoM)")
lines(c(0,1), c(1,0), col="red", lty=2)

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# Estimate IBD coefficients
set.seed(100)
snp.id <- sample(snpset.id, 1500)  # random 1500 SNPs
ibd <- snpgdsIBDMLE(genofile, sample.id=YRI.id, snp.id=snp.id,
    maf=0.05, missing.rate=0.05, num.thread=2)

# Make a data.frame
ibd.coeff <- snpgdsIBDSelection(ibd)

plot(ibd.coeff$k0, ibd.coeff$k1, xlim=c(0,1), ylim=c(0,1),
    xlab="k0", ylab="k1", main="YRI samples (MLE)")
lines(c(0,1), c(1,0), col="red", lty=2)

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# Incorporate with pedigree information
family.id <- read.gdsn(index.gdsn(genofile, "sample.annot/family.id"))
family.id <- family.id[match(YRI.id, sample.id)]
table(family.id)

ibd.robust <- snpgdsIBDKING(genofile, sample.id=YRI.id,
    family.id=family.id, num.thread=2)
names(ibd.robust)

# Pairs of individuals
dat <- snpgdsIBDSelection(ibd.robust)
head(dat)

plot(dat$IBS0, dat$kinship, xlab="Proportion of Zero IBS",
    ylab="Estimated Kinship Coefficient (KING-robust)")

## ------------------------------------------------------------------------
ibs <- snpgdsIBS(genofile, num.thread=2)

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# individulas in the same population are clustered together
pop.idx <- order(pop_code)

image(ibs$ibs[pop.idx, pop.idx], col=terrain.colors(16))

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
loc <- cmdscale(1 - ibs$ibs, k = 2)
x <- loc[, 1]; y <- loc[, 2]
race <- as.factor(pop_code)

plot(x, y, col=race, xlab = "", ylab = "",
    main = "Multidimensional Scaling Analysis (IBS)")
legend("topleft", legend=levels(race), pch="o", text.col=1:nlevels(race))

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
set.seed(100)
ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, num.thread=2))

# Determine groups of individuals automatically
rv <- snpgdsCutTree(ibs.hc)
plot(rv$dendrogram, leaflab="none", main="HapMap Phase II")
table(rv$samp.group)

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# Determine groups of individuals by population information
rv2 <- snpgdsCutTree(ibs.hc, samp.group=as.factor(pop_code))

plot(rv2$dendrogram, leaflab="none", main="HapMap Phase II")
legend("topright", legend=levels(race), col=1:nlevels(race), pch=19, ncol=4)

# Close the GDS file
snpgdsClose(genofile)

## ------------------------------------------------------------------------
sessionInfo()

## ----echo=FALSE----------------------------------------------------------
unlink("test.gds", force=TRUE)