## ----setup, echo=FALSE-------------------------------------------------------- knitr::opts_chunk$set(collapse=TRUE, comment = "#>") suppressPackageStartupMessages(library(universalmotif)) suppressPackageStartupMessages(library(Biostrings)) data(ArabidopsisPromoters) data(ArabidopsisMotif) ## ----------------------------------------------------------------------------- library(universalmotif) library(Biostrings) ## Create some DNA sequences for use with an external program (default ## is DNA): sequences.dna <- create_sequences(seqnum = 500, freqs = c(A=0.3, C=0.2, G=0.2, T=0.3)) ## writeXStringSet(sequences.dna, "dna.fasta") sequences.dna ## Amino acid: create_sequences(alphabet = "AA") ## Any set of characters can be used create_sequences(alphabet = paste0(letters, collapse = "")) ## ----------------------------------------------------------------------------- library(universalmotif) ## Background of DNA sequences: dna <- create_sequences() get_bkg(dna, k = 1:2, list.out = FALSE) ## Background of non DNA/RNA sequences: qwerty <- create_sequences("QWERTY") get_bkg(qwerty, k = 1:2, list.out = FALSE) ## ----------------------------------------------------------------------------- library(universalmotif) library(Biostrings) data(ArabidopsisPromoters) ## Potentially starting off with some external sequences: # ArabidopsisPromoters <- readDNAStringSet("ArabidopsisPromoters.fasta") euler <- shuffle_sequences(ArabidopsisPromoters, k = 2, method = "euler") markov <- shuffle_sequences(ArabidopsisPromoters, k = 2, method = "markov") linear <- shuffle_sequences(ArabidopsisPromoters, k = 2, method = "linear") k1 <- shuffle_sequences(ArabidopsisPromoters, k = 1) ## ----------------------------------------------------------------------------- o.letter <- get_bkg(ArabidopsisPromoters, 1, as.prob = FALSE, list.out = FALSE) e.letter <- get_bkg(euler, 1, as.prob = FALSE, list.out = FALSE) m.letter <- get_bkg(markov, 1, as.prob = FALSE, list.out = FALSE) l.letter <- get_bkg(linear, 1, as.prob = FALSE, list.out = FALSE) data.frame(original=o.letter, euler=e.letter, markov=m.letter, linear=l.letter) o.counts <- get_bkg(ArabidopsisPromoters, 2, as.prob = FALSE, list.out = FALSE) e.counts <- get_bkg(euler, 2, as.prob = FALSE, list.out = FALSE) m.counts <- get_bkg(markov, 2, as.prob = FALSE, list.out = FALSE) l.counts <- get_bkg(linear, 2, as.prob = FALSE, list.out = FALSE) data.frame(original=o.counts, euler=e.counts, markov=m.counts, linear=l.counts) ## ----------------------------------------------------------------------------- library(universalmotif) string <- "DASDSDDSASDSSA" count_klets(string, 2) shuffle_string(string, 2) ## ----------------------------------------------------------------------------- library(universalmotif) get_klets(c("A", "S", "D"), 2) ## ----------------------------------------------------------------------------- library(universalmotif) m1 <- create_motif("TATATATATA", nsites = 50, type = "PWM", pseudocount = 1) m2 <- matrix(c(0.10,0.27,0.23,0.19,0.29,0.28,0.51,0.12,0.34,0.26, 0.36,0.29,0.51,0.38,0.23,0.16,0.17,0.21,0.23,0.36, 0.45,0.05,0.02,0.13,0.27,0.38,0.26,0.38,0.12,0.31, 0.09,0.40,0.24,0.30,0.21,0.19,0.05,0.30,0.31,0.08), byrow = TRUE, nrow = 4) m2 <- create_motif(m2, alphabet = "DNA", type = "PWM") m1["motif"] m2["motif"] ## ----------------------------------------------------------------------------- motif_pvalue(m2, pvalue = 0.001) ## ----------------------------------------------------------------------------- library(universalmotif) library(Biostrings) data(ArabidopsisPromoters) ## A 2-letter example: motif.k2 <- create_motif("CWWWWCC", nsites = 6) sequences.k2 <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3)) motif.k2 <- add_multifreq(motif.k2, sequences.k2) ## ----------------------------------------------------------------------------- head(scan_sequences(motif.k2, ArabidopsisPromoters, RC = TRUE, threshold = 0.9, threshold.type = "logodds")) ## ----------------------------------------------------------------------------- head(scan_sequences(motif.k2, ArabidopsisPromoters, use.freq = 2, RC = TRUE, threshold = 0.9, threshold.type = "logodds")) ## ----------------------------------------------------------------------------- library(universalmotif) library(Biostrings) sequences <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3)) motif <- create_motif(sequences, add.multifreq = 2:3) ## ----------------------------------------------------------------------------- library(universalmotif) data(ArabidopsisMotif) data(ArabidopsisPromoters) enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3, threshold = 0.001, RC = TRUE) ## ----------------------------------------------------------------------------- library(universalmotif) data(ArabidopsisMotif) data(ArabidopsisPromoters) hits <- scan_sequences(ArabidopsisMotif, ArabidopsisPromoters, RC = FALSE) res <- motif_peaks(hits$start, seq.length = unique(width(ArabidopsisPromoters)), seq.count = length(ArabidopsisPromoters)) ## Significant peaks: res$Peaks ## ----------------------------------------------------------------------------- res$Plot ## ----eval=FALSE--------------------------------------------------------------- # library(universalmotif) # # ## 1. Once per session: via `options()` # # options(meme.bin = "/path/to/meme/bin/meme") # # run_meme(...) # # ## 2. Once per run: via `run_meme()` # # run_meme(..., bin = "/path/to/meme/bin/meme") ## ----eval=FALSE--------------------------------------------------------------- # library(universalmotif) # data(ArabidopsisPromoters) # # ## 1. Read sequences from disk (in fasta format): # # library(Biostrings) # # # The following `read*()` functions are available in Biostrings: # # DNA: readDNAStringSet # # DNA with quality scores: readQualityScaledDNAStringSet # # RNA: readRNAStringSet # # Amino acid: readAAStringSet # # Any: readBStringSet # # sequences <- readDNAStringSet("/path/to/sequences.fasta") # # run_meme(sequences, ...) # # ## 2. Extract from a `BSgenome` object: # # library(GenomicFeatures) # library(TxDb.Athaliana.BioMart.plantsmart28) # library(BSgenome.Athaliana.TAIR.TAIR9) # # # Let us retrieve the same promoter sequences from ArabidopsisPromoters: # gene.names <- names(ArabidopsisPromoters) # # # First get the transcript coordinates from the relevant `TxDb` object: # transcripts <- transcriptsBy(TxDb.Athaliana.BioMart.plantsmart28, # by = "gene")[gene.names] # # # There are multiple transcripts per gene, we only care for the first one # # in each: # # transcripts <- lapply(transcripts, function(x) x[1]) # transcripts <- unlist(GRangesList(transcripts)) # # # Then the actual sequences: # # # Unfortunately this is a case where the chromosome names do not match # # between the two databases # # seqlevels(TxDb.Athaliana.BioMart.plantsmart28) # #> [1] "1" "2" "3" "4" "5" "Mt" "Pt" # seqlevels(BSgenome.Athaliana.TAIR.TAIR9) # #> [1] "Chr1" "Chr2" "Chr3" "Chr4" "Chr5" "ChrM" "ChrC" # # # So we must first rename the chromosomes in `transcripts`: # seqlevels(transcripts) <- seqlevels(BSgenome.Athaliana.TAIR.TAIR9) # # # Finally we can extract the sequences # promoters <- getPromoterSeq(transcripts, # BSgenome.Athaliana.TAIR.TAIR9, # upstream = 0, downstream = 1000) # # run_meme(promoters, ...) ## ----eval=FALSE--------------------------------------------------------------- # run_meme(sequences, output = "/path/to/desired/output/folder") ## ----sessionInfo, echo=FALSE-------------------------------------------------- sessionInfo()