diff --git a/DESCRIPTION b/DESCRIPTION index a5d027cc..d2cc10b3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -78,6 +78,7 @@ Collate: utils.R misc.R SparseList-class.R MIndex-class.R + MIndexList-class.R lowlevel-matching.R match-utils.R matchPattern.R diff --git a/NAMESPACE b/NAMESPACE index ba1e3bfa..b100250c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -221,7 +221,7 @@ exportMethods( exportClasses( #SparseList, - MIndex, ByPos_MIndex, + MIndex, ByPos_MIndex, MIndexList, PreprocessedTB, Twobit, ACtree2, PDict3Parts, PDict, TB_PDict, MTB_PDict, Expanded_TB_PDict diff --git a/R/MIndexList-class.R b/R/MIndexList-class.R new file mode 100644 index 00000000..89b04438 --- /dev/null +++ b/R/MIndexList-class.R @@ -0,0 +1,76 @@ +### ========================================================================= +### MIndexList objects +### ------------------------------------------------------------------------- +### + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### The "MIndexList" class. +### +### This class serves as a base class to hold collections of MIndex objects. +### +### As with MIndex, in normal operations, the user should never need to create +### MIndex objects directly or to modify existing ones. Those objects are +### typically returned by a sequence matching/alignment function like +### vmatchPDict(). For this reason, coercion methods are relatively barebones +### and will throw errors rather than try to correct +### + +setClass("MIndexList", + contains="CompressedList", + representation( +# "VIRTUAL", + unlistData="MIndex" + ), + prototype( + elementType="MIndex" + ) +) + + +.from_list_to_MIndexList <- function(from) +{ + ## keeping ans_class variable for potential future extensions + ans_class <- "MIndexList" + + ## ensure elements are all MIndex objects + if(!all(vapply(from, inherits, logical(1L), what="MIndex"))) + stop("Only lists of MIndex objects can be coerced to MIndexList") + IRanges:::new_CompressedList_from_list(ans_class, from) +} + +## barebones, but users shouldn't be calling this directly +## expecting to be used internally, list coercion provided for QoL +setAs("list", "MIndexList", function(from) .from_list_to_MIndexList(from)) +setAs("List", "MIndexList", function(from) .from_list_to_MIndexList(as.list(from))) + +.from_MIndexList_to_compact_char <- function(object){ + lapply(object, \(m_ind){ + paste(length(m_ind), "patterns,", + sum(lengths(m_ind)), "matches") + }) +} + +setMethod("show", "MIndexList", + function(object) { + lo <- length(object) + k <- min(5, length(object)) + diffK <- lo - 5 + cat(classNameForDisplay(object), " of length ", lo, + "\n", sep = "") + + repr <- .from_MIndexList_to_compact_char(head(object, k)) + IRanges:::.showAtomicList(CharacterList(repr), minLines=10L) + if (diffK > 0) + cat("...\n<", diffK, + ifelse(diffK == 1, + " more element>\n", " more elements>\n"), + sep="") + } +) + +setMethod("showAsCell", "MIndexList", + function(object){ + repr <- .from_MIndexList_to_compact_char(object) + showAsCell(CharacterList(repr)) + } +) diff --git a/R/matchPDict.R b/R/matchPDict.R index c7324abe..1f1a9381 100644 --- a/R/matchPDict.R +++ b/R/matchPDict.R @@ -15,7 +15,7 @@ ### > library(BSgenome.Hsapiens.UCSC.hg18) ### > chr1 <- Hsapiens$chr1 ### > system.time(end_index <- endIndex(matchPDict(pdict, chr1))) -### user system elapsed +### user system elapsed ### 50.663 0.000 50.763 ### > count_index <- sapply(end_index, length) ### > table(count_index) @@ -64,8 +64,8 @@ ### 1M 36 2.5 sec 717M 106 sec 103 sec 0 ### 10M 36 56 sec 6724M 351 sec 200 sec 0 ### 10M 12 7.5 sec 340M 227 sec 216 sec 100M -### 30M 12 27 sec 523M 491 sec ? -### +### 30M 12 27 sec 523M 491 sec ? +### ### III. Inexact matching ### --------------------- ### pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) @@ -561,10 +561,8 @@ if (!identical(weight, 1L)) warning("'weight' is ignored when 'collapse=FALSE'") } - } else { - ## vmatchPDict() - stop("vmatchPDict() is not supported yet, sorry") } + ## We are doing our own dispatch here, based on the type of 'pdict'. ## TODO: Revisit this. Would probably be a better design to use a ## generic/methods approach and rely on the standard dispatch mechanism. @@ -584,7 +582,8 @@ max.mismatch, min.mismatch, with.indels, fixed, algorithm, collapse, weight, verbose, matches.as) - if (is.null(which_pp_excluded)) + if (is.null(which_pp_excluded) && matches.as %in% + c("MATCHES_AS_WHICH", "MATCHES_AS_COUNTS")) return(ans) if (matches.as == "MATCHES_AS_WHICH") { ## vwhichPDict() @@ -599,8 +598,51 @@ } return(ans) } - ## vmatchPDict() - stop("vmatchPDict() is not supported yet, sorry") + if(matches.as == "MATCHES_AS_ENDS") { + ## vmatchPDict() by ends + ## converting all the results to MIndex objects and then wrapping the result + for(i in seq_along(ans)) { + ans[[i]] <- new("ByPos_MIndex", + width0=width(pdict), + NAMES=names(pdict), + ends=ans[[i]]) + } + + names(ans) <- names(subject) + ans <- as(ans, "MIndexList") + return(ans) + } + if(matches.as == "MATCHES_AS_RANGES") { + ## vmatchPDict() by ranges + ## This is the same as MATCHES_AS_ENDS, except ans[[i]] holds two lists + ## - The first list has the start position for each range + ## - The second list has the width of the range (including the start) + + for(i in seq_along(ans)){ + ## First we have to move from starts to ends to align with the + ## Pos_MIndex constructor + ans_start <- ans[[i]][[1]] + ans_width <- ans[[i]][[2]] + for(j in seq_along(ans_start)){ + if(!is.null(ans_start[[j]])){ + ## Note that the width includes the start, so we subtract 1 + ans_start[[j]] <- ans_start[[j]] + ans_width[[j]] - 1L + } + } + ## Now ans_start holds the **end** position + ## width(pdict) will be equivalent to ifelse(is.null(w), 0, w[1]) for + ## w in ans_width + ans[[i]] <- new("ByPos_MIndex", + width0=width(pdict), + NAMES=names(pdict), + ends=ans_start) + } + + names(ans) <- names(subject) + ans <- as(ans, "MIndexList") + return(ans) + } + warning("Failed to post-process output of .vmatch(pdict, subject)") return(ans) } @@ -771,8 +813,8 @@ setMethod("whichPDict", "MaskedXString", ### 'lapply(subject, function(s) whichPDict(pdict, s, ...))'. ### The returned object is a list of length N. ### o vcountPDict(): returns an M x N matrix of integers. -### o vmatchPDict(): not supported yet! (first we need a container to -### store the results) +### o vmatchPDict(): returns an MIndexList (CompressedList of MIndex objects) +### This contains N MIndex objects, each with up to M entries ### setGeneric("vmatchPDict", signature="subject", @@ -787,7 +829,10 @@ setMethod("vmatchPDict", "ANY", function(pdict, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", verbose=FALSE) - stop("vmatchPDict() is not ready yet, sorry") + .vmatchPDict(pdict, subject, + max.mismatch, min.mismatch, with.indels, fixed, + algorithm, collapse=FALSE, weight=1L, + verbose, matches.as="MATCHES_AS_ENDS") ) ### Dispatch on 'subject' (see signature of generic). diff --git a/man/matchPDict-exact.Rd b/man/matchPDict-exact.Rd index 983bcec5..035128d6 100644 --- a/man/matchPDict-exact.Rd +++ b/man/matchPDict-exact.Rd @@ -55,9 +55,10 @@ and \code{whichPDict} returns the "who" information i.e. which patterns in the input dictionary have at least one match. - \code{vcountPDict} and \code{vwhichPDict} are vectorized versions - of \code{countPDict} and \code{whichPDict}, respectively, that is, - they work on a set of reference sequences in a vectorized fashion. + \code{vcountPDict}, \code{vwhichPDict}, and \code{vmatchPDict} are vectorized + versions of \code{countPDict}, \code{whichPDict}, and \code{matchPDict} + respectively, that is, they work on a set of reference sequences in a + vectorized fashion. This man page shows how to use these functions (aka the \code{*PDict} functions) for exact matching of a constant width dictionary i.e. @@ -86,6 +87,9 @@ vcountPDict(pdict, subject, vwhichPDict(pdict, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", verbose=FALSE) +vmatchPDict(pdict, subject, + max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, + algorithm="auto", verbose=FALSE) } \arguments{ @@ -111,7 +115,7 @@ vwhichPDict(pdict, subject, \code{whichPDict}. An \link{XStringSet} object containing the subject sequences - for \code{vcountPDict} and \code{vwhichPDict}. + for \code{vmatchPDict}, \code{vcountPDict}, \and code{vwhichPDict}. If \code{pdict} is a \link{PDict} object (i.e. a preprocessed dictionary), then \code{subject} must be of base class \link{DNAString}. @@ -210,6 +214,10 @@ vwhichPDict(pdict, subject, (see above for the details). \code{vwhichPDict} returns a list of \code{N} integer vectors. + + \code{vmatchPDict} returns an \link{MIndexList}, which is essentially a list + of length \code{N} where each entry is an \link{MIndex} object (as would be + obtained from running \code{matchPDict} on each of the sequences). } \author{H. Pagès} @@ -513,7 +521,7 @@ if (interactive()) { pdict2 <- PDict(probes2) ## Get the mapping from probes1 to probes2 (based on exact matching): - map1to2 <- vwhichPDict(pdict2, probes1) + map1to2 <- vwhichPDict(pdict2, probes1) ## The following helper function uses the probe level mapping to induce ## the mapping at the probe set IDs level (from hgu95av2 to hgu133a). diff --git a/src/match_pdict.c b/src/match_pdict.c index 6244e0b1..cade8820 100644 --- a/src/match_pdict.c +++ b/src/match_pdict.c @@ -480,6 +480,28 @@ static SEXP vcount_XStringSet_XStringSet(SEXP pattern, return ans; } +SEXP vmatch_PDict3Parts_XStringSet_helper(SEXP pptb, HeadTail *headtail, MatchPDictBuf *mpdb, + SEXP xstringset, + SEXP max_mismatch, SEXP min_mismatch, + SEXP fixed){ + XStringSet_holder subject; + int subj_length; + Chars_holder subj_elt; + + subject = _hold_XStringSet(xstringset); + subj_length = _get_length_from_XStringSet_holder(&subject); + + SEXP RETURN_LIST = PROTECT(NEW_LIST(subj_length)); + for(int i=0; imatches), R_NilValue)); + _MatchPDictBuf_flush(mpdb); + } + UNPROTECT(1); + return RETURN_LIST; +} + /* --- .Call ENTRY POINT --- */ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail, SEXP subject, @@ -495,10 +517,12 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail, matchpdict_buf = new_MatchPDictBuf_from_PDict3Parts(matches_as, pptb, pdict_head, pdict_tail); switch (matchpdict_buf.matches.ms_code) { - case MATCHES_AS_NULL: - error("vmatch_PDict3Parts_XStringSet() does not support " - "'matches_as=\"%s\"' yet, sorry", - CHAR(STRING_ELT(matches_as, 0))); + case MATCHES_AS_RANGES: + case MATCHES_AS_ENDS: + return vmatch_PDict3Parts_XStringSet_helper(pptb, &headtail, + &matchpdict_buf, subject, + max_mismatch, min_mismatch, + fixed); case MATCHES_AS_WHICH: return vwhich_PDict3Parts_XStringSet(pptb, &headtail, subject, @@ -510,8 +534,11 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail, max_mismatch, min_mismatch, fixed, collapse, weight, &matchpdict_buf); + default: + error("vmatch_PDict3Parts_XStringSet() does not support " + "'matches_as=\"%s\"' yet, sorry", + CHAR(STRING_ELT(matches_as, 0))); } - error("vmatchPDict() is not supported yet, sorry"); return R_NilValue; } @@ -545,7 +572,8 @@ SEXP vmatch_XStringSet_XStringSet(SEXP pattern, with_indels, fixed, algorithm, collapse, weight); } - error("vmatchPDict() is not supported yet, sorry"); + error("vmatchPDict() on two XStringSets is not supported yet. " + "Please use a PDict object instead (see `?PDict` for more info)."); return R_NilValue; } diff --git a/tests/testthat/test-MIndexListClass.R b/tests/testthat/test-MIndexListClass.R new file mode 100644 index 00000000..e69de29b diff --git a/tests/testthat/test-matchPDict.R b/tests/testthat/test-matchPDict.R index 722b7372..e6134467 100644 --- a/tests/testthat/test-matchPDict.R +++ b/tests/testthat/test-matchPDict.R @@ -75,7 +75,7 @@ test_that("inexact PDict matching works", { expect_equal(countPDict(pdict, d, max.mismatch=2), c(2,0,2,2)) }) -test_that("vcount/vwhich/vmatchPDict() work", { +test_that("vcount/vwhich/vmatchPDict work", { ## canary tests for future changes pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) d <- DNAString("acggaccg") @@ -116,10 +116,6 @@ test_that("vcount/vwhich/vmatchPDict() work", { expect_equal(vcountPDict(pdict, dvv, max.mismatch=2), exp_out) expect_equal(vwhichPDict(pdict, dvv, max.mismatch=2), list(c(1,3,4),c(1,3,4))) - - ## this will fail when we implement it to remind me to add tests for it - expect_error2(vmatchPDict(pdict, dss), - "vmatchPDict\\(\\) is not ready yet") }) test_that("MIndex functionality works", { @@ -233,3 +229,56 @@ test_that("matchPDict() works for variable width dictionary", { } }) +test_that("vmatchPDict is equivalent to matchPDict output", { + pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) + d <- DNAString("acggaccg") + d2 <- DNAString("gtcgtacgt") + dss <- DNAStringSet(list(d,d2)) + + mindex_obj_comp <- function(m1, m2){ + if(!is(m1, "MIndex") || !is(m2, "MIndex")){ + return(FALSE) + } + + if(length(m1) != length(m1)){ + return (FALSE) + } + + for(i in seq_along(m1)){ + l1 <- m1[[i]] + l2 <- m2[[i]] + for(attr_name in c("start", "width")){ + if(!identical(attr(l1, attr_name), attr(l2, attr_name))){ + return(FALSE) + } + } + } + return(TRUE) + } + + ## Basic matching + vmatch_output <- vmatchPDict(pdict, dss) + expect_true(mindex_obj_comp(vmatch_output[[1]], matchPDict(pdict, dss[[1]]))) + expect_true(mindex_obj_comp(vmatch_output[[2]], matchPDict(pdict, dss[[2]]))) + + ## max.mismatch + vmatch_output <- vmatchPDict(pdict, dss, max.mismatch = 2) + expect_true(mindex_obj_comp(vmatch_output[[1]], + matchPDict(pdict, dss[[1]], max.mismatch = 2))) + expect_true(mindex_obj_comp(vmatch_output[[2]], + matchPDict(pdict, dss[[2]], max.mismatch = 2))) + + ## min.mismatch + vmatch_output <- vmatchPDict(pdict, dss, min.mismatch = 1, max.mismatch = 2) + expect_true(mindex_obj_comp(vmatch_output[[1]], + matchPDict(pdict, dss[[1]], min.mismatch = 1, max.mismatch = 2))) + expect_true(mindex_obj_comp(vmatch_output[[2]], + matchPDict(pdict, dss[[2]], min.mismatch = 1, max.mismatch = 2))) + + ## fixed + vmatch_output <- vmatchPDict(pdict, dss, fixed = FALSE) + expect_true(mindex_obj_comp(vmatch_output[[1]], + matchPDict(pdict, dss[[1]], fixed = FALSE))) + expect_true(mindex_obj_comp(vmatch_output[[2]], + matchPDict(pdict, dss[[2]], fixed = FALSE))) +}) diff --git a/vignettes/SequenceMatching.Rmd b/vignettes/SequenceMatching.Rmd new file mode 100644 index 00000000..3d8d75ad --- /dev/null +++ b/vignettes/SequenceMatching.Rmd @@ -0,0 +1,929 @@ +--- +title: "Sequence Searching and Matching with Biostrings" +author: +- name: Aidan Lakshman + email: aidanlakshman@gmail.com +package: Biostrings +output: + BiocStyle::pdf_document +abstract: | + Biostrings encapsulates a wide variety of sequence matching functions, but + using them can be relatively confusing. This vignette walks through examples + of using the many functions available in Biostrings along with explanations + of their output types. +vignette: | + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup,include=FALSE} +library(BiocStyle) +library(Biostrings) +``` + +# Brief Recap + +Before diving into searching and matching in Biostrings, let's first briefly review the central data structures and concepts in the package. If you're already familiar with XString objects and the various containers to hold them, feel free to skip forward to the next section. + +## XString objects + +The focus of Biostrings is providing containers to efficiently work with biological data. Operations in Biostrings revolve around XString^[XString: a container that holds a single sequence of a particular type.] objects. XString isn't actually a type; it's the collective name for a set of objects of the form `*String`: + +- DNAString: holds DNA data (A/T/G/C) +- RNAString: holds RNA data (A/U/G/C) +- AAString: holds amino acid data +- BString: holds any arbitrary character + +Aside from BString, all the XString types also support standard IUPAC ambiguity codes. + +The simplest way to create XString objects is directly from a character object: + +```{r} +DNAString("ATGCATGC") + +RNAString("AUGCAUGC") + +AAString("ARNDC*") + +BString("I am a BString") +``` + +## XStringSets + +Single XString objects are nice, but many biological sequence analyses need to work with multiple sequences. Our primary container to achieve this functionality is the XStringSet^[XStringSet: a container that holds a collection of XString objects together.] object. This container holds a set of XString objects that are all the same type (all DNA, all RNA, etc.). There is one XStringSet type for each XString type (i.e., DNA, RNA, AA, B). + +It's also possible to create XStringSet objects from character vectors: + +```{r} +DNAStringSet(c("ATGC", "CGTA", "AAAAA", "TT")) +``` + +However, they are most often used to read in sequence sets from files, such as FASTA files: + +```{r} +filepath1 <- system.file("extdata", "someORF.fa", package="Biostrings") +readDNAStringSet(filepath1) +``` + +Once we have an XStringSet, we can use single brackets (`[]`) to get a subset of the XStringSet, and double brackets (`[[]]`) to get the component XStrings: + +```{r} +x <- DNAStringSet(c("ATGC", "CGTA", "AAAAA", "TT")) + +# Subsetting to a DNAStringSet +x[1] + +# Subsetting to a DNAString +x[[1]] +``` + +## XStringViews + +When we search for patterns in a sequence, we're looking for specific subsequences within a single sequence that match whatever we're looking for. To implement that functionality, we have the XStringViews^[XStringViews: Stores subsequences of a single XString object.] classes, which store "views" (subsequences) of a single XString object. These will be very important for searching and matching patterns later on. + +We can directly construct an XStringViews from an XString object with the `Views` method by by either specifying start/end indices, or using the width of the subsequences: + +```{r} +seq <- DNAString("AAATTTGGGCCC") + +## Splitting into codons +Views(seq, + start = seq(1, length(seq), by = 3), + end = seq(3, length(seq), by = 3)) + +## All overlapping subsequences of length 3 +Views(seq, + start = seq_len(length(seq) - 2), + width = 3) + +## Views can also be "out of range": +Views(seq, + end = seq_along(seq), + width = 3) +``` + +This class will be commonly returned from the pattern matching functions we'll discuss later, as well as some other internal functions (e.g., `codons`, which returns all the codons in a DNAString) + +It's worth noting that we can initialize an XStringSet from an XStringViews, in case we want to save our subsequences as a new XStringSet object. + +```{r} +seq <- DNAString("AAATTTGGGCCC") + +## `codons` returns an XStringViews +seq_codons <- codons(seq) + +seq_codons + +## We can also convert it to a DNAStringSet +DNAStringSet(seq_codons) +``` + +## Summary + +Knowledge of XStrings, XStringSets, and XStringViews are all that you'll need to be able to understand the rest of this vignette. However, there exist a number of additional containers and utility functions to work with these objects that I haven't mentioned here. For more information on these utility functions, look at `?XString`. Other containers that may be of interest include XStringSetList, MaskedXString, and QualityScaledXStringSet. + +# Low-Level Matching + +Now we can start to look at how to perform some basic matching operations in Biostrings. Matching in Biostrings is done in terms of "patterns" and "subjects". A *pattern* is what we're searching for, and a *subject* is where we're searching. For example, if we wanted to find all `ATG` codons in an *E. coli* genome, `ATG` would be the pattern, and the *E. coli* genome is the subject. A *match* is a subsequence of the pattern that is equal to the pattern (or similar enough, if using things like ambiguity codes). + +## Edit Distance + +Before we can get to matching, we first need a way to quantify the edit distance between two strings. The *edit distance* is the minimum number of operations it takes to transform one string into another. For instance, the edit distance between `AAA` and `AAB` is 1, since the only letter that differs between the two strings is the final character. + +```{r} +## the first string is the pattern, +## the second is the subject +neditAt(pattern = "AAAAA", subject = "AAAAB") + +neditAt("ABCDEFG", "AAADDDG") + +## Going from subject -> pattern +## AACGCA -> AACGTA -> AACCTA -> AAGCTA -> GAGCTA: 4 edits +neditAt("GAGCTA", "AACGCA") +``` + +However, this only works for strings of the same length. If we wanted to get the edit distance between two strings of unequal length, we'd also have to consider insertions and deletions (e.g., `AAA` and `AC` would have a distance of two via `AAA` -> `AA` -> `AC`). Considering insertions, deletions, and substitutions produces the Levenstein distance, which is also supported by `neditAt` if we set `with.indels=TRUE`: + +```{r} +## AACGCA -> GACGCA -> GAGCA -> GAGCTA: 3 edits +neditAt("GAGCTA", "AACGCA", with.indels = TRUE) +``` + +Note that this tries to match the pattern against the subject, so it only looks at the first `n` characters of the subject, where `n` is the length of the pattern. This can lead to some unintuitive results: + +```{r} +## Both of these will return 0, because nchar("ABC") = 3, +## so we only look at the first 3 characters of the pattern ("ABC") +neditAt("ABC", "ABCD") +neditAt("ABC", "ABCD", with.indels = TRUE) + +## However, if we change the subject slightly +## (note just showing the first 3 characters of the subject) +neditAt("ABC", "DBCA") # DBCA -> ABCA: 1 +neditAt("ABC", "DCBA") # DCBA -> DCCA -> DBCA -> ABCA : 3 +neditAt("ABC", "DCBA", with.indels = TRUE) # DCBA -> ADCBA -> ABCBA: 2 +``` + +## Simple Matching + +The previous result for `neditAt` probably seems unintuitive. Why is this function set up the way it is? + +It turns out that we can use this `neditAt` function to find matches. We can define a "match" as a pattern that has an edit distance of 0 with the subject. + +```{r} +neditAt("ABC", "ABCDE") == 0 + +isMatchingAt("ABC", "ABCDE") +``` + +These functions have an "at" suffix because we can specify where we should start searching in the subject. By default, we start searching at the first character (`1`), but we can start anywhere: + +```{r} +neditAt("ABC", "BABCABC", at = 1) + +neditAt("ABC", "BABCABC", at = 2) + +neditAt("ABC", "BABCABC", at = 5) +``` + +Both functions are vectorized to calculate for multiple starting positions at the same time: + +```{r} +subject <- "BABCABC" +edit_dists <- neditAt("ABC", subject, at = seq_len(nchar(subject))) +edit_dists + +## These are the positions of the matches! +which(edit_dists == 0) + +matches <- isMatchingAt("ABC", subject, at = seq_len(nchar(subject))) +which(matches) +``` + +What if we want to be a little more lenient than exact matching? Previously, we defined a match as any location where `neditAt` returns 0. We can relax these conditions with the `min.mismatch` and `max.mismatch` arguments. With these, the definition of a "match" becomes anywhere where `min.mismatch <= neditAt <= max.mismatch`. + +```{r} +pattern <- "AAB" +subject <- "AABBAAA" +s <- seq_len(nchar(subject)) +neditAt(pattern, subject, at = s) + +## Default is min.mismatch = max.mismatch = 0 +isMatchingAt(pattern, subject, at = s) + +## If max.mismatch = 1, we get more matches +isMatchingAt(pattern, subject, at = s, max.mismatch = 1) + +## We can also restrict the lower bound +## This returns all patterns with neditAt == 2 +isMatchingAt(pattern, subject, at = s, min.mismatch = 2, max.mismatch = 2) +``` + +Note that since we're using `neditAt` to define a match, the we can also use the Levenshtein distance as our matcher: + +```{r} +neditAt("GAGC", "AACG", with.indels = FALSE) +isMatchingAt("GAGCTA", "AACGCA", with.indels = FALSE, max.mismatch = 3) + +neditAt("GAGC", "AACG", with.indels = TRUE) +isMatchingAt("GAGCTA", "AACGCA", with.indels = TRUE, max.mismatch = 3) +``` + +## Summary + +With `isMatchingAt`, we have a basic way to get matches in pattern/subject syntax. All future sections will build off this foundational function. + +Note that, while we used character vectors for the examples in this section, all the functions shown also work on XString and XStringSet objects. + +# Fixed Pattern Matching + +In the previous section, we talked about how to check if a particular pattern has a match at a specific location in a sequence. While this is theoretically sufficient to do any arbitrary pattern matching, the API is pretty clunky. Ideally, we could just call a single function to automatically find a pattern throughout the subject, without having to worry about where to look. + +## `matchPattern` + +The `matchPattern` function does exactly what we're looking for. This function has the same input arguments as `isMatchingAt`, but searches in the entire subject for the pattern rather than at specific points. + +```{r} +subject <- "BABCABC" +edit_dists <- neditAt("ABC", subject, at = seq_len(nchar(subject))) + +which(edit_dists == 0) + +matchPattern("ABC", subject) +``` + +The output is almost the same as running `isMatchingAt` over the entire subject, except that the return type is an XStringViews object. + +XStringViews objects are perfect for pattern matching, because they can hold all the subsequences that match in the subject along with their start and end positions. Let's see what that looks like using DNAStrings: + +```{r} +pattern <- DNAString("AG") +subject <- DNAString("AGTAGCAGA") + +matchPattern(pattern, subject) +``` + +Here we can see that the pattern `AG` is found starting at indices 1, 4, and 7. In this case, all of our matches are identical to the pattern, but this may not always be the case. + +For instance, if we set `fixed = FALSE`, then we can use IUPAC ambiguity codes: + +```{r} +pattern <- DNAString("AGY") # Y = T or G +subject <- DNAString("AGTAGCAGAAGY") + +## By default, ambiguities only match ambiguities +## (i.e., Y = Y) +matchPattern(pattern, subject) + +## But with fixed = FALSE, it can match other letters +## (i.e., Y = Y, T, or G) +matchPattern(pattern, subject, fixed = FALSE) +``` + +Similarly, if we use the `max.mismatch` arguments, we can get variable length matches: + +```{r} +pattern <- DNAString("AGY") # Y = T or G +subject <- DNAString("AGTAGCAGAAGY") + +## But with fixed = FALSE, it can match other letters +## (i.e., Y = Y, T, or G) +matchPattern(pattern, subject, fixed = FALSE, max.mismatch = 2) +``` + +Sometimes you may get too many matches to deal with. However, we can also match a pattern against an XStringViews subject. In other words, we can find matches *in our matches* to filter them out: + +```{r} +pattern1 <- DNAString("ANN") # Any 3-mer beginning with A +pattern2 <- DNAString("NNT") # Any 3-mer ending with T +subject <- DNAString("ATGTTTAATTTATAATATTTATAAAT") + +first_matches <- matchPattern(pattern1, subject, fixed = FALSE) +first_matches + +second_matches <- matchPattern(pattern2, first_matches, fixed = FALSE) +second_matches +``` + +## `countPattern` + +Sometimes we don't care specifically about what the matches are, we just want to know how many matches exist. For this, we have `countPattern`. `countPattern` returns the number of matches that would be found by `matchPattern`. + +```{r} +pattern <- DNAString("ANN") +subject <- DNAString("ATGTTTAATTTATAATATTTATAAAT") + +length(matchPattern(pattern, subject, fixed = FALSE)) + +countPattern(pattern, subject, fixed = FALSE) +``` + +## One Pattern, Multiple Subjects + +What if we want to search for a single pattern in multiple subjects? A common example of this is searching for a motif across multiple chromosomes. + +We can accomplish this workflow using the `vcountPattern` and `vmatchPattern`. The "v" stands for "vectorized", since it operates across all the strings in the subject like a vectorized function. + +While we can only use a single pattern (for now), the subject of these functions can have multiple sequences, like a character vector or XStringSet: + +```{r} +pattern <- DNAString("ATNG") +subjects <- DNAStringSet(c("ATTGATTAATGGATCG", "AGTGAGACAGATTG")) + +vcountPattern(pattern, subjects, fixed = FALSE) +vcountPattern(pattern, subjects, fixed = FALSE, max.mismatch = 2) + +vmatchPattern(pattern, subjects, fixed = FALSE, max.mismatch = 1) +``` + +`vmatchPattern` returns an object we haven't seen before: an `MIndex`. This container stores matches for a set of a patterns against one or more subject sequences. Each entry in the `MIndex` is an `IRanges` object, which is basically an `XStringViews` without the substring of the match attached (i.e., just the start/end position of each match and its width). + +```{r} +pattern <- DNAString("ATNG") +subjects <- DNAStringSet(c("ATTGATTAATGGATCG", "AGTGAGACAGATTG")) + +matches <- vmatchPattern(pattern, subjects, fixed = FALSE) + +## 3 matches at indices 1, 9, and 13 +matches[[1]] + +## Note this is the same as in matchPattern +matchPattern(pattern, subjects[[1]], fixed = FALSE) + +## ...and same for the second match +matches[[2]] +matchPattern(pattern, subjects[[2]], fixed = FALSE) +``` + +## Summary + +`matchPattern` and `countPattern` are the first "real" matching functions we've seen. While it's possible to do complex matching using `isMatchingAt` and `neditAt`, the `*Pattern` functions provide true search functionality that is useful in real scenarios. + +One point of discussion we'll skip in this vignette is the `algorithm` parameter of `matchPattern`. The best practice is to just leave it set to its default value of `"auto"`, which will automatically pick the best pattern for your search. If a particular algorithm can be used for your search, it will perform identically to all other usable algorithms, so you don't need to worry too much about this parameter. + +We unfortunately won't be able to do "multiple pattern, multiple sequence" searching quite yet. For that, we'll need `PDict` objects, which we'll talk about later. + +# Matching with PDicts + +## What is a PDict? + +PDict is short for "Preprocessed Dictionary". A PDict object is essentially a container that holds a collection of DNA patterns to search with. The container preprocesses these patterns to allow fast matching against one or more subject sequences. + +In the standard configuration, PDicts have the following limitations: + +1. They can only contain DNA base letters (i.e., `ATGC`, no ambiguity codes allowed) +2. All sequences must be the same length +3. Only exact matching is supported + +We'll relax these limitations later. + +```{r} +patterns <- DNAStringSet(c("ATGC", "GGGG", "CCCC", "ATAT")) +pdict <- PDict(patterns) +pdict +``` + +The PDict object shows how many patterns it holds ("length 4") and how long the sequences are ("width 4"). We can also recover the sequences inside of it: + +```{r} +pdict[[1]] +``` + +We can also provide named sequences, and the PDict will keep track of them: +```{r} +patterns <- c(seq1 = "ATGC", seq2 = "CGTA") +pdict <- PDict(patterns) +names(pdict) +``` + +It also mentions the preprocessing algorithm, which must be one of the following: + +- `ACtree2`: The default processing algorithm. Supports matching against subjects that have ambiguity codes (i.e., `A` in the PDict could match `N` in a sequence). +- `Twobit`: Much faster than `ACtree2`, but can't match against ambiguity codes in subjects. + +Regardless of algorithm, we can't include ambiguity codes in the patterns stored in the PDict. However, we can get around this and unlock the full power of PDicts by using a "Trusted Band". + +## Exact Matching + +Once we have a PDict, we can start matching. These functions are very similar to the `*Pattern` functions: + +- `matchPDict` is equivalent to `matchPattern` +- `countPDict` is equivalent to `countPattern` +- `vmatchPDict` is similar to `vmatchPattern` +- `vcountPDict` is equivalent to `vcountPattern` + +These enable us to do "many pattern, single subject" matching: + +```{r} +patterns <- DNAStringSet(c(seq1 = "ATGC", seq2 = "AATT", seq3 = "CCTA")) +pdict <- PDict(patterns) +subject <- DNAString("ATGCAAAATTTAATTGC") + +## How many matches does each pattern have? +countPDict(pdict, subject) + +## Where are the matches? +matchPDict(pdict, subject) +``` + +We also have `whichPDict`, which returns a vector indicating which patterns have at least one match: + +```{r} +## only the first and second patterns have matches +whichPDict(pdict, subject) +``` + +We can also do "many pattern, many subject" matching with the `v*PDict` functions. However, the output types of these are different than previously. Let's break them down one by one. + + +First, `vwhichPDict`: + +```{r} +subjects <- DNAStringSet(c(subj1 = "ATGCAAAATTTAATTGC", subj2 = "CCAATCCTACTATGC")) + +vwhichPDict(pdict, subjects) +``` + +This returns a list with the output of calling `whichPDict` on each sequence in the subject. For this input, we find that patterns 1 and 2 have matches in the first sequence, and patterns 1 and 3 have matches in the second sequence. + +If we wanted to find out how many matches each pattern has in each sequence, we'd use `vcountPDict`: +```{r} +vcountPDict(pdict, subjects) +``` + +This function returns a matrix of counts. The rows correspond to patterns, and the columns correspond to subjects. Thus, since `vcountPDict(pdict, subjects)[2,1] = 2`, we know that pattern 2 has two matches in subject 1. + +If we wanted to find out where those matches were, we'd use `vmatchPDict`: + +```{r} +vmatchPDict(pdict, subjects) +``` + +The result of this is an MIndexList class. This is a similar concept to what we saw with the MIndex class when we discussed `matchPattern`. The MIndex class is esssentially a list storing offsets of matches for a set of patterns against a subject, and the MIndexList class is a list storing an MIndex object for each subject. + +```{r} +all_matches <- vmatchPDict(pdict, subjects) + +## first entry is the matches for all patterns against the first subject: +all_matches[[1]] +``` + +Note that the last entry of `all_matches[[1]]` is empty, because the third pattern had no matches against the first subject. + +**Note:** `vmatchPDict` is the *only* function that supports "many pattern, many subject" matching. `vmatchPattern` can only match a single pattern against one or more sequences. + +## Trusted Bands + +PDicts in their default configuration are very limited. However, this changes when we define a "Trusted Band". PDicts with a trusted band support ambiguity codes, variable length sequences, and fuzzy/inexact matching. + +The trusted band is a fixed size subsequence of the patterns that obeys the prior limitations (no ambiguities, fixed size). When we do matching, the PDict can first search for matches in the trusted band to limit the search space, and then expand to the regions of each pattern outside the trusted band. + +The trusted bands do not need to all be in the same position, but they all need to be the same length. For example, we could imagine the following sequences with trusted bands: + +``` +Sequences: +---------- +TTGCTAMKT +AGCTAGTTAAG + +Some Possible Trusted Bands: +---------------------------- +T | TGCT | AMKT +A | GCTA | GTTAAG + + T | TGCTA | MKT +AGC | TAGTT | AAG +``` + +Notice how the bands are all the same width and contain no ambiguity codes, but they don't have to all be in the same position in each sequence. In R, we could do this with the following: + +```{r} +patterns <- DNAStringSet(c("TTGCTAMKT", "AGCTAGTTAAG")) +pdict1 <- PDict(patterns, tb.start = 2, tb.width = 4) +pdict1 + +pdict2 <- PDict(patterns, tb.end = -4, tb.width = 5) +pdict2 +``` + +We can either define the trusted band relative to the start (`tb.start`), or relative to the end (`tb.end`). Negative numbers are indices relative to the end of the sequence, so `-4` means "the fourth base counting from the end of the sequence". + +The trusted band splits the sequences into three parts: the head (before the band), the band, and the tail (part after the band). We can extract these to look at them: + +```{r} +head(pdict2) +tb(pdict2) +tail(pdict2) +``` + +## Inexact Matching + +Given a PDict with a trusted band, we can start to do inexact matching. Inexact matching with a PDict that has a trusted band works in the following steps: + +1. Find all exact matches using the trusted band of each pattern against the subjects. +2. For each match, compare the head and tail of the pattern against the regions of the subject flanking the match. + +This means that an exact match has to be found to the trusted band before any inexact matching takes place. This is a key distinction, and can lead to some unintuitive behavior. For example, suppose we make two trusted bands from a given sequence: + +``` +Seq: AATTGGC + +Trusted Band 1: AA | TTGG | C +Trusted Band 2: A | ATTG | GC +``` + +The trusted band is the center section, and the left and right regions are the head / tail. Now suppose we want to match against a target sequence with at most one mismatch: + +``` +Subject: ACTTGGC + +Goal: Match with at most 1 mismatch + +Trusted Band 1 finds a match: +AA | TTGG | C +AC TTGG C + +Trusted Band 2 finds NO match. +A | ATTG | GC +A CTTG GC +(One mismatch, but no exact match in trusted band) +``` + +Because of this, the trusted band should be defined as the minimal match that must exist for a match to be correct. + +To do this in R: + +```{r} +seq <- DNAStringSet(c("AATTGGC")) + +# A | ATTG | GC +pdict <- PDict(seq, tb.start=2, tb.end=5) + +subjects <- DNAStringSet(c("AATTGGCC", "CATTGGCT", "ACTTGGCG")) + +# Pattern: A | ATTG | GC +# Seq 1: A ATTG GCC (matches) +# Seq 2: C ATTG GCT (matches, one mismatch) +# Seq 3: A CTTG GCG (no exact match in TB) +vcountPDict(pdict, subjects, max.mismatch = 1L) +``` + +The use of the trusted band allows us to use the other fuzzy matching techniques we used with the `*Pattern` functions. + +We can match ambiguity codes using `fixed`: + +```{r} +pattern <- DNAStringSet("ATNC") + +# | AT | NC +pdict <- PDict(pattern, tb.start = 1, tb.end = 2) + +subject <- DNAString("CCAATATACATNC") + +## Only one match, since ambiguities are treated literally +matchPDict(pdict, subject)[[1]] + +## Two matches; N in tail can match any nucleotide +matchPDict(pdict, subject, fixed=FALSE)[[1]] + +## Since N matches anything, this matches any AT** pattern +matchPDict(pdict, subject, fixed=FALSE, max.mismatch = 1)[[1]] +``` + +## PDict Matching Without a PDict + +It's possible to use the `*PDict` functions without a PDict, by just passing in a DNAStringSet. This is slightly slower than using a preprocessed dictionary, but will be slightly faster than using `matchPattern` in a loop, since `matchPDict` loops at the C level. Additionally, the preprocessing restrictions for PDicts *only apply to initializing the object*, so it's possible to call `matchPDict` for a variable width sequence set if the sequences aren't preprocessed: + +```{r} +## Patterns are all different lengths! +patterns <- DNAStringSet(c("AA", "ATGC", "TTGGCC", "T")) +subject <- DNAString("AATTGGCCAATGC") +countPDict(patterns, subject) +``` + +This is also the only way to use the `with.indels = TRUE` argument. + +## Performance Notes + +Inexact matching can become extremely slow for large numbers of subjects and patterns, especially if the trusted band is small. The most time consuming part is the inexact matching of the head/tail with the flanking regions of the subject's matches. Larger trusted bands mean fewer exact matches, which means better performance. The Biostrings documentation recommends aiming for $\frac{L}{4^w} < 10$, where $L$ is the number of letters in the subject, and $w$ is the width of the trusted band. Larger trusted bands will always result in fewer, faster matches, so prioritize making it as large as you confidently can to get the matches you're looking for. + +# Other Matching Utilities + +Biostrings also includes a couple general-purpose helper functions that operate on matches returned by matching functions (e.g., `matchPattern`, `matchPDict`). + +## Finding Mismatches + +Fuzzy / inexact matching can match with some mismatches. It's often useful to figure out which letters were mismatching in an inexact match call. For this, we have the `mismatch` function, which provides the position of mismatching letters of a given pattern relative to its matches in the subject: + +```{r} +subject <- DNAString("ATGCCGTA") +pattern <- DNAString("GCGGAA") + +matches <- matchPattern(pattern, subject, max.mismatch = 2) +matches + +## the pattern matched positions 3-8 of the subject +## pattern: GCGGAA +## subject: ATGCCGTA +## mismatches: | | (positions 3 and 5 of the match) +mismatch(pattern = pattern, x = matches) + +## How many mismatches were there overall? +nmismatch(pattern = pattern, x = matches) +``` + +If we're matching using ambiguity codes, we can also use the `fixed` argument to treat ambiguities in the subject as ambiguities (and not literal matches) + +```{r} +subject <- DNAString("ATNGTNCTN") +pattern <- DNAString("ATC") + +matches <- matchPattern(pattern, subject, fixed = FALSE, max.mismatch = 1L) +matches + +## mismatches if we ignored ambiguities in subject +mismatch(pattern, matches) + +## mismatches if we use ambiguities in subject +mismatch(pattern, matches, fixed = FALSE) +``` + +## Finding Coverage + +Given a set of matches on a subject, we may also be interested in determining how many matches we got at each position. In other words, how well does the pattern "cover" the sequence? + + +Let's use the same example from the previous section, including ambiguities: +```{r} +subject <- DNAString("ATNGTNCTN") +pattern <- DNAString("ATC") + +matches <- matchPattern(pattern, subject, fixed = FALSE, max.mismatch = 1L) +matches +``` + +We can imagine overlaying the regions that match the pattern: + +``` +Subject: A T N G T N C T N +Match 1: - - - +Match 2: - - - +Match 3: - - - +Match 4: - - - +``` + +If we then count how many matches we have for each position, we get the coverage: +``` +Subject: A T N G T N C T N +Match 1: - - - +Match 2: - - - +Match 3: - - - +Match 4: - - - +Coverage: 1 1 1 1 2 2 2 1 1 +``` + +In code, this looks like: +```{r} +subject <- DNAString("ATNGTNCTN") +pattern <- DNAString("ATC") + +matches <- matchPattern(pattern, subject, fixed = FALSE, max.mismatch = 1L) + +## This is a run-length encoding +coverage(matches) + +## converting to a vector, we get: +as.vector(coverage(matches)) +``` + +If we're matching multiple patterns against a subject, `coverage` will give us the number of matches from *any* pattern for each position: + +```{r} +patterns <- DNAStringSet(c("ATC", "ATG")) +pd <- PDict(patterns, tb.start = 1, tb.end = 2) +subject <- DNAString("ATATATATATGAC") + +countPDict(pd, subject, max.mismatch = 1L) + +matches <- matchPDict(pd, subject, max.mismatch = 1L) +as.vector(coverage(matches)) +``` + +# Biological Applications + +The previous sections have covered everything you need to know for matching sequences with Biostrings. In this last section, we'll cover some of the more application-specific functions you can use for biological analyses. + +## Finding Palindromes + +Finding palindromic sequences is a critical analysis for many analyses. Biostrings provides the `findPalindromes` function to identify palindromic regions in a sequence. + +By palindromic sequences, we mean two regions of a sequence that are reverse complements of each other, potentially separated by a non-matching region. These palindromic sequences can form hairpin loops, like so: + +``` +Palindromic Sequence: GAATTCAATCTAAGAATTC + +This forms a hairpin: + + A A + G A A T T C T + | | | | | | C + C T T A A G T + A A +``` + +This palindrome has three regions: a left arm, a right arm, and a loop. The arms are the two regions that match, with left / right corresponding to which comes first / last in the sequence (respectively). The loop is the middle region that doesn't bind to itself. + +In the previous example, we'd see the following regions: +``` +Palindromic Sequence: GAATTCAATCTAAGAATTC + +Separated into groups: GAATTC AATCTAA GAATTC + / | \ + left arm loop right arm +``` + +We can find this with the `findPalindrome` function in Biostrings: + +```{r} +subject <- DNAString("GAATTCAATCTAAGAATTC") +findPalindromes(subject, max.looplength = 7) +``` + +In this case, we get a single view on the whole string, since the entire sequence is a palindrome. Note that we set `max.looplength` to control the maximum size of loop we look for. We can also set `min.armlength` and `max.armlength` to control the size of the arms, and `min.looplength` to control the size of the loop. + +If we have a sequence with multiple palindromes, we can find them all at once: + +```{r} +subject <- DNAString("GAAGTTCTTCATGCATGTAACATG") + +# Default parameters: +# - min.armlength = 4 +# - max.looplength = 1 (strict palindromes only) +findPalindromes(subject, max.looplength = 3L) +``` + +Now we have three palindromes! However, it's a little difficult to tell what the palindromic regions are. We can identify the left/right arms with some helper functions: + +```{r} +subject <- DNAString("GAAGTTCTTCATGCATGTAACATG") +palindromes <- findPalindromes(subject, max.looplength = 3L) + +## lengths of all arms +palindromeArmLength(palindromes) + +## Get only the left arm +palindromeLeftArm(palindromes) + +## Get only the right arm +palindromeRightArm(palindromes) +``` + +We can also set `allow.wobble` to `TRUE` to treat `G-T` or `G-U` pairings as matches: + +```{r} +subject <- DNAString("ATGCGTAT") + +## No matches, because G-T pairings are ignored +findPalindromes(subject) + +## Allowing wobble base pairs, we find a match +findPalindromes(subject, allow.wobble = TRUE) + +## We can also just allow mismatches directly, which aren't limited to G-T pairs +findPalindromes(subject, max.mismatch = 1L) +``` + +`findPalindromes` will search for reverse complement sequences if the input is a DNAString or RNAString. If the input is a BString or AAString, it'll just search for regular palindromes: + +```{r} +subject <- BString("amanaplanacanalpanama racecar momomomom") +findPalindromes(subject) +``` + +## Finding Ampilicons for a Primer Pair + +Another common workflow is identifying amplicons that match to a given pair of primers. For this, we have the `matchProbePair` function, which takes as input a pair (forward and reverse) of primers and a target sequence, and returns all the amplicons mapped to it. + +Let's look at an example, using a primer pair derived directly from a yeast chromosome. + +```{r} +options(width = 70) # limiting the width of XString output +data("yeastSEQCHR1") # bundled with the Biostrings package +yeast_chr1 <- DNAString(yeastSEQCHR1) + +## We'll target 500 bases starting at position 50,000 +region_start <- 50000L +region_width <- 500L +probe_size <- 20L + +## Getting some subsequences to act as primers +forward_probe <- subseq(yeast_chr1, start = region_start, width = probe_size) +reverse_probe <- subseq( + yeast_chr1, + end = region_start + region_width - 1, + width = probe_size +) + +## The reverse probe should be the reverse complement +reverse_probe <- reverseComplement(reverse_probe) + +## Now we can identify the amplicons +matchProbePair(forward_probe, reverse_probe, subject = yeast_chr1) +``` + +Under the hood, `matchProbePair` uses `matchPattern`, which is why it returns an XStringViews object. That means it supports all the features of `matchPattern`: + +```{r} +options(width = 70) # limiting the width of XString output + +## Including mismatches +matchProbePair( + forward_probe, + reverse_probe, + yeast_chr1, + max.mismatch = 5 +) + +## Not sure why you'd want to do this, but you can! +matchProbePair( + forward_probe, + reverse_probe, + yeast_chr1, + min.mismatch = 6, + max.mismatch = 8 +) +``` + +## Position Weight Matrix (PWM) Matching + +We can also find matches in sequences using position weight matrices (PWMs). This functions exactly like all the other matching functions we've covered, except we'll use a PWM as the pattern rather than an XString directly. We can create a PWM with the `PWM` function: + +```{r} +## Some possible TATA box binding motifs +motifs <- DNAStringSet(c("TATAAA", "TATATA", "TATAAT", "TATATT")) + +## Converting into a PWM +pwm <- PWM(motifs) +pwm +``` + +Now we can use our `countPWM` and `matchPWM` just like we would with a pattern, we're just going to use the PWM as our pattern: + +```{r} +options(width = 70) # limiting the width of XString output +data("yeastSEQCHR1") # bundled with the Biostrings package +yeast_chr1 <- DNAString(yeastSEQCHR1) + +## Getting the number of matching motifs +countPWM(pwm, yeast_chr1) + +## Finding the matches +matchPWM(pwm, yeast_chr1) +``` + +As you can see, there are tons of matches. First, let's get an idea of how well each of these matches scored: + +```{r} +## finding matches with score included +matches <- matchPWM(pwm, yeast_chr1, with.score = TRUE) + +## `mcols` references the metadata columns +all_scores <- mcols(matches)$score +table(round(all_scores, 2)) +``` + +The high scores are because we have such a small motif, but we do have some subsequences that scored higher than others. Let's trim out the lower scoring sequences: + +```{r} +## Way too many matches +countPWM(pwm, yeast_chr1) + +## ...but we can limit the matches with min.score +countPWM(pwm, yeast_chr1, min.score = "86%") + +## Score can also be expressed as a decimal value +countPWM(pwm, yeast_chr1, min.score = 0.95) +``` + +There are a lot of features with the `*PWM` functions. You can use logged probability ratios (the default) or raw probabilities for scores, set priors for the estimated probability of bases at each position, and renormalize your PWMs to a unit scale. Check out `?PWM` for more details on these features. + +# Further Reading + +If you're interested in diving more into the details of matching in Biostrings, check out these man pages: + +- ?`lowlevel-matching`: low-level matching +- ?matchPattern +- ?PDict +- ?matchPDict +- ?`matchPDict-inexact` +- ?`match-utils` + +Matching in Biostrings is still a work in progress, and there are many features we're still working on. Some of them are even noted in the man pages. If you're interested in contributing to Biostrings, feel free to file a [pull request](https://github.com/Bioconductor/Biostrings/pulls)! + + +# Session info + +```{r sessionInfo, echo=FALSE} +sessionInfo() +```