Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Collate: utils.R
misc.R
SparseList-class.R
MIndex-class.R
MIndexList-class.R
lowlevel-matching.R
match-utils.R
matchPattern.R
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ exportMethods(

exportClasses(
#SparseList,
MIndex, ByPos_MIndex,
MIndex, ByPos_MIndex, MIndexList,
PreprocessedTB, Twobit, ACtree2,
PDict3Parts,
PDict, TB_PDict, MTB_PDict, Expanded_TB_PDict
Expand Down
76 changes: 76 additions & 0 deletions R/MIndexList-class.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
### =========================================================================
### MIndexList objects
### -------------------------------------------------------------------------
###

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The "MIndexList" class.
###
### This class serves as a base class to hold collections of MIndex objects.
###
### As with MIndex, in normal operations, the user should never need to create
### MIndex objects directly or to modify existing ones. Those objects are
### typically returned by a sequence matching/alignment function like
### vmatchPDict(). For this reason, coercion methods are relatively barebones
### and will throw errors rather than try to correct
###

setClass("MIndexList",
contains="CompressedList",
representation(
# "VIRTUAL",
unlistData="MIndex"
),
prototype(
elementType="MIndex"
)
)


.from_list_to_MIndexList <- function(from)
{
## keeping ans_class variable for potential future extensions
ans_class <- "MIndexList"

## ensure elements are all MIndex objects
if(!all(vapply(from, inherits, logical(1L), what="MIndex")))
stop("Only lists of MIndex objects can be coerced to MIndexList")
IRanges:::new_CompressedList_from_list(ans_class, from)
}

## barebones, but users shouldn't be calling this directly
## expecting to be used internally, list coercion provided for QoL
setAs("list", "MIndexList", function(from) .from_list_to_MIndexList(from))
setAs("List", "MIndexList", function(from) .from_list_to_MIndexList(as.list(from)))

.from_MIndexList_to_compact_char <- function(object){
lapply(object, \(m_ind){
paste(length(m_ind), "patterns,",
sum(lengths(m_ind)), "matches")
})
}

setMethod("show", "MIndexList",
function(object) {
lo <- length(object)
k <- min(5, length(object))
diffK <- lo - 5
cat(classNameForDisplay(object), " of length ", lo,
"\n", sep = "")

repr <- .from_MIndexList_to_compact_char(head(object, k))
IRanges:::.showAtomicList(CharacterList(repr), minLines=10L)
if (diffK > 0)
cat("...\n<", diffK,
ifelse(diffK == 1,
" more element>\n", " more elements>\n"),
sep="")
}
)

setMethod("showAsCell", "MIndexList",
function(object){
repr <- .from_MIndexList_to_compact_char(object)
showAsCell(CharacterList(repr))
}
)
69 changes: 57 additions & 12 deletions R/matchPDict.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
### > library(BSgenome.Hsapiens.UCSC.hg18)
### > chr1 <- Hsapiens$chr1
### > system.time(end_index <- endIndex(matchPDict(pdict, chr1)))
### user system elapsed
### user system elapsed
### 50.663 0.000 50.763
### > count_index <- sapply(end_index, length)
### > table(count_index)
Expand Down Expand Up @@ -64,8 +64,8 @@
### 1M 36 2.5 sec 717M 106 sec 103 sec 0
### 10M 36 56 sec 6724M 351 sec 200 sec 0
### 10M 12 7.5 sec 340M 227 sec 216 sec 100M
### 30M 12 27 sec 523M 491 sec ?
###
### 30M 12 27 sec 523M 491 sec ?
###
### III. Inexact matching
### ---------------------
### pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2)
Expand Down Expand Up @@ -561,10 +561,8 @@
if (!identical(weight, 1L))
warning("'weight' is ignored when 'collapse=FALSE'")
}
} else {
## vmatchPDict()
stop("vmatchPDict() is not supported yet, sorry")
}

## We are doing our own dispatch here, based on the type of 'pdict'.
## TODO: Revisit this. Would probably be a better design to use a
## generic/methods approach and rely on the standard dispatch mechanism.
Expand All @@ -584,7 +582,8 @@
max.mismatch, min.mismatch, with.indels, fixed,
algorithm, collapse, weight,
verbose, matches.as)
if (is.null(which_pp_excluded))
if (is.null(which_pp_excluded) && matches.as %in%
c("MATCHES_AS_WHICH", "MATCHES_AS_COUNTS"))
return(ans)
if (matches.as == "MATCHES_AS_WHICH") {
## vwhichPDict()
Expand All @@ -599,8 +598,51 @@
}
return(ans)
}
## vmatchPDict()
stop("vmatchPDict() is not supported yet, sorry")
if(matches.as == "MATCHES_AS_ENDS") {
## vmatchPDict() by ends
## converting all the results to MIndex objects and then wrapping the result
for(i in seq_along(ans)) {
ans[[i]] <- new("ByPos_MIndex",
width0=width(pdict),
NAMES=names(pdict),
ends=ans[[i]])
}

names(ans) <- names(subject)
ans <- as(ans, "MIndexList")
return(ans)
}
if(matches.as == "MATCHES_AS_RANGES") {
## vmatchPDict() by ranges
## This is the same as MATCHES_AS_ENDS, except ans[[i]] holds two lists
## - The first list has the start position for each range
## - The second list has the width of the range (including the start)

for(i in seq_along(ans)){
## First we have to move from starts to ends to align with the
## Pos_MIndex constructor
ans_start <- ans[[i]][[1]]
ans_width <- ans[[i]][[2]]
for(j in seq_along(ans_start)){
if(!is.null(ans_start[[j]])){
## Note that the width includes the start, so we subtract 1
ans_start[[j]] <- ans_start[[j]] + ans_width[[j]] - 1L
}
}
## Now ans_start holds the **end** position
## width(pdict) will be equivalent to ifelse(is.null(w), 0, w[1]) for
## w in ans_width
ans[[i]] <- new("ByPos_MIndex",
width0=width(pdict),
NAMES=names(pdict),
ends=ans_start)
}

names(ans) <- names(subject)
ans <- as(ans, "MIndexList")
return(ans)
}
warning("Failed to post-process output of .vmatch(pdict, subject)")
return(ans)
}

Expand Down Expand Up @@ -771,8 +813,8 @@ setMethod("whichPDict", "MaskedXString",
### 'lapply(subject, function(s) whichPDict(pdict, s, ...))'.
### The returned object is a list of length N.
### o vcountPDict(): returns an M x N matrix of integers.
### o vmatchPDict(): not supported yet! (first we need a container to
### store the results)
### o vmatchPDict(): returns an MIndexList (CompressedList of MIndex objects)
### This contains N MIndex objects, each with up to M entries
###

setGeneric("vmatchPDict", signature="subject",
Expand All @@ -787,7 +829,10 @@ setMethod("vmatchPDict", "ANY",
function(pdict, subject,
max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
algorithm="auto", verbose=FALSE)
stop("vmatchPDict() is not ready yet, sorry")
.vmatchPDict(pdict, subject,
max.mismatch, min.mismatch, with.indels, fixed,
algorithm, collapse=FALSE, weight=1L,
verbose, matches.as="MATCHES_AS_ENDS")
)

### Dispatch on 'subject' (see signature of generic).
Expand Down
18 changes: 13 additions & 5 deletions man/matchPDict-exact.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@
and \code{whichPDict} returns the "who" information i.e. which patterns
in the input dictionary have at least one match.

\code{vcountPDict} and \code{vwhichPDict} are vectorized versions
of \code{countPDict} and \code{whichPDict}, respectively, that is,
they work on a set of reference sequences in a vectorized fashion.
\code{vcountPDict}, \code{vwhichPDict}, and \code{vmatchPDict} are vectorized
versions of \code{countPDict}, \code{whichPDict}, and \code{matchPDict}
respectively, that is, they work on a set of reference sequences in a
vectorized fashion.

This man page shows how to use these functions (aka the \code{*PDict}
functions) for exact matching of a constant width dictionary i.e.
Expand Down Expand Up @@ -86,6 +87,9 @@ vcountPDict(pdict, subject,
vwhichPDict(pdict, subject,
max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
algorithm="auto", verbose=FALSE)
vmatchPDict(pdict, subject,
max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE,
algorithm="auto", verbose=FALSE)
}

\arguments{
Expand All @@ -111,7 +115,7 @@ vwhichPDict(pdict, subject,
\code{whichPDict}.

An \link{XStringSet} object containing the subject sequences
for \code{vcountPDict} and \code{vwhichPDict}.
for \code{vmatchPDict}, \code{vcountPDict}, \and code{vwhichPDict}.

If \code{pdict} is a \link{PDict} object (i.e. a preprocessed
dictionary), then \code{subject} must be of base class \link{DNAString}.
Expand Down Expand Up @@ -210,6 +214,10 @@ vwhichPDict(pdict, subject,
(see above for the details).

\code{vwhichPDict} returns a list of \code{N} integer vectors.

\code{vmatchPDict} returns an \link{MIndexList}, which is essentially a list
of length \code{N} where each entry is an \link{MIndex} object (as would be
obtained from running \code{matchPDict} on each of the sequences).
}

\author{H. Pagès}
Expand Down Expand Up @@ -513,7 +521,7 @@ if (interactive()) {
pdict2 <- PDict(probes2)

## Get the mapping from probes1 to probes2 (based on exact matching):
map1to2 <- vwhichPDict(pdict2, probes1)
map1to2 <- vwhichPDict(pdict2, probes1)

## The following helper function uses the probe level mapping to induce
## the mapping at the probe set IDs level (from hgu95av2 to hgu133a).
Expand Down
40 changes: 34 additions & 6 deletions src/match_pdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,28 @@ static SEXP vcount_XStringSet_XStringSet(SEXP pattern,
return ans;
}

SEXP vmatch_PDict3Parts_XStringSet_helper(SEXP pptb, HeadTail *headtail, MatchPDictBuf *mpdb,
SEXP xstringset,
SEXP max_mismatch, SEXP min_mismatch,
SEXP fixed){
XStringSet_holder subject;
int subj_length;
Chars_holder subj_elt;

subject = _hold_XStringSet(xstringset);
subj_length = _get_length_from_XStringSet_holder(&subject);

SEXP RETURN_LIST = PROTECT(NEW_LIST(subj_length));
for(int i=0; i<subj_length; i++){
subj_elt = _get_elt_from_XStringSet_holder(&subject, i);
match_pdict(pptb, headtail, &subj_elt, max_mismatch, min_mismatch, fixed, mpdb);
SET_ELEMENT(RETURN_LIST, i, _MatchBuf_as_SEXP(&(mpdb->matches), R_NilValue));
_MatchPDictBuf_flush(mpdb);
}
UNPROTECT(1);
return RETURN_LIST;
}

/* --- .Call ENTRY POINT --- */
SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
SEXP subject,
Expand All @@ -495,10 +517,12 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
matchpdict_buf = new_MatchPDictBuf_from_PDict3Parts(matches_as,
pptb, pdict_head, pdict_tail);
switch (matchpdict_buf.matches.ms_code) {
case MATCHES_AS_NULL:
error("vmatch_PDict3Parts_XStringSet() does not support "
"'matches_as=\"%s\"' yet, sorry",
CHAR(STRING_ELT(matches_as, 0)));
case MATCHES_AS_RANGES:
case MATCHES_AS_ENDS:
return vmatch_PDict3Parts_XStringSet_helper(pptb, &headtail,
&matchpdict_buf, subject,
max_mismatch, min_mismatch,
fixed);
case MATCHES_AS_WHICH:
return vwhich_PDict3Parts_XStringSet(pptb, &headtail,
subject,
Expand All @@ -510,8 +534,11 @@ SEXP vmatch_PDict3Parts_XStringSet(SEXP pptb, SEXP pdict_head, SEXP pdict_tail,
max_mismatch, min_mismatch, fixed,
collapse, weight,
&matchpdict_buf);
default:
error("vmatch_PDict3Parts_XStringSet() does not support "
"'matches_as=\"%s\"' yet, sorry",
CHAR(STRING_ELT(matches_as, 0)));
}
error("vmatchPDict() is not supported yet, sorry");
return R_NilValue;
}

Expand Down Expand Up @@ -545,7 +572,8 @@ SEXP vmatch_XStringSet_XStringSet(SEXP pattern,
with_indels, fixed,
algorithm, collapse, weight);
}
error("vmatchPDict() is not supported yet, sorry");
error("vmatchPDict() on two XStringSets is not supported yet. "
"Please use a PDict object instead (see `?PDict` for more info).");
return R_NilValue;
}

Empty file.
Loading