blob: b971bf3418e5aed938394b16539ceecd64d97494 [file] [log] [blame]
use crate::ahocorasick::MatchKind;
use crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
use crate::state_id::{dead_id, fail_id, StateID};
use crate::Match;
// NOTE: This trait essentially started as a copy of the same trait from from
// regex-automata, with some wording changed since we use this trait for
// NFAs in addition to DFAs in this crate. Additionally, we do not export
// this trait. It's only used internally to reduce code duplication. The
// regex-automata crate needs to expose it because its Regex type is generic
// over implementations of this trait. In this crate, we encapsulate everything
// behind the AhoCorasick type.
//
// This trait is a bit of a mess, but it's not quite clear how to fix it.
// Basically, there are several competing concerns:
//
// * We need performance, so everything effectively needs to get monomorphized.
// * There are several variations on searching Aho-Corasick automatons:
// overlapping, standard and leftmost. Overlapping and standard are somewhat
// combined together below, but there is no real way to combine standard with
// leftmost. Namely, leftmost requires continuing a search even after a match
// is found, in order to correctly disambiguate a match.
// * On top of that, *sometimes* callers want to know which state the automaton
// is in after searching. This is principally useful for overlapping and
// stream searches. However, when callers don't care about this, we really
// do not want to be forced to compute it, since it sometimes requires extra
// work. Thus, there are effectively two copies of leftmost searching: one
// for tracking the state ID and one that doesn't. We should ideally do the
// same for standard searching, but my sanity stopped me.
// SAFETY RATIONALE: Previously, the code below went to some length to remove
// all bounds checks. This generally produced tighter assembly and lead to
// 20-50% improvements in micro-benchmarks on corpora made up of random
// characters. This somewhat makes sense, since the branch predictor is going
// to be at its worse on random text.
//
// However, using the aho-corasick-debug tool and manually benchmarking
// different inputs, the code *with* bounds checks actually wound up being
// slightly faster:
//
// $ cat input
// Sherlock Holmes
// John Watson
// Professor Moriarty
// Irene Adler
// Mary Watson
//
// $ aho-corasick-debug-safe \
// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
// pattern read time: 32.824µs
// automaton build time: 444.687µs
// automaton heap usage: 72392 bytes
// match count: 639
// count time: 1.809961702s
//
// $ aho-corasick-debug-master \
// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
// pattern read time: 31.425µs
// automaton build time: 317.434µs
// automaton heap usage: 72392 bytes
// match count: 639
// count time: 2.059157705s
//
// I was able to reproduce this result on two different machines (an i5 and
// an i7). Therefore, we go the route of safe code for now.
/// A trait describing the interface of an Aho-Corasick finite state machine.
///
/// Every automaton has exactly one fail state, one dead state and exactly one
/// start state. Generally, these correspond to the first, second and third
/// states, respectively. The failure state is always treated as a sentinel.
/// That is, no correct Aho-Corasick automaton will ever transition into the
/// fail state. The dead state, however, can be transitioned into, but only
/// when leftmost-first or leftmost-longest match semantics are enabled and
/// only when at least one match has been observed.
///
/// Every automaton also has one or more match states, such that
/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
/// corresponds to a match state.
pub trait Automaton {
/// The representation used for state identifiers in this automaton.
///
/// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
type ID: StateID;
/// The type of matching that should be done.
fn match_kind(&self) -> &MatchKind;
/// Returns true if and only if this automaton uses anchored searches.
fn anchored(&self) -> bool;
/// An optional prefilter for quickly skipping to the next candidate match.
/// A prefilter must report at least every match, although it may report
/// positions that do not correspond to a match. That is, it must not allow
/// false negatives, but can allow false positives.
///
/// Currently, a prefilter only runs when the automaton is in the start
/// state. That is, the position reported by a prefilter should always
/// correspond to the start of a potential match.
fn prefilter(&self) -> Option<&dyn Prefilter>;
/// Return the identifier of this automaton's start state.
fn start_state(&self) -> Self::ID;
/// Returns true if and only if the given state identifier refers to a
/// valid state.
fn is_valid(&self, id: Self::ID) -> bool;
/// Returns true if and only if the given identifier corresponds to a match
/// state.
///
/// The state ID given must be valid, or else implementors may panic.
fn is_match_state(&self, id: Self::ID) -> bool;
/// Returns true if and only if the given identifier corresponds to a state
/// that is either the dead state or a match state.
///
/// Depending on the implementation of the automaton, this routine can
/// be used to save a branch in the core matching loop. Nevertheless,
/// `is_match_state(id) || id == dead_id()` is always a valid
/// implementation. Indeed, this is the default implementation.
///
/// The state ID given must be valid, or else implementors may panic.
fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
id == dead_id() || self.is_match_state(id)
}
/// If the given state is a match state, return the match corresponding
/// to the given match index. `end` must be the ending position of the
/// detected match. If no match exists or if `match_index` exceeds the
/// number of matches in this state, then `None` is returned.
///
/// The state ID given must be valid, or else implementors may panic.
///
/// If the given state ID is correct and if the `match_index` is less than
/// the number of matches for that state, then this is guaranteed to return
/// a match.
fn get_match(
&self,
id: Self::ID,
match_index: usize,
end: usize,
) -> Option<Match>;
/// Returns the number of matches for the given state. If the given state
/// is not a match state, then this returns 0.
///
/// The state ID given must be valid, or else implementors must panic.
fn match_count(&self, id: Self::ID) -> usize;
/// Given the current state that this automaton is in and the next input
/// byte, this method returns the identifier of the next state. The
/// identifier returned must always be valid and may never correspond to
/// the fail state. The returned identifier may, however, point to the
/// dead state.
///
/// This is not safe so that implementors may look up the next state
/// without memory safety checks such as bounds checks. As such, callers
/// must ensure that the given identifier corresponds to a valid automaton
/// state. Implementors must, in turn, ensure that this routine is safe for
/// all valid state identifiers and for all possible `u8` values.
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
/// Like next_state, but debug_asserts that the underlying
/// implementation never returns a `fail_id()` for the next state.
fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
let next = self.next_state(current, input);
// We should never see a transition to the failure state.
debug_assert!(
next != fail_id(),
"automaton should never return fail_id for next state"
);
next
}
/// Execute a search using standard match semantics.
///
/// This can be used even when the automaton was constructed with leftmost
/// match semantics when you want to find the earliest possible match. This
/// can also be used as part of an overlapping search implementation.
///
/// N.B. This does not report a match if `state_id` is given as a matching
/// state. As such, this should not be used directly.
#[inline(always)]
fn standard_find_at(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
if let Some(pre) = self.prefilter() {
self.standard_find_at_imp(
prestate,
Some(pre),
haystack,
at,
state_id,
)
} else {
self.standard_find_at_imp(prestate, None, haystack, at, state_id)
}
}
// It's important for this to always be inlined. Namely, its only caller
// is standard_find_at, and the inlining should remove the case analysis
// for prefilter scanning when there is no prefilter available.
#[inline(always)]
fn standard_find_at_imp(
&self,
prestate: &mut PrefilterState,
prefilter: Option<&dyn Prefilter>,
haystack: &[u8],
mut at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
while at < haystack.len() {
if let Some(pre) = prefilter {
if prestate.is_effective(at) && *state_id == self.start_state()
{
let c = prefilter::next(prestate, pre, haystack, at)
.into_option();
match c {
None => return None,
Some(i) => {
at = i;
}
}
}
}
// CORRECTNESS: next_state is correct for all possible u8 values,
// so the only thing we're concerned about is the validity of
// `state_id`. `state_id` either comes from the caller (in which
// case, we assume it is correct), or it comes from the return
// value of next_state, which is guaranteed to be correct.
*state_id = self.next_state_no_fail(*state_id, haystack[at]);
at += 1;
// This routine always quits immediately after seeing a
// match, and since dead states can only come after seeing
// a match, seeing a dead state here is impossible. (Unless
// we have an anchored automaton, in which case, dead states
// are used to stop a search.)
debug_assert!(
*state_id != dead_id() || self.anchored(),
"standard find should never see a dead state"
);
if self.is_match_or_dead_state(*state_id) {
return if *state_id == dead_id() {
None
} else {
self.get_match(*state_id, 0, at)
};
}
}
None
}
/// Execute a search using leftmost (either first or longest) match
/// semantics.
///
/// The principle difference between searching with standard semantics and
/// searching with leftmost semantics is that leftmost searching will
/// continue searching even after a match has been found. Once a match
/// is found, the search does not stop until either the haystack has been
/// exhausted or a dead state is observed in the automaton. (Dead states
/// only exist in automatons constructed with leftmost semantics.) That is,
/// we rely on the construction of the automaton to tell us when to quit.
#[inline(never)]
fn leftmost_find_at(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
if let Some(pre) = self.prefilter() {
self.leftmost_find_at_imp(
prestate,
Some(pre),
haystack,
at,
state_id,
)
} else {
self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
}
}
// It's important for this to always be inlined. Namely, its only caller
// is leftmost_find_at, and the inlining should remove the case analysis
// for prefilter scanning when there is no prefilter available.
#[inline(always)]
fn leftmost_find_at_imp(
&self,
prestate: &mut PrefilterState,
prefilter: Option<&dyn Prefilter>,
haystack: &[u8],
mut at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
debug_assert!(self.match_kind().is_leftmost());
if self.anchored() && at > 0 && *state_id == self.start_state() {
return None;
}
let mut last_match = self.get_match(*state_id, 0, at);
while at < haystack.len() {
if let Some(pre) = prefilter {
if prestate.is_effective(at) && *state_id == self.start_state()
{
let c = prefilter::next(prestate, pre, haystack, at)
.into_option();
match c {
None => return None,
Some(i) => {
at = i;
}
}
}
}
// CORRECTNESS: next_state is correct for all possible u8 values,
// so the only thing we're concerned about is the validity of
// `state_id`. `state_id` either comes from the caller (in which
// case, we assume it is correct), or it comes from the return
// value of next_state, which is guaranteed to be correct.
*state_id = self.next_state_no_fail(*state_id, haystack[at]);
at += 1;
if self.is_match_or_dead_state(*state_id) {
if *state_id == dead_id() {
// The only way to enter into a dead state is if a match
// has been found, so we assert as much. This is different
// from normal automata, where you might enter a dead state
// if you know a subsequent match will never be found
// (regardless of whether a match has already been found).
// For Aho-Corasick, it is built so that we can match at
// any position, so the possibility of a match always
// exists.
//
// (Unless we have an anchored automaton, in which case,
// dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
"failure state should only be seen after match"
);
return last_match;
}
last_match = self.get_match(*state_id, 0, at);
}
}
last_match
}
/// This is like leftmost_find_at, but does not need to track a caller
/// provided state id. In other words, the only output of this routine is a
/// match, if one exists.
///
/// It is regrettable that we need to effectively copy a chunk of
/// implementation twice, but when we don't need to track the state ID, we
/// can allow the prefilter to report matches immediately without having
/// to re-confirm them with the automaton. The re-confirmation step is
/// necessary in leftmost_find_at because tracing through the automaton is
/// the only way to correctly set the state ID. (Perhaps an alternative
/// would be to keep a map from pattern ID to matching state ID, but that
/// complicates the code and still doesn't permit us to defer to the
/// prefilter entirely when possible.)
///
/// I did try a few things to avoid the code duplication here, but nothing
/// optimized as well as this approach. (In microbenchmarks, there was
/// about a 25% difference.)
#[inline(never)]
fn leftmost_find_at_no_state(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Option<Match> {
if let Some(pre) = self.prefilter() {
self.leftmost_find_at_no_state_imp(
prestate,
Some(pre),
haystack,
at,
)
} else {
self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
}
}
// It's important for this to always be inlined. Namely, its only caller
// is leftmost_find_at_no_state, and the inlining should remove the case
// analysis for prefilter scanning when there is no prefilter available.
#[inline(always)]
fn leftmost_find_at_no_state_imp(
&self,
prestate: &mut PrefilterState,
prefilter: Option<&dyn Prefilter>,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(self.match_kind().is_leftmost());
if self.anchored() && at > 0 {
return None;
}
// If our prefilter handles confirmation of matches 100% of the
// time, and since we don't need to track state IDs, we can avoid
// Aho-Corasick completely.
if let Some(pre) = prefilter {
// We should never have a prefilter during an anchored search.
debug_assert!(!self.anchored());
if !pre.reports_false_positives() {
return match pre.next_candidate(prestate, haystack, at) {
Candidate::None => None,
Candidate::Match(m) => Some(m),
Candidate::PossibleStartOfMatch(_) => unreachable!(),
};
}
}
let mut state_id = self.start_state();
let mut last_match = self.get_match(state_id, 0, at);
while at < haystack.len() {
if let Some(pre) = prefilter {
if prestate.is_effective(at) && state_id == self.start_state()
{
match prefilter::next(prestate, pre, haystack, at) {
Candidate::None => return None,
// Since we aren't tracking a state ID, we can
// quit early once we know we have a match.
Candidate::Match(m) => return Some(m),
Candidate::PossibleStartOfMatch(i) => {
at = i;
}
}
}
}
// CORRECTNESS: next_state is correct for all possible u8 values,
// so the only thing we're concerned about is the validity of
// `state_id`. `state_id` either comes from the caller (in which
// case, we assume it is correct), or it comes from the return
// value of next_state, which is guaranteed to be correct.
state_id = self.next_state_no_fail(state_id, haystack[at]);
at += 1;
if self.is_match_or_dead_state(state_id) {
if state_id == dead_id() {
// The only way to enter into a dead state is if a
// match has been found, so we assert as much. This
// is different from normal automata, where you might
// enter a dead state if you know a subsequent match
// will never be found (regardless of whether a match
// has already been found). For Aho-Corasick, it is
// built so that we can match at any position, so the
// possibility of a match always exists.
//
// (Unless we have an anchored automaton, in which
// case, dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
"failure state should only be seen after match"
);
return last_match;
}
last_match = self.get_match(state_id, 0, at);
}
}
last_match
}
/// Execute an overlapping search.
///
/// When executing an overlapping match, the previous state ID in addition
/// to the previous match index should be given. If there are more matches
/// at the given state, then the match is reported and the given index is
/// incremented.
#[inline(always)]
fn overlapping_find_at(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
state_id: &mut Self::ID,
match_index: &mut usize,
) -> Option<Match> {
if self.anchored() && at > 0 && *state_id == self.start_state() {
return None;
}
let match_count = self.match_count(*state_id);
if *match_index < match_count {
// This is guaranteed to return a match since
// match_index < match_count.
let result = self.get_match(*state_id, *match_index, at);
debug_assert!(result.is_some(), "must be a match");
*match_index += 1;
return result;
}
*match_index = 0;
match self.standard_find_at(prestate, haystack, at, state_id) {
None => None,
Some(m) => {
*match_index = 1;
Some(m)
}
}
}
/// Return the earliest match found. This returns as soon as we know that
/// we have a match. As such, this does not necessarily correspond to the
/// leftmost starting match, but rather, the leftmost position at which a
/// match ends.
#[inline(always)]
fn earliest_find_at(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
if *state_id == self.start_state() {
if self.anchored() && at > 0 {
return None;
}
if let Some(m) = self.get_match(*state_id, 0, at) {
return Some(m);
}
}
self.standard_find_at(prestate, haystack, at, state_id)
}
/// A convenience function for finding the next match according to the
/// match semantics of this automaton. For standard match semantics, this
/// finds the earliest match. Otherwise, the leftmost match is found.
#[inline(always)]
fn find_at(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
match *self.match_kind() {
MatchKind::Standard => {
self.earliest_find_at(prestate, haystack, at, state_id)
}
MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
self.leftmost_find_at(prestate, haystack, at, state_id)
}
MatchKind::__Nonexhaustive => unreachable!(),
}
}
/// Like find_at, but does not track state identifiers. This permits some
/// optimizations when a prefilter that confirms its own matches is
/// present.
#[inline(always)]
fn find_at_no_state(
&self,
prestate: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Option<Match> {
match *self.match_kind() {
MatchKind::Standard => {
let mut state = self.start_state();
self.earliest_find_at(prestate, haystack, at, &mut state)
}
MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
self.leftmost_find_at_no_state(prestate, haystack, at)
}
MatchKind::__Nonexhaustive => unreachable!(),
}
}
}