third_party/rust_crates/ask2patch/aho-corasick/src/nfa.rs - fuchsia - Git at Google

 use std::cmp;
 use std::collections::{BTreeSet, VecDeque};
 use std::fmt;
 use std::mem::size_of;
 use std::ops::{Index, IndexMut};

 use crate::ahocorasick::MatchKind;
 use crate::automaton::Automaton;
 use crate::classes::{ByteClassBuilder, ByteClasses};
 use crate::error::Result;
 use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
 use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
 use crate::Match;

 /// The identifier for a pattern, which is simply the position of the pattern
 /// in the sequence of patterns given by the caller.
 pub type PatternID = usize;

 /// The length of a pattern, in bytes.
 pub type PatternLength = usize;

 /// An Aho-Corasick automaton, represented as an NFA.
 ///
 /// This is the classical formulation of Aho-Corasick, which involves building
 /// up a prefix trie of a given set of patterns, and then wiring up failure
 /// transitions between states in order to guarantee linear time matching. The
 /// standard formulation is, technically, an NFA because of these failure
 /// transitions. That is, one can see them as enabling the automaton to be in
 /// multiple states at once. Indeed, during search, it is possible to check
 /// the transitions on multiple states for a single input byte.
 ///
 /// This particular implementation not only supports the standard style of
 /// matching, but also provides a mode for choosing leftmost-first or
 /// leftmost-longest match semantics. When a leftmost mode is chosen, some
 /// failure transitions that would otherwise be added are elided. See
 /// the documentation of `MatchKind` for more details and examples on how the
 /// match semantics may differ.
 ///
 /// If one wants a DFA, then it is necessary to first build an NFA and convert
 /// it into a DFA. Note, however, that because we've constrained ourselves to
 /// matching literal patterns, this does not need to use subset construction
 /// for determinization. Instead, the DFA has at most a number of states
 /// equivalent to the number of NFA states. The only real difference between
 /// them is that all failure transitions are followed and pre-computed. This
 /// uses much more memory, but also executes searches more quickly.
 #[derive(Clone)]
 pub struct NFA<S> {
     /// The match semantics built into this NFA.
     match_kind: MatchKind,
     /// The start state id as an index into `states`.
     start_id: S,
     /// The length, in bytes, of the longest pattern in this automaton. This
     /// information is useful for keeping correct buffer sizes when searching
     /// on streams.
     max_pattern_len: usize,
     /// The total number of patterns added to this automaton, including
     /// patterns that may never be matched.
     pattern_count: usize,
     /// The number of bytes of heap used by this NFA's transition table.
     heap_bytes: usize,
     /// A prefilter for quickly skipping to candidate matches, if pertinent.
     prefilter: Option<PrefilterObj>,
     /// Whether this automaton anchors all matches to the start of input.
     anchored: bool,
     /// A set of equivalence classes in terms of bytes. We compute this while
     /// building the NFA, but don't use it in the NFA's states. Instead, we
     /// use this for building the DFA. We store it on the NFA since it's easy
     /// to compute while visiting the the patterns.
     byte_classes: ByteClasses,
     /// A set of states. Each state defines its own transitions, a fail
     /// transition and a set of indices corresponding to matches.
     ///
     /// The first state is always the fail state, which is used only as a
     /// sentinel. Namely, in the final NFA, no transition into the fail state
     /// exists. (Well, they do, but they aren't followed. Instead, the state's
     /// failure transition is followed.)
     ///
     /// The second state (index 1) is always the dead state. Dead states are
     /// in every automaton, but only used when leftmost-{first,longest} match
     /// semantics are enabled. Specifically, they instruct search to stop
     /// at specific points in order to report the correct match location. In
     /// the standard Aho-Corasick construction, there are no transitions to
     /// the dead state.
     ///
     /// The third state (index 2) is generally intended to be the starting or
     /// "root" state.
     states: Vec<State<S>>,
 }

 impl<S: StateID> NFA<S> {
     /// Returns the equivalence classes of bytes found while constructing
     /// this NFA.
     ///
     /// Note that the NFA doesn't actually make use of these equivalence
     /// classes. Instead, these are useful for building the DFA when desired.
     pub fn byte_classes(&self) -> &ByteClasses {
         &self.byte_classes
     }

     /// Returns a prefilter, if one exists.
     pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
         self.prefilter.as_ref()
     }

     /// Returns the total number of heap bytes used by this NFA's transition
     /// table.
     pub fn heap_bytes(&self) -> usize {
         self.heap_bytes
             + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
     }

     /// Return the length of the longest pattern in this automaton.
     pub fn max_pattern_len(&self) -> usize {
         self.max_pattern_len
     }

     /// Return the total number of patterns added to this automaton.
     pub fn pattern_count(&self) -> usize {
         self.pattern_count
     }

     /// Returns the total number of states in this NFA.
     pub fn state_len(&self) -> usize {
         self.states.len()
     }

     /// Returns the matches for the given state.
     pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
         &self.states[id.to_usize()].matches
     }

     /// Returns an iterator over all transitions in the given state according
     /// to the given equivalence classes, including transitions to `fail_id()`.
     /// The number of transitions returned is always equivalent to the number
     /// of equivalence classes.
     pub fn iter_all_transitions<F: FnMut(u8, S)>(
         &self,
         byte_classes: &ByteClasses,
         id: S,
         f: F,
     ) {
         self.states[id.to_usize()].trans.iter_all(byte_classes, f);
     }

     /// Returns the failure transition for the given state.
     pub fn failure_transition(&self, id: S) -> S {
         self.states[id.to_usize()].fail
     }

     /// Returns the next state for the given state and input byte.
     ///
     /// Note that this does not follow failure transitions. As such, the id
     /// returned may be `fail_id`.
     pub fn next_state(&self, current: S, input: u8) -> S {
         self.states[current.to_usize()].next_state(input)
     }

     fn state(&self, id: S) -> &State<S> {
         &self.states[id.to_usize()]
     }

     fn state_mut(&mut self, id: S) -> &mut State<S> {
         &mut self.states[id.to_usize()]
     }

     fn start(&self) -> &State<S> {
         self.state(self.start_id)
     }

     fn start_mut(&mut self) -> &mut State<S> {
         let id = self.start_id;
         self.state_mut(id)
     }

     fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
         IterTransitionsMut::new(self, id)
     }

     fn copy_matches(&mut self, src: S, dst: S) {
         let (src, dst) =
             get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
         dst.matches.extend_from_slice(&src.matches);
     }

     fn copy_empty_matches(&mut self, dst: S) {
         let start_id = self.start_id;
         self.copy_matches(start_id, dst);
     }

     fn add_dense_state(&mut self, depth: usize) -> Result<S> {
         let trans = Transitions::Dense(Dense::new());
         let id = usize_to_state_id(self.states.len())?;
         self.states.push(State {
             trans,
             // Anchored automatons do not have any failure transitions.
             fail: if self.anchored { dead_id() } else { self.start_id },
             depth,
             matches: vec![],
         });
         Ok(id)
     }

     fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
         let trans = Transitions::Sparse(vec![]);
         let id = usize_to_state_id(self.states.len())?;
         self.states.push(State {
             trans,
             // Anchored automatons do not have any failure transitions.
             fail: if self.anchored { dead_id() } else { self.start_id },
             depth,
             matches: vec![],
         });
         Ok(id)
     }
 }

 impl<S: StateID> Automaton for NFA<S> {
     type ID = S;

     fn match_kind(&self) -> &MatchKind {
         &self.match_kind
     }

     fn anchored(&self) -> bool {
         self.anchored
     }

     fn prefilter(&self) -> Option<&dyn Prefilter> {
         self.prefilter.as_ref().map(|p| p.as_ref())
     }

     fn start_state(&self) -> S {
         self.start_id
     }

     fn is_valid(&self, id: S) -> bool {
         id.to_usize() < self.states.len()
     }

     fn is_match_state(&self, id: S) -> bool {
         self.states[id.to_usize()].is_match()
     }

     fn get_match(
         &self,
         id: S,
         match_index: usize,
         end: usize,
     ) -> Option<Match> {
         let state = match self.states.get(id.to_usize()) {
             None => return None,
             Some(state) => state,
         };
         state.matches.get(match_index).map(|&(id, len)| Match {
             pattern: id,
             len,
             end,
         })
     }

     fn match_count(&self, id: S) -> usize {
         self.states[id.to_usize()].matches.len()
     }

     fn next_state(&self, mut current: S, input: u8) -> S {
         // This terminates since:
         //
         // 1. `State.fail` never points to fail_id().
         // 2. All `State.fail` values point to a state closer to `start`.
         // 3. The start state has no transitions to fail_id().
         loop {
             let state = &self.states[current.to_usize()];
             let next = state.next_state(input);
             if next != fail_id() {
                 return next;
             }
             current = state.fail;
         }
     }
 }

 /// A representation of an NFA state for an Aho-Corasick automaton.
 ///
 /// It contains the transitions to the next state, a failure transition for
 /// cases where there exists no other transition for the current input byte,
 /// the matches implied by visiting this state (if any) and the depth of this
 /// state. The depth of a state is simply the distance from it to the start
 /// state in the automaton, where the depth of the start state is 0.
 #[derive(Clone, Debug)]
 pub struct State<S> {
     trans: Transitions<S>,
     fail: S,
     matches: Vec<(PatternID, PatternLength)>,
     // TODO: Strictly speaking, this isn't needed for searching. It's only
     // used when building an NFA that supports leftmost match semantics. We
     // could drop this from the state and dynamically build a map only when
     // computing failure transitions, but it's not clear which is better.
     // Benchmark this.
     depth: usize,
 }

 impl<S: StateID> State<S> {
     fn heap_bytes(&self) -> usize {
         self.trans.heap_bytes()
             + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
     }

     fn add_match(&mut self, i: PatternID, len: PatternLength) {
         self.matches.push((i, len));
     }

     fn is_match(&self) -> bool {
         !self.matches.is_empty()
     }

     fn get_longest_match_len(&self) -> Option<usize> {
         // Why is this true? Because the first match in any matching state
         // will always correspond to the match added to it during trie
         // construction (since when we copy matches due to failure transitions,
         // we always append them). Therefore, it follows that the first match
         // must always be longest since any subsequent match must be from a
         // failure transition, and a failure transition by construction points
         // to a proper suffix. A proper suffix is, by definition, smaller.
         self.matches.get(0).map(|&(_, len)| len)
     }

     fn next_state(&self, input: u8) -> S {
         self.trans.next_state(input)
     }

     fn set_next_state(&mut self, input: u8, next: S) {
         self.trans.set_next_state(input, next);
     }
 }

 /// Represents the transitions for a single dense state.
 ///
 /// The primary purpose here is to encapsulate index access. Namely, since a
 /// dense representation always contains 256 elements, all values of `u8` are
 /// valid indices.
 #[derive(Clone, Debug)]
 struct Dense<S>(Vec<S>);

 impl<S> Dense<S>
 where
     S: StateID,
 {
     fn new() -> Self {
         Dense(vec![fail_id(); 256])
     }

     #[inline]
     fn len(&self) -> usize {
         self.0.len()
     }
 }

 impl<S> Index<u8> for Dense<S> {
     type Output = S;

     #[inline]
     fn index(&self, i: u8) -> &S {
         // SAFETY: This is safe because all dense transitions have
         // exactly 256 elements, so all u8 values are valid indices.
         &self.0[i as usize]
     }
 }

 impl<S> IndexMut<u8> for Dense<S> {
     #[inline]
     fn index_mut(&mut self, i: u8) -> &mut S {
         // SAFETY: This is safe because all dense transitions have
         // exactly 256 elements, so all u8 values are valid indices.
         &mut self.0[i as usize]
     }
 }

 /// A representation of transitions in an NFA.
 ///
 /// Transitions have either a sparse representation, which is slower for
 /// lookups but uses less memory, or a dense representation, which is faster
 /// for lookups but uses more memory. In the sparse representation, the absence
 /// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
 /// are still explicitly represented.
 ///
 /// For the NFA, by default, we use a dense representation for transitions for
 /// states close to the start state because it's likely these are the states
 /// that will be most frequently visited.
 #[derive(Clone, Debug)]
 enum Transitions<S> {
     Sparse(Vec<(u8, S)>),
     Dense(Dense<S>),
 }

 impl<S: StateID> Transitions<S> {
     fn heap_bytes(&self) -> usize {
         match *self {
             Transitions::Sparse(ref sparse) => {
                 sparse.len() * size_of::<(u8, S)>()
             }
             Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
         }
     }

     fn next_state(&self, input: u8) -> S {
         match *self {
             Transitions::Sparse(ref sparse) => {
                 for &(b, id) in sparse {
                     if b == input {
                         return id;
                     }
                 }
                 fail_id()
             }
             Transitions::Dense(ref dense) => dense[input],
         }
     }

     fn set_next_state(&mut self, input: u8, next: S) {
         match *self {
             Transitions::Sparse(ref mut sparse) => {
                 match sparse.binary_search_by_key(&input, |&(b, _)| b) {
                     Ok(i) => sparse[i] = (input, next),
                     Err(i) => sparse.insert(i, (input, next)),
                 }
             }
             Transitions::Dense(ref mut dense) => {
                 dense[input] = next;
             }
         }
     }

     /// Iterate over transitions in this state while skipping over transitions
     /// to `fail_id()`.
     fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
         match *self {
             Transitions::Sparse(ref sparse) => {
                 for &(b, id) in sparse {
                     f(b, id);
                 }
             }
             Transitions::Dense(ref dense) => {
                 for b in AllBytesIter::new() {
                     let id = dense[b];
                     if id != fail_id() {
                         f(b, id);
                     }
                 }
             }
         }
     }

     /// Iterate over all transitions in this state according to the given
     /// equivalence classes, including transitions to `fail_id()`.
     fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
         if classes.is_singleton() {
             match *self {
                 Transitions::Sparse(ref sparse) => {
                     sparse_iter(sparse, f);
                 }
                 Transitions::Dense(ref dense) => {
                     for b in AllBytesIter::new() {
                         f(b, dense[b]);
                     }
                 }
             }
         } else {
             // In this case, we only want to yield a single byte for each
             // equivalence class.
             match *self {
                 Transitions::Sparse(ref sparse) => {
                     let mut last_class = None;
                     sparse_iter(sparse, |b, next| {
                         let class = classes.get(b);
                         if last_class != Some(class) {
                             last_class = Some(class);
                             f(b, next);
                         }
                     })
                 }
                 Transitions::Dense(ref dense) => {
                     for b in classes.representatives() {
                         f(b, dense[b]);
                     }
                 }
             }
         }
     }
 }

 /// Iterator over transitions in a state, skipping transitions to `fail_id()`.
 ///
 /// This abstracts over the representation of NFA transitions, which may be
 /// either in a sparse or dense representation.
 ///
 /// This somewhat idiosyncratically borrows the NFA mutably, so that when one
 /// is iterating over transitions, the caller can still mutate the NFA. This
 /// is useful when creating failure transitions.
 #[derive(Debug)]
 struct IterTransitionsMut<'a, S: StateID> {
     nfa: &'a mut NFA<S>,
     state_id: S,
     cur: usize,
 }

 impl<'a, S: StateID> IterTransitionsMut<'a, S> {
     fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
         IterTransitionsMut { nfa, state_id, cur: 0 }
     }

     fn nfa(&mut self) -> &mut NFA<S> {
         self.nfa
     }
 }

 impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
     type Item = (u8, S);

     fn next(&mut self) -> Option<(u8, S)> {
         match self.nfa.states[self.state_id.to_usize()].trans {
             Transitions::Sparse(ref sparse) => {
                 if self.cur >= sparse.len() {
                     return None;
                 }
                 let i = self.cur;
                 self.cur += 1;
                 Some(sparse[i])
             }
             Transitions::Dense(ref dense) => {
                 while self.cur < dense.len() {
                     // There are always exactly 255 transitions in dense repr.
                     debug_assert!(self.cur < 256);

                     let b = self.cur as u8;
                     let id = dense[b];
                     self.cur += 1;
                     if id != fail_id() {
                         return Some((b, id));
                     }
                 }
                 None
             }
         }
     }
 }

 /// A simple builder for configuring the NFA construction of Aho-Corasick.
 #[derive(Clone, Debug)]
 pub struct Builder {
     dense_depth: usize,
     match_kind: MatchKind,
     prefilter: bool,
     anchored: bool,
     ascii_case_insensitive: bool,
 }

 impl Default for Builder {
     fn default() -> Builder {
         Builder {
             dense_depth: 2,
             match_kind: MatchKind::default(),
             prefilter: true,
             anchored: false,
             ascii_case_insensitive: false,
         }
     }
 }

 impl Builder {
     pub fn new() -> Builder {
         Builder::default()
     }

     pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
     where
         I: IntoIterator<Item = P>,
         P: AsRef<[u8]>,
     {
         Compiler::new(self)?.compile(patterns)
     }

     pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
         self.match_kind = kind;
         self
     }

     pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
         self.dense_depth = depth;
         self
     }

     pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
         self.prefilter = yes;
         self
     }

     pub fn anchored(&mut self, yes: bool) -> &mut Builder {
         self.anchored = yes;
         self
     }

     pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
         self.ascii_case_insensitive = yes;
         self
     }
 }

 /// A compiler uses a builder configuration and builds up the NFA formulation
 /// of an Aho-Corasick automaton. This roughly corresponds to the standard
 /// formulation described in textbooks.
 #[derive(Debug)]
 struct Compiler<'a, S: StateID> {
     builder: &'a Builder,
     prefilter: prefilter::Builder,
     nfa: NFA<S>,
     byte_classes: ByteClassBuilder,
 }

 impl<'a, S: StateID> Compiler<'a, S> {
     fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
         Ok(Compiler {
             builder,
             prefilter: prefilter::Builder::new(builder.match_kind)
                 .ascii_case_insensitive(builder.ascii_case_insensitive),
             nfa: NFA {
                 match_kind: builder.match_kind,
                 start_id: usize_to_state_id(2)?,
                 max_pattern_len: 0,
                 pattern_count: 0,
                 heap_bytes: 0,
                 prefilter: None,
                 anchored: builder.anchored,
                 byte_classes: ByteClasses::singletons(),
                 states: vec![],
             },
             byte_classes: ByteClassBuilder::new(),
         })
     }

     fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
     where
         I: IntoIterator<Item = P>,
         P: AsRef<[u8]>,
     {
         self.add_state(0)?; // the fail state, which is never entered
         self.add_state(0)?; // the dead state, only used for leftmost
         self.add_state(0)?; // the start state
         self.build_trie(patterns)?;
         self.add_start_state_loop();
         self.add_dead_state_loop();
         if !self.builder.anchored {
             if self.match_kind().is_leftmost() {
                 self.fill_failure_transitions_leftmost();
             } else {
                 self.fill_failure_transitions_standard();
             }
         }
         self.close_start_state_loop();
         self.nfa.byte_classes = self.byte_classes.build();
         if !self.builder.anchored {
             self.nfa.prefilter = self.prefilter.build();
         }
         self.calculate_size();
         Ok(self.nfa)
     }

     /// This sets up the initial prefix trie that makes up the Aho-Corasick
     /// automaton. Effectively, it creates the basic structure of the
     /// automaton, where every pattern given has a path from the start state to
     /// the end of the pattern.
     fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
     where
         I: IntoIterator<Item = P>,
         P: AsRef<[u8]>,
     {
         'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
             let pat = pat.as_ref();
             self.nfa.max_pattern_len =
                 cmp::max(self.nfa.max_pattern_len, pat.len());
             self.nfa.pattern_count += 1;

             let mut prev = self.nfa.start_id;
             let mut saw_match = false;
             for (depth, &b) in pat.iter().enumerate() {
                 // When leftmost-first match semantics are requested, we
                 // specifically stop adding patterns when a previously added
                 // pattern is a prefix of it. We avoid adding it because
                 // leftmost-first semantics imply that the pattern can never
                 // match. This is not just an optimization to save space! It
                 // is necessary for correctness. In fact, this is the only
                 // difference in the automaton between the implementations for
                 // leftmost-first and leftmost-longest.
                 saw_match = saw_match || self.nfa.state(prev).is_match();
                 if self.builder.match_kind.is_leftmost_first() && saw_match {
                     // Skip to the next pattern immediately. This avoids
                     // incorrectly adding a match after this loop terminates.
                     continue 'PATTERNS;
                 }

                 // Add this byte to our equivalence classes. We don't use these
                 // for NFA construction. These are instead used only if we're
                 // building a DFA. They would technically be useful for the
                 // NFA, but it would require a second pass over the patterns.
                 self.byte_classes.set_range(b, b);
                 if self.builder.ascii_case_insensitive {
                     let b = opposite_ascii_case(b);
                     self.byte_classes.set_range(b, b);
                 }

                 // If the transition from prev using the current byte already
                 // exists, then just move through it. Otherwise, add a new
                 // state. We track the depth here so that we can determine
                 // how to represent transitions. States near the start state
                 // use a dense representation that uses more memory but is
                 // faster. Other states use a sparse representation that uses
                 // less memory but is slower.
                 let next = self.nfa.state(prev).next_state(b);
                 if next != fail_id() {
                     prev = next;
                 } else {
                     let next = self.add_state(depth + 1)?;
                     self.nfa.state_mut(prev).set_next_state(b, next);
                     if self.builder.ascii_case_insensitive {
                         let b = opposite_ascii_case(b);
                         self.nfa.state_mut(prev).set_next_state(b, next);
                     }
                     prev = next;
                 }
             }
             // Once the pattern has been added, log the match in the final
             // state that it reached.
             self.nfa.state_mut(prev).add_match(pati, pat.len());
             // ... and hand it to the prefilter builder, if applicable.
             if self.builder.prefilter {
                 self.prefilter.add(pat);
             }
         }
         Ok(())
     }

     /// This routine creates failure transitions according to the standard
     /// textbook formulation of the Aho-Corasick algorithm.
     ///
     /// Building failure transitions is the most interesting part of building
     /// the Aho-Corasick automaton, because they are what allow searches to
     /// be performed in linear time. Specifically, a failure transition is
     /// a single transition associated with each state that points back to
     /// the longest proper suffix of the pattern being searched. The failure
     /// transition is followed whenever there exists no transition on the
     /// current state for the current input byte. If there is no other proper
     /// suffix, then the failure transition points back to the starting state.
     ///
     /// For example, let's say we built an Aho-Corasick automaton with the
     /// following patterns: 'abcd' and 'cef'. The trie looks like this:
     ///
     /// ```ignore
     ///          a - S1 - b - S2 - c - S3 - d - S4*
     ///         /
     ///     S0 - c - S5 - e - S6 - f - S7*
     /// ```
     ///
     /// At this point, it should be fairly straight-forward to see how this
     /// trie can be used in a simplistic way. At any given position in the
     /// text we're searching (called the "subject" string), all we need to do
     /// is follow the transitions in the trie by consuming one transition for
     /// each byte in the subject string. If we reach a match state, then we can
     /// report that location as a match.
     ///
     /// The trick comes when searching a subject string like 'abcef'. We'll
     /// initially follow the transition from S0 to S1 and wind up in S3 after
     /// observng the 'c' byte. At this point, the next byte is 'e' but state
     /// S3 has no transition for 'e', so the search fails. We then would need
     /// to restart the search at the next position in 'abcef', which
     /// corresponds to 'b'. The match would fail, but the next search starting
     /// at 'c' would finally succeed. The problem with this approach is that
     /// we wind up searching the subject string potentially many times. In
     /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
     /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
     /// like to achieve a `O(n + m)` worst case complexity.
     ///
     /// This is where failure transitions come in. Instead of dying at S3 in
     /// the first search, the automaton can instruct the search to move to
     /// another part of the automaton that corresponds to a suffix of what
     /// we've seen so far. Recall that we've seen 'abc' in the subject string,
     /// and the automaton does indeed have a non-empty suffix, 'c', that could
     /// potentially lead to another match. Thus, the actual Aho-Corasick
     /// automaton for our patterns in this case looks like this:
     ///
     /// ```ignore
     ///          a - S1 - b - S2 - c - S3 - d - S4*
     ///         /                      /
     ///        /       ----------------
     ///       /       /
     ///     S0 - c - S5 - e - S6 - f - S7*
     /// ```
     ///
     /// That is, we have a failure transition from S3 to S5, which is followed
     /// exactly in cases when we are in state S3 but see any byte other than
     /// 'd' (that is, we've "failed" to find a match in this portion of our
     /// trie). We know we can transition back to S5 because we've already seen
     /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
     /// with the search starting at S5 and complete our match.
     ///
     /// Adding failure transitions to a trie is fairly simple, but subtle. The
     /// key issue is that you might have multiple failure transition that you
     /// need to follow. For example, look at the trie for the patterns
     /// 'abcd', 'b', 'bcd' and 'cd':
     ///
     /// ```ignore
     ///        - a - S1 - b - S2 - c - S3 - d - S4*
     ///       /
     ///     S0 - b - S5* - c - S6 - d - S7*
     ///       \
     ///        - c - S8 - d - S9*
     /// ```
     ///
     /// The failure transitions for this trie are defined from S2 to S5,
     /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
     /// corresponds to a match, since its failure transition to S5 is itself
     /// a match state.
     ///
     /// Perhaps simplest way to think about adding these failure transitions
     /// is recursively. That is, if you know the failure transitions for every
     /// possible previous state that could be visited (e.g., when computing the
     /// failure transition for S3, you already know the failure transitions
     /// for S0, S1 and S2), then you can simply follow the failure transition
     /// of the previous state and check whether the incoming transition is
     /// defined after following the failure transition.
     ///
     /// For example, when determining the failure state for S3, by our
     /// assumptions, we already know that there is a failure transition from
     /// S2 (the previous state) to S5. So we follow that transition and check
     /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
     /// as there is a transition from S5 to S6 for the byte 'c'. If no such
     /// transition existed, we could keep following the failure transitions
     /// until we reach the start state, which is the failure transition for
     /// every state that has no corresponding proper suffix.
     ///
     /// We don't actually use recursion to implement this, but instead, use a
     /// breadth first search of the automaton. Our base case is the start
     /// state, whose failure transition is just a transition to itself.
     fn fill_failure_transitions_standard(&mut self) {
         // Initialize the queue for breadth first search with all transitions
         // out of the start state. We handle the start state specially because
         // we only want to follow non-self transitions. If we followed self
         // transitions, then this would never terminate.
         let mut queue = VecDeque::new();
         let mut seen = self.queued_set();
         for b in AllBytesIter::new() {
             let next = self.nfa.start().next_state(b);
             if next != self.nfa.start_id {
                 if !seen.contains(next) {
                     queue.push_back(next);
                     seen.insert(next);
                 }
             }
         }
         while let Some(id) = queue.pop_front() {
             let mut it = self.nfa.iter_transitions_mut(id);
             while let Some((b, next)) = it.next() {
                 if seen.contains(next) {
                     // The only way to visit a duplicate state in a transition
                     // list is when ASCII case insensitivity is enabled. In
                     // this case, we want to skip it since it's redundant work.
                     // But it would also end up duplicating matches, which
                     // results in reporting duplicate matches in some cases.
                     // See the 'acasei010' regression test.
                     continue;
                 }
                 queue.push_back(next);
                 seen.insert(next);

                 let mut fail = it.nfa().state(id).fail;
                 while it.nfa().state(fail).next_state(b) == fail_id() {
                     fail = it.nfa().state(fail).fail;
                 }
                 fail = it.nfa().state(fail).next_state(b);
                 it.nfa().state_mut(next).fail = fail;
                 it.nfa().copy_matches(fail, next);
             }
             // If the start state is a match state, then this automaton can
             // match the empty string. This implies all states are match states
             // since every position matches the empty string, so copy the
             // matches from the start state to every state. Strictly speaking,
             // this is only necessary for overlapping matches since each
             // non-empty non-start match state needs to report empty matches
             // in addition to its own. For the non-overlapping case, such
             // states only report the first match, which is never empty since
             // it isn't a start state.
             it.nfa().copy_empty_matches(id);
         }
     }

     /// This routine is just like fill_failure_transitions_standard, except
     /// it adds failure transitions in a way that preserves leftmost match
     /// semantics (for both leftmost-first and leftmost-longest).
     ///
     /// The algorithms are so similar that it would be possible to write it
     /// generically. But doing so without overhead would require a bit of
     /// ceremony, so we just copy it and add in the extra leftmost logic.
     /// Moreover, the standard algorithm above is so simple that it feels like
     /// a crime to disturb it.
     ///
     /// In effect, this proceeds just like the standard approach, but we
     /// specifically add only a subset of all failure transitions. Namely, we
     /// only add failure transitions that either do not occur after a match
     /// or failure transitions that do occur after a match but preserve the
     /// match. The comments in the implementation below should help.
     ///
     /// N.B. The only differences in the automaton between leftmost-first and
     /// leftmost-longest are in trie construction. Otherwise, both have exactly
     /// the same set of failure transitions. leftmost-longest adds everything
     /// to the trie, where as leftmost-first skips any patterns for which there
     /// exists a prefix of it that was added earlier.
     ///
     /// N.B. I came up with this algorithm on my own, and after scouring all of
     /// the other AC implementations I know of (Perl, Snort, many on GitHub).
     /// I couldn't find any that implement leftmost semantics like this.
     /// Perl of course needs leftmost-first semantics, but they implement it
     /// with a seeming hack at *search* time instead of encoding it into the
     /// automaton. There are also a couple Java libraries that support leftmost
     /// longest semantics, but they do it by building a queue of matches at
     /// search time, which is even worse than what Perl is doing. ---AG
     fn fill_failure_transitions_leftmost(&mut self) {
         /// Represents an item in our queue of states to process.
         ///
         /// Fundamentally, this queue serves the same purpose as the queue
         /// for filling failure transitions using the standard formulation.
         /// In the leftmost case, though, we need to track a bit more
         /// information. See comments below.
         #[derive(Clone, Copy, Debug)]
         struct QueuedState<S> {
             /// The id of the state to visit.
             id: S,
             /// The depth at which the first match was observed in the path
             /// to this state. Note that this corresponds to the depth at
             /// which the beginning of the match was detected. If no match
             /// has been seen, then this is None.
             match_at_depth: Option<usize>,
         }

         impl<S: StateID> QueuedState<S> {
             /// Create a queued state corresponding to the given NFA's start
             /// state.
             fn start(nfa: &NFA<S>) -> QueuedState<S> {
                 let match_at_depth =
                     if nfa.start().is_match() { Some(0) } else { None };
                 QueuedState { id: nfa.start_id, match_at_depth }
             }

             /// Return the next state to queue up. The given id must be a state
             /// corresponding to a single transition from this queued state.
             fn next_queued_state(
                 &self,
                 nfa: &NFA<S>,
                 id: S,
             ) -> QueuedState<S> {
                 let match_at_depth = self.next_match_at_depth(nfa, id);
                 QueuedState { id, match_at_depth }
             }

             /// Return the earliest depth at which a match has occurred for
             /// the given state. The given state must correspond to a single
             /// transition from this queued state.
             fn next_match_at_depth(
                 &self,
                 nfa: &NFA<S>,
                 next: S,
             ) -> Option<usize> {
                 // This is a little tricky. If the previous state has already
                 // seen a match or if `next` isn't a match state, then nothing
                 // needs to change since a later state cannot find an earlier
                 // match.
                 match self.match_at_depth {
                     Some(x) => return Some(x),
                     None if nfa.state(next).is_match() => {}
                     None => return None,
                 }
                 let depth = nfa.state(next).depth
                     - nfa.state(next).get_longest_match_len().unwrap()
                     + 1;
                 Some(depth)
             }
         }

         // Initialize the queue for breadth first search with all transitions
         // out of the start state. We handle the start state specially because
         // we only want to follow non-self transitions. If we followed self
         // transitions, then this would never terminate.
         let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
         let mut seen = self.queued_set();
         let start = QueuedState::start(&self.nfa);
         for b in AllBytesIter::new() {
             let next_id = self.nfa.start().next_state(b);
             if next_id != start.id {
                 let next = start.next_queued_state(&self.nfa, next_id);
                 if !seen.contains(next.id) {
                     queue.push_back(next);
                     seen.insert(next.id);
                 }
                 // If a state immediately following the start state is a match
                 // state, then we never want to follow its failure transition
                 // since the failure transition necessarily leads back to the
                 // start state, which we never want to do for leftmost matching
                 // after a match has been found.
                 //
                 // N.B. This is a special case of the more general handling
                 // found below.
                 if self.nfa.state(next_id).is_match() {
                     self.nfa.state_mut(next_id).fail = dead_id();
                 }
             }
         }
         while let Some(item) = queue.pop_front() {
             let mut any_trans = false;
             let mut it = self.nfa.iter_transitions_mut(item.id);
             while let Some((b, next_id)) = it.next() {
                 any_trans = true;

                 // Queue up the next state.
                 let next = item.next_queued_state(it.nfa(), next_id);
                 if seen.contains(next.id) {
                     // The only way to visit a duplicate state in a transition
                     // list is when ASCII case insensitivity is enabled. In
                     // this case, we want to skip it since it's redundant work.
                     // But it would also end up duplicating matches, which
                     // results in reporting duplicate matches in some cases.
                     // See the 'acasei010' regression test.
                     continue;
                 }
                 queue.push_back(next);
                 seen.insert(next.id);

                 // Find the failure state for next. Same as standard.
                 let mut fail = it.nfa().state(item.id).fail;
                 while it.nfa().state(fail).next_state(b) == fail_id() {
                     fail = it.nfa().state(fail).fail;
                 }
                 fail = it.nfa().state(fail).next_state(b);

                 // This is the key difference from the standard formulation.
                 // Namely, if we've seen a match, then we only want a failure
                 // transition if the failure transition preserves the match
                 // we've seen. In general, this is not true of all failure
                 // transitions since they can point back to any suffix of what
                 // we've seen so far. Instead, we only want to point back to
                 // suffixes that contain any match we've seen.
                 //
                 // We achieve this by comparing the depth of the failure
                 // transition with the number of states between this state
                 // and the beginning of the earliest match detected. If the
                 // depth of the failure state is smaller than this difference,
                 // then it cannot contain the match. If it's bigger or equal
                 // to the difference, then it necessarily includes the match
                 // we've seen since all failure transitions correspond to a
                 // suffix.
                 //
                 // If we've determined that we don't want the failure
                 // transition, then we set this state's failure transition to
                 // the dead state. In other words, when a search hits this
                 // state, it will not continue and correctly stop. (N.B. A
                 // dead state is different than a fail state. A dead state
                 // MUST be preceded by a match and acts as a sentinel to search
                 // routines to terminate.)
                 //
                 // Understanding this is tricky, and it took me several days
                 // to think through this and get it right. If you want to grok
                 // it, then I'd recommend: 1) switch the implementation to
                 // always use the standard algorithm for filling in failure
                 // transitions, 2) run the test suite and 3) examine the test
                 // failures. Write out the automatons for them and try to work
                 // backwards by figuring out which failure transitions should
                 // be removed. You should arrive at the same rule used below.
                 if let Some(match_depth) = next.match_at_depth {
                     let fail_depth = it.nfa().state(fail).depth;
                     let next_depth = it.nfa().state(next.id).depth;
                     if next_depth - match_depth + 1 > fail_depth {
                         it.nfa().state_mut(next.id).fail = dead_id();
                         continue;
                     }
                     assert_ne!(
                         start.id,
                         it.nfa().state(next.id).fail,
                         "states that are match states or follow match \
                          states should never have a failure transition \
                          back to the start state in leftmost searching",
                     );
                 }
                 it.nfa().state_mut(next.id).fail = fail;
                 it.nfa().copy_matches(fail, next.id);
             }
             // If there are no transitions for this state and if it's a match
             // state, then we must set its failure transition to the dead
             // state since we never want it to restart the search.
             if !any_trans && it.nfa().state(item.id).is_match() {
                 it.nfa().state_mut(item.id).fail = dead_id();
             }
             // We don't need to copy empty matches from the start state here
             // because that's only necessary for overlapping matches and
             // leftmost match kinds don't support overlapping matches.
         }
     }

     /// Returns a set that tracked queued states.
     ///
     /// This is only necessary when ASCII case insensitivity is enabled, since
     /// it is the only way to visit the same state twice. Otherwise, this
     /// returns an inert set that nevers adds anything and always reports
     /// `false` for every member test.
     fn queued_set(&self) -> QueuedSet<S> {
         if self.builder.ascii_case_insensitive {
             QueuedSet::active()
         } else {
             QueuedSet::inert()
         }
     }

     /// Set the failure transitions on the start state to loop back to the
     /// start state. This effectively permits the Aho-Corasick automaton to
     /// match at any position. This is also required for finding the next
     /// state to terminate, namely, finding the next state should never return
     /// a fail_id.
     ///
     /// This must be done after building the initial trie, since trie
     /// construction depends on transitions to `fail_id` to determine whether a
     /// state already exists or not.
     fn add_start_state_loop(&mut self) {
         let start_id = self.nfa.start_id;
         let start = self.nfa.start_mut();
         for b in AllBytesIter::new() {
             if start.next_state(b) == fail_id() {
                 start.set_next_state(b, start_id);
             }
         }
     }

     /// Remove the start state loop by rewriting any transitions on the start
     /// state back to the start state with transitions to the dead state.
     ///
     /// The loop is only closed when two conditions are met: the start state
     /// is a match state and the match kind is leftmost-first or
     /// leftmost-longest. (Alternatively, if this is an anchored automaton,
     /// then the start state is always closed, regardless of aforementioned
     /// conditions.)
     ///
     /// The reason for this is that under leftmost semantics, a start state
     /// that is also a match implies that we should never restart the search
     /// process. We allow normal transitions out of the start state, but if
     /// none exist, we transition to the dead state, which signals that
     /// searching should stop.
     fn close_start_state_loop(&mut self) {
         if self.builder.anchored
             || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
         {
             let start_id = self.nfa.start_id;
             let start = self.nfa.start_mut();
             for b in AllBytesIter::new() {
                 if start.next_state(b) == start_id {
                     start.set_next_state(b, dead_id());
                 }
             }
         }
     }

     /// Sets all transitions on the dead state to point back to the dead state.
     /// Normally, missing transitions map back to the failure state, but the
     /// point of the dead state is to act as a sink that can never be escaped.
     fn add_dead_state_loop(&mut self) {
         let dead = self.nfa.state_mut(dead_id());
         for b in AllBytesIter::new() {
             dead.set_next_state(b, dead_id());
         }
     }

     /// Computes the total amount of heap used by this NFA in bytes.
     fn calculate_size(&mut self) {
         let mut size = 0;
         for state in &self.nfa.states {
             size += state.heap_bytes();
         }
         self.nfa.heap_bytes = size;
     }

     /// Add a new state to the underlying NFA with the given depth. The depth
     /// is used to determine how to represent the transitions.
     ///
     /// If adding the new state would overflow the chosen state ID
     /// representation, then this returns an error.
     fn add_state(&mut self, depth: usize) -> Result<S> {
         if depth < self.builder.dense_depth {
             self.nfa.add_dense_state(depth)
         } else {
             self.nfa.add_sparse_state(depth)
         }
     }

     /// Returns the match kind configured on the underlying builder.
     fn match_kind(&self) -> MatchKind {
         self.builder.match_kind
     }
 }

 /// A set of state identifiers used to avoid revisiting the same state multiple
 /// times when filling in failure transitions.
 ///
 /// This set has an "inert" and an "active" mode. When inert, the set never
 /// stores anything and always returns `false` for every member test. This is
 /// useful to avoid the performance and memory overhead of maintaining this
 /// set when it is not needed.
 #[derive(Debug)]
 struct QueuedSet<S> {
     set: Option<BTreeSet<S>>,
 }

 impl<S: StateID> QueuedSet<S> {
     /// Return an inert set that returns `false` for every state ID membership
     /// test.
     fn inert() -> QueuedSet<S> {
         QueuedSet { set: None }
     }

     /// Return an active set that tracks state ID membership.
     fn active() -> QueuedSet<S> {
         QueuedSet { set: Some(BTreeSet::new()) }
     }

     /// Inserts the given state ID into this set. (If the set is inert, then
     /// this is a no-op.)
     fn insert(&mut self, state_id: S) {
         if let Some(ref mut set) = self.set {
             set.insert(state_id);
         }
     }

     /// Returns true if and only if the given state ID is in this set. If the
     /// set is inert, this always returns false.
     fn contains(&self, state_id: S) -> bool {
         match self.set {
             None => false,
             Some(ref set) => set.contains(&state_id),
         }
     }
 }

 /// An iterator over every byte value.
 ///
 /// We use this instead of (0..256).map(|b| b as u8) because this optimizes
 /// better in debug builds.
 ///
 /// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
 /// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
 /// once our MSRV is Rust 1.26 or newer.
 #[derive(Debug)]
 struct AllBytesIter(u16);

 impl AllBytesIter {
     fn new() -> AllBytesIter {
         AllBytesIter(0)
     }
 }

 impl Iterator for AllBytesIter {
     type Item = u8;

     fn next(&mut self) -> Option<Self::Item> {
         if self.0 >= 256 {
             None
         } else {
             let b = self.0 as u8;
             self.0 += 1;
             Some(b)
         }
     }
 }

 impl<S: StateID> fmt::Debug for NFA<S> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         writeln!(f, "NFA(")?;
         writeln!(f, "match_kind: {:?}", self.match_kind)?;
         writeln!(f, "prefilter: {:?}", self.prefilter)?;
         writeln!(f, "{}", "-".repeat(79))?;
         for (id, s) in self.states.iter().enumerate() {
             let mut trans = vec![];
             s.trans.iter(|byte, next| {
                 // The start state has a bunch of uninteresting transitions
                 // back into itself. It's questionable to hide them since they
                 // are critical to understanding the automaton, but they are
                 // very noisy without better formatting for contiugous ranges
                 // to the same state.
                 if id == self.start_id.to_usize() && next == self.start_id {
                     return;
                 }
                 // Similarly, the dead state has a bunch of uninteresting
                 // transitions too.
                 if id == dead_id() {
                     return;
                 }
                 trans.push(format!("{} => {}", escape(byte), next.to_usize()));
             });
             writeln!(f, "{:04}: {}", id, trans.join(", "))?;

             let matches: Vec<String> = s
                 .matches
                 .iter()
                 .map(|&(pattern_id, _)| pattern_id.to_string())
                 .collect();
             writeln!(f, "  matches: {}", matches.join(", "))?;
             writeln!(f, "     fail: {}", s.fail.to_usize())?;
             writeln!(f, "    depth: {}", s.depth)?;
         }
         writeln!(f, "{}", "-".repeat(79))?;
         writeln!(f, ")")?;
         Ok(())
     }
 }

 /// Iterate over all possible byte transitions given a sparse set.
 fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
     let mut byte = 0u16;
     for &(b, id) in trans {
         while byte < (b as u16) {
             f(byte as u8, fail_id());
             byte += 1;
         }
         f(b, id);
         byte += 1;
     }
     for b in byte..256 {
         f(b as u8, fail_id());
     }
 }

 /// Safely return two mutable borrows to two different locations in the given
 /// slice.
 ///
 /// This panics if i == j.
 fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
     assert!(i != j, "{} must not be equal to {}", i, j);
     if i < j {
         let (before, after) = xs.split_at_mut(j);
         (&mut before[i], &mut after[0])
     } else {
         let (before, after) = xs.split_at_mut(i);
         (&mut after[0], &mut before[j])
     }
 }

 /// Return the given byte as its escaped string form.
 fn escape(b: u8) -> String {
     use std::ascii;

     String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
 }

 #[cfg(test)]
 mod tests {
     use super::*;

     #[test]
     fn scratch() {
         let nfa: NFA<usize> = Builder::new()
             .dense_depth(0)
             // .match_kind(MatchKind::LeftmostShortest)
             // .match_kind(MatchKind::LeftmostLongest)
             .match_kind(MatchKind::LeftmostFirst)
             // .build(&["abcd", "ce", "b"])
             // .build(&["ab", "bc"])
             // .build(&["b", "bcd", "ce"])
             // .build(&["abc", "bx"])
             // .build(&["abc", "bd", "ab"])
             // .build(&["abcdefghi", "hz", "abcdefgh"])
             // .build(&["abcd", "bce", "b"])
             .build(&["abcdefg", "bcde", "bcdef"])
             .unwrap();
         println!("{:?}", nfa);
     }
 }