| use std::fmt; |
| |
| use classes::ByteClasses; |
| pub use nfa::compiler::Builder; |
| |
| mod compiler; |
| mod map; |
| mod range_trie; |
| |
| /// The representation for an NFA state identifier. |
| pub type StateID = usize; |
| |
| /// A final compiled NFA. |
| /// |
| /// The states of the NFA are indexed by state IDs, which are how transitions |
| /// are expressed. |
| #[derive(Clone)] |
| pub struct NFA { |
| /// Whether this NFA can only match at the beginning of input or not. |
| /// |
| /// When true, a match should only be reported if it begins at the 0th |
| /// index of the haystack. |
| anchored: bool, |
| /// The starting state of this NFA. |
| start: StateID, |
| /// The state list. This list is guaranteed to be indexable by the starting |
| /// state ID, and it is also guaranteed to contain exactly one `Match` |
| /// state. |
| states: Vec<State>, |
| /// A mapping from any byte value to its corresponding equivalence class |
| /// identifier. Two bytes in the same equivalence class cannot discriminate |
| /// between a match or a non-match. This map can be used to shrink the |
| /// total size of a DFA's transition table with a small match-time cost. |
| /// |
| /// Note that the NFA's transitions are *not* defined in terms of these |
| /// equivalence classes. The NFA's transitions are defined on the original |
| /// byte values. For the most part, this is because they wouldn't really |
| /// help the NFA much since the NFA already uses a sparse representation |
| /// to represent transitions. Byte classes are most effective in a dense |
| /// representation. |
| byte_classes: ByteClasses, |
| } |
| |
| impl NFA { |
| /// Returns an NFA that always matches at every position. |
| pub fn always_match() -> NFA { |
| NFA { |
| anchored: false, |
| start: 0, |
| states: vec![State::Match], |
| byte_classes: ByteClasses::empty(), |
| } |
| } |
| |
| /// Returns an NFA that never matches at any position. |
| pub fn never_match() -> NFA { |
| NFA { |
| anchored: false, |
| start: 0, |
| states: vec![State::Fail], |
| byte_classes: ByteClasses::empty(), |
| } |
| } |
| |
| /// Returns true if and only if this NFA is anchored. |
| pub fn is_anchored(&self) -> bool { |
| self.anchored |
| } |
| |
| /// Return the number of states in this NFA. |
| pub fn len(&self) -> usize { |
| self.states.len() |
| } |
| |
| /// Return the ID of the initial state of this NFA. |
| pub fn start(&self) -> StateID { |
| self.start |
| } |
| |
| /// Return the NFA state corresponding to the given ID. |
| pub fn state(&self, id: StateID) -> &State { |
| &self.states[id] |
| } |
| |
| /// Return the set of equivalence classes for this NFA. The slice returned |
| /// always has length 256 and maps each possible byte value to its |
| /// corresponding equivalence class ID (which is never more than 255). |
| pub fn byte_classes(&self) -> &ByteClasses { |
| &self.byte_classes |
| } |
| } |
| |
| impl fmt::Debug for NFA { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| for (i, state) in self.states.iter().enumerate() { |
| let status = if i == self.start { '>' } else { ' ' }; |
| writeln!(f, "{}{:06}: {:?}", status, i, state)?; |
| } |
| Ok(()) |
| } |
| } |
| |
| /// A state in a final compiled NFA. |
| #[derive(Clone, Eq, PartialEq)] |
| pub enum State { |
| /// A state that transitions to `next` if and only if the current input |
| /// byte is in the range `[start, end]` (inclusive). |
| /// |
| /// This is a special case of Sparse in that it encodes only one transition |
| /// (and therefore avoids the allocation). |
| Range { range: Transition }, |
| /// A state with possibly many transitions, represented in a sparse |
| /// fashion. Transitions are ordered lexicographically by input range. |
| /// As such, this may only be used when every transition has equal |
| /// priority. (In practice, this is only used for encoding large UTF-8 |
| /// automata.) |
| Sparse { ranges: Box<[Transition]> }, |
| /// An alternation such that there exists an epsilon transition to all |
| /// states in `alternates`, where matches found via earlier transitions |
| /// are preferred over later transitions. |
| Union { alternates: Box<[StateID]> }, |
| /// A fail state. When encountered, the automaton is guaranteed to never |
| /// reach a match state. |
| Fail, |
| /// A match state. There is exactly one such occurrence of this state in |
| /// an NFA. |
| Match, |
| } |
| |
| /// A transition to another state, only if the given byte falls in the |
| /// inclusive range specified. |
| #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
| pub struct Transition { |
| pub start: u8, |
| pub end: u8, |
| pub next: StateID, |
| } |
| |
| impl State { |
| /// Returns true if and only if this state contains one or more epsilon |
| /// transitions. |
| pub fn is_epsilon(&self) -> bool { |
| match *self { |
| State::Range { .. } |
| | State::Sparse { .. } |
| | State::Fail |
| | State::Match => false, |
| State::Union { .. } => true, |
| } |
| } |
| |
| /// Remap the transitions in this state using the given map. Namely, the |
| /// given map should be indexed according to the transitions currently |
| /// in this state. |
| /// |
| /// This is used during the final phase of the NFA compiler, which turns |
| /// its intermediate NFA into the final NFA. |
| fn remap(&mut self, remap: &[StateID]) { |
| match *self { |
| State::Range { ref mut range } => range.next = remap[range.next], |
| State::Sparse { ref mut ranges } => { |
| for r in ranges.iter_mut() { |
| r.next = remap[r.next]; |
| } |
| } |
| State::Union { ref mut alternates } => { |
| for alt in alternates.iter_mut() { |
| *alt = remap[*alt]; |
| } |
| } |
| State::Fail => {} |
| State::Match => {} |
| } |
| } |
| } |
| |
| impl fmt::Debug for State { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match *self { |
| State::Range { ref range } => range.fmt(f), |
| State::Sparse { ref ranges } => { |
| let rs = ranges |
| .iter() |
| .map(|t| format!("{:?}", t)) |
| .collect::<Vec<String>>() |
| .join(", "); |
| write!(f, "sparse({})", rs) |
| } |
| State::Union { ref alternates } => { |
| let alts = alternates |
| .iter() |
| .map(|id| format!("{}", id)) |
| .collect::<Vec<String>>() |
| .join(", "); |
| write!(f, "alt({})", alts) |
| } |
| State::Fail => write!(f, "FAIL"), |
| State::Match => write!(f, "MATCH"), |
| } |
| } |
| } |
| |
| impl fmt::Debug for Transition { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| let Transition { start, end, next } = *self; |
| if self.start == self.end { |
| write!(f, "{} => {}", escape(start), next) |
| } else { |
| write!(f, "{}-{} => {}", escape(start), escape(end), next) |
| } |
| } |
| } |
| |
| /// Return the given byte as its escaped string form. |
| fn escape(b: u8) -> String { |
| use std::ascii; |
| |
| String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use dense; |
| use dfa::DFA; |
| |
| #[test] |
| fn always_match() { |
| let nfa = NFA::always_match(); |
| let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); |
| |
| assert_eq!(Some(0), dfa.find_at(b"", 0)); |
| assert_eq!(Some(0), dfa.find_at(b"a", 0)); |
| assert_eq!(Some(1), dfa.find_at(b"a", 1)); |
| assert_eq!(Some(0), dfa.find_at(b"ab", 0)); |
| assert_eq!(Some(1), dfa.find_at(b"ab", 1)); |
| assert_eq!(Some(2), dfa.find_at(b"ab", 2)); |
| } |
| |
| #[test] |
| fn never_match() { |
| let nfa = NFA::never_match(); |
| let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); |
| |
| assert_eq!(None, dfa.find_at(b"", 0)); |
| assert_eq!(None, dfa.find_at(b"a", 0)); |
| assert_eq!(None, dfa.find_at(b"a", 1)); |
| assert_eq!(None, dfa.find_at(b"ab", 0)); |
| assert_eq!(None, dfa.find_at(b"ab", 1)); |
| assert_eq!(None, dfa.find_at(b"ab", 2)); |
| } |
| } |