|  | // This module implements the Pike VM. That is, it guarantees linear time | 
|  | // search of a regex on any text with memory use proportional to the size of | 
|  | // the regex. | 
|  | // | 
|  | // It is equal in power to the backtracking engine in this crate, except the | 
|  | // backtracking engine is typically faster on small regexes/texts at the | 
|  | // expense of a bigger memory footprint. | 
|  | // | 
|  | // It can do more than the DFA can (specifically, record capture locations | 
|  | // and execute Unicode word boundary assertions), but at a slower speed. | 
|  | // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding | 
|  | // epsilon transitions. That is, the Pike VM engine can be in multiple states | 
|  | // at once where as the DFA is only ever in one state at a time. | 
|  | // | 
|  | // Therefore, the Pike VM is generally treated as the fallback when the other | 
|  | // matching engines either aren't feasible to run or are insufficient. | 
|  |  | 
|  | use std::mem; | 
|  |  | 
|  | use exec::ProgramCache; | 
|  | use input::{Input, InputAt}; | 
|  | use prog::{InstPtr, Program}; | 
|  | use re_trait::Slot; | 
|  | use sparse::SparseSet; | 
|  |  | 
|  | /// An NFA simulation matching engine. | 
|  | #[derive(Debug)] | 
|  | pub struct Fsm<'r, I> { | 
|  | /// The sequence of opcodes (among other things) that is actually executed. | 
|  | /// | 
|  | /// The program may be byte oriented or Unicode codepoint oriented. | 
|  | prog: &'r Program, | 
|  | /// An explicit stack used for following epsilon transitions. (This is | 
|  | /// borrowed from the cache.) | 
|  | stack: &'r mut Vec<FollowEpsilon>, | 
|  | /// The input to search. | 
|  | input: I, | 
|  | } | 
|  |  | 
|  | /// A cached allocation that can be reused on each execution. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct Cache { | 
|  | /// A pair of ordered sets for tracking NFA states. | 
|  | clist: Threads, | 
|  | nlist: Threads, | 
|  | /// An explicit stack used for following epsilon transitions. | 
|  | stack: Vec<FollowEpsilon>, | 
|  | } | 
|  |  | 
|  | /// An ordered set of NFA states and their captures. | 
|  | #[derive(Clone, Debug)] | 
|  | struct Threads { | 
|  | /// An ordered set of opcodes (each opcode is an NFA state). | 
|  | set: SparseSet, | 
|  | /// Captures for every NFA state. | 
|  | /// | 
|  | /// It is stored in row-major order, where the columns are the capture | 
|  | /// slots and the rows are the states. | 
|  | caps: Vec<Slot>, | 
|  | /// The number of capture slots stored per thread. (Every capture has | 
|  | /// two slots.) | 
|  | slots_per_thread: usize, | 
|  | } | 
|  |  | 
|  | /// A representation of an explicit stack frame when following epsilon | 
|  | /// transitions. This is used to avoid recursion. | 
|  | #[derive(Clone, Debug)] | 
|  | enum FollowEpsilon { | 
|  | /// Follow transitions at the given instruction pointer. | 
|  | IP(InstPtr), | 
|  | /// Restore the capture slot with the given position in the input. | 
|  | Capture { slot: usize, pos: Slot }, | 
|  | } | 
|  |  | 
|  | impl Cache { | 
|  | /// Create a new allocation used by the NFA machine to record execution | 
|  | /// and captures. | 
|  | pub fn new(_prog: &Program) -> Self { | 
|  | Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } | 
|  | } | 
|  | } | 
|  |  | 
|  | impl<'r, I: Input> Fsm<'r, I> { | 
|  | /// Execute the NFA matching engine. | 
|  | /// | 
|  | /// If there's a match, `exec` returns `true` and populates the given | 
|  | /// captures accordingly. | 
|  | pub fn exec( | 
|  | prog: &'r Program, | 
|  | cache: &ProgramCache, | 
|  | matches: &mut [bool], | 
|  | slots: &mut [Slot], | 
|  | quit_after_match: bool, | 
|  | input: I, | 
|  | start: usize, | 
|  | end: usize, | 
|  | ) -> bool { | 
|  | let mut cache = cache.borrow_mut(); | 
|  | let cache = &mut cache.pikevm; | 
|  | cache.clist.resize(prog.len(), prog.captures.len()); | 
|  | cache.nlist.resize(prog.len(), prog.captures.len()); | 
|  | let at = input.at(start); | 
|  | Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_( | 
|  | &mut cache.clist, | 
|  | &mut cache.nlist, | 
|  | matches, | 
|  | slots, | 
|  | quit_after_match, | 
|  | at, | 
|  | end, | 
|  | ) | 
|  | } | 
|  |  | 
|  | fn exec_( | 
|  | &mut self, | 
|  | mut clist: &mut Threads, | 
|  | mut nlist: &mut Threads, | 
|  | matches: &mut [bool], | 
|  | slots: &mut [Slot], | 
|  | quit_after_match: bool, | 
|  | mut at: InputAt, | 
|  | end: usize, | 
|  | ) -> bool { | 
|  | let mut matched = false; | 
|  | let mut all_matched = false; | 
|  | clist.set.clear(); | 
|  | nlist.set.clear(); | 
|  | 'LOOP: loop { | 
|  | if clist.set.is_empty() { | 
|  | // Three ways to bail out when our current set of threads is | 
|  | // empty. | 
|  | // | 
|  | // 1. We have a match---so we're done exploring any possible | 
|  | //    alternatives. Time to quit. (We can't do this if we're | 
|  | //    looking for matches for multiple regexes, unless we know | 
|  | //    they all matched.) | 
|  | // | 
|  | // 2. If the expression starts with a '^' we can terminate as | 
|  | //    soon as the last thread dies. | 
|  | if (matched && matches.len() <= 1) | 
|  | || all_matched | 
|  | || (!at.is_start() && self.prog.is_anchored_start) | 
|  | { | 
|  | break; | 
|  | } | 
|  |  | 
|  | // 3. If there's a literal prefix for the program, try to | 
|  | //    jump ahead quickly. If it can't be found, then we can | 
|  | //    bail out early. | 
|  | if !self.prog.prefixes.is_empty() { | 
|  | at = match self.input.prefix_at(&self.prog.prefixes, at) { | 
|  | None => break, | 
|  | Some(at) => at, | 
|  | }; | 
|  | } | 
|  | } | 
|  |  | 
|  | // This simulates a preceding '.*?' for every regex by adding | 
|  | // a state starting at the current position in the input for the | 
|  | // beginning of the program only if we don't already have a match. | 
|  | if clist.set.is_empty() | 
|  | || (!self.prog.is_anchored_start && !all_matched) | 
|  | { | 
|  | self.add(&mut clist, slots, 0, at); | 
|  | } | 
|  | // The previous call to "add" actually inspects the position just | 
|  | // before the current character. For stepping through the machine, | 
|  | // we can to look at the current character, so we advance the | 
|  | // input. | 
|  | let at_next = self.input.at(at.next_pos()); | 
|  | for i in 0..clist.set.len() { | 
|  | let ip = clist.set[i]; | 
|  | if self.step( | 
|  | &mut nlist, | 
|  | matches, | 
|  | slots, | 
|  | clist.caps(ip), | 
|  | ip, | 
|  | at, | 
|  | at_next, | 
|  | ) { | 
|  | matched = true; | 
|  | all_matched = all_matched || matches.iter().all(|&b| b); | 
|  | if quit_after_match { | 
|  | // If we only care if a match occurs (not its | 
|  | // position), then we can quit right now. | 
|  | break 'LOOP; | 
|  | } | 
|  | if self.prog.matches.len() == 1 { | 
|  | // We don't need to check the rest of the threads | 
|  | // in this set because we've matched something | 
|  | // ("leftmost-first"). However, we still need to check | 
|  | // threads in the next set to support things like | 
|  | // greedy matching. | 
|  | // | 
|  | // This is only true on normal regexes. For regex sets, | 
|  | // we need to mush on to observe other matches. | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  | if at.pos() >= end { | 
|  | break; | 
|  | } | 
|  | at = at_next; | 
|  | mem::swap(clist, nlist); | 
|  | nlist.set.clear(); | 
|  | } | 
|  | matched | 
|  | } | 
|  |  | 
|  | /// Step through the input, one token (byte or codepoint) at a time. | 
|  | /// | 
|  | /// nlist is the set of states that will be processed on the next token | 
|  | /// in the input. | 
|  | /// | 
|  | /// caps is the set of captures passed by the caller of the NFA. They are | 
|  | /// written to only when a match state is visited. | 
|  | /// | 
|  | /// thread_caps is the set of captures set for the current NFA state, ip. | 
|  | /// | 
|  | /// at and at_next are the current and next positions in the input. at or | 
|  | /// at_next may be EOF. | 
|  | fn step( | 
|  | &mut self, | 
|  | nlist: &mut Threads, | 
|  | matches: &mut [bool], | 
|  | slots: &mut [Slot], | 
|  | thread_caps: &mut [Option<usize>], | 
|  | ip: usize, | 
|  | at: InputAt, | 
|  | at_next: InputAt, | 
|  | ) -> bool { | 
|  | use prog::Inst::*; | 
|  | match self.prog[ip] { | 
|  | Match(match_slot) => { | 
|  | if match_slot < matches.len() { | 
|  | matches[match_slot] = true; | 
|  | } | 
|  | for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { | 
|  | *slot = *val; | 
|  | } | 
|  | true | 
|  | } | 
|  | Char(ref inst) => { | 
|  | if inst.c == at.char() { | 
|  | self.add(nlist, thread_caps, inst.goto, at_next); | 
|  | } | 
|  | false | 
|  | } | 
|  | Ranges(ref inst) => { | 
|  | if inst.matches(at.char()) { | 
|  | self.add(nlist, thread_caps, inst.goto, at_next); | 
|  | } | 
|  | false | 
|  | } | 
|  | Bytes(ref inst) => { | 
|  | if let Some(b) = at.byte() { | 
|  | if inst.matches(b) { | 
|  | self.add(nlist, thread_caps, inst.goto, at_next); | 
|  | } | 
|  | } | 
|  | false | 
|  | } | 
|  | EmptyLook(_) | Save(_) | Split(_) => false, | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Follows epsilon transitions and adds them for processing to nlist, | 
|  | /// starting at and including ip. | 
|  | fn add( | 
|  | &mut self, | 
|  | nlist: &mut Threads, | 
|  | thread_caps: &mut [Option<usize>], | 
|  | ip: usize, | 
|  | at: InputAt, | 
|  | ) { | 
|  | self.stack.push(FollowEpsilon::IP(ip)); | 
|  | while let Some(frame) = self.stack.pop() { | 
|  | match frame { | 
|  | FollowEpsilon::IP(ip) => { | 
|  | self.add_step(nlist, thread_caps, ip, at); | 
|  | } | 
|  | FollowEpsilon::Capture { slot, pos } => { | 
|  | thread_caps[slot] = pos; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A helper function for add that avoids excessive pushing to the stack. | 
|  | fn add_step( | 
|  | &mut self, | 
|  | nlist: &mut Threads, | 
|  | thread_caps: &mut [Option<usize>], | 
|  | mut ip: usize, | 
|  | at: InputAt, | 
|  | ) { | 
|  | // Instead of pushing and popping to the stack, we mutate ip as we | 
|  | // traverse the set of states. We only push to the stack when we | 
|  | // absolutely need recursion (restoring captures or following a | 
|  | // branch). | 
|  | use prog::Inst::*; | 
|  | loop { | 
|  | // Don't visit states we've already added. | 
|  | if nlist.set.contains(ip) { | 
|  | return; | 
|  | } | 
|  | nlist.set.insert(ip); | 
|  | match self.prog[ip] { | 
|  | EmptyLook(ref inst) => { | 
|  | if self.input.is_empty_match(at, inst) { | 
|  | ip = inst.goto; | 
|  | } | 
|  | } | 
|  | Save(ref inst) => { | 
|  | if inst.slot < thread_caps.len() { | 
|  | self.stack.push(FollowEpsilon::Capture { | 
|  | slot: inst.slot, | 
|  | pos: thread_caps[inst.slot], | 
|  | }); | 
|  | thread_caps[inst.slot] = Some(at.pos()); | 
|  | } | 
|  | ip = inst.goto; | 
|  | } | 
|  | Split(ref inst) => { | 
|  | self.stack.push(FollowEpsilon::IP(inst.goto2)); | 
|  | ip = inst.goto1; | 
|  | } | 
|  | Match(_) | Char(_) | Ranges(_) | Bytes(_) => { | 
|  | let t = &mut nlist.caps(ip); | 
|  | for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { | 
|  | *slot = *val; | 
|  | } | 
|  | return; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | impl Threads { | 
|  | fn new() -> Self { | 
|  | Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } | 
|  | } | 
|  |  | 
|  | fn resize(&mut self, num_insts: usize, ncaps: usize) { | 
|  | if num_insts == self.set.capacity() { | 
|  | return; | 
|  | } | 
|  | self.slots_per_thread = ncaps * 2; | 
|  | self.set = SparseSet::new(num_insts); | 
|  | self.caps = vec![None; self.slots_per_thread * num_insts]; | 
|  | } | 
|  |  | 
|  | fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { | 
|  | let i = pc * self.slots_per_thread; | 
|  | &mut self.caps[i..i + self.slots_per_thread] | 
|  | } | 
|  | } |