|  | // This is the backtracking matching engine. It has the same exact capability | 
|  | // as the full NFA simulation, except it is artificially restricted to small | 
|  | // regexes on small inputs because of its memory requirements. | 
|  | // | 
|  | // In particular, this is a *bounded* backtracking engine. It retains worst | 
|  | // case linear time by keeping track of the states that it has visited (using a | 
|  | // bitmap). Namely, once a state is visited, it is never visited again. Since a | 
|  | // state is keyed by `(instruction index, input index)`, we have that its time | 
|  | // complexity is `O(mn)` (i.e., linear in the size of the search text). | 
|  | // | 
|  | // The backtracking engine can beat out the NFA simulation on small | 
|  | // regexes/inputs because it doesn't have to keep track of multiple copies of | 
|  | // the capture groups. In benchmarks, the backtracking engine is roughly twice | 
|  | // as fast as the full NFA simulation. Note though that its performance doesn't | 
|  | // scale, even if you're willing to live with the memory requirements. Namely, | 
|  | // the bitset has to be zeroed on each execution, which becomes quite expensive | 
|  | // on large bitsets. | 
|  |  | 
|  | use exec::ProgramCache; | 
|  | use input::{Input, InputAt}; | 
|  | use prog::{InstPtr, Program}; | 
|  | use re_trait::Slot; | 
|  |  | 
|  | type Bits = u32; | 
|  |  | 
|  | const BIT_SIZE: usize = 32; | 
|  | const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB | 
|  |  | 
|  | /// Returns true iff the given regex and input should be executed by this | 
|  | /// engine with reasonable memory usage. | 
|  | pub fn should_exec(num_insts: usize, text_len: usize) -> bool { | 
|  | // Total memory usage in bytes is determined by: | 
|  | // | 
|  | //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) | 
|  | // | 
|  | // The actual limit picked is pretty much a heuristic. | 
|  | // See: https://github.com/rust-lang/regex/issues/215 | 
|  | let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; | 
|  | size <= MAX_SIZE_BYTES | 
|  | } | 
|  |  | 
|  | /// A backtracking matching engine. | 
|  | #[derive(Debug)] | 
|  | pub struct Bounded<'a, 'm, 'r, 's, I> { | 
|  | prog: &'r Program, | 
|  | input: I, | 
|  | matches: &'m mut [bool], | 
|  | slots: &'s mut [Slot], | 
|  | m: &'a mut Cache, | 
|  | } | 
|  |  | 
|  | /// Shared cached state between multiple invocations of a backtracking engine | 
|  | /// in the same thread. | 
|  | #[derive(Clone, Debug)] | 
|  | pub struct Cache { | 
|  | jobs: Vec<Job>, | 
|  | visited: Vec<Bits>, | 
|  | } | 
|  |  | 
|  | impl Cache { | 
|  | /// Create new empty cache for the backtracking engine. | 
|  | pub fn new(_prog: &Program) -> Self { | 
|  | Cache { jobs: vec![], visited: vec![] } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// A job is an explicit unit of stack space in the backtracking engine. | 
|  | /// | 
|  | /// The "normal" representation is a single state transition, which corresponds | 
|  | /// to an NFA state and a character in the input. However, the backtracking | 
|  | /// engine must keep track of old capture group values. We use the explicit | 
|  | /// stack to do it. | 
|  | #[derive(Clone, Copy, Debug)] | 
|  | enum Job { | 
|  | Inst { ip: InstPtr, at: InputAt }, | 
|  | SaveRestore { slot: usize, old_pos: Option<usize> }, | 
|  | } | 
|  |  | 
|  | impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { | 
|  | /// Execute the backtracking matching engine. | 
|  | /// | 
|  | /// If there's a match, `exec` returns `true` and populates the given | 
|  | /// captures accordingly. | 
|  | pub fn exec( | 
|  | prog: &'r Program, | 
|  | cache: &ProgramCache, | 
|  | matches: &'m mut [bool], | 
|  | slots: &'s mut [Slot], | 
|  | input: I, | 
|  | start: usize, | 
|  | end: usize, | 
|  | ) -> bool { | 
|  | let mut cache = cache.borrow_mut(); | 
|  | let cache = &mut cache.backtrack; | 
|  | let start = input.at(start); | 
|  | let mut b = Bounded { | 
|  | prog: prog, | 
|  | input: input, | 
|  | matches: matches, | 
|  | slots: slots, | 
|  | m: cache, | 
|  | }; | 
|  | b.exec_(start, end) | 
|  | } | 
|  |  | 
|  | /// Clears the cache such that the backtracking engine can be executed | 
|  | /// on some input of fixed length. | 
|  | fn clear(&mut self) { | 
|  | // Reset the job memory so that we start fresh. | 
|  | self.m.jobs.clear(); | 
|  |  | 
|  | // Now we need to clear the bit state set. | 
|  | // We do this by figuring out how much space we need to keep track | 
|  | // of the states we've visited. | 
|  | // Then we reset all existing allocated space to 0. | 
|  | // Finally, we request more space if we need it. | 
|  | // | 
|  | // This is all a little circuitous, but doing this unsafely | 
|  | // doesn't seem to have a measurable impact on performance. | 
|  | // (Probably because backtracking is limited to such small | 
|  | // inputs/regexes in the first place.) | 
|  | let visited_len = | 
|  | (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) | 
|  | / BIT_SIZE; | 
|  | self.m.visited.truncate(visited_len); | 
|  | for v in &mut self.m.visited { | 
|  | *v = 0; | 
|  | } | 
|  | if visited_len > self.m.visited.len() { | 
|  | let len = self.m.visited.len(); | 
|  | self.m.visited.reserve_exact(visited_len - len); | 
|  | for _ in 0..(visited_len - len) { | 
|  | self.m.visited.push(0); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Start backtracking at the given position in the input, but also look | 
|  | /// for literal prefixes. | 
|  | fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { | 
|  | self.clear(); | 
|  | // If this is an anchored regex at the beginning of the input, then | 
|  | // we're either already done or we only need to try backtracking once. | 
|  | if self.prog.is_anchored_start { | 
|  | return if !at.is_start() { false } else { self.backtrack(at) }; | 
|  | } | 
|  | let mut matched = false; | 
|  | loop { | 
|  | if !self.prog.prefixes.is_empty() { | 
|  | at = match self.input.prefix_at(&self.prog.prefixes, at) { | 
|  | None => break, | 
|  | Some(at) => at, | 
|  | }; | 
|  | } | 
|  | matched = self.backtrack(at) || matched; | 
|  | if matched && self.prog.matches.len() == 1 { | 
|  | return true; | 
|  | } | 
|  | if at.pos() >= end { | 
|  | break; | 
|  | } | 
|  | at = self.input.at(at.next_pos()); | 
|  | } | 
|  | matched | 
|  | } | 
|  |  | 
|  | /// The main backtracking loop starting at the given input position. | 
|  | fn backtrack(&mut self, start: InputAt) -> bool { | 
|  | // N.B. We use an explicit stack to avoid recursion. | 
|  | // To avoid excessive pushing and popping, most transitions are handled | 
|  | // in the `step` helper function, which only pushes to the stack when | 
|  | // there's a capture or a branch. | 
|  | let mut matched = false; | 
|  | self.m.jobs.push(Job::Inst { ip: 0, at: start }); | 
|  | while let Some(job) = self.m.jobs.pop() { | 
|  | match job { | 
|  | Job::Inst { ip, at } => { | 
|  | if self.step(ip, at) { | 
|  | // Only quit if we're matching one regex. | 
|  | // If we're matching a regex set, then mush on and | 
|  | // try to find other matches (if we want them). | 
|  | if self.prog.matches.len() == 1 { | 
|  | return true; | 
|  | } | 
|  | matched = true; | 
|  | } | 
|  | } | 
|  | Job::SaveRestore { slot, old_pos } => { | 
|  | if slot < self.slots.len() { | 
|  | self.slots[slot] = old_pos; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | matched | 
|  | } | 
|  |  | 
|  | fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { | 
|  | use prog::Inst::*; | 
|  | loop { | 
|  | // This loop is an optimization to avoid constantly pushing/popping | 
|  | // from the stack. Namely, if we're pushing a job only to run it | 
|  | // next, avoid the push and just mutate `ip` (and possibly `at`) | 
|  | // in place. | 
|  | if self.has_visited(ip, at) { | 
|  | return false; | 
|  | } | 
|  | match self.prog[ip] { | 
|  | Match(slot) => { | 
|  | if slot < self.matches.len() { | 
|  | self.matches[slot] = true; | 
|  | } | 
|  | return true; | 
|  | } | 
|  | Save(ref inst) => { | 
|  | if let Some(&old_pos) = self.slots.get(inst.slot) { | 
|  | // If this path doesn't work out, then we save the old | 
|  | // capture index (if one exists) in an alternate | 
|  | // job. If the next path fails, then the alternate | 
|  | // job is popped and the old capture index is restored. | 
|  | self.m.jobs.push(Job::SaveRestore { | 
|  | slot: inst.slot, | 
|  | old_pos: old_pos, | 
|  | }); | 
|  | self.slots[inst.slot] = Some(at.pos()); | 
|  | } | 
|  | ip = inst.goto; | 
|  | } | 
|  | Split(ref inst) => { | 
|  | self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); | 
|  | ip = inst.goto1; | 
|  | } | 
|  | EmptyLook(ref inst) => { | 
|  | if self.input.is_empty_match(at, inst) { | 
|  | ip = inst.goto; | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | } | 
|  | Char(ref inst) => { | 
|  | if inst.c == at.char() { | 
|  | ip = inst.goto; | 
|  | at = self.input.at(at.next_pos()); | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | } | 
|  | Ranges(ref inst) => { | 
|  | if inst.matches(at.char()) { | 
|  | ip = inst.goto; | 
|  | at = self.input.at(at.next_pos()); | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | } | 
|  | Bytes(ref inst) => { | 
|  | if let Some(b) = at.byte() { | 
|  | if inst.matches(b) { | 
|  | ip = inst.goto; | 
|  | at = self.input.at(at.next_pos()); | 
|  | continue; | 
|  | } | 
|  | } | 
|  | return false; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { | 
|  | let k = ip * (self.input.len() + 1) + at.pos(); | 
|  | let k1 = k / BIT_SIZE; | 
|  | let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); | 
|  | if self.m.visited[k1] & k2 == 0 { | 
|  | self.m.visited[k1] |= k2; | 
|  | false | 
|  | } else { | 
|  | true | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | fn usize_to_u32(n: usize) -> u32 { | 
|  | if (n as u64) > (::std::u32::MAX as u64) { | 
|  | panic!("BUG: {} is too big to fit into u32", n) | 
|  | } | 
|  | n as u32 | 
|  | } |