blob: 198e1f45c4e83e5564eac0b4f21c7547e30a40b9 [file] [log] [blame]
use std::collections::HashMap;
use std::cmp::Ordering;
use std::fmt;
use std::ops::Deref;
use std::mem;
use std::slice;
use std::sync::Arc;
use input::Char;
use literal::LiteralSearcher;
/// `InstPtr` represents the index of an instruction in a regex program.
pub type InstPtr = usize;
/// Program is a sequence of instructions and various facts about thos
/// instructions.
pub struct Program {
/// A sequence of instructions that represents an NFA.
pub insts: Vec<Inst>,
/// Pointers to each Match instruction in the sequence.
/// This is always length 1 unless this program represents a regex set.
pub matches: Vec<InstPtr>,
/// The ordered sequence of all capture groups extracted from the AST.
/// Unnamed groups are `None`.
pub captures: Vec<Option<String>>,
/// Pointers to all named capture groups into `captures`.
pub capture_name_idx: Arc<HashMap<String, usize>>,
/// A pointer to the start instruction. This can vary depending on how
/// the program was compiled. For example, programs for use with the DFA
/// engine have a `.*?` inserted at the beginning of unanchored regular
/// expressions. The actual starting point of the program is after the
/// `.*?`.
pub start: InstPtr,
/// A set of equivalence classes for discriminating bytes in the compiled
/// program.
pub byte_classes: Vec<u8>,
/// When true, this program can only match valid UTF-8.
pub only_utf8: bool,
/// When true, this program uses byte range instructions instead of Unicode
/// range instructions.
pub is_bytes: bool,
/// When true, the program is compiled for DFA matching. For example, this
/// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
/// regexes.
pub is_dfa: bool,
/// When true, the program matches text in reverse (for use only in the
/// DFA).
pub is_reverse: bool,
/// Whether the regex must match from the start of the input.
pub is_anchored_start: bool,
/// Whether the regex must match at the end of the input.
pub is_anchored_end: bool,
/// Whether this program contains a Unicode word boundary instruction.
pub has_unicode_word_boundary: bool,
/// A possibly empty machine for very quickly matching prefix literals.
pub prefixes: LiteralSearcher,
/// A limit on the size of the cache that the DFA is allowed to use while
/// matching.
/// The cache limit specifies approximately how much space we're willing to
/// give to the state cache. Once the state cache exceeds the size, it is
/// wiped and all states must be re-computed.
/// Note that this value does not impact correctness. It can be set to 0
/// and the DFA will run just fine. (It will only ever store exactly one
/// state in the cache, and will likely run very slowly, but it will work.)
/// Also note that this limit is *per thread of execution*. That is,
/// if the same regex is used to search text across multiple threads
/// simultaneously, then the DFA cache is not shared. Instead, copies are
/// made.
pub dfa_size_limit: usize,
impl Program {
/// Creates an empty instruction sequence. Fields are given default
/// values.
pub fn new() -> Self {
Program {
insts: vec![],
matches: vec![],
captures: vec![],
capture_name_idx: Arc::new(HashMap::new()),
start: 0,
byte_classes: vec![0; 256],
only_utf8: true,
is_bytes: false,
is_dfa: false,
is_reverse: false,
is_anchored_start: false,
is_anchored_end: false,
has_unicode_word_boundary: false,
prefixes: LiteralSearcher::empty(),
dfa_size_limit: 2 * (1<<20),
/// If pc is an index to a no-op instruction (like Save), then return the
/// next pc that is not a no-op instruction.
pub fn skip(&self, mut pc: usize) -> usize {
loop {
match self[pc] {
Inst::Save(ref i) => pc = i.goto,
_ => return pc,
/// Return true if and only if an execution engine at instruction `pc` will
/// always lead to a match.
pub fn leads_to_match(&self, pc: usize) -> bool {
if self.matches.len() > 1 {
// If we have a regex set, then we have more than one ending
// state, so leading to one of those states is generally
// meaningless.
return false;
match self[self.skip(pc)] {
Inst::Match(_) => true,
_ => false,
/// Returns true if the current configuration demands that an implicit
/// `.*?` be prepended to the instruction sequence.
pub fn needs_dotstar(&self) -> bool {
self.is_dfa && !self.is_reverse && !self.is_anchored_start
/// Returns true if this program uses Byte instructions instead of
/// Char/Range instructions.
pub fn uses_bytes(&self) -> bool {
self.is_bytes || self.is_dfa
/// Returns true if this program exclusively matches valid UTF-8 bytes.
/// That is, if an invalid UTF-8 byte is seen, then no match is possible.
pub fn only_utf8(&self) -> bool {
/// Return the approximate heap usage of this instruction sequence in
/// bytes.
pub fn approximate_size(&self) -> usize {
// The only instruction that uses heap space is Ranges (for
// Unicode codepoint programs) to store non-overlapping codepoint
// ranges. To keep this operation constant time, we ignore them.
(self.len() * mem::size_of::<Inst>())
+ (self.matches.len() * mem::size_of::<InstPtr>())
+ (self.captures.len() * mem::size_of::<Option<String>>())
+ (self.capture_name_idx.len() *
(mem::size_of::<String>() + mem::size_of::<usize>()))
+ (self.byte_classes.len() * mem::size_of::<u8>())
+ self.prefixes.approximate_size()
impl Deref for Program {
type Target = [Inst];
fn deref(&self) -> &Self::Target {
impl fmt::Debug for Program {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Inst::*;
fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
if goto == cur + 1 {
} else {
format!("{} (goto: {})", fmtd, goto)
fn visible_byte(b: u8) -> String {
use std::ascii::escape_default;
let escaped = escape_default(b).collect::<Vec<u8>>();
for (pc, inst) in self.iter().enumerate() {
match *inst {
Match(slot) => {
write!(f, "{:04} Match({:?})", pc, slot)?
Save(ref inst) => {
let s = format!("{:04} Save({})", pc, inst.slot);
write!(f, "{}", with_goto(pc, inst.goto, s))?;
Split(ref inst) => {
f, "{:04} Split({}, {})", pc, inst.goto1, inst.goto2)?;
EmptyLook(ref inst) => {
let s = format!("{:?}", inst.look);
write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
Char(ref inst) => {
let s = format!("{:?}", inst.c);
write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
Ranges(ref inst) => {
let ranges = inst.ranges
.map(|r| format!("{:?}-{:?}", r.0, r.1))
.join(", ");
f, "{:04} {}", pc, with_goto(pc, inst.goto, ranges))?;
Bytes(ref inst) => {
let s = format!(
"Bytes({}, {})",
write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
if pc == self.start {
write!(f, " (start)")?;
write!(f, "\n")?;
impl<'a> IntoIterator for &'a Program {
type Item = &'a Inst;
type IntoIter = slice::Iter<'a, Inst>;
fn into_iter(self) -> Self::IntoIter { self.iter() }
/// Inst is an instruction code in a Regex program.
/// Regrettably, a regex program either contains Unicode codepoint
/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
/// A regex program can never contain both.
/// It would be worth investigating splitting this into two distinct types and
/// then figuring out how to make the matching engines polymorphic over those
/// types without sacrificing performance.
/// Other than the benefit of moving invariants into the type system, another
/// benefit is the decreased size. If we remove the `Char` and `Ranges`
/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
/// variant.) Given that byte based machines are typically much bigger than
/// their Unicode analogues (because they can decode UTF-8 directly), this ends
/// up being a pretty significant savings.
#[derive(Clone, Debug)]
pub enum Inst {
/// Match indicates that the program has reached a match state.
/// The number in the match corresponds to the Nth logical regular
/// expression in this program. This index is always 0 for normal regex
/// programs. Values greater than 0 appear when compiling regex sets, and
/// each match instruction gets its own unique value. The value corresponds
/// to the Nth regex in the set.
/// Save causes the program to save the current location of the input in
/// the slot indicated by InstSave.
/// Split causes the program to diverge to one of two paths in the
/// program, preferring goto1 in InstSplit.
/// EmptyLook represents a zero-width assertion in a regex program. A
/// zero-width assertion does not consume any of the input text.
/// Char requires the regex program to match the character in InstChar at
/// the current position in the input.
/// Ranges requires the regex program to match the character at the current
/// position in the input with one of the ranges specified in InstRanges.
/// Bytes is like Ranges, except it expresses a single byte range. It is
/// used in conjunction with Split instructions to implement multi-byte
/// character classes.
impl Inst {
/// Returns true if and only if this is a match instruction.
pub fn is_match(&self) -> bool {
match *self {
Inst::Match(_) => true,
_ => false,
/// Representation of the Save instruction.
#[derive(Clone, Debug)]
pub struct InstSave {
/// The next location to execute in the program.
pub goto: InstPtr,
/// The capture slot (there are two slots for every capture in a regex,
/// including the zeroth capture for the entire match).
pub slot: usize,
/// Representation of the Split instruction.
#[derive(Clone, Debug)]
pub struct InstSplit {
/// The first instruction to try. A match resulting from following goto1
/// has precedence over a match resulting from following goto2.
pub goto1: InstPtr,
/// The second instruction to try. A match resulting from following goto1
/// has precedence over a match resulting from following goto2.
pub goto2: InstPtr,
/// Representation of the `EmptyLook` instruction.
#[derive(Clone, Debug)]
pub struct InstEmptyLook {
/// The next location to execute in the program if this instruction
/// succeeds.
pub goto: InstPtr,
/// The type of zero-width assertion to check.
pub look: EmptyLook,
/// The set of zero-width match instructions.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EmptyLook {
/// Start of line or input.
/// End of line or input.
/// Start of input.
/// End of input.
/// Word character on one side and non-word character on other.
/// Word character on both sides or non-word character on both sides.
/// ASCII word boundary.
/// Not ASCII word boundary.
/// Representation of the Char instruction.
#[derive(Clone, Debug)]
pub struct InstChar {
/// The next location to execute in the program if this instruction
/// succeeds.
pub goto: InstPtr,
/// The character to test.
pub c: char,
/// Representation of the Ranges instruction.
#[derive(Clone, Debug)]
pub struct InstRanges {
/// The next location to execute in the program if this instruction
/// succeeds.
pub goto: InstPtr,
/// The set of Unicode scalar value ranges to test.
pub ranges: Vec<(char, char)>,
impl InstRanges {
/// Tests whether the given input character matches this instruction.
pub fn matches(&self, c: Char) -> bool {
// This speeds up the `match_class_unicode` benchmark by checking
// some common cases quickly without binary search. e.g., Matching
// a Unicode class on predominantly ASCII text.
for r in self.ranges.iter().take(4) {
if c < r.0 {
return false;
if c <= r.1 {
return true;
self.ranges.binary_search_by(|r| {
if r.1 < c {
} else if r.0 > c {
} else {
/// Return the number of distinct characters represented by all of the
/// ranges.
pub fn num_chars(&self) -> usize {
.map(|&(s, e)| 1 + (e as u32) - (s as u32))
.fold(0, |acc, len| acc + len)
as usize
/// Representation of the Bytes instruction.
#[derive(Clone, Debug)]
pub struct InstBytes {
/// The next location to execute in the program if this instruction
/// succeeds.
pub goto: InstPtr,
/// The start (inclusive) of this byte range.
pub start: u8,
/// The end (inclusive) of this byte range.
pub end: u8,
impl InstBytes {
/// Returns true if and only if the given byte is in this range.
pub fn matches(&self, byte: u8) -> bool {
self.start <= byte && byte <= self.end