blob: 6f035268701895a67d6f497c0592fc597c22d8da [file] [log] [blame]
// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
/// The set of user configurable options for compiling zero or more regexes.
#[derive(Clone, Debug)]
#[allow(missing_docs)]
pub struct RegexOptions {
pub pats: Vec<String>,
pub size_limit: usize,
pub dfa_size_limit: usize,
pub nest_limit: u32,
pub case_insensitive: bool,
pub multi_line: bool,
pub dot_matches_new_line: bool,
pub swap_greed: bool,
pub ignore_whitespace: bool,
pub unicode: bool,
pub octal: bool,
}
impl Default for RegexOptions {
fn default() -> Self {
RegexOptions {
pats: vec![],
size_limit: 10 * (1<<20),
dfa_size_limit: 2 * (1<<20),
nest_limit: 250,
case_insensitive: false,
multi_line: false,
dot_matches_new_line: false,
swap_greed: false,
ignore_whitespace: false,
unicode: true,
octal: false,
}
}
}
macro_rules! define_builder {
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
pub mod $name {
use error::Error;
use exec::ExecBuilder;
use super::RegexOptions;
use $regex_mod::Regex;
/// A configurable builder for a regular expression.
///
/// A builder can be used to configure how the regex is built, for example, by
/// setting the default flags (which can be overridden in the expression
/// itself) or setting various limits.
pub struct RegexBuilder(RegexOptions);
impl RegexBuilder {
/// Create a new regular expression builder with the given pattern.
///
/// If the pattern is invalid, then an error will be returned when
/// `build` is called.
pub fn new(pattern: &str) -> RegexBuilder {
let mut builder = RegexBuilder(RegexOptions::default());
builder.0.pats.push(pattern.to_owned());
builder
}
/// Consume the builder and compile the regular expression.
///
/// Note that calling `as_str` on the resulting `Regex` will produce the
/// pattern given to `new` verbatim. Notably, it will not incorporate any
/// of the flags set on this builder.
pub fn build(&self) -> Result<Regex, Error> {
ExecBuilder::new_options(self.0.clone())
.only_utf8($only_utf8)
.build()
.map(Regex::from)
}
/// Set the value for the case insensitive (`i`) flag.
///
/// When enabled, letters in the pattern will match both upper case and
/// lower case variants.
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.case_insensitive = yes;
self
}
/// Set the value for the multi-line matching (`m`) flag.
///
/// When enabled, `^` matches the beginning of lines and `$` matches the
/// end of lines.
///
/// By default, they match beginning/end of the input.
pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.multi_line = yes;
self
}
/// Set the value for the any character (`s`) flag, where in `.` matches
/// anything when `s` is set and matches anything except for new line when
/// it is not set (the default).
///
/// N.B. "matches anything" means "any byte" when Unicode is disabled and
/// means "any valid UTF-8 encoding of any Unicode scalar value" when
/// Unicode is enabled.
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.dot_matches_new_line = yes;
self
}
/// Set the value for the greedy swap (`U`) flag.
///
/// When enabled, a pattern like `a*` is lazy (tries to find shortest
/// match) and `a*?` is greedy (tries to find longest match).
///
/// By default, `a*` is greedy and `a*?` is lazy.
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.swap_greed = yes;
self
}
/// Set the value for the ignore whitespace (`x`) flag.
///
/// When enabled, whitespace such as new lines and spaces will be ignored
/// between expressions of the pattern, and `#` can be used to start a
/// comment until the next new line.
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.ignore_whitespace = yes;
self
}
/// Set the value for the Unicode (`u`) flag.
///
/// Enabled by default. When disabled, character classes such as `\w` only
/// match ASCII word characters instead of all Unicode word characters.
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.unicode = yes;
self
}
/// Whether to support octal syntax or not.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints in
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
/// `\141` are all equivalent regular expressions, where the last example
/// shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem, it does
/// make good error messages harder. That is, in PCRE based regex engines,
/// syntax like `\0` invokes a backreference, which is explicitly
/// unsupported in Rust's regex engine. However, many users expect it to
/// be supported. Therefore, when octal support is disabled, the error
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
self.0.octal = yes;
self
}
/// Set the approximate size limit of the compiled regular expression.
///
/// This roughly corresponds to the number of bytes occupied by a single
/// compiled program. If the program exceeds this number, then a
/// compilation error is returned.
pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder {
self.0.size_limit = limit;
self
}
/// Set the approximate size of the cache used by the DFA.
///
/// This roughly corresponds to the number of bytes that the DFA will
/// use while searching.
///
/// Note that this is a *per thread* limit. There is no way to set a global
/// limit. In particular, if a regex is used from multiple threads
/// simultaneously, then each thread may use up to the number of bytes
/// specified here.
pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder {
self.0.dfa_size_limit = limit;
self
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an `Ast` using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire Ast is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since this parser
/// implementation will limit itself to heap space proportional to the
/// lenth of the pattern string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation, which results in a nest
/// depth of `1`. In general, a nest limit is not something that manifests
/// in an obvious way in the concrete syntax, therefore, it should not be
/// used in a granular way.
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
self.0.nest_limit = limit;
self
}
}
}
}
}
define_builder!(bytes, re_bytes, false);
define_builder!(unicode, re_unicode, true);
macro_rules! define_set_builder {
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
pub mod $name {
use error::Error;
use exec::ExecBuilder;
use super::RegexOptions;
use re_set::$regex_mod::RegexSet;
/// A configurable builder for a set of regular expressions.
///
/// A builder can be used to configure how the regexes are built, for example,
/// by setting the default flags (which can be overridden in the expression
/// itself) or setting various limits.
pub struct RegexSetBuilder(RegexOptions);
impl RegexSetBuilder {
/// Create a new regular expression builder with the given pattern.
///
/// If the pattern is invalid, then an error will be returned when
/// `build` is called.
pub fn new<I, S>(patterns: I) -> RegexSetBuilder
where S: AsRef<str>, I: IntoIterator<Item=S> {
let mut builder = RegexSetBuilder(RegexOptions::default());
for pat in patterns {
builder.0.pats.push(pat.as_ref().to_owned());
}
builder
}
/// Consume the builder and compile the regular expressions into a set.
pub fn build(&self) -> Result<RegexSet, Error> {
ExecBuilder::new_options(self.0.clone())
.only_utf8($only_utf8)
.build()
.map(RegexSet::from)
}
/// Set the value for the case insensitive (`i`) flag.
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.case_insensitive = yes;
self
}
/// Set the value for the multi-line matching (`m`) flag.
pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.multi_line = yes;
self
}
/// Set the value for the any character (`s`) flag, where in `.` matches
/// anything when `s` is set and matches anything except for new line when
/// it is not set (the default).
///
/// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
/// expressions and means "any Unicode scalar value" for `regex::RegexSet`
/// expressions.
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.dot_matches_new_line = yes;
self
}
/// Set the value for the greedy swap (`U`) flag.
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.swap_greed = yes;
self
}
/// Set the value for the ignore whitespace (`x`) flag.
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.ignore_whitespace = yes;
self
}
/// Set the value for the Unicode (`u`) flag.
pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.unicode = yes;
self
}
/// Whether to support octal syntax or not.
///
/// Octal syntax is a little-known way of uttering Unicode codepoints in
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
/// `\141` are all equivalent regular expressions, where the last example
/// shows octal syntax.
///
/// While supporting octal syntax isn't in and of itself a problem, it does
/// make good error messages harder. That is, in PCRE based regex engines,
/// syntax like `\0` invokes a backreference, which is explicitly
/// unsupported in Rust's regex engine. However, many users expect it to
/// be supported. Therefore, when octal support is disabled, the error
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
self.0.octal = yes;
self
}
/// Set the approximate size limit of the compiled regular expression.
///
/// This roughly corresponds to the number of bytes occupied by a single
/// compiled program. If the program exceeds this number, then a
/// compilation error is returned.
pub fn size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder {
self.0.size_limit = limit;
self
}
/// Set the approximate size of the cache used by the DFA.
///
/// This roughly corresponds to the number of bytes that the DFA will
/// use while searching.
///
/// Note that this is a *per thread* limit. There is no way to set a global
/// limit. In particular, if a regex is used from multiple threads
/// simulanteously, then each thread may use up to the number of bytes
/// specified here.
pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder {
self.0.dfa_size_limit = limit;
self
}
/// Set the nesting limit for this parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow for consumers that do structural induction on an `Ast` using
/// explicit recursion. While this crate never does this (instead using
/// constant stack space and moving the call stack to the heap), other
/// crates may.
///
/// This limit is not checked until the entire Ast is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since this parser
/// implementation will limit itself to heap space proportional to the
/// lenth of the pattern string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation, which results in a nest
/// depth of `1`. In general, a nest limit is not something that manifests
/// in an obvious way in the concrete syntax, therefore, it should not be
/// used in a granular way.
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
self.0.nest_limit = limit;
self
}
}
}
}
}
define_set_builder!(set_bytes, bytes, false);
define_set_builder!(set_unicode, unicode, true);