blob: 2469890684a8d3e201d60e91fa79acb96acca629 [file] [log] [blame]
/*!
Defines a translator that converts an `Ast` to an `Hir`.
*/
use std::cell::{Cell, RefCell};
use std::result;
use ast::{self, Ast, Span, Visitor};
use hir::{self, Error, ErrorKind, Hir};
use unicode::{self, ClassQuery};
type Result<T> = result::Result<T, Error>;
/// A builder for constructing an AST->HIR translator.
#[derive(Clone, Debug)]
pub struct TranslatorBuilder {
allow_invalid_utf8: bool,
flags: Flags,
}
impl Default for TranslatorBuilder {
fn default() -> TranslatorBuilder {
TranslatorBuilder::new()
}
}
impl TranslatorBuilder {
/// Create a new translator builder with a default c onfiguration.
pub fn new() -> TranslatorBuilder {
TranslatorBuilder {
allow_invalid_utf8: false,
flags: Flags::default(),
}
}
/// Build a translator using the current configuration.
pub fn build(&self) -> Translator {
Translator {
stack: RefCell::new(vec![]),
flags: Cell::new(self.flags),
allow_invalid_utf8: self.allow_invalid_utf8,
}
}
/// When enabled, translation will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// When disabled (the default), the translator is guaranteed to produce
/// an expression that will only ever match valid UTF-8 (otherwise, the
/// translator will return an error).
///
/// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
/// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
/// the parser to return an error. Namely, a negated ASCII word boundary
/// can result in matching positions that aren't valid UTF-8 boundaries.
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.allow_invalid_utf8 = yes;
self
}
/// Enable or disable the case insensitive flag (`i`) by default.
pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.case_insensitive = if yes { Some(true) } else { None };
self
}
/// Enable or disable the multi-line matching flag (`m`) by default.
pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.multi_line = if yes { Some(true) } else { None };
self
}
/// Enable or disable the "dot matches any character" flag (`s`) by
/// default.
pub fn dot_matches_new_line(
&mut self,
yes: bool,
) -> &mut TranslatorBuilder {
self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
self
}
/// Enable or disable the "swap greed" flag (`U`) by default.
pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.swap_greed = if yes { Some(true) } else { None };
self
}
/// Enable or disable the Unicode flag (`u`) by default.
pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
self.flags.unicode = if yes { None } else { Some(false) };
self
}
}
/// A translator maps abstract syntax to a high level intermediate
/// representation.
///
/// A translator may be benefit from reuse. That is, a translator can translate
/// many abstract syntax trees.
///
/// A `Translator` can be configured in more detail via a
/// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
#[derive(Clone, Debug)]
pub struct Translator {
/// Our call stack, but on the heap.
stack: RefCell<Vec<HirFrame>>,
/// The current flag settings.
flags: Cell<Flags>,
/// Whether we're allowed to produce HIR that can match arbitrary bytes.
allow_invalid_utf8: bool,
}
impl Translator {
/// Create a new translator using the default configuration.
pub fn new() -> Translator {
TranslatorBuilder::new().build()
}
/// Translate the given abstract syntax tree (AST) into a high level
/// intermediate representation (HIR).
///
/// If there was a problem doing the translation, then an HIR-specific
/// error is returned.
///
/// The original pattern string used to produce the `Ast` *must* also be
/// provided. The translator does not use the pattern string during any
/// correct translation, but is used for error reporting.
pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
ast::visit(ast, TranslatorI::new(self, pattern))
}
}
/// An HirFrame is a single stack frame, represented explicitly, which is
/// created for each item in the Ast that we traverse.
///
/// Note that technically, this type doesn't represent our entire stack
/// frame. In particular, the Ast visitor represents any state associated with
/// traversing the Ast itself.
#[derive(Clone, Debug)]
enum HirFrame {
/// An arbitrary HIR expression. These get pushed whenever we hit a base
/// case in the Ast. They get popped after an inductive (i.e., recursive)
/// step is complete.
Expr(Hir),
/// A Unicode character class. This frame is mutated as we descend into
/// the Ast of a character class (which is itself its own mini recursive
/// structure).
ClassUnicode(hir::ClassUnicode),
/// A byte-oriented character class. This frame is mutated as we descend
/// into the Ast of a character class (which is itself its own mini
/// recursive structure).
///
/// Byte character classes are created when Unicode mode (`u`) is disabled.
/// If `allow_invalid_utf8` is disabled (the default), then a byte
/// character is only permitted to match ASCII text.
ClassBytes(hir::ClassBytes),
/// This is pushed on to the stack upon first seeing any kind of group,
/// indicated by parentheses (including non-capturing groups). It is popped
/// upon leaving a group.
Group {
/// The old active flags when this group was opened.
///
/// If this group sets flags, then the new active flags are set to the
/// result of merging the old flags with the flags introduced by this
/// group. If the group doesn't set any flags, then this is simply
/// equivalent to whatever flags were set when the group was opened.
///
/// When this group is popped, the active flags should be restored to
/// the flags set here.
///
/// The "active" flags correspond to whatever flags are set in the
/// Translator.
old_flags: Flags,
},
/// This is pushed whenever a concatenation is observed. After visiting
/// every sub-expression in the concatenation, the translator's stack is
/// popped until it sees a Concat frame.
Concat,
/// This is pushed whenever an alternation is observed. After visiting
/// every sub-expression in the alternation, the translator's stack is
/// popped until it sees an Alternation frame.
Alternation,
}
impl HirFrame {
/// Assert that the current stack frame is an Hir expression and return it.
fn unwrap_expr(self) -> Hir {
match self {
HirFrame::Expr(expr) => expr,
_ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
}
}
/// Assert that the current stack frame is a Unicode class expression and
/// return it.
fn unwrap_class_unicode(self) -> hir::ClassUnicode {
match self {
HirFrame::ClassUnicode(cls) => cls,
_ => panic!(
"tried to unwrap Unicode class \
from HirFrame, got: {:?}",
self
),
}
}
/// Assert that the current stack frame is a byte class expression and
/// return it.
fn unwrap_class_bytes(self) -> hir::ClassBytes {
match self {
HirFrame::ClassBytes(cls) => cls,
_ => panic!(
"tried to unwrap byte class \
from HirFrame, got: {:?}",
self
),
}
}
/// Assert that the current stack frame is a group indicator and return
/// its corresponding flags (the flags that were active at the time the
/// group was entered).
fn unwrap_group(self) -> Flags {
match self {
HirFrame::Group { old_flags } => old_flags,
_ => {
panic!("tried to unwrap group from HirFrame, got: {:?}", self)
}
}
}
}
impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
type Output = Hir;
type Err = Error;
fn finish(self) -> Result<Hir> {
// ... otherwise, we should have exactly one HIR on the stack.
assert_eq!(self.trans().stack.borrow().len(), 1);
Ok(self.pop().unwrap().unwrap_expr())
}
fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
match *ast {
Ast::Class(ast::Class::Bracketed(_)) => {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
} else {
let cls = hir::ClassBytes::empty();
self.push(HirFrame::ClassBytes(cls));
}
}
Ast::Group(ref x) => {
let old_flags = x
.flags()
.map(|ast| self.set_flags(ast))
.unwrap_or_else(|| self.flags());
self.push(HirFrame::Group { old_flags });
}
Ast::Concat(ref x) if x.asts.is_empty() => {}
Ast::Concat(_) => {
self.push(HirFrame::Concat);
}
Ast::Alternation(ref x) if x.asts.is_empty() => {}
Ast::Alternation(_) => {
self.push(HirFrame::Alternation);
}
_ => {}
}
Ok(())
}
fn visit_post(&mut self, ast: &Ast) -> Result<()> {
match *ast {
Ast::Empty(_) => {
self.push(HirFrame::Expr(Hir::empty()));
}
Ast::Flags(ref x) => {
self.set_flags(&x.flags);
// Flags in the AST are generally considered directives and
// not actual sub-expressions. However, they can be used in
// the concrete syntax like `((?i))`, and we need some kind of
// indication of an expression there, and Empty is the correct
// choice.
//
// There can also be things like `(?i)+`, but we rule those out
// in the parser. In the future, we might allow them for
// consistency sake.
self.push(HirFrame::Expr(Hir::empty()));
}
Ast::Literal(ref x) => {
self.push(HirFrame::Expr(self.hir_literal(x)?));
}
Ast::Dot(span) => {
self.push(HirFrame::Expr(self.hir_dot(span)?));
}
Ast::Assertion(ref x) => {
self.push(HirFrame::Expr(self.hir_assertion(x)?));
}
Ast::Class(ast::Class::Perl(ref x)) => {
if self.flags().unicode() {
let cls = self.hir_perl_unicode_class(x)?;
let hcls = hir::Class::Unicode(cls);
self.push(HirFrame::Expr(Hir::class(hcls)));
} else {
let cls = self.hir_perl_byte_class(x);
let hcls = hir::Class::Bytes(cls);
self.push(HirFrame::Expr(Hir::class(hcls)));
}
}
Ast::Class(ast::Class::Unicode(ref x)) => {
let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
self.push(HirFrame::Expr(Hir::class(cls)));
}
Ast::Class(ast::Class::Bracketed(ref ast)) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
self.unicode_fold_and_negate(
&ast.span,
ast.negated,
&mut cls,
)?;
if cls.iter().next().is_none() {
return Err(self.error(
ast.span,
ErrorKind::EmptyClassNotAllowed,
));
}
let expr = Hir::class(hir::Class::Unicode(cls));
self.push(HirFrame::Expr(expr));
} else {
let mut cls = self.pop().unwrap().unwrap_class_bytes();
self.bytes_fold_and_negate(
&ast.span,
ast.negated,
&mut cls,
)?;
if cls.iter().next().is_none() {
return Err(self.error(
ast.span,
ErrorKind::EmptyClassNotAllowed,
));
}
let expr = Hir::class(hir::Class::Bytes(cls));
self.push(HirFrame::Expr(expr));
}
}
Ast::Repetition(ref x) => {
let expr = self.pop().unwrap().unwrap_expr();
self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
}
Ast::Group(ref x) => {
let expr = self.pop().unwrap().unwrap_expr();
let old_flags = self.pop().unwrap().unwrap_group();
self.trans().flags.set(old_flags);
self.push(HirFrame::Expr(self.hir_group(x, expr)));
}
Ast::Concat(_) => {
let mut exprs = vec![];
while let Some(HirFrame::Expr(expr)) = self.pop() {
if !expr.kind().is_empty() {
exprs.push(expr);
}
}
exprs.reverse();
self.push(HirFrame::Expr(Hir::concat(exprs)));
}
Ast::Alternation(_) => {
let mut exprs = vec![];
while let Some(HirFrame::Expr(expr)) = self.pop() {
exprs.push(expr);
}
exprs.reverse();
self.push(HirFrame::Expr(Hir::alternation(exprs)));
}
}
Ok(())
}
fn visit_class_set_item_pre(
&mut self,
ast: &ast::ClassSetItem,
) -> Result<()> {
match *ast {
ast::ClassSetItem::Bracketed(_) => {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
} else {
let cls = hir::ClassBytes::empty();
self.push(HirFrame::ClassBytes(cls));
}
}
// We needn't handle the Union case here since the visitor will
// do it for us.
_ => {}
}
Ok(())
}
fn visit_class_set_item_post(
&mut self,
ast: &ast::ClassSetItem,
) -> Result<()> {
match *ast {
ast::ClassSetItem::Empty(_) => {}
ast::ClassSetItem::Literal(ref x) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
self.push(HirFrame::ClassUnicode(cls));
} else {
let mut cls = self.pop().unwrap().unwrap_class_bytes();
let byte = self.class_literal_byte(x)?;
cls.push(hir::ClassBytesRange::new(byte, byte));
self.push(HirFrame::ClassBytes(cls));
}
}
ast::ClassSetItem::Range(ref x) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
self.push(HirFrame::ClassUnicode(cls));
} else {
let mut cls = self.pop().unwrap().unwrap_class_bytes();
let start = self.class_literal_byte(&x.start)?;
let end = self.class_literal_byte(&x.end)?;
cls.push(hir::ClassBytesRange::new(start, end));
self.push(HirFrame::ClassBytes(cls));
}
}
ast::ClassSetItem::Ascii(ref x) => {
if self.flags().unicode() {
let mut cls = self.pop().unwrap().unwrap_class_unicode();
for &(s, e) in ascii_class(&x.kind) {
cls.push(hir::ClassUnicodeRange::new(s, e));
}
self.unicode_fold_and_negate(
&x.span, x.negated, &mut cls,
)?;
self.push(HirFrame::ClassUnicode(cls));
} else {
let mut cls = self.pop().unwrap().unwrap_class_bytes();
for &(s, e) in ascii_class(&x.kind) {
cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
}
self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?;
self.push(HirFrame::ClassBytes(cls));
}
}
ast::ClassSetItem::Unicode(ref x) => {
let xcls = self.hir_unicode_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_unicode();
cls.union(&xcls);
self.push(HirFrame::ClassUnicode(cls));
}
ast::ClassSetItem::Perl(ref x) => {
if self.flags().unicode() {
let xcls = self.hir_perl_unicode_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_unicode();
cls.union(&xcls);
self.push(HirFrame::ClassUnicode(cls));
} else {
let xcls = self.hir_perl_byte_class(x);
let mut cls = self.pop().unwrap().unwrap_class_bytes();
cls.union(&xcls);
self.push(HirFrame::ClassBytes(cls));
}
}
ast::ClassSetItem::Bracketed(ref ast) => {
if self.flags().unicode() {
let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
self.unicode_fold_and_negate(
&ast.span,
ast.negated,
&mut cls1,
)?;
let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
cls2.union(&cls1);
self.push(HirFrame::ClassUnicode(cls2));
} else {
let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
self.bytes_fold_and_negate(
&ast.span,
ast.negated,
&mut cls1,
)?;
let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
cls2.union(&cls1);
self.push(HirFrame::ClassBytes(cls2));
}
}
// This is handled automatically by the visitor.
ast::ClassSetItem::Union(_) => {}
}
Ok(())
}
fn visit_class_set_binary_op_pre(
&mut self,
_op: &ast::ClassSetBinaryOp,
) -> Result<()> {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
} else {
let cls = hir::ClassBytes::empty();
self.push(HirFrame::ClassBytes(cls));
}
Ok(())
}
fn visit_class_set_binary_op_in(
&mut self,
_op: &ast::ClassSetBinaryOp,
) -> Result<()> {
if self.flags().unicode() {
let cls = hir::ClassUnicode::empty();
self.push(HirFrame::ClassUnicode(cls));
} else {
let cls = hir::ClassBytes::empty();
self.push(HirFrame::ClassBytes(cls));
}
Ok(())
}
fn visit_class_set_binary_op_post(
&mut self,
op: &ast::ClassSetBinaryOp,
) -> Result<()> {
use ast::ClassSetBinaryOpKind::*;
if self.flags().unicode() {
let mut rhs = self.pop().unwrap().unwrap_class_unicode();
let mut lhs = self.pop().unwrap().unwrap_class_unicode();
let mut cls = self.pop().unwrap().unwrap_class_unicode();
if self.flags().case_insensitive() {
rhs.try_case_fold_simple().map_err(|_| {
self.error(
op.rhs.span().clone(),
ErrorKind::UnicodeCaseUnavailable,
)
})?;
lhs.try_case_fold_simple().map_err(|_| {
self.error(
op.lhs.span().clone(),
ErrorKind::UnicodeCaseUnavailable,
)
})?;
}
match op.kind {
Intersection => lhs.intersect(&rhs),
Difference => lhs.difference(&rhs),
SymmetricDifference => lhs.symmetric_difference(&rhs),
}
cls.union(&lhs);
self.push(HirFrame::ClassUnicode(cls));
} else {
let mut rhs = self.pop().unwrap().unwrap_class_bytes();
let mut lhs = self.pop().unwrap().unwrap_class_bytes();
let mut cls = self.pop().unwrap().unwrap_class_bytes();
if self.flags().case_insensitive() {
rhs.case_fold_simple();
lhs.case_fold_simple();
}
match op.kind {
Intersection => lhs.intersect(&rhs),
Difference => lhs.difference(&rhs),
SymmetricDifference => lhs.symmetric_difference(&rhs),
}
cls.union(&lhs);
self.push(HirFrame::ClassBytes(cls));
}
Ok(())
}
}
/// The internal implementation of a translator.
///
/// This type is responsible for carrying around the original pattern string,
/// which is not tied to the internal state of a translator.
///
/// A TranslatorI exists for the time it takes to translate a single Ast.
#[derive(Clone, Debug)]
struct TranslatorI<'t, 'p> {
trans: &'t Translator,
pattern: &'p str,
}
impl<'t, 'p> TranslatorI<'t, 'p> {
/// Build a new internal translator.
fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
TranslatorI { trans: trans, pattern: pattern }
}
/// Return a reference to the underlying translator.
fn trans(&self) -> &Translator {
&self.trans
}
/// Push the given frame on to the call stack.
fn push(&self, frame: HirFrame) {
self.trans().stack.borrow_mut().push(frame);
}
/// Pop the top of the call stack. If the call stack is empty, return None.
fn pop(&self) -> Option<HirFrame> {
self.trans().stack.borrow_mut().pop()
}
/// Create a new error with the given span and error type.
fn error(&self, span: Span, kind: ErrorKind) -> Error {
Error { kind: kind, pattern: self.pattern.to_string(), span: span }
}
/// Return a copy of the active flags.
fn flags(&self) -> Flags {
self.trans().flags.get()
}
/// Set the flags of this translator from the flags set in the given AST.
/// Then, return the old flags.
fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
let old_flags = self.flags();
let mut new_flags = Flags::from_ast(ast_flags);
new_flags.merge(&old_flags);
self.trans().flags.set(new_flags);
old_flags
}
fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
let ch = match self.literal_to_char(lit)? {
byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
hir::Literal::Unicode(ch) => ch,
};
if self.flags().case_insensitive() {
self.hir_from_char_case_insensitive(lit.span, ch)
} else {
self.hir_from_char(lit.span, ch)
}
}
/// Convert an Ast literal to its scalar representation.
///
/// When Unicode mode is enabled, then this always succeeds and returns a
/// `char` (Unicode scalar value).
///
/// When Unicode mode is disabled, then a raw byte is returned. If that
/// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
/// an error.
fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
if self.flags().unicode() {
return Ok(hir::Literal::Unicode(lit.c));
}
let byte = match lit.byte() {
None => return Ok(hir::Literal::Unicode(lit.c)),
Some(byte) => byte,
};
if byte <= 0x7F {
return Ok(hir::Literal::Unicode(byte as char));
}
if !self.trans().allow_invalid_utf8 {
return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
}
Ok(hir::Literal::Byte(byte))
}
fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
if !self.flags().unicode() && c.len_utf8() > 1 {
return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
}
Ok(Hir::literal(hir::Literal::Unicode(c)))
}
fn hir_from_char_case_insensitive(
&self,
span: Span,
c: char,
) -> Result<Hir> {
if self.flags().unicode() {
// If case folding won't do anything, then don't bother trying.
let map =
unicode::contains_simple_case_mapping(c, c).map_err(|_| {
self.error(span, ErrorKind::UnicodeCaseUnavailable)
})?;
if !map {
return self.hir_from_char(span, c);
}
let mut cls =
hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
c, c,
)]);
cls.try_case_fold_simple().map_err(|_| {
self.error(span, ErrorKind::UnicodeCaseUnavailable)
})?;
Ok(Hir::class(hir::Class::Unicode(cls)))
} else {
if c.len_utf8() > 1 {
return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
}
// If case folding won't do anything, then don't bother trying.
match c {
'A'..='Z' | 'a'..='z' => {}
_ => return self.hir_from_char(span, c),
}
let mut cls =
hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
c as u8, c as u8,
)]);
cls.case_fold_simple();
Ok(Hir::class(hir::Class::Bytes(cls)))
}
}
fn hir_dot(&self, span: Span) -> Result<Hir> {
let unicode = self.flags().unicode();
if !unicode && !self.trans().allow_invalid_utf8 {
return Err(self.error(span, ErrorKind::InvalidUtf8));
}
Ok(if self.flags().dot_matches_new_line() {
Hir::any(!unicode)
} else {
Hir::dot(!unicode)
})
}
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
let unicode = self.flags().unicode();
let multi_line = self.flags().multi_line();
Ok(match asst.kind {
ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
hir::Anchor::StartLine
} else {
hir::Anchor::StartText
}),
ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
hir::Anchor::EndLine
} else {
hir::Anchor::EndText
}),
ast::AssertionKind::StartText => {
Hir::anchor(hir::Anchor::StartText)
}
ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
ast::AssertionKind::WordBoundary => {
Hir::word_boundary(if unicode {
hir::WordBoundary::Unicode
} else {
hir::WordBoundary::Ascii
})
}
ast::AssertionKind::NotWordBoundary => {
Hir::word_boundary(if unicode {
hir::WordBoundary::UnicodeNegate
} else {
// It is possible for negated ASCII word boundaries to
// match at invalid UTF-8 boundaries, even when searching
// valid UTF-8.
if !self.trans().allow_invalid_utf8 {
return Err(
self.error(asst.span, ErrorKind::InvalidUtf8)
);
}
hir::WordBoundary::AsciiNegate
})
}
})
}
fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
let kind = match group.kind {
ast::GroupKind::CaptureIndex(idx) => {
hir::GroupKind::CaptureIndex(idx)
}
ast::GroupKind::CaptureName(ref capname) => {
hir::GroupKind::CaptureName {
name: capname.name.clone(),
index: capname.index,
}
}
ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
};
Hir::group(hir::Group { kind: kind, hir: Box::new(expr) })
}
fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
let kind = match rep.op.kind {
ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
}
ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
}
ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
m,
n,
)) => {
hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
}
};
let greedy =
if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
Hir::repetition(hir::Repetition {
kind: kind,
greedy: greedy,
hir: Box::new(expr),
})
}
fn hir_unicode_class(
&self,
ast_class: &ast::ClassUnicode,
) -> Result<hir::ClassUnicode> {
use ast::ClassUnicodeKind::*;
if !self.flags().unicode() {
return Err(
self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
);
}
let query = match ast_class.kind {
OneLetter(name) => ClassQuery::OneLetter(name),
Named(ref name) => ClassQuery::Binary(name),
NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
property_name: name,
property_value: value,
},
};
let mut result = self.convert_unicode_class_error(
&ast_class.span,
unicode::class(query),
);
if let Ok(ref mut class) = result {
self.unicode_fold_and_negate(
&ast_class.span,
ast_class.negated,
class,
)?;
}
result
}
fn hir_perl_unicode_class(
&self,
ast_class: &ast::ClassPerl,
) -> Result<hir::ClassUnicode> {
use ast::ClassPerlKind::*;
assert!(self.flags().unicode());
let result = match ast_class.kind {
Digit => unicode::perl_digit(),
Space => unicode::perl_space(),
Word => unicode::perl_word(),
};
let mut class =
self.convert_unicode_class_error(&ast_class.span, result)?;
// We needn't apply case folding here because the Perl Unicode classes
// are already closed under Unicode simple case folding.
if ast_class.negated {
class.negate();
}
Ok(class)
}
fn hir_perl_byte_class(
&self,
ast_class: &ast::ClassPerl,
) -> hir::ClassBytes {
use ast::ClassPerlKind::*;
assert!(!self.flags().unicode());
let mut class = match ast_class.kind {
Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
};
// We needn't apply case folding here because the Perl ASCII classes
// are already closed (under ASCII case folding).
if ast_class.negated {
class.negate();
}
class
}
/// Converts the given Unicode specific error to an HIR translation error.
///
/// The span given should approximate the position at which an error would
/// occur.
fn convert_unicode_class_error(
&self,
span: &Span,
result: unicode::Result<hir::ClassUnicode>,
) -> Result<hir::ClassUnicode> {
result.map_err(|err| {
let sp = span.clone();
match err {
unicode::Error::PropertyNotFound => {
self.error(sp, ErrorKind::UnicodePropertyNotFound)
}
unicode::Error::PropertyValueNotFound => {
self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
}
unicode::Error::PerlClassNotFound => {
self.error(sp, ErrorKind::UnicodePerlClassNotFound)
}
}
})
}
fn unicode_fold_and_negate(
&self,
span: &Span,
negated: bool,
class: &mut hir::ClassUnicode,
) -> Result<()> {
// Note that we must apply case folding before negation!
// Consider `(?i)[^x]`. If we applied negation field, then
// the result would be the character class that matched any
// Unicode scalar value.
if self.flags().case_insensitive() {
class.try_case_fold_simple().map_err(|_| {
self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
})?;
}
if negated {
class.negate();
}
Ok(())
}
fn bytes_fold_and_negate(
&self,
span: &Span,
negated: bool,
class: &mut hir::ClassBytes,
) -> Result<()> {
// Note that we must apply case folding before negation!
// Consider `(?i)[^x]`. If we applied negation field, then
// the result would be the character class that matched any
// Unicode scalar value.
if self.flags().case_insensitive() {
class.case_fold_simple();
}
if negated {
class.negate();
}
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
}
Ok(())
}
/// Return a scalar byte value suitable for use as a literal in a byte
/// character class.
fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
match self.literal_to_char(ast)? {
hir::Literal::Byte(byte) => Ok(byte),
hir::Literal::Unicode(ch) => {
if ch <= 0x7F as char {
Ok(ch as u8)
} else {
// We can't feasibly support Unicode in
// byte oriented classes. Byte classes don't
// do Unicode case folding.
Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
}
}
}
}
}
/// A translator's representation of a regular expression's flags at any given
/// moment in time.
///
/// Each flag can be in one of three states: absent, present but disabled or
/// present but enabled.
#[derive(Clone, Copy, Debug, Default)]
struct Flags {
case_insensitive: Option<bool>,
multi_line: Option<bool>,
dot_matches_new_line: Option<bool>,
swap_greed: Option<bool>,
unicode: Option<bool>,
// Note that `ignore_whitespace` is omitted here because it is handled
// entirely in the parser.
}
impl Flags {
fn from_ast(ast: &ast::Flags) -> Flags {
let mut flags = Flags::default();
let mut enable = true;
for item in &ast.items {
match item.kind {
ast::FlagsItemKind::Negation => {
enable = false;
}
ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
flags.case_insensitive = Some(enable);
}
ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
flags.multi_line = Some(enable);
}
ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
flags.dot_matches_new_line = Some(enable);
}
ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
flags.swap_greed = Some(enable);
}
ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
flags.unicode = Some(enable);
}
ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
}
}
flags
}
fn merge(&mut self, previous: &Flags) {
if self.case_insensitive.is_none() {
self.case_insensitive = previous.case_insensitive;
}
if self.multi_line.is_none() {
self.multi_line = previous.multi_line;
}
if self.dot_matches_new_line.is_none() {
self.dot_matches_new_line = previous.dot_matches_new_line;
}
if self.swap_greed.is_none() {
self.swap_greed = previous.swap_greed;
}
if self.unicode.is_none() {
self.unicode = previous.unicode;
}
}
fn case_insensitive(&self) -> bool {
self.case_insensitive.unwrap_or(false)
}
fn multi_line(&self) -> bool {
self.multi_line.unwrap_or(false)
}
fn dot_matches_new_line(&self) -> bool {
self.dot_matches_new_line.unwrap_or(false)
}
fn swap_greed(&self) -> bool {
self.swap_greed.unwrap_or(false)
}
fn unicode(&self) -> bool {
self.unicode.unwrap_or(true)
}
}
fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
let ranges: Vec<_> = ascii_class(kind)
.iter()
.cloned()
.map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8))
.collect();
hir::ClassBytes::new(ranges)
}
fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
use ast::ClassAsciiKind::*;
match *kind {
Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
Alpha => &[('A', 'Z'), ('a', 'z')],
Ascii => &[('\x00', '\x7F')],
Blank => &[('\t', '\t'), (' ', ' ')],
Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
Digit => &[('0', '9')],
Graph => &[('!', '~')],
Lower => &[('a', 'z')],
Print => &[(' ', '~')],
Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
Space => &[
('\t', '\t'),
('\n', '\n'),
('\x0B', '\x0B'),
('\x0C', '\x0C'),
('\r', '\r'),
(' ', ' '),
],
Upper => &[('A', 'Z')],
Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
}
}
#[cfg(test)]
mod tests {
use ast::parse::ParserBuilder;
use ast::{self, Ast, Position, Span};
use hir::{self, Hir, HirKind};
use unicode::{self, ClassQuery};
use super::{ascii_class, TranslatorBuilder};
// We create these errors to compare with real hir::Errors in the tests.
// We define equality between TestError and hir::Error to disregard the
// pattern string in hir::Error, which is annoying to provide in tests.
#[derive(Clone, Debug)]
struct TestError {
span: Span,
kind: hir::ErrorKind,
}
impl PartialEq<hir::Error> for TestError {
fn eq(&self, other: &hir::Error) -> bool {
self.span == other.span && self.kind == other.kind
}
}
impl PartialEq<TestError> for hir::Error {
fn eq(&self, other: &TestError) -> bool {
self.span == other.span && self.kind == other.kind
}
}
fn parse(pattern: &str) -> Ast {
ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
}
fn t(pattern: &str) -> Hir {
TranslatorBuilder::new()
.allow_invalid_utf8(false)
.build()
.translate(pattern, &parse(pattern))
.unwrap()
}
fn t_err(pattern: &str) -> hir::Error {
TranslatorBuilder::new()
.allow_invalid_utf8(false)
.build()
.translate(pattern, &parse(pattern))
.unwrap_err()
}
fn t_bytes(pattern: &str) -> Hir {
TranslatorBuilder::new()
.allow_invalid_utf8(true)
.build()
.translate(pattern, &parse(pattern))
.unwrap()
}
fn hir_lit(s: &str) -> Hir {
match s.len() {
0 => Hir::empty(),
_ => {
let lits = s
.chars()
.map(hir::Literal::Unicode)
.map(Hir::literal)
.collect();
Hir::concat(lits)
}
}
}
fn hir_blit(s: &[u8]) -> Hir {
match s.len() {
0 => Hir::empty(),
1 => Hir::literal(hir::Literal::Byte(s[0])),
_ => {
let lits = s
.iter()
.cloned()
.map(hir::Literal::Byte)
.map(Hir::literal)
.collect();
Hir::concat(lits)
}
}
}
fn hir_group(i: u32, expr: Hir) -> Hir {
Hir::group(hir::Group {
kind: hir::GroupKind::CaptureIndex(i),
hir: Box::new(expr),
})
}
fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
Hir::group(hir::Group {
kind: hir::GroupKind::CaptureName {
name: name.to_string(),
index: i,
},
hir: Box::new(expr),
})
}
fn hir_group_nocap(expr: Hir) -> Hir {
Hir::group(hir::Group {
kind: hir::GroupKind::NonCapturing,
hir: Box::new(expr),
})
}
fn hir_quest(greedy: bool, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::ZeroOrOne,
greedy: greedy,
hir: Box::new(expr),
})
}
fn hir_star(greedy: bool, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::ZeroOrMore,
greedy: greedy,
hir: Box::new(expr),
})
}
fn hir_plus(greedy: bool, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::OneOrMore,
greedy: greedy,
hir: Box::new(expr),
})
}
fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::Range(range),
greedy: greedy,
hir: Box::new(expr),
})
}
fn hir_alt(alts: Vec<Hir>) -> Hir {
Hir::alternation(alts)
}
fn hir_cat(exprs: Vec<Hir>) -> Hir {
Hir::concat(exprs)
}
#[allow(dead_code)]
fn hir_uclass_query(query: ClassQuery) -> Hir {
Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
}
#[allow(dead_code)]
fn hir_uclass_perl_word() -> Hir {
Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
}
fn hir_uclass(ranges: &[(char, char)]) -> Hir {
let ranges: Vec<hir::ClassUnicodeRange> = ranges
.iter()
.map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
.collect();
Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
}
fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
let ranges: Vec<hir::ClassBytesRange> = ranges
.iter()
.map(|&(s, e)| hir::ClassBytesRange::new(s, e))
.collect();
Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
}
fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
let ranges: Vec<hir::ClassBytesRange> = ranges
.iter()
.map(|&(s, e)| {
assert!(s as u32 <= 0x7F);
assert!(e as u32 <= 0x7F);
hir::ClassBytesRange::new(s as u8, e as u8)
})
.collect();
Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
}
fn hir_case_fold(expr: Hir) -> Hir {
match expr.into_kind() {
HirKind::Class(mut cls) => {
cls.case_fold_simple();
Hir::class(cls)
}
_ => panic!("cannot case fold non-class Hir expr"),
}
}
fn hir_negate(expr: Hir) -> Hir {
match expr.into_kind() {
HirKind::Class(mut cls) => {
cls.negate();
Hir::class(cls)
}
_ => panic!("cannot negate non-class Hir expr"),
}
}
#[allow(dead_code)]
fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
use hir::Class::{Bytes, Unicode};
match (expr1.into_kind(), expr2.into_kind()) {
(HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
c1.union(&c2);
Hir::class(hir::Class::Unicode(c1))
}
(HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
c1.union(&c2);
Hir::class(hir::Class::Bytes(c1))
}
_ => panic!("cannot union non-class Hir exprs"),
}
}
#[allow(dead_code)]
fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
use hir::Class::{Bytes, Unicode};
match (expr1.into_kind(), expr2.into_kind()) {
(HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
c1.difference(&c2);
Hir::class(hir::Class::Unicode(c1))
}
(HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
c1.difference(&c2);
Hir::class(hir::Class::Bytes(c1))
}
_ => panic!("cannot difference non-class Hir exprs"),
}
}
fn hir_anchor(anchor: hir::Anchor) -> Hir {
Hir::anchor(anchor)
}
fn hir_word(wb: hir::WordBoundary) -> Hir {
Hir::word_boundary(wb)
}
#[test]
fn empty() {
assert_eq!(t(""), Hir::empty());
assert_eq!(t("(?i)"), Hir::empty());
assert_eq!(t("()"), hir_group(1, Hir::empty()));
assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
assert_eq!(
t("()|()"),
hir_alt(vec![
hir_group(1, Hir::empty()),
hir_group(2, Hir::empty()),
])
);
assert_eq!(
t("(|b)"),
hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
);
assert_eq!(
t("(a|)"),
hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
);
assert_eq!(
t("(a||c)"),
hir_group(
1,
hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
)
);
assert_eq!(
t("(||)"),
hir_group(
1,
hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
)
);
}
#[test]
fn literal() {
assert_eq!(t("a"), hir_lit("a"));
assert_eq!(t("(?-u)a"), hir_lit("a"));
assert_eq!(t("☃"), hir_lit("☃"));
assert_eq!(t("abcd"), hir_lit("abcd"));
assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
assert_eq!(
t_err("(?-u)☃"),
TestError {
kind: hir::ErrorKind::UnicodeNotAllowed,
span: Span::new(
Position::new(5, 1, 6),
Position::new(8, 1, 7)
),
}
);
assert_eq!(
t_err(r"(?-u)\xFF"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(9, 1, 10)
),
}
);
}
#[test]
fn literal_case_insensitive() {
#[cfg(feature = "unicode-case")]
assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i:a)"),
hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("a(?i)a(?-i)a"),
hir_cat(vec![
hir_lit("a"),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_lit("a"),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)ab@c"),
hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_uclass(&[('B', 'B'), ('b', 'b')]),
hir_lit("@"),
hir_uclass(&[('C', 'C'), ('c', 'c')]),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)β"),
hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
);
assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?-u)a(?i)a(?-i)a"),
hir_cat(vec![
hir_lit("a"),
hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
hir_lit("a"),
])
);
assert_eq!(
t("(?i-u)ab@c"),
hir_cat(vec![
hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
hir_lit("@"),
hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
])
);
assert_eq!(
t_bytes("(?i-u)a"),
hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
);
assert_eq!(
t_bytes("(?i-u)\x61"),
hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
);
assert_eq!(
t_bytes(r"(?i-u)\x61"),
hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
);
assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
assert_eq!(
t_err("(?i-u)β"),
TestError {
kind: hir::ErrorKind::UnicodeNotAllowed,
span: Span::new(
Position::new(6, 1, 7),
Position::new(8, 1, 8),
),
}
);
}
#[test]
fn dot() {
assert_eq!(
t("."),
hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
);
assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
assert_eq!(
t_bytes("(?-u)."),
hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
);
assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
// If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
assert_eq!(
t_err("(?-u)."),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(6, 1, 7)
),
}
);
assert_eq!(
t_err("(?s-u)."),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(6, 1, 7),
Position::new(7, 1, 8)
),
}
);
}
#[test]
fn assertions() {
assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
assert_eq!(
t_bytes(r"(?-u)\B"),
hir_word(hir::WordBoundary::AsciiNegate)
);
assert_eq!(
t_err(r"(?-u)\B"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(7, 1, 8)
),
}
);
}
#[test]
fn group() {
assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
assert_eq!(
t("(a)(b)"),
hir_cat(vec![
hir_group(1, hir_lit("a")),
hir_group(2, hir_lit("b")),
])
);
assert_eq!(
t("(a)|(b)"),
hir_alt(vec![
hir_group(1, hir_lit("a")),
hir_group(2, hir_lit("b")),
])
);
assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
assert_eq!(
t("(?P<foo>a)(?P<bar>b)"),
hir_cat(vec![
hir_group_name(1, "foo", hir_lit("a")),
hir_group_name(2, "bar", hir_lit("b")),
])
);
assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
assert_eq!(
t("(?:a)(b)"),
hir_cat(vec![
hir_group_nocap(hir_lit("a")),
hir_group(1, hir_lit("b")),
])
);
assert_eq!(
t("(a)(?:b)(c)"),
hir_cat(vec![
hir_group(1, hir_lit("a")),
hir_group_nocap(hir_lit("b")),
hir_group(2, hir_lit("c")),
])
);
assert_eq!(
t("(a)(?P<foo>b)(c)"),
hir_cat(vec![
hir_group(1, hir_lit("a")),
hir_group_name(2, "foo", hir_lit("b")),
hir_group(3, hir_lit("c")),
])
);
assert_eq!(t("()"), hir_group(1, Hir::empty()));
assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
}
#[test]
fn flags() {
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i:a)a"),
hir_cat(vec![
hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
hir_lit("a"),
])
);
assert_eq!(
t("(?i-u:a)β"),
hir_cat(vec![
hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
hir_lit("β"),
])
);
assert_eq!(
t("(?:(?i-u)a)b"),
hir_cat(vec![
hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
hir_lit("b"),
])
);
assert_eq!(
t("((?i-u)a)b"),
hir_cat(vec![
hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
hir_lit("b"),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)(?-i:a)a"),
hir_cat(vec![
hir_group_nocap(hir_lit("a")),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?im)a^"),
hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_anchor(hir::Anchor::StartLine),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?im)a^(?i-m)a^"),
hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_anchor(hir::Anchor::StartLine),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_anchor(hir::Anchor::StartText),
])
);
assert_eq!(
t("(?U)a*a*?(?-U)a*a*?"),
hir_cat(vec![
hir_star(false, hir_lit("a")),
hir_star(true, hir_lit("a")),
hir_star(true, hir_lit("a")),
hir_star(false, hir_lit("a")),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?:a(?i)a)a"),
hir_cat(vec![
hir_group_nocap(hir_cat(vec![
hir_lit("a"),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
])),
hir_lit("a"),
])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)(?:a(?-i)a)a"),
hir_cat(vec![
hir_group_nocap(hir_cat(vec![
hir_uclass(&[('A', 'A'), ('a', 'a')]),
hir_lit("a"),
])),
hir_uclass(&[('A', 'A'), ('a', 'a')]),
])
);
}
#[test]
fn escape() {
assert_eq!(
t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
hir_lit(r"\.+*?()|[]{}^$#")
);
}
#[test]
fn repetition() {
assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
assert_eq!(
t("a{1}"),
hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
);
assert_eq!(
t("a{1,}"),
hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
);
assert_eq!(
t("a{1,2}"),
hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),)
);
assert_eq!(
t("a{1}?"),
hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
);
assert_eq!(
t("a{1,}?"),
hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
);
assert_eq!(
t("a{1,2}?"),
hir_range(
false,
hir::RepetitionRange::Bounded(1, 2),
hir_lit("a"),
)
);
assert_eq!(
t("ab?"),
hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
);
assert_eq!(
t("(ab)?"),
hir_quest(
true,
hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
)
);
assert_eq!(
t("a|b?"),
hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
);
}
#[test]
fn cat_alt() {
assert_eq!(
t("(ab)"),
hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
);
assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),]));
assert_eq!(
t("a|b|c"),
hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
);
assert_eq!(
t("ab|bc|cd"),
hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
);
assert_eq!(
t("(a|b)"),
hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),]))
);
assert_eq!(
t("(a|b|c)"),
hir_group(
1,
hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
)
);
assert_eq!(
t("(ab|bc|cd)"),
hir_group(
1,
hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
)
);
assert_eq!(
t("(ab|(bc|(cd)))"),
hir_group(
1,
hir_alt(vec![
hir_lit("ab"),
hir_group(
2,
hir_alt(vec![
hir_lit("bc"),
hir_group(3, hir_lit("cd")),
])
),
])
)
);
}
#[test]
fn class_ascii() {
assert_eq!(
t("[[:alnum:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))
);
assert_eq!(
t("[[:alpha:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))
);
assert_eq!(
t("[[:ascii:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))
);
assert_eq!(
t("[[:blank:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))
);
assert_eq!(
t("[[:cntrl:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))
);
assert_eq!(
t("[[:digit:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t("[[:graph:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))
);
assert_eq!(
t("[[:lower:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))
);
assert_eq!(
t("[[:print:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))
);
assert_eq!(
t("[[:punct:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))
);
assert_eq!(
t("[[:space:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))
);
assert_eq!(
t("[[:upper:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))
);
assert_eq!(
t("[[:word:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))
);
assert_eq!(
t("[[:xdigit:]]"),
hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))
);
assert_eq!(
t("[[:^lower:]]"),
hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)[[:lower:]]"),
hir_uclass(&[
('A', 'Z'),
('a', 'z'),
('\u{17F}', '\u{17F}'),
('\u{212A}', '\u{212A}'),
])
);
assert_eq!(
t("(?-u)[[:lower:]]"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))
);
assert_eq!(
t("(?i-u)[[:lower:]]"),
hir_case_fold(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Lower
)))
);
assert_eq!(
t_err("(?-u)[[:^lower:]]"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(6, 1, 7),
Position::new(16, 1, 17)
),
}
);
assert_eq!(
t_err("(?i-u)[[:^lower:]]"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(7, 1, 8),
Position::new(17, 1, 18)
),
}
);
}
#[test]
#[cfg(feature = "unicode-perl")]
fn class_perl() {
// Unicode
assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
assert_eq!(t(r"\w"), hir_uclass_perl_word());
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)\d"),
hir_uclass_query(ClassQuery::Binary("digit"))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)\s"),
hir_uclass_query(ClassQuery::Binary("space"))
);
#[cfg(feature = "unicode-case")]
assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
// Unicode, negated
assert_eq!(
t(r"\D"),
hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
);
assert_eq!(
t(r"\S"),
hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
);
assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)\D"),
hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)\S"),
hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
);
#[cfg(feature = "unicode-case")]
assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
// ASCII only
assert_eq!(
t(r"(?-u)\d"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t(r"(?-u)\s"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
);
assert_eq!(
t(r"(?-u)\w"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
);
assert_eq!(
t(r"(?i-u)\d"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
);
assert_eq!(
t(r"(?i-u)\s"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
);
assert_eq!(
t(r"(?i-u)\w"),
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
);
// ASCII only, negated
assert_eq!(
t(r"(?-u)\D"),
hir_negate(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Digit
)))
);
assert_eq!(
t(r"(?-u)\S"),
hir_negate(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Space
)))
);
assert_eq!(
t(r"(?-u)\W"),
hir_negate(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Word
)))
);
assert_eq!(
t(r"(?i-u)\D"),
hir_negate(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Digit
)))
);
assert_eq!(
t(r"(?i-u)\S"),
hir_negate(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Space
)))
);
assert_eq!(
t(r"(?i-u)\W"),
hir_negate(hir_bclass_from_char(ascii_class(
&ast::ClassAsciiKind::Word
)))
);
}
#[test]
#[cfg(not(feature = "unicode-perl"))]
fn class_perl_word_disabled() {
assert_eq!(
t_err(r"\w"),
TestError {
kind: hir::ErrorKind::UnicodePerlClassNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(2, 1, 3)
),
}
);
}
#[test]
#[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
fn class_perl_space_disabled() {
assert_eq!(
t_err(r"\s"),
TestError {
kind: hir::ErrorKind::UnicodePerlClassNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(2, 1, 3)
),
}
);
}
#[test]
#[cfg(all(
not(feature = "unicode-perl"),
not(feature = "unicode-gencat")
))]
fn class_perl_digit_disabled() {
assert_eq!(
t_err(r"\d"),
TestError {
kind: hir::ErrorKind::UnicodePerlClassNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(2, 1, 3)
),
}
);
}
#[test]
#[cfg(feature = "unicode-gencat")]
fn class_unicode_gencat() {
assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
assert_eq!(
t(r"\p{Separator}"),
hir_uclass_query(ClassQuery::Binary("Z"))
);
assert_eq!(
t(r"\p{se PaRa ToR}"),
hir_uclass_query(ClassQuery::Binary("Z"))
);
assert_eq!(
t(r"\p{gc:Separator}"),
hir_uclass_query(ClassQuery::Binary("Z"))
);
assert_eq!(
t(r"\p{gc=Separator}"),
hir_uclass_query(ClassQuery::Binary("Z"))
);
assert_eq!(
t(r"\p{Other}"),
hir_uclass_query(ClassQuery::Binary("Other"))
);
assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
assert_eq!(
t(r"\PZ"),
hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
);
assert_eq!(
t(r"\P{separator}"),
hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
);
assert_eq!(
t(r"\P{gc!=separator}"),
hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
);
assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
assert_eq!(
t(r"\p{assigned}"),
hir_uclass_query(ClassQuery::Binary("Assigned"))
);
assert_eq!(
t(r"\p{ascii}"),
hir_uclass_query(ClassQuery::Binary("ASCII"))
);
assert_eq!(
t(r"\p{gc:any}"),
hir_uclass_query(ClassQuery::Binary("Any"))
);
assert_eq!(
t(r"\p{gc:assigned}"),
hir_uclass_query(ClassQuery::Binary("Assigned"))
);
assert_eq!(
t(r"\p{gc:ascii}"),
hir_uclass_query(ClassQuery::Binary("ASCII"))
);
assert_eq!(
t_err(r"(?-u)\pZ"),
TestError {
kind: hir::ErrorKind::UnicodeNotAllowed,
span: Span::new(
Position::new(5, 1, 6),
Position::new(8, 1, 9)
),
}
);
assert_eq!(
t_err(r"(?-u)\p{Separator}"),
TestError {
kind: hir::ErrorKind::UnicodeNotAllowed,
span: Span::new(
Position::new(5, 1, 6),
Position::new(18, 1, 19)
),
}
);
assert_eq!(
t_err(r"\pE"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(3, 1, 4)
),
}
);
assert_eq!(
t_err(r"\p{Foo}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(7, 1, 8)
),
}
);
assert_eq!(
t_err(r"\p{gc:Foo}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyValueNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(10, 1, 11)
),
}
);
}
#[test]
#[cfg(not(feature = "unicode-gencat"))]
fn class_unicode_gencat_disabled() {
assert_eq!(
t_err(r"\p{Separator}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(13, 1, 14)
),
}
);
assert_eq!(
t_err(r"\p{Any}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(7, 1, 8)
),
}
);
}
#[test]
#[cfg(feature = "unicode-script")]
fn class_unicode_script() {
assert_eq!(
t(r"\p{Greek}"),
hir_uclass_query(ClassQuery::Binary("Greek"))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)\p{Greek}"),
hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t(r"(?i)\P{Greek}"),
hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
"Greek"
))))
);
assert_eq!(
t_err(r"\p{sc:Foo}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyValueNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(10, 1, 11)
),
}
);
assert_eq!(
t_err(r"\p{scx:Foo}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyValueNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(11, 1, 12)
),
}
);
}
#[test]
#[cfg(not(feature = "unicode-script"))]
fn class_unicode_script_disabled() {
assert_eq!(
t_err(r"\p{Greek}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(9, 1, 10)
),
}
);
assert_eq!(
t_err(r"\p{scx:Greek}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(13, 1, 14)
),
}
);
}
#[test]
#[cfg(feature = "unicode-age")]
fn class_unicode_age() {
assert_eq!(
t_err(r"\p{age:Foo}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyValueNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(11, 1, 12)
),
}
);
}
#[test]
#[cfg(not(feature = "unicode-age"))]
fn class_unicode_age_disabled() {
assert_eq!(
t_err(r"\p{age:3.0}"),
TestError {
kind: hir::ErrorKind::UnicodePropertyNotFound,
span: Span::new(
Position::new(0, 1, 1),
Position::new(11, 1, 12)
),
}
);
}
#[test]
fn class_bracketed() {
assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
#[cfg(feature = "unicode-gencat")]
assert_eq!(
t(r"[\pZ]"),
hir_uclass_query(ClassQuery::Binary("separator"))
);
#[cfg(feature = "unicode-gencat")]
assert_eq!(
t(r"[\p{separator}]"),
hir_uclass_query(ClassQuery::Binary("separator"))
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
#[cfg(feature = "unicode-gencat")]
assert_eq!(
t(r"[^\PZ]"),
hir_uclass_query(ClassQuery::Binary("separator"))
);
#[cfg(feature = "unicode-gencat")]
assert_eq!(
t(r"[^\P{separator}]"),
hir_uclass_query(ClassQuery::Binary("separator"))
);
#[cfg(all(
feature = "unicode-case",
any(feature = "unicode-perl", feature = "unicode-gencat")
))]
assert_eq!(
t(r"(?i)[^\D]"),
hir_uclass_query(ClassQuery::Binary("digit"))
);
#[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
assert_eq!(
t(r"(?i)[^\P{greek}]"),
hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
);
assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
#[cfg(feature = "unicode-case")]
assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)[k]"),
hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
);
#[cfg(feature = "unicode-case")]
assert_eq!(
t("(?i)[β]"),
hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
);
assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
assert_eq!(
t_bytes("(?-u)[^a]"),
hir_negate(hir_bclass(&[(b'a', b'a')]))
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
assert_eq!(
t(r"[^\d]"),
hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
);
#[cfg(feature = "unicode-gencat")]
assert_eq!(
t(r"[^\pZ]"),
hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
);
#[cfg(feature = "unicode-gencat")]
assert_eq!(
t(r"[^\p{separator}]"),
hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
);
#[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
assert_eq!(
t(r"(?i)[^\p{greek}]"),
hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
"greek"
))))
);
#[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
assert_eq!(
t(r"(?i)[\P{greek}]"),
hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
"greek"
))))
);
// Test some weird cases.
assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
assert_eq!(
t_err("(?-u)[^a]"),
TestError {
kind: hir::ErrorKind::InvalidUtf8,
span: Span::new(
Position::new(5, 1, 6),
Position::new(9, 1, 10)
),
}
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
assert_eq!(
t_err(r"[^\s\S]"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(0, 1, 1),
Position::new(7, 1, 8)
),
}
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
assert_eq!(
t_err(r"(?-u)[^\s\S]"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(5, 1, 6),