| // Copyright 2014 The Rust Project Developers. See the COPYRIGHT |
| // file at the top-level directory of this distribution and at |
| // http://rust-lang.org/COPYRIGHT. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| //! This crate provides the `regex!` macro. Its use is documented in the |
| //! `regex` crate. |
| |
| #![crate_id = "regex_macros#0.11.0-pre"] |
| #![crate_type = "dylib"] |
| #![experimental] |
| #![license = "MIT/ASL2"] |
| #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", |
| html_favicon_url = "http://www.rust-lang.org/favicon.ico", |
| html_root_url = "http://doc.rust-lang.org/")] |
| |
| #![feature(plugin_registrar, managed_boxes, quote)] |
| |
| extern crate regex; |
| extern crate syntax; |
| extern crate rustc; |
| |
| use std::rc::Rc; |
| use std::gc::{Gc, GC}; |
| |
| use syntax::ast; |
| use syntax::codemap; |
| use syntax::ext::build::AstBuilder; |
| use syntax::ext::base::{ExtCtxt, MacResult, MacExpr, DummyResult}; |
| use syntax::parse; |
| use syntax::parse::token; |
| use syntax::print::pprust; |
| |
| use rustc::plugin::Registry; |
| |
| use regex::Regex; |
| use regex::native::{ |
| OneChar, CharClass, Any, Save, Jump, Split, |
| Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, |
| Program, Dynamic, Native, |
| FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, |
| }; |
| |
| /// For the `regex!` syntax extension. Do not use. |
| #[plugin_registrar] |
| #[doc(hidden)] |
| pub fn plugin_registrar(reg: &mut Registry) { |
| reg.register_macro("regex", native); |
| } |
| |
| /// Generates specialized code for the Pike VM for a particular regular |
| /// expression. |
| /// |
| /// There are two primary differences between the code generated here and the |
| /// general code in vm.rs. |
| /// |
| /// 1. All heap allocation is removed. Sized vector types are used instead. |
| /// Care must be taken to make sure that these vectors are not copied |
| /// gratuitously. (If you're not sure, run the benchmarks. They will yell |
| /// at you if you do.) |
| /// 2. The main `match instruction { ... }` expressions are replaced with more |
| /// direct `match pc { ... }`. The generators can be found in |
| /// `step_insts` and `add_insts`. |
| /// |
| /// Other more minor changes include eliding code when possible (although this |
| /// isn't completely thorough at the moment), and translating character class |
| /// matching from using a binary search to a simple `match` expression (see |
| /// `match_class`). |
| /// |
| /// It is strongly recommended to read the dynamic implementation in vm.rs |
| /// first before trying to understand the code generator. The implementation |
| /// strategy is identical and vm.rs has comments and will be easier to follow. |
| #[allow(experimental)] |
| fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) |
| -> Box<MacResult> { |
| let regex = match parse(cx, tts) { |
| Some(r) => r, |
| // error is logged in 'parse' with cx.span_err |
| None => return DummyResult::any(sp), |
| }; |
| let re = match Regex::new(regex.as_slice()) { |
| Ok(re) => re, |
| Err(err) => { |
| cx.span_err(sp, err.to_str().as_slice()); |
| return DummyResult::any(sp) |
| } |
| }; |
| let prog = match re { |
| Dynamic(Dynamic { ref prog, .. }) => prog.clone(), |
| Native(_) => unreachable!(), |
| }; |
| |
| let mut gen = NfaGen { |
| cx: &*cx, sp: sp, prog: prog, |
| names: re.names_iter().collect(), original: re.as_str().to_string(), |
| }; |
| MacExpr::new(gen.code()) |
| } |
| |
| struct NfaGen<'a> { |
| cx: &'a ExtCtxt<'a>, |
| sp: codemap::Span, |
| prog: Program, |
| names: Vec<Option<String>>, |
| original: String, |
| } |
| |
| impl<'a> NfaGen<'a> { |
| fn code(&mut self) -> Gc<ast::Expr> { |
| // Most or all of the following things are used in the quasiquoted |
| // expression returned. |
| let num_cap_locs = 2 * self.prog.num_captures(); |
| let num_insts = self.prog.insts.len(); |
| let cap_names = self.vec_expr(self.names.as_slice().iter(), |
| |cx, name| match *name { |
| Some(ref name) => { |
| let name = name.as_slice(); |
| quote_expr!(cx, Some($name)) |
| } |
| None => cx.expr_none(self.sp), |
| } |
| ); |
| let prefix_anchor = |
| match self.prog.insts.as_slice()[1] { |
| EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, |
| _ => false, |
| }; |
| let init_groups = self.vec_expr(range(0, num_cap_locs), |
| |cx, _| cx.expr_none(self.sp)); |
| |
| let prefix_lit = Rc::new(Vec::from_slice(self.prog.prefix.as_slice().as_bytes())); |
| let prefix_bytes = self.cx.expr_lit(self.sp, ast::LitBinary(prefix_lit)); |
| |
| let check_prefix = self.check_prefix(); |
| let step_insts = self.step_insts(); |
| let add_insts = self.add_insts(); |
| let regex = self.original.as_slice(); |
| |
| quote_expr!(self.cx, { |
| // When `regex!` is bound to a name that is not used, we have to make sure |
| // that dead_code warnings don't bubble up to the user from the generated |
| // code. Therefore, we suppress them by allowing dead_code. The effect is that |
| // the user is only warned about *their* unused variable/code, and not the |
| // unused code generated by regex!. See #14185 for an example. |
| #[allow(dead_code)] |
| static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names; |
| |
| #[allow(dead_code)] |
| fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, |
| start: uint, end: uint) -> Vec<Option<uint>> { |
| #![allow(unused_imports)] |
| #![allow(unused_mut)] |
| |
| use regex::native::{ |
| MatchKind, Exists, Location, Submatches, |
| StepState, StepMatchEarlyReturn, StepMatch, StepContinue, |
| CharReader, find_prefix, |
| }; |
| |
| return Nfa { |
| which: which, |
| input: input, |
| ic: 0, |
| chars: CharReader::new(input), |
| }.run(start, end); |
| |
| type Captures = [Option<uint>, ..$num_cap_locs]; |
| |
| struct Nfa<'t> { |
| which: MatchKind, |
| input: &'t str, |
| ic: uint, |
| chars: CharReader<'t>, |
| } |
| |
| impl<'t> Nfa<'t> { |
| #[allow(unused_variable)] |
| fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> { |
| let mut matched = false; |
| let prefix_bytes: &[u8] = $prefix_bytes; |
| let mut clist = &mut Threads::new(self.which); |
| let mut nlist = &mut Threads::new(self.which); |
| |
| let mut groups = $init_groups; |
| |
| self.ic = start; |
| let mut next_ic = self.chars.set(start); |
| while self.ic <= end { |
| if clist.size == 0 { |
| if matched { |
| break |
| } |
| $check_prefix |
| } |
| if clist.size == 0 || (!$prefix_anchor && !matched) { |
| self.add(clist, 0, &mut groups) |
| } |
| |
| self.ic = next_ic; |
| next_ic = self.chars.advance(); |
| |
| for i in range(0, clist.size) { |
| let pc = clist.pc(i); |
| let step_state = self.step(&mut groups, nlist, |
| clist.groups(i), pc); |
| match step_state { |
| StepMatchEarlyReturn => |
| return vec![Some(0u), Some(0u)], |
| StepMatch => { matched = true; break }, |
| StepContinue => {}, |
| } |
| } |
| ::std::mem::swap(&mut clist, &mut nlist); |
| nlist.empty(); |
| } |
| match self.which { |
| Exists if matched => vec![Some(0u), Some(0u)], |
| Exists => vec![None, None], |
| Location | Submatches => groups.iter().map(|x| *x).collect(), |
| } |
| } |
| |
| // Sometimes `nlist` is never used (for empty regexes). |
| #[allow(unused_variable)] |
| #[inline] |
| fn step(&self, groups: &mut Captures, nlist: &mut Threads, |
| caps: &mut Captures, pc: uint) -> StepState { |
| $step_insts |
| StepContinue |
| } |
| |
| fn add(&self, nlist: &mut Threads, pc: uint, |
| groups: &mut Captures) { |
| if nlist.contains(pc) { |
| return |
| } |
| $add_insts |
| } |
| } |
| |
| struct Thread { |
| pc: uint, |
| groups: Captures, |
| } |
| |
| struct Threads { |
| which: MatchKind, |
| queue: [Thread, ..$num_insts], |
| sparse: [uint, ..$num_insts], |
| size: uint, |
| } |
| |
| impl Threads { |
| fn new(which: MatchKind) -> Threads { |
| Threads { |
| which: which, |
| // These unsafe blocks are used for performance reasons, as it |
| // gives us a zero-cost initialization of a sparse set. The |
| // trick is described in more detail here: |
| // http://research.swtch.com/sparse |
| // The idea here is to avoid initializing threads that never |
| // need to be initialized, particularly for larger regexs with |
| // a lot of instructions. |
| queue: unsafe { ::std::mem::uninitialized() }, |
| sparse: unsafe { ::std::mem::uninitialized() }, |
| size: 0, |
| } |
| } |
| |
| #[inline] |
| fn add(&mut self, pc: uint, groups: &Captures) { |
| let t = &mut self.queue[self.size]; |
| t.pc = pc; |
| match self.which { |
| Exists => {}, |
| Location => { |
| t.groups[0] = groups[0]; |
| t.groups[1] = groups[1]; |
| } |
| Submatches => { |
| for (slot, val) in t.groups.mut_iter().zip(groups.iter()) { |
| *slot = *val; |
| } |
| } |
| } |
| self.sparse[pc] = self.size; |
| self.size += 1; |
| } |
| |
| #[inline] |
| fn add_empty(&mut self, pc: uint) { |
| self.queue[self.size].pc = pc; |
| self.sparse[pc] = self.size; |
| self.size += 1; |
| } |
| |
| #[inline] |
| fn contains(&self, pc: uint) -> bool { |
| let s = self.sparse[pc]; |
| s < self.size && self.queue[s].pc == pc |
| } |
| |
| #[inline] |
| fn empty(&mut self) { |
| self.size = 0; |
| } |
| |
| #[inline] |
| fn pc(&self, i: uint) -> uint { |
| self.queue[i].pc |
| } |
| |
| #[inline] |
| fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures { |
| &'r mut self.queue[i].groups |
| } |
| } |
| } |
| |
| ::regex::native::Native(::regex::native::Native { |
| original: $regex, |
| names: CAP_NAMES, |
| prog: exec, |
| }) |
| }) |
| } |
| |
| // Generates code for the `add` method, which is responsible for adding |
| // zero-width states to the next queue of states to visit. |
| fn add_insts(&self) -> Gc<ast::Expr> { |
| let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { |
| let nextpc = pc + 1; |
| let body = match *inst { |
| EmptyBegin(flags) => { |
| let cond = |
| if flags & FLAG_MULTI > 0 { |
| quote_expr!(self.cx, |
| self.chars.is_begin() |
| || self.chars.prev == Some('\n') |
| ) |
| } else { |
| quote_expr!(self.cx, self.chars.is_begin()) |
| }; |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| if $cond { self.add(nlist, $nextpc, &mut *groups) } |
| }) |
| } |
| EmptyEnd(flags) => { |
| let cond = |
| if flags & FLAG_MULTI > 0 { |
| quote_expr!(self.cx, |
| self.chars.is_end() |
| || self.chars.cur == Some('\n') |
| ) |
| } else { |
| quote_expr!(self.cx, self.chars.is_end()) |
| }; |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| if $cond { self.add(nlist, $nextpc, &mut *groups) } |
| }) |
| } |
| EmptyWordBoundary(flags) => { |
| let cond = |
| if flags & FLAG_NEGATED > 0 { |
| quote_expr!(self.cx, !self.chars.is_word_boundary()) |
| } else { |
| quote_expr!(self.cx, self.chars.is_word_boundary()) |
| }; |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| if $cond { self.add(nlist, $nextpc, &mut *groups) } |
| }) |
| } |
| Save(slot) => { |
| let save = quote_expr!(self.cx, { |
| let old = groups[$slot]; |
| groups[$slot] = Some(self.ic); |
| self.add(nlist, $nextpc, &mut *groups); |
| groups[$slot] = old; |
| }); |
| let add = quote_expr!(self.cx, { |
| self.add(nlist, $nextpc, &mut *groups); |
| }); |
| // If this is saving a submatch location but we request |
| // existence or only full match location, then we can skip |
| // right over it every time. |
| if slot > 1 { |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| match self.which { |
| Submatches => $save, |
| Exists | Location => $add, |
| } |
| }) |
| } else { |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| match self.which { |
| Submatches | Location => $save, |
| Exists => $add, |
| } |
| }) |
| } |
| } |
| Jump(to) => { |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| self.add(nlist, $to, &mut *groups); |
| }) |
| } |
| Split(x, y) => { |
| quote_expr!(self.cx, { |
| nlist.add_empty($pc); |
| self.add(nlist, $x, &mut *groups); |
| self.add(nlist, $y, &mut *groups); |
| }) |
| } |
| // For Match, OneChar, CharClass, Any |
| _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), |
| }; |
| self.arm_inst(pc, body) |
| }).collect::<Vec<ast::Arm>>(); |
| |
| self.match_insts(arms) |
| } |
| |
| // Generates the code for the `step` method, which processes all states |
| // in the current queue that consume a single character. |
| fn step_insts(&self) -> Gc<ast::Expr> { |
| let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { |
| let nextpc = pc + 1; |
| let body = match *inst { |
| Match => { |
| quote_expr!(self.cx, { |
| match self.which { |
| Exists => { |
| return StepMatchEarlyReturn |
| } |
| Location => { |
| groups[0] = caps[0]; |
| groups[1] = caps[1]; |
| return StepMatch |
| } |
| Submatches => { |
| for (slot, val) in groups.mut_iter().zip(caps.iter()) { |
| *slot = *val; |
| } |
| return StepMatch |
| } |
| } |
| }) |
| } |
| OneChar(c, flags) => { |
| if flags & FLAG_NOCASE > 0 { |
| let upc = c.to_uppercase(); |
| quote_expr!(self.cx, { |
| let upc = self.chars.prev.map(|c| c.to_uppercase()); |
| if upc == Some($upc) { |
| self.add(nlist, $nextpc, caps); |
| } |
| }) |
| } else { |
| quote_expr!(self.cx, { |
| if self.chars.prev == Some($c) { |
| self.add(nlist, $nextpc, caps); |
| } |
| }) |
| } |
| } |
| CharClass(ref ranges, flags) => { |
| let negate = flags & FLAG_NEGATED > 0; |
| let casei = flags & FLAG_NOCASE > 0; |
| let get_char = |
| if casei { |
| quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase()) |
| } else { |
| quote_expr!(self.cx, self.chars.prev.unwrap()) |
| }; |
| let negcond = |
| if negate { |
| quote_expr!(self.cx, !found) |
| } else { |
| quote_expr!(self.cx, found) |
| }; |
| let mranges = self.match_class(casei, ranges.as_slice()); |
| quote_expr!(self.cx, { |
| if self.chars.prev.is_some() { |
| let c = $get_char; |
| let found = $mranges; |
| if $negcond { |
| self.add(nlist, $nextpc, caps); |
| } |
| } |
| }) |
| } |
| Any(flags) => { |
| if flags & FLAG_DOTNL > 0 { |
| quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) |
| } else { |
| quote_expr!(self.cx, { |
| if self.chars.prev != Some('\n') { |
| self.add(nlist, $nextpc, caps) |
| } |
| () |
| }) |
| } |
| } |
| // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split |
| _ => self.empty_block(), |
| }; |
| self.arm_inst(pc, body) |
| }).collect::<Vec<ast::Arm>>(); |
| |
| self.match_insts(arms) |
| } |
| |
| // Translates a character class into a match expression. |
| // This avoids a binary search (and is hopefully replaced by a jump |
| // table). |
| fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> Gc<ast::Expr> { |
| let expr_true = quote_expr!(self.cx, true); |
| |
| let mut arms = ranges.iter().map(|&(mut start, mut end)| { |
| if casei { |
| start = start.to_uppercase(); |
| end = end.to_uppercase(); |
| } |
| let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start), |
| quote_expr!(self.cx, $end))); |
| self.cx.arm(self.sp, vec!(pat), expr_true) |
| }).collect::<Vec<ast::Arm>>(); |
| |
| arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); |
| |
| let match_on = quote_expr!(self.cx, c); |
| self.cx.expr_match(self.sp, match_on, arms) |
| } |
| |
| // Generates code for checking a literal prefix of the search string. |
| // The code is only generated if the regex *has* a literal prefix. |
| // Otherwise, a no-op is returned. |
| fn check_prefix(&self) -> Gc<ast::Expr> { |
| if self.prog.prefix.len() == 0 { |
| self.empty_block() |
| } else { |
| quote_expr!(self.cx, |
| if clist.size == 0 { |
| let haystack = self.input.as_bytes().slice_from(self.ic); |
| match find_prefix(prefix_bytes, haystack) { |
| None => break, |
| Some(i) => { |
| self.ic += i; |
| next_ic = self.chars.set(self.ic); |
| } |
| } |
| } |
| ) |
| } |
| } |
| |
| // Builds a `match pc { ... }` expression from a list of arms, specifically |
| // for matching the current program counter with an instruction. |
| // A wild-card arm is automatically added that executes a no-op. It will |
| // never be used, but is added to satisfy the compiler complaining about |
| // non-exhaustive patterns. |
| fn match_insts(&self, mut arms: Vec<ast::Arm>) -> Gc<ast::Expr> { |
| arms.push(self.wild_arm_expr(self.empty_block())); |
| self.cx.expr_match(self.sp, quote_expr!(self.cx, pc), arms) |
| } |
| |
| fn empty_block(&self) -> Gc<ast::Expr> { |
| quote_expr!(self.cx, {}) |
| } |
| |
| // Creates a match arm for the instruction at `pc` with the expression |
| // `body`. |
| fn arm_inst(&self, pc: uint, body: Gc<ast::Expr>) -> ast::Arm { |
| let pc_pat = self.cx.pat_lit(self.sp, quote_expr!(self.cx, $pc)); |
| |
| self.cx.arm(self.sp, vec!(pc_pat), body) |
| } |
| |
| // Creates a wild-card match arm with the expression `body`. |
| fn wild_arm_expr(&self, body: Gc<ast::Expr>) -> ast::Arm { |
| ast::Arm { |
| attrs: vec!(), |
| pats: vec!(box(GC) ast::Pat{ |
| id: ast::DUMMY_NODE_ID, |
| span: self.sp, |
| node: ast::PatWild, |
| }), |
| guard: None, |
| body: body, |
| } |
| } |
| |
| |
| // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` |
| // on each element in `xs`. |
| fn vec_expr<T, It: Iterator<T>>(&self, xs: It, |
| to_expr: |&ExtCtxt, T| -> Gc<ast::Expr>) |
| -> Gc<ast::Expr> { |
| let exprs = xs.map(|x| to_expr(self.cx, x)).collect(); |
| self.cx.expr_vec(self.sp, exprs) |
| } |
| } |
| |
| /// Looks for a single string literal and returns it. |
| /// Otherwise, logs an error with cx.span_err and returns None. |
| fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<String> { |
| let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), |
| Vec::from_slice(tts)); |
| let entry = cx.expand_expr(parser.parse_expr()); |
| let regex = match entry.node { |
| ast::ExprLit(lit) => { |
| match lit.node { |
| ast::LitStr(ref s, _) => s.to_str(), |
| _ => { |
| cx.span_err(entry.span, format!( |
| "expected string literal but got `{}`", |
| pprust::lit_to_str(lit)).as_slice()); |
| return None |
| } |
| } |
| } |
| _ => { |
| cx.span_err(entry.span, format!( |
| "expected string literal but got `{}`", |
| pprust::expr_to_str(entry)).as_slice()); |
| return None |
| } |
| }; |
| if !parser.eat(&token::EOF) { |
| cx.span_err(parser.span, "only one string literal allowed"); |
| return None; |
| } |
| Some(regex) |
| } |