blob: 604e25a14e6be55f17828f9a26da6603f42f5c2d [file] [log] [blame]
//! This is an NFA-based parser, which calls out to the main Rust parser for named non-terminals
//! (which it commits to fully when it hits one in a grammar). There's a set of current NFA threads
//! and a set of next ones. Instead of NTs, we have a special case for Kleene star. The big-O, in
//! pathological cases, is worse than traditional use of NFA or Earley parsing, but it's an easier
//! fit for Macro-by-Example-style rules.
//!
//! (In order to prevent the pathological case, we'd need to lazily construct the resulting
//! `NamedMatch`es at the very end. It'd be a pain, and require more memory to keep around old
//! matcher positions, but it would also save overhead)
//!
//! We don't say this parser uses the Earley algorithm, because it's unnecessarily inaccurate.
//! The macro parser restricts itself to the features of finite state automata. Earley parsers
//! can be described as an extension of NFAs with completion rules, prediction rules, and recursion.
//!
//! Quick intro to how the parser works:
//!
//! A "matcher position" (a.k.a. "position" or "mp") is a dot in the middle of a matcher, usually
//! written as a `·`. For example `· a $( a )* a b` is one, as is `a $( · a )* a b`.
//!
//! The parser walks through the input a token at a time, maintaining a list
//! of threads consistent with the current position in the input string: `cur_mps`.
//!
//! As it processes them, it fills up `eof_mps` with threads that would be valid if
//! the macro invocation is now over, `bb_mps` with threads that are waiting on
//! a Rust non-terminal like `$e:expr`, and `next_mps` with threads that are waiting
//! on a particular token. Most of the logic concerns moving the · through the
//! repetitions indicated by Kleene stars. The rules for moving the · without
//! consuming any input are called epsilon transitions. It only advances or calls
//! out to the real Rust parser when no `cur_mps` threads remain.
//!
//! Example:
//!
//! ```text, ignore
//! Start parsing a a a a b against [· a $( a )* a b].
//!
//! Remaining input: a a a a b
//! next: [· a $( a )* a b]
//!
//! - - - Advance over an a. - - -
//!
//! Remaining input: a a a b
//! cur: [a · $( a )* a b]
//! Descend/Skip (first position).
//! next: [a $( · a )* a b] [a $( a )* · a b].
//!
//! - - - Advance over an a. - - -
//!
//! Remaining input: a a b
//! cur: [a $( a · )* a b] [a $( a )* a · b]
//! Follow epsilon transition: Finish/Repeat (first position)
//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
//!
//! - - - Advance over an a. - - - (this looks exactly like the last step)
//!
//! Remaining input: a b
//! cur: [a $( a · )* a b] [a $( a )* a · b]
//! Follow epsilon transition: Finish/Repeat (first position)
//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
//!
//! - - - Advance over an a. - - - (this looks exactly like the last step)
//!
//! Remaining input: b
//! cur: [a $( a · )* a b] [a $( a )* a · b]
//! Follow epsilon transition: Finish/Repeat (first position)
//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
//!
//! - - - Advance over a b. - - -
//!
//! Remaining input: ''
//! eof: [a $( a )* a b ·]
//! ```
crate use NamedMatch::*;
crate use ParseResult::*;
use crate::mbe::{KleeneOp, TokenTree};
use rustc_ast::token::{self, DocComment, Nonterminal, NonterminalKind, Token};
use rustc_parse::parser::{NtOrTt, Parser};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::MacroRulesNormalizedIdent;
use rustc_span::Span;
use smallvec::{smallvec, SmallVec};
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::sync::Lrc;
use rustc_span::symbol::Ident;
use std::borrow::Cow;
use std::collections::hash_map::Entry::{Occupied, Vacant};
// One element is enough to cover 95-99% of vectors for most benchmarks. Also, vectors longer than
// one frequently have many elements, not just two or three.
type NamedMatchVec = SmallVec<[NamedMatch; 1]>;
// This type is used a lot. Make sure it doesn't unintentionally get bigger.
#[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
rustc_data_structures::static_assert_size!(NamedMatchVec, 48);
/// A unit within a matcher that a `MatcherPos` can refer to. Similar to (and derived from)
/// `mbe::TokenTree`, but designed specifically for fast and easy traversal during matching.
/// Notable differences to `mbe::TokenTree`:
/// - It is non-recursive, i.e. there is no nesting.
/// - The end pieces of each sequence (the separator, if present, and the Kleene op) are
/// represented explicitly, as is the very end of the matcher.
///
/// This means a matcher can be represented by `&[MatcherLoc]`, and traversal mostly involves
/// simply incrementing the current matcher position index by one.
pub(super) enum MatcherLoc {
Token {
token: Token,
},
Delimited,
Sequence {
op: KleeneOp,
num_metavar_decls: usize,
idx_first_after: usize,
next_metavar: usize,
seq_depth: usize,
},
SequenceKleeneOpNoSep {
op: KleeneOp,
idx_first: usize,
},
SequenceSep {
separator: Token,
},
SequenceKleeneOpAfterSep {
idx_first: usize,
},
MetaVarDecl {
span: Span,
bind: Ident,
kind: Option<NonterminalKind>,
next_metavar: usize,
seq_depth: usize,
},
Eof,
}
pub(super) fn compute_locs(sess: &ParseSess, matcher: &[TokenTree]) -> Vec<MatcherLoc> {
fn inner(
sess: &ParseSess,
tts: &[TokenTree],
locs: &mut Vec<MatcherLoc>,
next_metavar: &mut usize,
seq_depth: usize,
) {
for tt in tts {
match tt {
TokenTree::Token(token) => {
locs.push(MatcherLoc::Token { token: token.clone() });
}
TokenTree::Delimited(_, delimited) => {
locs.push(MatcherLoc::Delimited);
inner(sess, &delimited.all_tts, locs, next_metavar, seq_depth);
}
TokenTree::Sequence(_, seq) => {
// We can't determine `idx_first_after` and construct the final
// `MatcherLoc::Sequence` until after `inner()` is called and the sequence end
// pieces are processed. So we push a dummy value (`Eof` is cheapest to
// construct) now, and overwrite it with the proper value below.
let dummy = MatcherLoc::Eof;
locs.push(dummy);
let next_metavar_orig = *next_metavar;
let op = seq.kleene.op;
let idx_first = locs.len();
let idx_seq = idx_first - 1;
inner(sess, &seq.tts, locs, next_metavar, seq_depth + 1);
if let Some(separator) = &seq.separator {
locs.push(MatcherLoc::SequenceSep { separator: separator.clone() });
locs.push(MatcherLoc::SequenceKleeneOpAfterSep { idx_first });
} else {
locs.push(MatcherLoc::SequenceKleeneOpNoSep { op, idx_first });
}
// Overwrite the dummy value pushed above with the proper value.
locs[idx_seq] = MatcherLoc::Sequence {
op,
num_metavar_decls: seq.num_captures,
idx_first_after: locs.len(),
next_metavar: next_metavar_orig,
seq_depth,
};
}
&TokenTree::MetaVarDecl(span, bind, kind) => {
locs.push(MatcherLoc::MetaVarDecl {
span,
bind,
kind,
next_metavar: *next_metavar,
seq_depth,
});
*next_metavar += 1;
}
TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(),
}
}
}
let mut locs = vec![];
let mut next_metavar = 0;
inner(sess, matcher, &mut locs, &mut next_metavar, /* seq_depth */ 0);
// A final entry is needed for eof.
locs.push(MatcherLoc::Eof);
locs
}
/// A single matcher position, representing the state of matching.
struct MatcherPos {
/// The index into `TtParser::locs`, which represents the "dot".
idx: usize,
/// The matches made against metavar decls so far. On a successful match, this vector ends up
/// with one element per metavar decl in the matcher. Each element records token trees matched
/// against the relevant metavar by the black box parser. An element will be a `MatchedSeq` if
/// the corresponding metavar decl is within a sequence.
matches: Lrc<NamedMatchVec>,
}
// This type is used a lot. Make sure it doesn't unintentionally get bigger.
#[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
rustc_data_structures::static_assert_size!(MatcherPos, 16);
impl MatcherPos {
/// Adds `m` as a named match for the `metavar_idx`-th metavar. There are only two call sites,
/// and both are hot enough to be always worth inlining.
#[inline(always)]
fn push_match(&mut self, metavar_idx: usize, seq_depth: usize, m: NamedMatch) {
let matches = Lrc::make_mut(&mut self.matches);
match seq_depth {
0 => {
// We are not within a sequence. Just append `m`.
assert_eq!(metavar_idx, matches.len());
matches.push(m);
}
_ => {
// We are within a sequence. Find the final `MatchedSeq` at the appropriate depth
// and append `m` to its vector.
let mut curr = &mut matches[metavar_idx];
for _ in 0..seq_depth - 1 {
match curr {
MatchedSeq(seq) => {
let seq = Lrc::make_mut(seq);
curr = seq.last_mut().unwrap();
}
_ => unreachable!(),
}
}
match curr {
MatchedSeq(seq) => {
let seq = Lrc::make_mut(seq);
seq.push(m);
}
_ => unreachable!(),
}
}
}
}
}
enum EofMatcherPositions {
None,
One(MatcherPos),
Multiple,
}
/// Represents the possible results of an attempted parse.
crate enum ParseResult<T> {
/// Parsed successfully.
Success(T),
/// Arm failed to match. If the second parameter is `token::Eof`, it indicates an unexpected
/// end of macro invocation. Otherwise, it indicates that no rules expected the given token.
Failure(Token, &'static str),
/// Fatal error (malformed macro?). Abort compilation.
Error(rustc_span::Span, String),
ErrorReported,
}
/// A `ParseResult` where the `Success` variant contains a mapping of
/// `MacroRulesNormalizedIdent`s to `NamedMatch`es. This represents the mapping
/// of metavars to the token trees they bind to.
crate type NamedParseResult = ParseResult<FxHashMap<MacroRulesNormalizedIdent, NamedMatch>>;
/// Count how many metavars declarations are in `matcher`.
pub(super) fn count_metavar_decls(matcher: &[TokenTree]) -> usize {
matcher
.iter()
.map(|tt| match tt {
TokenTree::MetaVarDecl(..) => 1,
TokenTree::Sequence(_, seq) => seq.num_captures,
TokenTree::Delimited(_, delim) => count_metavar_decls(delim.inner_tts()),
TokenTree::Token(..) => 0,
TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(),
})
.sum()
}
/// `NamedMatch` is a pattern-match result for a single metavar. All
/// `MatchedNonterminal`s in the `NamedMatch` have the same non-terminal type
/// (expr, item, etc).
///
/// The in-memory structure of a particular `NamedMatch` represents the match
/// that occurred when a particular subset of a matcher was applied to a
/// particular token tree.
///
/// The width of each `MatchedSeq` in the `NamedMatch`, and the identity of
/// the `MatchedNtNonTts`s, will depend on the token tree it was applied
/// to: each `MatchedSeq` corresponds to a single repetition in the originating
/// token tree. The depth of the `NamedMatch` structure will therefore depend
/// only on the nesting depth of repetitions in the originating token tree it
/// was derived from.
///
/// In layperson's terms: `NamedMatch` will form a tree representing nested matches of a particular
/// meta variable. For example, if we are matching the following macro against the following
/// invocation...
///
/// ```rust
/// macro_rules! foo {
/// ($($($x:ident),+);+) => {}
/// }
///
/// foo!(a, b, c, d; a, b, c, d, e);
/// ```
///
/// Then, the tree will have the following shape:
///
/// ```rust
/// MatchedSeq([
/// MatchedSeq([
/// MatchedNonterminal(a),
/// MatchedNonterminal(b),
/// MatchedNonterminal(c),
/// MatchedNonterminal(d),
/// ]),
/// MatchedSeq([
/// MatchedNonterminal(a),
/// MatchedNonterminal(b),
/// MatchedNonterminal(c),
/// MatchedNonterminal(d),
/// MatchedNonterminal(e),
/// ])
/// ])
/// ```
#[derive(Debug, Clone)]
crate enum NamedMatch {
MatchedSeq(Lrc<NamedMatchVec>),
// A metavar match of type `tt`.
MatchedTokenTree(rustc_ast::tokenstream::TokenTree),
// A metavar match of any type other than `tt`.
MatchedNonterminal(Lrc<Nonterminal>),
}
/// Performs a token equality check, ignoring syntax context (that is, an unhygienic comparison)
fn token_name_eq(t1: &Token, t2: &Token) -> bool {
if let (Some((ident1, is_raw1)), Some((ident2, is_raw2))) = (t1.ident(), t2.ident()) {
ident1.name == ident2.name && is_raw1 == is_raw2
} else if let (Some(ident1), Some(ident2)) = (t1.lifetime(), t2.lifetime()) {
ident1.name == ident2.name
} else {
t1.kind == t2.kind
}
}
// Note: the vectors could be created and dropped within `parse_tt`, but to avoid excess
// allocations we have a single vector fo each kind that is cleared and reused repeatedly.
pub struct TtParser {
macro_name: Ident,
/// The set of current mps to be processed. This should be empty by the end of a successful
/// execution of `parse_tt_inner`.
cur_mps: Vec<MatcherPos>,
/// The set of newly generated mps. These are used to replenish `cur_mps` in the function
/// `parse_tt`.
next_mps: Vec<MatcherPos>,
/// The set of mps that are waiting for the black-box parser.
bb_mps: Vec<MatcherPos>,
/// Pre-allocate an empty match array, so it can be cloned cheaply for macros with many rules
/// that have no metavars.
empty_matches: Lrc<NamedMatchVec>,
}
impl TtParser {
pub(super) fn new(macro_name: Ident) -> TtParser {
TtParser {
macro_name,
cur_mps: vec![],
next_mps: vec![],
bb_mps: vec![],
empty_matches: Lrc::new(smallvec![]),
}
}
/// Process the matcher positions of `cur_mps` until it is empty. In the process, this will
/// produce more mps in `next_mps` and `bb_mps`.
///
/// # Returns
///
/// `Some(result)` if everything is finished, `None` otherwise. Note that matches are kept
/// track of through the mps generated.
fn parse_tt_inner(
&mut self,
sess: &ParseSess,
matcher: &[MatcherLoc],
token: &Token,
) -> Option<NamedParseResult> {
// Matcher positions that would be valid if the macro invocation was over now. Only
// modified if `token == Eof`.
let mut eof_mps = EofMatcherPositions::None;
while let Some(mut mp) = self.cur_mps.pop() {
match &matcher[mp.idx] {
MatcherLoc::Token { token: t } => {
// If it's a doc comment, we just ignore it and move on to the next tt in the
// matcher. This is a bug, but #95267 showed that existing programs rely on
// this behaviour, and changing it would require some care and a transition
// period.
//
// If the token matches, we can just advance the parser.
//
// Otherwise, this match has failed, there is nothing to do, and hopefully
// another mp in `cur_mps` will match.
if matches!(t, Token { kind: DocComment(..), .. }) {
mp.idx += 1;
self.cur_mps.push(mp);
} else if token_name_eq(&t, token) {
mp.idx += 1;
self.next_mps.push(mp);
}
}
MatcherLoc::Delimited => {
// Entering the delimeter is trivial.
mp.idx += 1;
self.cur_mps.push(mp);
}
&MatcherLoc::Sequence {
op,
num_metavar_decls,
idx_first_after,
next_metavar,
seq_depth,
} => {
// Install an empty vec for each metavar within the sequence.
for metavar_idx in next_metavar..next_metavar + num_metavar_decls {
mp.push_match(
metavar_idx,
seq_depth,
MatchedSeq(self.empty_matches.clone()),
);
}
if op == KleeneOp::ZeroOrMore || op == KleeneOp::ZeroOrOne {
// Try zero matches of this sequence, by skipping over it.
self.cur_mps.push(MatcherPos {
idx: idx_first_after,
matches: mp.matches.clone(), // a cheap clone
});
}
// Try one or more matches of this sequence, by entering it.
mp.idx += 1;
self.cur_mps.push(mp);
}
&MatcherLoc::SequenceKleeneOpNoSep { op, idx_first } => {
// We are past the end of a sequence with no separator. Try ending the
// sequence. If that's not possible, `ending_mp` will fail quietly when it is
// processed next time around the loop.
let ending_mp = MatcherPos {
idx: mp.idx + 1, // +1 skips the Kleene op
matches: mp.matches.clone(), // a cheap clone
};
self.cur_mps.push(ending_mp);
if op != KleeneOp::ZeroOrOne {
// Try another repetition.
mp.idx = idx_first;
self.cur_mps.push(mp);
}
}
MatcherLoc::SequenceSep { separator } => {
// We are past the end of a sequence with a separator but we haven't seen the
// separator yet. Try ending the sequence. If that's not possible, `ending_mp`
// will fail quietly when it is processed next time around the loop.
let ending_mp = MatcherPos {
idx: mp.idx + 2, // +2 skips the separator and the Kleene op
matches: mp.matches.clone(), // a cheap clone
};
self.cur_mps.push(ending_mp);
if token_name_eq(token, separator) {
// The separator matches the current token. Advance past it.
mp.idx += 1;
self.next_mps.push(mp);
}
}
&MatcherLoc::SequenceKleeneOpAfterSep { idx_first } => {
// We are past the sequence separator. This can't be a `?` Kleene op, because
// they don't permit separators. Try another repetition.
mp.idx = idx_first;
self.cur_mps.push(mp);
}
&MatcherLoc::MetaVarDecl { span, kind, .. } => {
// Built-in nonterminals never start with these tokens, so we can eliminate
// them from consideration. We use the span of the metavariable declaration
// to determine any edition-specific matching behavior for non-terminals.
if let Some(kind) = kind {
if Parser::nonterminal_may_begin_with(kind, token) {
self.bb_mps.push(mp);
}
} else {
// Both this check and the one in `nameize` are necessary, surprisingly.
if sess.missing_fragment_specifiers.borrow_mut().remove(&span).is_some() {
// E.g. `$e` instead of `$e:expr`.
return Some(Error(span, "missing fragment specifier".to_string()));
}
}
}
MatcherLoc::Eof => {
// We are past the matcher's end, and not in a sequence. Try to end things.
debug_assert_eq!(mp.idx, matcher.len() - 1);
if *token == token::Eof {
eof_mps = match eof_mps {
EofMatcherPositions::None => EofMatcherPositions::One(mp),
EofMatcherPositions::One(_) | EofMatcherPositions::Multiple => {
EofMatcherPositions::Multiple
}
}
}
}
}
}
// If we reached the end of input, check that there is EXACTLY ONE possible matcher.
// Otherwise, either the parse is ambiguous (which is an error) or there is a syntax error.
if *token == token::Eof {
Some(match eof_mps {
EofMatcherPositions::One(mut eof_mp) => {
// Need to take ownership of the matches from within the `Lrc`.
Lrc::make_mut(&mut eof_mp.matches);
let matches = Lrc::try_unwrap(eof_mp.matches).unwrap().into_iter();
self.nameize(sess, matcher, matches)
}
EofMatcherPositions::Multiple => {
Error(token.span, "ambiguity: multiple successful parses".to_string())
}
EofMatcherPositions::None => Failure(
Token::new(
token::Eof,
if token.span.is_dummy() { token.span } else { token.span.shrink_to_hi() },
),
"missing tokens in macro arguments",
),
})
} else {
None
}
}
/// Match the token stream from `parser` against `matcher`.
pub(super) fn parse_tt(
&mut self,
parser: &mut Cow<'_, Parser<'_>>,
matcher: &[MatcherLoc],
) -> NamedParseResult {
// A queue of possible matcher positions. We initialize it with the matcher position in
// which the "dot" is before the first token of the first token tree in `matcher`.
// `parse_tt_inner` then processes all of these possible matcher positions and produces
// possible next positions into `next_mps`. After some post-processing, the contents of
// `next_mps` replenish `cur_mps` and we start over again.
self.cur_mps.clear();
self.cur_mps.push(MatcherPos { idx: 0, matches: self.empty_matches.clone() });
loop {
self.next_mps.clear();
self.bb_mps.clear();
// Process `cur_mps` until either we have finished the input or we need to get some
// parsing from the black-box parser done.
if let Some(res) = self.parse_tt_inner(&parser.sess, matcher, &parser.token) {
return res;
}
// `parse_tt_inner` handled all of `cur_mps`, so it's empty.
assert!(self.cur_mps.is_empty());
// Error messages here could be improved with links to original rules.
match (self.next_mps.len(), self.bb_mps.len()) {
(0, 0) => {
// There are no possible next positions AND we aren't waiting for the black-box
// parser: syntax error.
return Failure(
parser.token.clone(),
"no rules expected this token in macro call",
);
}
(_, 0) => {
// Dump all possible `next_mps` into `cur_mps` for the next iteration. Then
// process the next token.
self.cur_mps.extend(self.next_mps.drain(..));
parser.to_mut().bump();
}
(0, 1) => {
// We need to call the black-box parser to get some nonterminal.
let mut mp = self.bb_mps.pop().unwrap();
let loc = &matcher[mp.idx];
if let &MatcherLoc::MetaVarDecl {
span,
kind: Some(kind),
next_metavar,
seq_depth,
..
} = loc
{
// We use the span of the metavariable declaration to determine any
// edition-specific matching behavior for non-terminals.
let nt = match parser.to_mut().parse_nonterminal(kind) {
Err(mut err) => {
err.span_label(
span,
format!(
"while parsing argument for this `{kind}` macro fragment"
),
)
.emit();
return ErrorReported;
}
Ok(nt) => nt,
};
let m = match nt {
NtOrTt::Nt(nt) => MatchedNonterminal(Lrc::new(nt)),
NtOrTt::Tt(tt) => MatchedTokenTree(tt),
};
mp.push_match(next_metavar, seq_depth, m);
mp.idx += 1;
} else {
unreachable!()
}
self.cur_mps.push(mp);
}
(_, _) => {
// Too many possibilities!
return self.ambiguity_error(matcher, parser.token.span);
}
}
assert!(!self.cur_mps.is_empty());
}
}
fn ambiguity_error(
&self,
matcher: &[MatcherLoc],
token_span: rustc_span::Span,
) -> NamedParseResult {
let nts = self
.bb_mps
.iter()
.map(|mp| match &matcher[mp.idx] {
MatcherLoc::MetaVarDecl { bind, kind: Some(kind), .. } => {
format!("{} ('{}')", kind, bind)
}
_ => unreachable!(),
})
.collect::<Vec<String>>()
.join(" or ");
Error(
token_span,
format!(
"local ambiguity when calling macro `{}`: multiple parsing options: {}",
self.macro_name,
match self.next_mps.len() {
0 => format!("built-in NTs {}.", nts),
1 => format!("built-in NTs {} or 1 other option.", nts),
n => format!("built-in NTs {} or {} other options.", nts, n),
}
),
)
}
fn nameize<I: Iterator<Item = NamedMatch>>(
&self,
sess: &ParseSess,
matcher: &[MatcherLoc],
mut res: I,
) -> NamedParseResult {
// Make that each metavar has _exactly one_ binding. If so, insert the binding into the
// `NamedParseResult`. Otherwise, it's an error.
let mut ret_val = FxHashMap::default();
for loc in matcher {
if let &MatcherLoc::MetaVarDecl { span, bind, kind, .. } = loc {
if kind.is_some() {
match ret_val.entry(MacroRulesNormalizedIdent::new(bind)) {
Vacant(spot) => spot.insert(res.next().unwrap()),
Occupied(..) => {
return Error(span, format!("duplicated bind name: {}", bind));
}
};
} else {
// Both this check and the one in `parse_tt_inner` are necessary, surprisingly.
if sess.missing_fragment_specifiers.borrow_mut().remove(&span).is_some() {
// E.g. `$e` instead of `$e:expr`.
return Error(span, "missing fragment specifier".to_string());
}
}
}
}
Success(ret_val)
}
}