Rollup merge of #72393 - Aaron1011:feature/rewrite-collect-tokens, r=petrochenkov

Rewrite `Parser::collect_tokens`

The previous implementation did not work when called on an opening
delimiter, or when called re-entrantly from the same `TokenCursor` stack
depth.

I'm not sure how to test this apart from https://github.com/rust-lang/rust/pull/72287
diff --git a/src/librustc_parse/parser/mod.rs b/src/librustc_parse/parser/mod.rs
index bdb4d7c..b21524c 100644
--- a/src/librustc_parse/parser/mod.rs
+++ b/src/librustc_parse/parser/mod.rs
@@ -118,6 +118,8 @@
 struct TokenCursor {
     frame: TokenCursorFrame,
     stack: Vec<TokenCursorFrame>,
+    cur_token: Option<TreeAndJoint>,
+    collecting: Option<Collecting>,
 }
 
 #[derive(Clone)]
@@ -127,30 +129,24 @@
     open_delim: bool,
     tree_cursor: tokenstream::Cursor,
     close_delim: bool,
-    last_token: LastToken,
 }
 
-/// This is used in `TokenCursorFrame` above to track tokens that are consumed
-/// by the parser, and then that's transitively used to record the tokens that
-/// each parse AST item is created with.
-///
-/// Right now this has two states, either collecting tokens or not collecting
-/// tokens. If we're collecting tokens we just save everything off into a local
-/// `Vec`. This should eventually though likely save tokens from the original
-/// token stream and just use slicing of token streams to avoid creation of a
-/// whole new vector.
-///
-/// The second state is where we're passively not recording tokens, but the last
-/// token is still tracked for when we want to start recording tokens. This
-/// "last token" means that when we start recording tokens we'll want to ensure
-/// that this, the first token, is included in the output.
-///
-/// You can find some more example usage of this in the `collect_tokens` method
-/// on the parser.
-#[derive(Clone)]
-enum LastToken {
-    Collecting(Vec<TreeAndJoint>),
-    Was(Option<TreeAndJoint>),
+/// Used to track additional state needed by `collect_tokens`
+#[derive(Clone, Debug)]
+struct Collecting {
+    /// Holds the current tokens captured during the most
+    /// recent call to `collect_tokens`
+    buf: Vec<TreeAndJoint>,
+    /// The depth of the `TokenCursor` stack at the time
+    /// collection was started. When we encounter a `TokenTree::Delimited`,
+    /// we want to record the `TokenTree::Delimited` itself,
+    /// but *not* any of the inner tokens while we are inside
+    /// the new frame (this would cause us to record duplicate tokens).
+    ///
+    /// This `depth` fields tracks stack depth we are recording tokens.
+    /// Only tokens encountered at this depth will be recorded. See
+    /// `TokenCursor::next` for more details.
+    depth: usize,
 }
 
 impl TokenCursorFrame {
@@ -161,7 +157,6 @@
             open_delim: delim == token::NoDelim,
             tree_cursor: tts.clone().into_trees(),
             close_delim: delim == token::NoDelim,
-            last_token: LastToken::Was(None),
         }
     }
 }
@@ -171,12 +166,12 @@
         loop {
             let tree = if !self.frame.open_delim {
                 self.frame.open_delim = true;
-                TokenTree::open_tt(self.frame.span, self.frame.delim)
-            } else if let Some(tree) = self.frame.tree_cursor.next() {
+                TokenTree::open_tt(self.frame.span, self.frame.delim).into()
+            } else if let Some(tree) = self.frame.tree_cursor.next_with_joint() {
                 tree
             } else if !self.frame.close_delim {
                 self.frame.close_delim = true;
-                TokenTree::close_tt(self.frame.span, self.frame.delim)
+                TokenTree::close_tt(self.frame.span, self.frame.delim).into()
             } else if let Some(frame) = self.stack.pop() {
                 self.frame = frame;
                 continue;
@@ -184,12 +179,25 @@
                 return Token::new(token::Eof, DUMMY_SP);
             };
 
-            match self.frame.last_token {
-                LastToken::Collecting(ref mut v) => v.push(tree.clone().into()),
-                LastToken::Was(ref mut t) => *t = Some(tree.clone().into()),
+            // Don't set an open delimiter as our current token - we want
+            // to leave it as the full `TokenTree::Delimited` from the previous
+            // iteration of this loop
+            if !matches!(tree.0, TokenTree::Token(Token { kind: TokenKind::OpenDelim(_), .. })) {
+                self.cur_token = Some(tree.clone());
             }
 
-            match tree {
+            if let Some(collecting) = &mut self.collecting {
+                if collecting.depth == self.stack.len() {
+                    debug!(
+                        "TokenCursor::next():  collected {:?} at depth {:?}",
+                        tree,
+                        self.stack.len()
+                    );
+                    collecting.buf.push(tree.clone().into())
+                }
+            }
+
+            match tree.0 {
                 TokenTree::Token(token) => return token,
                 TokenTree::Delimited(sp, delim, tts) => {
                     let frame = TokenCursorFrame::new(sp, delim, &tts);
@@ -350,6 +358,8 @@
             token_cursor: TokenCursor {
                 frame: TokenCursorFrame::new(DelimSpan::dummy(), token::NoDelim, &tokens),
                 stack: Vec::new(),
+                cur_token: None,
+                collecting: None,
             },
             desugar_doc_comments,
             unmatched_angle_bracket_count: 0,
@@ -1105,65 +1115,95 @@
         }
     }
 
+    /// Records all tokens consumed by the provided callback,
+    /// including the current token. These tokens are collected
+    /// into a `TokenStream`, and returned along with the result
+    /// of the callback.
+    ///
+    /// Note: If your callback consumes an opening delimiter
+    /// (including the case where you call `collect_tokens`
+    /// when the current token is an opening delimeter),
+    /// you must also consume the corresponding closing delimiter.
+    ///
+    /// That is, you can consume
+    /// `something ([{ }])` or `([{}])`, but not `([{}]`
+    ///
+    /// This restriction shouldn't be an issue in practice,
+    /// since this function is used to record the tokens for
+    /// a parsed AST item, which always has matching delimiters.
     fn collect_tokens<R>(
         &mut self,
         f: impl FnOnce(&mut Self) -> PResult<'a, R>,
     ) -> PResult<'a, (R, TokenStream)> {
         // Record all tokens we parse when parsing this item.
-        let mut tokens = Vec::new();
-        let prev_collecting = match self.token_cursor.frame.last_token {
-            LastToken::Collecting(ref mut list) => Some(mem::take(list)),
-            LastToken::Was(ref mut last) => {
-                tokens.extend(last.take());
-                None
-            }
-        };
-        self.token_cursor.frame.last_token = LastToken::Collecting(tokens);
-        let prev = self.token_cursor.stack.len();
+        let tokens: Vec<TreeAndJoint> = self.token_cursor.cur_token.clone().into_iter().collect();
+        debug!("collect_tokens: starting with {:?}", tokens);
+
+        // We need special handling for the case where `collect_tokens` is called
+        // on an opening delimeter (e.g. '('). At this point, we have already pushed
+        // a new frame - however, we want to record the original `TokenTree::Delimited`,
+        // for consistency with the case where we start recording one token earlier.
+        // See `TokenCursor::next` to see how `cur_token` is set up.
+        let prev_depth =
+            if matches!(self.token_cursor.cur_token, Some((TokenTree::Delimited(..), _))) {
+                if self.token_cursor.stack.is_empty() {
+                    // There is nothing below us in the stack that
+                    // the function could consume, so the only thing it can legally
+                    // capture is the entire contents of the current frame.
+                    return Ok((f(self)?, TokenStream::new(tokens)));
+                }
+                // We have already recorded the full `TokenTree::Delimited` when we created
+                // our `tokens` vector at the start of this function. We are now inside
+                // a new frame corresponding to the `TokenTree::Delimited` we already recoreded.
+                // We don't want to record any of the tokens inside this frame, since they
+                // will be duplicates of the tokens nested inside the `TokenTree::Delimited`.
+                // Therefore, we set our recording depth to the *previous* frame. This allows
+                // us to record a sequence like: `(foo).bar()`: the `(foo)` will be recored
+                // as our initial `cur_token`, while the `.bar()` will be recored after we
+                // pop the `(foo)` frame.
+                self.token_cursor.stack.len() - 1
+            } else {
+                self.token_cursor.stack.len()
+            };
+        let prev_collecting =
+            self.token_cursor.collecting.replace(Collecting { buf: tokens, depth: prev_depth });
+
         let ret = f(self);
-        let last_token = if self.token_cursor.stack.len() == prev {
-            &mut self.token_cursor.frame.last_token
-        } else if self.token_cursor.stack.get(prev).is_none() {
-            // This can happen due to a bad interaction of two unrelated recovery mechanisms with
-            // mismatched delimiters *and* recovery lookahead on the likely typo `pub ident(`
-            // (#62881).
-            return Ok((ret?, TokenStream::default()));
+
+        let mut collected_tokens = if let Some(collecting) = self.token_cursor.collecting.take() {
+            collecting.buf
         } else {
-            &mut self.token_cursor.stack[prev].last_token
+            let msg = format!("our vector went away?");
+            debug!("collect_tokens: {}", msg);
+            self.sess.span_diagnostic.delay_span_bug(self.token.span, &msg);
+            // This can happen due to a bad interaction of two unrelated recovery mechanisms
+            // with mismatched delimiters *and* recovery lookahead on the likely typo
+            // `pub ident(` (#62895, different but similar to the case above).
+            return Ok((ret?, TokenStream::default()));
         };
 
-        // Pull out the tokens that we've collected from the call to `f` above.
-        let mut collected_tokens = match *last_token {
-            LastToken::Collecting(ref mut v) => mem::take(v),
-            LastToken::Was(ref was) => {
-                let msg = format!("our vector went away? - found Was({:?})", was);
-                debug!("collect_tokens: {}", msg);
-                self.sess.span_diagnostic.delay_span_bug(self.token.span, &msg);
-                // This can happen due to a bad interaction of two unrelated recovery mechanisms
-                // with mismatched delimiters *and* recovery lookahead on the likely typo
-                // `pub ident(` (#62895, different but similar to the case above).
-                return Ok((ret?, TokenStream::default()));
-            }
-        };
+        debug!("collect_tokens: got raw tokens {:?}", collected_tokens);
 
         // If we're not at EOF our current token wasn't actually consumed by
         // `f`, but it'll still be in our list that we pulled out. In that case
         // put it back.
         let extra_token = if self.token != token::Eof { collected_tokens.pop() } else { None };
 
-        // If we were previously collecting tokens, then this was a recursive
-        // call. In that case we need to record all the tokens we collected in
-        // our parent list as well. To do that we push a clone of our stream
-        // onto the previous list.
-        match prev_collecting {
-            Some(mut list) => {
-                list.extend(collected_tokens.iter().cloned());
-                list.extend(extra_token);
-                *last_token = LastToken::Collecting(list);
+        if let Some(mut collecting) = prev_collecting {
+            // If we were previously collecting at the same depth,
+            // then the previous call to `collect_tokens` needs to see
+            // the tokens we just recorded.
+            //
+            // If we were previously recording at an lower `depth`,
+            // then the previous `collect_tokens` call already recorded
+            // this entire frame in the form of a `TokenTree::Delimited`,
+            // so there is nothing else for us to do.
+            if collecting.depth == prev_depth {
+                collecting.buf.extend(collected_tokens.iter().cloned());
+                collecting.buf.extend(extra_token);
+                debug!("collect_tokens: updating previous buf to {:?}", collecting);
             }
-            None => {
-                *last_token = LastToken::Was(extra_token);
-            }
+            self.token_cursor.collecting = Some(collecting)
         }
 
         Ok((ret?, TokenStream::new(collected_tokens)))
diff --git a/src/test/ui/ast-json/ast-json-noexpand-output.stdout b/src/test/ui/ast-json/ast-json-noexpand-output.stdout
index 1a07968..f60b6a0 100644
--- a/src/test/ui/ast-json/ast-json-noexpand-output.stdout
+++ b/src/test/ui/ast-json/ast-json-noexpand-output.stdout
@@ -1 +1 @@
-{"module":{"inner":{"lo":0,"hi":0},"items":[{"attrs":[],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"core","span":{"lo":0,"hi":0}},"kind":{"variant":"ExternCrate","fields":[null]},"tokens":{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["extern",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["crate",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["core",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":"Semi","span":{"lo":0,"hi":0}}]},"NonJoint"]]}}],"inline":true},"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"crate_type","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":{"variant":"Eq","fields":[{"lo":0,"hi":0},{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Literal","fields":[{"kind":"Str","symbol":"lib","suffix":null}]},"span":{"lo":0,"hi":0}}]},"NonJoint"]]}]}}]},"id":null,"style":"Inner","span":{"lo":0,"hi":0}}],"span":{"lo":0,"hi":0},"proc_macros":[]}
+{"module":{"inner":{"lo":0,"hi":0},"items":[{"attrs":[],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"core","span":{"lo":0,"hi":0}},"kind":{"variant":"ExternCrate","fields":[null]},"tokens":{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["extern",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["crate",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["core",false]},"span":{"lo":0,"hi":0}}]},"Joint"],[{"variant":"Token","fields":[{"kind":"Semi","span":{"lo":0,"hi":0}}]},"NonJoint"]]}}],"inline":true},"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"crate_type","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":{"variant":"Eq","fields":[{"lo":0,"hi":0},{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Literal","fields":[{"kind":"Str","symbol":"lib","suffix":null}]},"span":{"lo":0,"hi":0}}]},"NonJoint"]]}]}}]},"id":null,"style":"Inner","span":{"lo":0,"hi":0}}],"span":{"lo":0,"hi":0},"proc_macros":[]}
diff --git a/src/test/ui/ast-json/ast-json-output.stdout b/src/test/ui/ast-json/ast-json-output.stdout
index 0b3704e..42e7e78 100644
--- a/src/test/ui/ast-json/ast-json-output.stdout
+++ b/src/test/ui/ast-json/ast-json-output.stdout
@@ -1 +1 @@
-{"module":{"inner":{"lo":0,"hi":0},"items":[{"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"prelude_import","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":"Empty"}]},"id":null,"style":"Outer","span":{"lo":0,"hi":0}}],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"","span":{"lo":0,"hi":0}},"kind":{"variant":"Use","fields":[{"prefix":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"{{root}}","span":{"lo":0,"hi":0}},"id":0,"args":null},{"ident":{"name":"std","span":{"lo":0,"hi":0}},"id":0,"args":null},{"ident":{"name":"prelude","span":{"lo":0,"hi":0}},"id":0,"args":null},{"ident":{"name":"v1","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"kind":"Glob","span":{"lo":0,"hi":0}}]},"tokens":null},{"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"macro_use","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":"Empty"}]},"id":null,"style":"Outer","span":{"lo":0,"hi":0}}],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"std","span":{"lo":0,"hi":0}},"kind":{"variant":"ExternCrate","fields":[null]},"tokens":null},{"attrs":[],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"core","span":{"lo":0,"hi":0}},"kind":{"variant":"ExternCrate","fields":[null]},"tokens":{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["extern",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["crate",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["core",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":"Semi","span":{"lo":0,"hi":0}}]},"NonJoint"]]}}],"inline":true},"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"crate_type","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":{"variant":"Eq","fields":[{"lo":0,"hi":0},{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Literal","fields":[{"kind":"Str","symbol":"lib","suffix":null}]},"span":{"lo":0,"hi":0}}]},"NonJoint"]]}]}}]},"id":null,"style":"Inner","span":{"lo":0,"hi":0}}],"span":{"lo":0,"hi":0},"proc_macros":[]}
+{"module":{"inner":{"lo":0,"hi":0},"items":[{"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"prelude_import","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":"Empty"}]},"id":null,"style":"Outer","span":{"lo":0,"hi":0}}],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"","span":{"lo":0,"hi":0}},"kind":{"variant":"Use","fields":[{"prefix":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"{{root}}","span":{"lo":0,"hi":0}},"id":0,"args":null},{"ident":{"name":"std","span":{"lo":0,"hi":0}},"id":0,"args":null},{"ident":{"name":"prelude","span":{"lo":0,"hi":0}},"id":0,"args":null},{"ident":{"name":"v1","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"kind":"Glob","span":{"lo":0,"hi":0}}]},"tokens":null},{"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"macro_use","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":"Empty"}]},"id":null,"style":"Outer","span":{"lo":0,"hi":0}}],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"std","span":{"lo":0,"hi":0}},"kind":{"variant":"ExternCrate","fields":[null]},"tokens":null},{"attrs":[],"id":0,"span":{"lo":0,"hi":0},"vis":{"node":"Inherited","span":{"lo":0,"hi":0}},"ident":{"name":"core","span":{"lo":0,"hi":0}},"kind":{"variant":"ExternCrate","fields":[null]},"tokens":{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["extern",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["crate",false]},"span":{"lo":0,"hi":0}}]},"NonJoint"],[{"variant":"Token","fields":[{"kind":{"variant":"Ident","fields":["core",false]},"span":{"lo":0,"hi":0}}]},"Joint"],[{"variant":"Token","fields":[{"kind":"Semi","span":{"lo":0,"hi":0}}]},"NonJoint"]]}}],"inline":true},"attrs":[{"kind":{"variant":"Normal","fields":[{"path":{"span":{"lo":0,"hi":0},"segments":[{"ident":{"name":"crate_type","span":{"lo":0,"hi":0}},"id":0,"args":null}]},"args":{"variant":"Eq","fields":[{"lo":0,"hi":0},{"_field0":[[{"variant":"Token","fields":[{"kind":{"variant":"Literal","fields":[{"kind":"Str","symbol":"lib","suffix":null}]},"span":{"lo":0,"hi":0}}]},"NonJoint"]]}]}}]},"id":null,"style":"Inner","span":{"lo":0,"hi":0}}],"span":{"lo":0,"hi":0},"proc_macros":[]}