Add ExpressionContext for expression parsing (#11055)

## Summary This PR adds a new `ExpressionContext` struct which is used in expression parsing. This solves the following problem: 1. Allowing starred expression with different precedence 2. Allowing yield expression in certain context 3. Remove ambiguity with `in` keyword when parsing a `for ... in` statement For context, (1) was solved by adding `parse_star_expression_list` and `parse_star_expression_or_higher` in #10623, (2) was solved by by adding `parse_yield_expression_or_else` in #10809, and (3) was fixed in #11009. All of the mentioned functions have been removed in favor of the context flags. As mentioned in #11009, an ideal solution would be to implement an expression context which is what this PR implements. This is passed around as function parameter and the call stack is used to automatically reset the context. ### Recovery How should the parser recover if the target expression is invalid when an expression can consume the `in` keyword? 1. Should the `in` keyword be part of the target expression? 2. Or, should the expression parsing stop as soon as `in` keyword is encountered, no matter the expression? For example: ```python for yield x in y: ... # Here, should this be parsed as for (yield x) in (y): ... # Or for (yield x in y): ... # where the `in iter` part is missing ``` Or, for binary expression parsing: ```python for x or y in z: ... # Should this be parsed as for (x or y) in z: ... # Or for (x or y in z): ... # where the `in iter` part is missing ``` This need not be solved now, but is very easy to change. For context this PR does the following: * For binary, comparison, and unary expressions, stop at `in` * For lambda, yield expressions, consume the `in` ## Test Plan 1. Add test cases for the `for ... in` statement and verify the snapshots 2. Make sure the existing test suite pass 3. Run the fuzzer for around 3000 generated source code 4. Run the updated logic on a dozen or so open source repositories (codename "parser-checkouts")
astral-sh · Apr 23, 2024 · c30735d · c30735d
1 parent 62478c3
commit c30735d
Show file tree

Hide file tree

Showing 22 changed files with 1,093 additions and 811 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
@@ -18,7 +18,6 @@ ruff_text_size = { path = "../ruff_text_size" }
 
 anyhow = { workspace = true }
 bitflags = { workspace = true }
-drop_bomb = { workspace = true }
 bstr = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }

diff --git a/crates/ruff_python_parser/resources/inline/err/for_in_target_postfix_expr.py b/crates/ruff_python_parser/resources/inline/err/for_in_target_postfix_expr.py
diff --git a/crates/ruff_python_parser/resources/inline/err/for_stmt_invalid_target.py b/crates/ruff_python_parser/resources/inline/err/for_stmt_invalid_target.py
@@ -3,4 +3,5 @@
 for *x and y in z: ...
 for *x | y in z: ...
 for await x in z: ...
+for yield x in y: ...
 for [x, 1, y, *["a"]] in z: ...
diff --git a/crates/ruff_python_parser/resources/inline/err/for_stmt_invalid_target_binary_expr.py b/crates/ruff_python_parser/resources/inline/err/for_stmt_invalid_target_binary_expr.py
@@ -0,0 +1,6 @@
+for x not in y in z: ...
+for x == y in z: ...
+for x or y in z: ...
+for -x in y: ...
+for not x in y: ...
+for x | y in z: ...
diff --git a/.../err/parenthesized_compare_expr_in_for.py → ...err/for_stmt_invalid_target_in_keyword.py b/.../err/parenthesized_compare_expr_in_for.py → ...err/for_stmt_invalid_target_in_keyword.py
@@ -1,3 +1,4 @@
+for d(x in y) in target: ...
 for (x in y)() in iter: ...
 for (x in y) in iter: ...
 for (x in y, z) in iter: ...

diff --git a/crates/ruff_python_parser/resources/inline/ok/for_in_target_postfix_expr.py b/crates/ruff_python_parser/resources/inline/ok/for_in_target_postfix_expr.py
diff --git a/...e/ok/parenthesized_compare_expr_in_for.py → ...ces/inline/ok/for_in_target_valid_expr.py b/...e/ok/parenthesized_compare_expr_in_for.py → ...ces/inline/ok/for_in_target_valid_expr.py
@@ -1,2 +1,3 @@
+for d[x in y] in target: ...
 for (x in y)[0] in iter: ...
 for (x in y).attr in iter: ...
diff --git a/crates/ruff_python_parser/src/parser/expression.rs b/crates/ruff_python_parser/src/parser/expression.rs
diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs
@@ -1,7 +1,6 @@
 use std::cmp::Ordering;
 
 use bitflags::bitflags;
-use drop_bomb::DebugDropBomb;
 
 use ast::Mod;
 use ruff_python_ast as ast;
@@ -16,7 +15,7 @@ use crate::{
     Mode, ParseError, ParseErrorType, Tok, TokenKind,
 };
 
-use self::expression::AllowStarredExpression;
+use self::expression::ExpressionContext;
 
 mod expression;
 mod helpers;
@@ -77,13 +76,6 @@ pub(crate) struct Parser<'src> {
     /// Stores all the syntax errors found during the parsing.
     errors: Vec<ParseError>,
 
-    /// This tracks the current expression or statement being parsed.
-    ///
-    /// The `ctx` is also used to create custom error messages and forbid certain
-    /// expressions or statements of being parsed. The `ctx` should be empty after
-    /// an expression or statement is done parsing.
-    ctx: ParserCtxFlags,
-
     /// Specify the mode in which the code will be parsed.
     mode: Mode,
 
@@ -123,7 +115,6 @@ impl<'src> Parser<'src> {
             mode,
             source,
             errors: Vec::new(),
-            ctx: ParserCtxFlags::empty(),
             tokens,
             recovery_context: RecoveryContext::empty(),
             last_token_end: tokens_range.start(),
@@ -136,7 +127,7 @@ impl<'src> Parser<'src> {
     pub(crate) fn parse_program(mut self) -> Program {
         let ast = if self.mode == Mode::Expression {
             let start = self.node_start();
-            let parsed_expr = self.parse_expression_list(AllowStarredExpression::No);
+            let parsed_expr = self.parse_expression_list(ExpressionContext::default());
 
             // All of the remaining newlines are actually going to be non-logical newlines.
             self.eat(TokenKind::Newline);
@@ -185,9 +176,6 @@ impl<'src> Parser<'src> {
     }
 
     fn finish(self) -> Vec<ParseError> {
-        // After parsing, the `ctx` and `ctx_stack` should be empty.
-        // If it's not, you probably forgot to call `clear_ctx` somewhere.
-        assert_eq!(self.ctx, ParserCtxFlags::empty());
         assert_eq!(
             self.current_token_kind(),
             TokenKind::EndOfFile,
@@ -232,29 +220,6 @@ impl<'src> Parser<'src> {
         merged
     }
 
-    #[inline]
-    #[must_use]
-    fn set_ctx(&mut self, ctx: ParserCtxFlags) -> SavedParserContext {
-        SavedParserContext {
-            flags: std::mem::replace(&mut self.ctx, ctx),
-            bomb: DebugDropBomb::new(
-                "You must restore the old parser context explicit by calling `restore_ctx`",
-            ),
-        }
-    }
-
-    #[inline]
-    fn restore_ctx(&mut self, current: ParserCtxFlags, mut saved_context: SavedParserContext) {
-        assert_eq!(self.ctx, current);
-        saved_context.bomb.defuse();
-        self.ctx = saved_context.flags;
-    }
-
-    #[inline]
-    fn has_ctx(&self, ctx: ParserCtxFlags) -> bool {
-        self.ctx.intersects(ctx)
-    }
-
     /// Returns the start position for a node that starts at the current token.
     fn node_start(&self) -> TextSize {
         self.current_token_range().start()
@@ -675,13 +640,6 @@ impl SequenceMatchPatternParentheses {
     }
 }
 
-bitflags! {
-    #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
-    struct ParserCtxFlags: u8 {
-        const FOR_TARGET = 1 << 2;
-    }
-}
-
 #[derive(Debug, PartialEq, Copy, Clone)]
 enum FunctionKind {
     /// A lambda expression, e.g., `lambda x: x`
@@ -1327,9 +1285,3 @@ impl RecoveryContext {
         })
     }
 }
-
-#[derive(Debug)]
-struct SavedParserContext {
-    flags: ParserCtxFlags,
-    bomb: DebugDropBomb,
-}
diff --git a/crates/ruff_python_parser/src/parser/pattern.rs b/crates/ruff_python_parser/src/parser/pattern.rs
@@ -6,6 +6,8 @@ use crate::parser::{recovery, Parser, RecoveryContextKind, SequenceMatchPatternP
 use crate::token_set::TokenSet;
 use crate::{ParseErrorType, Tok, TokenKind};
 
+use super::expression::ExpressionContext;
+
 /// The set of tokens that can start a literal pattern.
 const LITERAL_PATTERN_START_SET: TokenSet = TokenSet::new([
     TokenKind::None,
@@ -483,7 +485,7 @@ impl<'src> Parser<'src> {
                     TokenKind::Int | TokenKind::Float | TokenKind::Complex
                 ) =>
             {
-                let unary_expr = self.parse_unary_expression();
+                let unary_expr = self.parse_unary_expression(ExpressionContext::default());
 
                 if unary_expr.op.is_u_add() {
                     self.add_error(