Skip to content

Commit

Permalink
Merge string parsing to single entrypoint (#10339)
Browse files Browse the repository at this point in the history
This PR merges the different string parsing functions into a single
entry point function.

Previously there were two entry points, one for string or byte literal
and the other for f-strings. The reason for this separation is that our
old parser raised a hard syntax error if an f-string was used as a
pattern literal. But, it's actually a soft syntax error as evident
through the CPython parser which raises it during the runtime.

This function basically implements the following grammar:
```
strings: (string|fstring)+
```

And it delegates it to the list parsing for better error recovery.

- [x] All tests pass
- [x] No performance regression
  • Loading branch information
dhruvmanila committed Mar 12, 2024
1 parent fc8d542 commit 70731cb
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 117 deletions.
186 changes: 71 additions & 115 deletions crates/ruff_python_parser/src/parser/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,7 @@ impl<'src> Parser<'src> {

Expr::IpyEscapeCommand(command)
}
TokenKind::String => self.parse_string_expression(),
TokenKind::FStringStart => self.parse_fstring_expression(),
TokenKind::String | TokenKind::FStringStart => self.parse_strings(),
TokenKind::Lpar => {
return self.parse_parenthesized_expression();
}
Expand Down Expand Up @@ -778,145 +777,98 @@ impl<'src> Parser<'src> {
}
}

pub(super) fn parse_string_expression(&mut self) -> Expr {
/// Parses all kinds of strings and implicitly concatenated strings.
///
/// # Panics
///
/// If the parser isn't positioned at a `String` or `FStringStart` token.
///
/// See: <https://docs.python.org/3/reference/grammar.html> (Search "strings:")
pub(super) fn parse_strings(&mut self) -> Expr {
let start = self.node_start();
let mut strings = vec![];
let mut progress = ParserProgress::default();

while self.at(TokenKind::String) {
progress.assert_progressing(self);
let (Tok::String { value, kind }, tok_range) = self.bump(TokenKind::String) else {
unreachable!()
self.parse_list(RecoveryContextKind::Strings, |parser| {
let string = match parser.current_token_kind() {
TokenKind::String => parser.parse_string(),
TokenKind::FStringStart => StringType::FString(parser.parse_fstring()),
_ => unreachable!(),
};

match parse_string_literal(value, kind, tok_range) {
Ok(string) => {
strings.push(string);
}
Err(error) => {
strings.push(StringType::Invalid(ast::StringLiteral {
value: self.src_text(tok_range).to_string().into_boxed_str(),
range: tok_range,
flags: kind.into(),
}));
let location = error.location();
self.add_error(ParseErrorType::Lexical(error.into_error()), location);
}
}
}

// This handles the case where the string is implicit concatenated with
// a fstring, e.g., `"hello " f"{x}"`.
if self.at(TokenKind::FStringStart) {
self.handle_implicit_concatenated_strings(&mut strings);
}
strings.push(string);
});

let range = self.node_range(start);

if strings.len() == 1 {
return match strings.pop().unwrap() {
match strings.len() {
// This is not possible as the function was called by matching against a
// `String` or `FStringStart` token.
0 => unreachable!("Expected to parse at least one string"),
// We need a owned value, hence the `pop` here.
1 => match strings.pop().unwrap() {
StringType::Str(string) => Expr::StringLiteral(ast::ExprStringLiteral {
value: ast::StringLiteralValue::single(string),
range,
}),
StringType::Bytes(bytes) => {
// TODO(micha): Is this valid? I thought string and byte literals can't be concatenated? Maybe not a syntax error?
Expr::BytesLiteral(ast::ExprBytesLiteral {
value: ast::BytesLiteralValue::single(bytes),
range,
})
}
StringType::Bytes(bytes) => Expr::BytesLiteral(ast::ExprBytesLiteral {
value: ast::BytesLiteralValue::single(bytes),
range,
}),
StringType::FString(fstring) => Expr::FString(ast::ExprFString {
value: ast::FStringValue::single(fstring),
range,
}),
#[allow(deprecated)]
StringType::Invalid(invalid) => Expr::Invalid(ast::ExprInvalid {
value: invalid.value.to_string(),
range,
}),
StringType::FString(_) => unreachable!(),
};
}

concatenated_strings(strings, range).unwrap_or_else(|error| {
let location = error.location();
self.add_error(ParseErrorType::Lexical(error.into_error()), location);
#[allow(deprecated)]
Expr::Invalid(ast::ExprInvalid {
value: self.src_text(location).into(),
range: location,
})
})
}

/// Handles implicit concatenated f-strings, e.g. `f"{x}" f"hello"`, and
/// implicit concatenated f-strings with strings, e.g. `f"{x}" "xyz" f"{x}"`.
fn handle_implicit_concatenated_strings(&mut self, strings: &mut Vec<StringType>) {
let mut progress = ParserProgress::default();

loop {
progress.assert_progressing(self);
let start = self.node_start();

if self.at(TokenKind::FStringStart) {
strings.push(StringType::FString(self.parse_fstring()));
} else if self.at(TokenKind::String) {
let (Tok::String { value, kind }, _) = self.next_token() else {
unreachable!()
};

let range = self.node_range(start);
},
_ => concatenated_strings(strings, range).unwrap_or_else(|error| {
let location = error.location();
self.add_error(ParseErrorType::Lexical(error.into_error()), location);

match parse_string_literal(value, kind, range) {
Ok(string) => {
strings.push(string);
}
Err(error) => {
strings.push(StringType::Invalid(ast::StringLiteral {
value: self.src_text(error.location()).to_string().into_boxed_str(),
range,
flags: kind.into(),
}));
let location = error.location();
self.add_error(ParseErrorType::Lexical(error.into_error()), location);
}
}
} else {
break;
}
#[allow(deprecated)]
Expr::Invalid(ast::ExprInvalid {
value: self.src_text(location).into(),
range: location,
})
}),
}
}

/// Parses a f-string expression.
fn parse_fstring_expression(&mut self) -> Expr {
const FSTRING_SET: TokenSet = TokenSet::new([TokenKind::FStringStart, TokenKind::String]);

let start = self.node_start();
let fstring = self.parse_fstring();
/// Parses a single string literal.
///
/// This does not handle implicitly concatenated strings.
///
/// # Panics
///
/// If the parser isn't positioned at a `String` token.
fn parse_string(&mut self) -> StringType {
let (Tok::String { value, kind }, tok_range) = self.bump(TokenKind::String) else {
unreachable!()
};

if !self.at_ts(FSTRING_SET) {
return Expr::FString(ast::ExprFString {
value: ast::FStringValue::single(fstring),
range: self.node_range(start),
});
match parse_string_literal(value, kind, tok_range) {
Ok(string) => string,
Err(error) => {
let string = StringType::Invalid(ast::StringLiteral {
// TODO(dhruvmanila): Do we need to get the source text? In the future, we
// can just set a flag and the linter can query the source code if needed.
value: self.src_text(tok_range).to_string().into_boxed_str(),
range: tok_range,
flags: kind.into(),
});
let location = error.location();
self.add_error(ParseErrorType::Lexical(error.into_error()), location);
string
}
}

let mut strings = vec![StringType::FString(fstring)];
self.handle_implicit_concatenated_strings(&mut strings);

let range = self.node_range(start);

concatenated_strings(strings, range).unwrap_or_else(|error| {
let location = error.location();
self.add_error(ParseErrorType::Lexical(error.into_error()), location);

#[allow(deprecated)]
Expr::Invalid(ast::ExprInvalid {
value: self.src_text(location).into(),
range: location,
})
})
}

/// Parses a f-string.
///
/// This does not handle implicitly concatenated strings.
///
/// # Panics
///
/// If the parser isn't positioned at a `FStringStart` token.
Expand All @@ -940,6 +892,10 @@ impl<'src> Parser<'src> {
}

/// Parses a list of f-string elements.
///
/// # Panics
///
/// If the parser isn't positioned at a `{` or `FStringMiddle` token.
fn parse_fstring_elements(&mut self) -> Vec<FStringElement> {
let mut elements = vec![];

Expand Down
24 changes: 24 additions & 0 deletions crates/ruff_python_parser/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,10 @@ enum RecoveryContextKind {
/// When parsing a list of f-string elements which are either literal elements
/// or expressions.
FStringElements,

/// When parsing a single string or a list of strings that are implicitly
/// concatenated e.g., `"foo"`, `b"foo"`, `f"foo"`, `"foo" "bar"`.
Strings,
}

impl RecoveryContextKind {
Expand Down Expand Up @@ -890,6 +894,14 @@ impl RecoveryContextKind {
TokenKind::Rbrace,
]))
}
RecoveryContextKind::Strings => {
// This is basically a negation of the FIRST set as there can be
// any token after a string depending on the context.
!matches!(
p.current_token_kind(),
TokenKind::String | TokenKind::FStringStart
)
}
}
}

Expand Down Expand Up @@ -928,6 +940,11 @@ impl RecoveryContextKind {
// Expression element
| TokenKind::Lbrace
),
RecoveryContextKind::Strings => matches!(
p.current_token_kind(),
// "foo" f"bar"
TokenKind::String | TokenKind::FStringStart
),
}
}

Expand Down Expand Up @@ -1014,6 +1031,10 @@ impl RecoveryContextKind {
RecoveryContextKind::FStringElements => ParseErrorType::OtherError(
"Expected an f-string element or the end of the f-string".to_string(),
),
RecoveryContextKind::Strings => ParseErrorType::OtherError(
"Expected a string or byte literal, f-string or the end of the string list"
.to_string(),
),
}
}
}
Expand Down Expand Up @@ -1051,6 +1072,7 @@ bitflags! {
const WITH_ITEMS_PARENTHESIZED_EXPRESSION = 1 << 25;
const WITH_ITEMS_UNPARENTHESIZED = 1 << 26;
const F_STRING_ELEMENTS = 1 << 27;
const STRINGS = 1 << 28;
}
}

Expand Down Expand Up @@ -1101,6 +1123,7 @@ impl RecoveryContext {
WithItemKind::Unparenthesized => RecoveryContext::WITH_ITEMS_UNPARENTHESIZED,
},
RecoveryContextKind::FStringElements => RecoveryContext::F_STRING_ELEMENTS,
RecoveryContextKind::Strings => RecoveryContext::STRINGS,
}
}

Expand Down Expand Up @@ -1163,6 +1186,7 @@ impl RecoveryContext {
RecoveryContextKind::WithItems(WithItemKind::Unparenthesized)
}
RecoveryContext::F_STRING_ELEMENTS => RecoveryContextKind::FStringElements,
RecoveryContext::STRINGS => RecoveryContextKind::Strings,
_ => return None,
})
}
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff_python_parser/src/parser/pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,8 @@ impl<'src> Parser<'src> {
range: self.node_range(start),
})
}
TokenKind::String => {
let str = self.parse_string_expression();
TokenKind::String | TokenKind::FStringStart => {
let str = self.parse_strings();

Pattern::MatchValue(ast::PatternMatchValue {
value: Box::new(str),
Expand Down

0 comments on commit 70731cb

Please sign in to comment.