From 32f1f48b9e4b9a00463130c5ae58d10802bee1e6 Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Fri, 22 Sep 2023 14:54:58 +0530 Subject: [PATCH] Use the new f-string tokens in string formatting (#7586) ## Summary This PR updates the string formatter to account for the new f-string tokens. The formatter uses the full lexer to handle comments around implicitly concatenated strings. The reason it uses the lexer is because the AST merges them into a single node so the boundaries aren't preserved. For f-strings, it creates some complexity now that it isn't represented as a `String` token. A single f-string will atleast emit 3 tokens (`FStringStart`, `FStringMiddle`, `FStringEnd`) and if it contains expressions, then it'll emit the respective tokens for them. In our case, we're currently only interested in the outermost f-string range for which I've introduced a new `FStringRangeBuilder` which keeps builds the outermost f-string range by considering the start and end tokens and the nesting level. Note that this doesn't support in any way nested f-strings which is out of scope for this PR. This means that if there are nested f-strings, especially the ones using the same quote, the formatter will escape the inner quotes: ```python f"hello world { x + f\"nested {y}\" }" ``` ## Test plan ``` cargo test --package ruff_python_formatter ``` --- .../src/expression/string.rs | 91 ++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/crates/ruff_python_formatter/src/expression/string.rs b/crates/ruff_python_formatter/src/expression/string.rs index 3bd7bdb12982b..a76230fa2dede 100644 --- a/crates/ruff_python_formatter/src/expression/string.rs +++ b/crates/ruff_python_formatter/src/expression/string.rs @@ -138,6 +138,14 @@ impl<'a> FormatString<'a> { impl<'a> Format> for FormatString<'a> { fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { + // TODO(dhruvmanila): With PEP 701, comments can be inside f-strings. + // This is to mark all of those comments as formatted but we need to + // figure out how to handle them. + if matches!(self.string, AnyString::FString(_)) { + f.context() + .comments() + .mark_verbatim_node_comments_formatted(self.string.into()); + } let locator = f.context().locator(); match self.layout { StringLayout::Default => { @@ -166,6 +174,60 @@ impl<'a> Format> for FormatString<'a> { } } +/// A builder for the f-string range. +/// +/// For now, this is limited to the outermost f-string and doesn't support +/// nested f-strings. +#[derive(Debug, Default)] +struct FStringRangeBuilder { + start_location: TextSize, + end_location: TextSize, + nesting: u32, +} + +impl FStringRangeBuilder { + fn visit_token(&mut self, token: &Tok, range: TextRange) { + match token { + Tok::FStringStart => { + if self.nesting == 0 { + self.start_location = range.start(); + } + self.nesting += 1; + } + Tok::FStringEnd => { + // We can assume that this will never overflow because we know + // that the program once parsed to a valid AST which means that + // the start and end tokens for f-strings are balanced. + self.nesting -= 1; + if self.nesting == 0 { + self.end_location = range.end(); + } + } + _ => {} + } + } + + /// Returns `true` if the lexer is currently inside of a f-string. + /// + /// It'll return `false` once the `FStringEnd` token for the outermost + /// f-string is visited. + const fn in_fstring(&self) -> bool { + self.nesting > 0 + } + + /// Returns the complete range of the previously visited f-string. + /// + /// This method should only be called once the lexer is outside of any + /// f-string otherwise it might return an invalid range. + /// + /// It doesn't consume the builder because there can be multiple f-strings + /// throughout the source code. + fn finish(&self) -> TextRange { + debug_assert!(!self.in_fstring()); + TextRange::new(self.start_location, self.end_location) + } +} + struct FormatStringContinuation<'a> { string: &'a AnyString<'a>, } @@ -195,6 +257,10 @@ impl Format> for FormatStringContinuation<'_> { // because this is a black preview style. let lexer = lex_starts_at(string_content, Mode::Expression, string_range.start()); + // The lexer emits multiple tokens for a single f-string literal. Each token + // will have it's own range but we require the complete range of the f-string. + let mut fstring_range_builder = FStringRangeBuilder::default(); + let mut joiner = f.join_with(in_parentheses_only_soft_line_break_or_space()); for token in lexer { @@ -226,8 +292,31 @@ impl Format> for FormatStringContinuation<'_> { } }; + fstring_range_builder.visit_token(&token, token_range); + + // We need to ignore all the tokens within the f-string as there can + // be `String` tokens inside it as well. For example, + // + // ```python + // f"foo {'bar'} foo" + // # ^^^^^ + // # Ignore any logic for this `String` token + // ``` + // + // Here, we're interested in the complete f-string, not the individual + // tokens inside it. + if fstring_range_builder.in_fstring() { + continue; + } + match token { - Tok::String { .. } => { + Tok::String { .. } | Tok::FStringEnd => { + let token_range = if token.is_f_string_end() { + fstring_range_builder.finish() + } else { + token_range + }; + // ```python // ( // "a"