From 32f1f48b9e4b9a00463130c5ae58d10802bee1e6 Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Fri, 22 Sep 2023 14:54:58 +0530
Subject: [PATCH] Use the new f-string tokens in string formatting (#7586)

## Summary

This PR updates the string formatter to account for the new f-string
tokens.

The formatter uses the full lexer to handle comments around implicitly
concatenated strings. The reason it uses the lexer is because the AST
merges them into a single node so the boundaries aren't preserved.

For f-strings, it creates some complexity now that it isn't represented
as a `String` token. A single f-string will atleast emit 3 tokens
(`FStringStart`, `FStringMiddle`, `FStringEnd`) and if it contains
expressions, then it'll emit the respective tokens for them. In our
case, we're currently only interested in the outermost f-string range
for which I've introduced a new `FStringRangeBuilder` which keeps builds
the outermost f-string range by considering the start and end tokens and
the nesting level.

Note that this doesn't support in any way nested f-strings which is out
of scope for this PR. This means that if there are nested f-strings,
especially the ones using the same quote, the formatter will escape the
inner quotes:

```python
f"hello world {
    x
        +
            f\"nested {y}\"
}"
```

## Test plan

```
cargo test --package ruff_python_formatter
```
---
 .../src/expression/string.rs                  | 91 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/crates/ruff_python_formatter/src/expression/string.rs b/crates/ruff_python_formatter/src/expression/string.rs
index 3bd7bdb12982b..a76230fa2dede 100644
--- a/crates/ruff_python_formatter/src/expression/string.rs
+++ b/crates/ruff_python_formatter/src/expression/string.rs
@@ -138,6 +138,14 @@ impl<'a> FormatString<'a> {
 
 impl<'a> Format<PyFormatContext<'_>> for FormatString<'a> {
     fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> {
+        // TODO(dhruvmanila): With PEP 701, comments can be inside f-strings.
+        // This is to mark all of those comments as formatted but we need to
+        // figure out how to handle them.
+        if matches!(self.string, AnyString::FString(_)) {
+            f.context()
+                .comments()
+                .mark_verbatim_node_comments_formatted(self.string.into());
+        }
         let locator = f.context().locator();
         match self.layout {
             StringLayout::Default => {
@@ -166,6 +174,60 @@ impl<'a> Format<PyFormatContext<'_>> for FormatString<'a> {
     }
 }
 
+/// A builder for the f-string range.
+///
+/// For now, this is limited to the outermost f-string and doesn't support
+/// nested f-strings.
+#[derive(Debug, Default)]
+struct FStringRangeBuilder {
+    start_location: TextSize,
+    end_location: TextSize,
+    nesting: u32,
+}
+
+impl FStringRangeBuilder {
+    fn visit_token(&mut self, token: &Tok, range: TextRange) {
+        match token {
+            Tok::FStringStart => {
+                if self.nesting == 0 {
+                    self.start_location = range.start();
+                }
+                self.nesting += 1;
+            }
+            Tok::FStringEnd => {
+                // We can assume that this will never overflow because we know
+                // that the program once parsed to a valid AST which means that
+                // the start and end tokens for f-strings are balanced.
+                self.nesting -= 1;
+                if self.nesting == 0 {
+                    self.end_location = range.end();
+                }
+            }
+            _ => {}
+        }
+    }
+
+    /// Returns `true` if the lexer is currently inside of a f-string.
+    ///
+    /// It'll return `false` once the `FStringEnd` token for the outermost
+    /// f-string is visited.
+    const fn in_fstring(&self) -> bool {
+        self.nesting > 0
+    }
+
+    /// Returns the complete range of the previously visited f-string.
+    ///
+    /// This method should only be called once the lexer is outside of any
+    /// f-string otherwise it might return an invalid range.
+    ///
+    /// It doesn't consume the builder because there can be multiple f-strings
+    /// throughout the source code.
+    fn finish(&self) -> TextRange {
+        debug_assert!(!self.in_fstring());
+        TextRange::new(self.start_location, self.end_location)
+    }
+}
+
 struct FormatStringContinuation<'a> {
     string: &'a AnyString<'a>,
 }
@@ -195,6 +257,10 @@ impl Format<PyFormatContext<'_>> for FormatStringContinuation<'_> {
         // because this is a black preview style.
         let lexer = lex_starts_at(string_content, Mode::Expression, string_range.start());
 
+        // The lexer emits multiple tokens for a single f-string literal. Each token
+        // will have it's own range but we require the complete range of the f-string.
+        let mut fstring_range_builder = FStringRangeBuilder::default();
+
         let mut joiner = f.join_with(in_parentheses_only_soft_line_break_or_space());
 
         for token in lexer {
@@ -226,8 +292,31 @@ impl Format<PyFormatContext<'_>> for FormatStringContinuation<'_> {
                 }
             };
 
+            fstring_range_builder.visit_token(&token, token_range);
+
+            // We need to ignore all the tokens within the f-string as there can
+            // be `String` tokens inside it as well. For example,
+            //
+            // ```python
+            // f"foo {'bar'} foo"
+            // #      ^^^^^
+            // #      Ignore any logic for this `String` token
+            // ```
+            //
+            // Here, we're interested in the complete f-string, not the individual
+            // tokens inside it.
+            if fstring_range_builder.in_fstring() {
+                continue;
+            }
+
             match token {
-                Tok::String { .. } => {
+                Tok::String { .. } | Tok::FStringEnd => {
+                    let token_range = if token.is_f_string_end() {
+                        fstring_range_builder.finish()
+                    } else {
+                        token_range
+                    };
+
                     // ```python
                     // (
                     //      "a"