Update W605 to check in f-strings (#7329)

## Summary This PR updates the `W605` (invalid-escape-sequence) to check inside f-strings. It also adds support to report violation on invalid escape sequence within f-strings w.r.t. the curly braces. So, the following cases will be identified: ```python f"\{1}" f"\{{1}}" f"{1:\}" ``` The new CPython parser also gives out a syntax warning for such cases: ``` fstring.py:1: SyntaxWarning: invalid escape sequence '\{' f"\{1}" fstring.py:2: SyntaxWarning: invalid escape sequence '\{' f"\{{1}}" fstring.py:3: SyntaxWarning: invalid escape sequence '\}' f"{1:\}" ``` Nested f-strings are supported here, so the generated fix is aware of that and will create an edit for the proper f-string. ## Test Plan Add new test cases for f-strings. fixes: #7295
astral-sh · Sep 19, 2023 · 101f9be · 101f9be
1 parent 17be76b
commit 101f9be
Show file tree

Hide file tree

Showing 5 changed files with 355 additions and 29 deletions.
diff --git a/crates/ruff/resources/test/fixtures/pycodestyle/W605_2.py b/crates/ruff/resources/test/fixtures/pycodestyle/W605_2.py
@@ -0,0 +1,54 @@
+# Same as `W605_0.py` but using f-strings instead.
+
+#: W605:1:10
+regex = f'\.png$'
+
+#: W605:2:1
+regex = f'''
+\.png$
+'''
+
+#: W605:2:6
+f(
+    f'\_'
+)
+
+#: W605:4:6
+f"""
+multi-line
+literal
+with \_ somewhere
+in the middle
+"""
+
+#: W605:1:38
+value = f'new line\nand invalid escape \_ here'
+
+
+#: Okay
+regex = fr'\.png$'
+regex = f'\\.png$'
+regex = fr'''
+\.png$
+'''
+regex = fr'''
+\\.png$
+'''
+s = f'\\'
+regex = f'\w'  # noqa
+regex = f'''
+\w
+'''  # noqa
+
+regex = f'\\\_'
+value = f'\{{1}}'
+value = f'\{1}'
+value = f'{1:\}'
+value = f"{f"\{1}"}"
+value = rf"{f"\{1}"}"
+
+# Okay
+value = rf'\{{1}}'
+value = rf'\{1}'
+value = rf'{1:\}'
+value = f"{rf"\{1}"}"
diff --git a/crates/ruff/src/checkers/tokens.rs b/crates/ruff/src/checkers/tokens.rs
@@ -77,14 +77,14 @@ pub(crate) fn check_tokens(
 
     if settings.rules.enabled(Rule::InvalidEscapeSequence) {
         for (tok, range) in tokens.iter().flatten() {
-            if tok.is_string() {
-                pycodestyle::rules::invalid_escape_sequence(
-                    &mut diagnostics,
-                    locator,
-                    *range,
-                    settings.rules.should_fix(Rule::InvalidEscapeSequence),
-                );
-            }
+            pycodestyle::rules::invalid_escape_sequence(
+                &mut diagnostics,
+                locator,
+                indexer,
+                tok,
+                *range,
+                settings.rules.should_fix(Rule::InvalidEscapeSequence),
+            );
         }
     }
 

diff --git a/crates/ruff/src/rules/pycodestyle/mod.rs b/crates/ruff/src/rules/pycodestyle/mod.rs
@@ -28,6 +28,7 @@ mod tests {
     #[test_case(Rule::BlankLineWithWhitespace, Path::new("W29.py"))]
     #[test_case(Rule::InvalidEscapeSequence, Path::new("W605_0.py"))]
     #[test_case(Rule::InvalidEscapeSequence, Path::new("W605_1.py"))]
+    #[test_case(Rule::InvalidEscapeSequence, Path::new("W605_2.py"))]
     #[test_case(Rule::LineTooLong, Path::new("E501.py"))]
     #[test_case(Rule::MixedSpacesAndTabs, Path::new("E101.py"))]
     #[test_case(Rule::ModuleImportNotAtTopOfFile, Path::new("E40.py"))]

diff --git a/crates/ruff/src/rules/pycodestyle/rules/invalid_escape_sequence.rs b/crates/ruff/src/rules/pycodestyle/rules/invalid_escape_sequence.rs
@@ -2,7 +2,8 @@ use memchr::memchr_iter;
 
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
-use ruff_python_ast::str::{leading_quote, trailing_quote};
+use ruff_python_index::Indexer;
+use ruff_python_parser::Tok;
 use ruff_source_file::Locator;
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 
@@ -45,29 +46,32 @@ impl AlwaysAutofixableViolation for InvalidEscapeSequence {
 pub(crate) fn invalid_escape_sequence(
     diagnostics: &mut Vec<Diagnostic>,
     locator: &Locator,
-    range: TextRange,
+    indexer: &Indexer,
+    token: &Tok,
+    token_range: TextRange,
     autofix: bool,
 ) {
-    let text = locator.slice(range);
-
-    // Determine whether the string is single- or triple-quoted.
-    let Some(leading_quote) = leading_quote(text) else {
-        return;
-    };
-    let Some(trailing_quote) = trailing_quote(text) else {
-        return;
+    let token_source_code = match token {
+        Tok::FStringMiddle { value, is_raw } => {
+            if *is_raw {
+                return;
+            }
+            value.as_str()
+        }
+        Tok::String { kind, .. } => {
+            if kind.is_raw() {
+                return;
+            }
+            locator.slice(token_range)
+        }
+        _ => return,
     };
-    let body = &text[leading_quote.len()..text.len() - trailing_quote.len()];
-
-    if leading_quote.contains(['r', 'R']) {
-        return;
-    }
 
     let mut contains_valid_escape_sequence = false;
     let mut invalid_escape_sequence = Vec::new();
 
     let mut prev = None;
-    let bytes = body.as_bytes();
+    let bytes = token_source_code.as_bytes();
     for i in memchr_iter(b'\\', bytes) {
         // If the previous character was also a backslash, skip.
         if prev.is_some_and(|prev| prev == i - 1) {
@@ -77,9 +81,38 @@ pub(crate) fn invalid_escape_sequence(
 
         prev = Some(i);
 
-        let Some(next_char) = body[i + 1..].chars().next() else {
+        let next_char = match token_source_code[i + 1..].chars().next() {
+            Some(next_char) => next_char,
+            None if token.is_f_string_middle() => {
+                // If we're at the end of a f-string middle token, the next character
+                // is actually emitted as a different token. For example,
+                //
+                // ```python
+                // f"\{1}"
+                // ```
+                //
+                // is lexed as `FStringMiddle('\\')` and `LBrace` (ignoring irrelevant
+                // tokens), so we need to check the next character in the source code.
+                //
+                // Now, if we're at the end of the f-string itself, the lexer wouldn't
+                // have emitted the `FStringMiddle` token in the first place. For example,
+                //
+                // ```python
+                // f"foo\"
+                // ```
+                //
+                // Here, there won't be any `FStringMiddle` because it's an unterminated
+                // f-string. This means that if there's a `FStringMiddle` token and we
+                // encounter a `\` character, then the next character is always going to
+                // be part of the f-string.
+                if let Some(next_char) = locator.after(token_range.end()).chars().next() {
+                    next_char
+                } else {
+                    continue;
+                }
+            }
             // If we're at the end of the file, skip.
-            continue;
+            None => continue,
         };
 
         // If we're at the end of line, skip.
@@ -120,7 +153,7 @@ pub(crate) fn invalid_escape_sequence(
             continue;
         }
 
-        let location = range.start() + leading_quote.text_len() + TextSize::try_from(i).unwrap();
+        let location = token_range.start() + TextSize::try_from(i).unwrap();
         let range = TextRange::at(location, next_char.text_len() + TextSize::from(1));
         invalid_escape_sequence.push(Diagnostic::new(InvalidEscapeSequence(next_char), range));
     }
@@ -135,14 +168,25 @@ pub(crate) fn invalid_escape_sequence(
                 )));
             }
         } else {
+            let tok_start = if token.is_f_string_middle() {
+                // SAFETY: If this is a `FStringMiddle` token, then the indexer
+                // must have the f-string range.
+                indexer
+                    .fstring_ranges()
+                    .innermost(token_range.start())
+                    .unwrap()
+                    .start()
+            } else {
+                token_range.start()
+            };
             // Turn into raw string.
             for diagnostic in &mut invalid_escape_sequence {
                 // If necessary, add a space between any leading keyword (`return`, `yield`,
                 // `assert`, etc.) and the string. For example, `return"foo"` is valid, but
                 // `returnr"foo"` is not.
                 diagnostic.set_fix(Fix::automatic(Edit::insertion(
-                    pad_start("r".to_string(), range.start(), locator),
-                    range.start(),
+                    pad_start("r".to_string(), tok_start, locator),
+                    tok_start,
                 )));
             }
         }