Track quoting style in the tokenizer (astral-sh#10256)

nkxxll · Mar 10, 2024 · 4ac1b15 · 4ac1b15
1 parent bd973e8
commit 4ac1b15
Show file tree

Hide file tree

Showing 55 changed files with 4,063 additions and 3,268 deletions.
diff --git a/crates/ruff_linter/resources/test/fixtures/flake8_quotes/singles_escaped_unnecessary.py b/crates/ruff_linter/resources/test/fixtures/flake8_quotes/singles_escaped_unnecessary.py
@@ -40,4 +40,7 @@
 
 # Make sure we do not unescape quotes
 this_is_fine = "This is an \\'escaped\\' quote"
-this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"
+this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"  # Q004
+
+# Invalid escapes in bytestrings are also triggered:
+x = b"\xe7\xeb\x0c\xa1\x1b\x83tN\xce=x\xe9\xbe\x01\xb9\x13B_\xba\xe7\x0c2\xce\'rm\x0e\xcd\xe9.\xf8\xd2"  # Q004
diff --git a/crates/ruff_linter/src/directives.rs b/crates/ruff_linter/src/directives.rs
@@ -131,10 +131,7 @@ fn extract_noqa_line_for(lxr: &[LexResult], locator: &Locator, indexer: &Indexer
 
             // For multi-line strings, we expect `noqa` directives on the last line of the
             // string.
-            Tok::String {
-                triple_quoted: true,
-                ..
-            } => {
+            Tok::String { kind, .. } if kind.is_triple_quoted() => {
                 if locator.contains_line_break(*range) {
                     string_mappings.push(TextRange::new(
                         locator.line_start(range.start()),

diff --git a/crates/ruff_linter/src/rules/flake8_commas/rules/trailing_commas.rs b/crates/ruff_linter/src/rules/flake8_commas/rules/trailing_commas.rs
@@ -243,7 +243,7 @@ pub(crate) fn trailing_commas(
             // F-strings are handled as `String` token type with the complete range
             // of the outermost f-string. This means that the expression inside the
             // f-string is not checked for trailing commas.
-            Tok::FStringStart => {
+            Tok::FStringStart(_) => {
                 fstrings = fstrings.saturating_add(1);
                 None
             }

diff --git a/crates/ruff_linter/src/rules/flake8_implicit_str_concat/rules/implicit.rs b/crates/ruff_linter/src/rules/flake8_implicit_str_concat/rules/implicit.rs
@@ -110,7 +110,7 @@ pub(crate) fn implicit(
     {
         let (a_range, b_range) = match (a_tok, b_tok) {
             (Tok::String { .. }, Tok::String { .. }) => (*a_range, *b_range),
-            (Tok::String { .. }, Tok::FStringStart) => {
+            (Tok::String { .. }, Tok::FStringStart(_)) => {
                 match indexer.fstring_ranges().innermost(b_range.start()) {
                     Some(b_range) => (*a_range, b_range),
                     None => continue,
@@ -122,7 +122,7 @@ pub(crate) fn implicit(
                     None => continue,
                 }
             }
-            (Tok::FStringEnd, Tok::FStringStart) => {
+            (Tok::FStringEnd, Tok::FStringStart(_)) => {
                 match (
                     indexer.fstring_ranges().innermost(a_range.start()),
                     indexer.fstring_ranges().innermost(b_range.start()),

diff --git a/crates/ruff_linter/src/rules/flake8_quotes/rules/avoidable_escaped_quote.rs b/crates/ruff_linter/src/rules/flake8_quotes/rules/avoidable_escaped_quote.rs
@@ -1,6 +1,5 @@
 use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
-use ruff_python_ast::str::{is_triple_quote, leading_quote};
 use ruff_python_parser::lexer::LexResult;
 use ruff_python_parser::Tok;
 use ruff_source_file::Locator;
@@ -158,7 +157,7 @@ pub(crate) fn avoidable_escaped_quote(
             // ```python
             // f'"foo" {'nested'}"
             // ```
-            if matches!(tok, Tok::String { .. } | Tok::FStringStart) {
+            if matches!(tok, Tok::String { .. } | Tok::FStringStart(_)) {
                 if let Some(fstring_context) = fstrings.last_mut() {
                     fstring_context.ignore_escaped_quotes();
                     continue;
@@ -170,16 +169,13 @@ pub(crate) fn avoidable_escaped_quote(
             Tok::String {
                 value: string_contents,
                 kind,
-                triple_quoted,
             } => {
-                if kind.is_raw() || *triple_quoted {
+                if kind.is_raw_string() || kind.is_triple_quoted() {
                     continue;
                 }
 
                 // Check if we're using the preferred quotation style.
-                if !leading_quote(locator.slice(tok_range)).is_some_and(|text| {
-                    contains_quote(text, quotes_settings.inline_quotes.as_char())
-                }) {
+                if Quote::from(kind.quote_style()) != quotes_settings.inline_quotes {
                     continue;
                 }
 
@@ -192,7 +188,7 @@ pub(crate) fn avoidable_escaped_quote(
                     let mut diagnostic = Diagnostic::new(AvoidableEscapedQuote, tok_range);
                     let fixed_contents = format!(
                         "{prefix}{quote}{value}{quote}",
-                        prefix = kind.as_str(),
+                        prefix = kind.prefix_str(),
                         quote = quotes_settings.inline_quotes.opposite().as_char(),
                         value = unescape_string(
                             string_contents,
@@ -206,12 +202,11 @@ pub(crate) fn avoidable_escaped_quote(
                     diagnostics.push(diagnostic);
                 }
             }
-            Tok::FStringStart => {
-                let text = locator.slice(tok_range);
+            Tok::FStringStart(kind) => {
                 // Check for escaped quote only if we're using the preferred quotation
                 // style and it isn't a triple-quoted f-string.
-                let check_for_escaped_quote = !is_triple_quote(text)
-                    && contains_quote(text, quotes_settings.inline_quotes.as_char());
+                let check_for_escaped_quote = !kind.is_triple_quoted()
+                    && Quote::from(kind.quote_style()) == quotes_settings.inline_quotes;
                 fstrings.push(FStringContext::new(
                     check_for_escaped_quote,
                     tok_range,
@@ -220,9 +215,8 @@ pub(crate) fn avoidable_escaped_quote(
             }
             Tok::FStringMiddle {
                 value: string_contents,
-                is_raw,
-                triple_quoted: _,
-            } if !is_raw => {
+                kind,
+            } if !kind.is_raw_string() => {
                 let Some(context) = fstrings.last_mut() else {
                     continue;
                 };
@@ -315,25 +309,20 @@ pub(crate) fn unnecessary_escaped_quote(
             Tok::String {
                 value: string_contents,
                 kind,
-                triple_quoted,
             } => {
-                if kind.is_raw() || *triple_quoted {
+                if kind.is_raw_string() || kind.is_triple_quoted() {
                     continue;
                 }
 
-                let leading = match leading_quote(locator.slice(tok_range)) {
-                    Some("\"") => Quote::Double,
-                    Some("'") => Quote::Single,
-                    _ => continue,
-                };
+                let leading = kind.quote_style();
                 if !contains_escaped_quote(string_contents, leading.opposite().as_char()) {
                     continue;
                 }
 
                 let mut diagnostic = Diagnostic::new(UnnecessaryEscapedQuote, tok_range);
                 let fixed_contents = format!(
                     "{prefix}{quote}{value}{quote}",
-                    prefix = kind.as_str(),
+                    prefix = kind.prefix_str(),
                     quote = leading.as_char(),
                     value = unescape_string(string_contents, leading.opposite().as_char())
                 );
@@ -343,16 +332,11 @@ pub(crate) fn unnecessary_escaped_quote(
                 )));
                 diagnostics.push(diagnostic);
             }
-            Tok::FStringStart => {
-                let text = locator.slice(tok_range);
+            Tok::FStringStart(kind) => {
                 // Check for escaped quote only if we're using the preferred quotation
                 // style and it isn't a triple-quoted f-string.
-                let check_for_escaped_quote = !is_triple_quote(text);
-                let quote_style = if contains_quote(text, Quote::Single.as_char()) {
-                    Quote::Single
-                } else {
-                    Quote::Double
-                };
+                let check_for_escaped_quote = !kind.is_triple_quoted();
+                let quote_style = Quote::from(kind.quote_style());
                 fstrings.push(FStringContext::new(
                     check_for_escaped_quote,
                     tok_range,
@@ -361,9 +345,8 @@ pub(crate) fn unnecessary_escaped_quote(
             }
             Tok::FStringMiddle {
                 value: string_contents,
-                is_raw,
-                triple_quoted: _,
-            } if !is_raw => {
+                kind,
+            } if !kind.is_raw_string() => {
                 let Some(context) = fstrings.last_mut() else {
                     continue;
                 };

diff --git a/crates/ruff_linter/src/rules/flake8_quotes/rules/check_string_quotes.rs b/crates/ruff_linter/src/rules/flake8_quotes/rules/check_string_quotes.rs
@@ -383,7 +383,7 @@ struct FStringRangeBuilder {
 impl FStringRangeBuilder {
     fn visit_token(&mut self, token: &Tok, range: TextRange) {
         match token {
-            Tok::FStringStart => {
+            Tok::FStringStart(_) => {
                 if self.nesting == 0 {
                     self.start_location = range.start();
                 }

diff --git a/crates/ruff_linter/src/rules/flake8_quotes/settings.rs b/crates/ruff_linter/src/rules/flake8_quotes/settings.rs
@@ -22,6 +22,15 @@ impl Default for Quote {
     }
 }
 
+impl From<ruff_python_parser::QuoteStyle> for Quote {
+    fn from(value: ruff_python_parser::QuoteStyle) -> Self {
+        match value {
+            ruff_python_parser::QuoteStyle::Double => Self::Double,
+            ruff_python_parser::QuoteStyle::Single => Self::Single,
+        }
+    }
+}
+
 #[derive(Debug, CacheKey)]
 pub struct Settings {
     pub inline_quotes: Quote,

diff --git a/...er__rules__flake8_quotes__tests__require_doubles_over_singles_escaped_unnecessary.py.snap b/...er__rules__flake8_quotes__tests__require_doubles_over_singles_escaped_unnecessary.py.snap
@@ -326,16 +326,34 @@ singles_escaped_unnecessary.py:43:26: Q004 [*] Unnecessary escape on inner quote
    |
 41 | # Make sure we do not unescape quotes
 42 | this_is_fine = "This is an \\'escaped\\' quote"
-43 | this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"
+43 | this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"  # Q004
    |                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Q004
+44 | 
+45 | # Invalid escapes in bytestrings are also triggered:
    |
    = help: Remove backslash
 
 ℹ Safe fix
 40 40 | 
 41 41 | # Make sure we do not unescape quotes
 42 42 | this_is_fine = "This is an \\'escaped\\' quote"
-43    |-this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"
-   43 |+this_should_raise_Q004 = "This is an \\'escaped\\' quote with an extra backslash"
+43    |-this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"  # Q004
+   43 |+this_should_raise_Q004 = "This is an \\'escaped\\' quote with an extra backslash"  # Q004
+44 44 | 
+45 45 | # Invalid escapes in bytestrings are also triggered:
+46 46 | x = b"\xe7\xeb\x0c\xa1\x1b\x83tN\xce=x\xe9\xbe\x01\xb9\x13B_\xba\xe7\x0c2\xce\'rm\x0e\xcd\xe9.\xf8\xd2"  # Q004
 
+singles_escaped_unnecessary.py:46:5: Q004 [*] Unnecessary escape on inner quote character
+   |
+45 | # Invalid escapes in bytestrings are also triggered:
+46 | x = b"\xe7\xeb\x0c\xa1\x1b\x83tN\xce=x\xe9\xbe\x01\xb9\x13B_\xba\xe7\x0c2\xce\'rm\x0e\xcd\xe9.\xf8\xd2"  # Q004
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Q004
+   |
+   = help: Remove backslash
 
+ℹ Safe fix
+43 43 | this_should_raise_Q004 = "This is an \\\'escaped\\\' quote with an extra backslash"  # Q004
+44 44 | 
+45 45 | # Invalid escapes in bytestrings are also triggered:
+46    |-x = b"\xe7\xeb\x0c\xa1\x1b\x83tN\xce=x\xe9\xbe\x01\xb9\x13B_\xba\xe7\x0c2\xce\'rm\x0e\xcd\xe9.\xf8\xd2"  # Q004
+   46 |+x = b"\xe7\xeb\x0c\xa1\x1b\x83tN\xce=x\xe9\xbe\x01\xb9\x13B_\xba\xe7\x0c2\xce'rm\x0e\xcd\xe9.\xf8\xd2"  # Q004
diff --git a/crates/ruff_linter/src/rules/pycodestyle/rules/invalid_escape_sequence.rs b/crates/ruff_linter/src/rules/pycodestyle/rules/invalid_escape_sequence.rs
@@ -3,7 +3,7 @@ use memchr::memchr_iter;
 use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_index::Indexer;
-use ruff_python_parser::{StringKind, Tok};
+use ruff_python_parser::Tok;
 use ruff_source_file::Locator;
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 
@@ -66,21 +66,21 @@ pub(crate) fn invalid_escape_sequence(
     token: &Tok,
     token_range: TextRange,
 ) {
-    let (token_source_code, string_start_location) = match token {
-        Tok::FStringMiddle { value, is_raw, .. } => {
-            if *is_raw {
+    let (token_source_code, string_start_location, kind) = match token {
+        Tok::FStringMiddle { value, kind } => {
+            if kind.is_raw_string() {
                 return;
             }
             let Some(range) = indexer.fstring_ranges().innermost(token_range.start()) else {
                 return;
             };
-            (&**value, range.start())
+            (&**value, range.start(), kind)
         }
         Tok::String { kind, .. } => {
-            if kind.is_raw() {
+            if kind.is_raw_string() {
                 return;
             }
-            (locator.slice(token_range), token_range.start())
+            (locator.slice(token_range), token_range.start(), kind)
         }
         _ => return,
     };
@@ -207,13 +207,7 @@ pub(crate) fn invalid_escape_sequence(
                 invalid_escape_char.range(),
             );
 
-            if matches!(
-                token,
-                Tok::String {
-                    kind: StringKind::Unicode,
-                    ..
-                }
-            ) {
+            if kind.is_u_string() {
                 // Replace the Unicode prefix with `r`.
                 diagnostic.set_fix(Fix::safe_edit(Edit::replacement(
                     "r".to_string(),

diff --git a/crates/ruff_linter/src/rules/pylint/rules/bad_string_format_character.rs b/crates/ruff_linter/src/rules/pylint/rules/bad_string_format_character.rs
@@ -2,15 +2,14 @@ use std::str::FromStr;
 
 use ruff_diagnostics::{Diagnostic, Violation};
 use ruff_macros::{derive_message_formats, violation};
-use ruff_python_ast::str::{leading_quote, trailing_quote};
 use ruff_python_ast::Expr;
 use ruff_python_literal::{
     cformat::{CFormatErrorType, CFormatString},
     format::FormatPart,
     format::FromTemplate,
     format::{FormatSpec, FormatSpecError, FormatString},
 };
-use ruff_python_parser::{lexer, Mode};
+use ruff_python_parser::{lexer, Mode, StringKind, Tok};
 use ruff_text_size::{Ranged, TextRange};
 
 use crate::checkers::ast::Checker;
@@ -93,15 +92,15 @@ pub(crate) fn call(checker: &mut Checker, string: &str, range: TextRange) {
 /// Ex) `"%z" % "1"`
 pub(crate) fn percent(checker: &mut Checker, expr: &Expr) {
     // Grab each string segment (in case there's an implicit concatenation).
-    let mut strings: Vec<TextRange> = vec![];
+    let mut strings: Vec<(TextRange, StringKind)> = vec![];
     for (tok, range) in
         lexer::lex_starts_at(checker.locator().slice(expr), Mode::Module, expr.start()).flatten()
     {
-        if tok.is_string() {
-            strings.push(range);
-        } else if tok.is_percent() {
+        match tok {
+            Tok::String { kind, .. } => strings.push((range, kind)),
             // Break as soon as we find the modulo symbol.
-            break;
+            Tok::Percent => break,
+            _ => {}
         }
     }
 
@@ -110,12 +109,10 @@ pub(crate) fn percent(checker: &mut Checker, expr: &Expr) {
         return;
     }
 
-    for range in &strings {
+    for (range, kind) in &strings {
         let string = checker.locator().slice(*range);
-        let (Some(leader), Some(trailer)) = (leading_quote(string), trailing_quote(string)) else {
-            return;
-        };
-        let string = &string[leader.len()..string.len() - trailer.len()];
+        let string = &string
+            [usize::from(kind.opener_len())..(string.len() - usize::from(kind.closer_len()))];
 
         // Parse the format string (e.g. `"%s"`) into a list of `PercentFormat`.
         if let Err(format_error) = CFormatString::from_str(string) {