diff --git a/Cargo.lock b/Cargo.lock index b8345646d05ac..0a72f59ee275c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2368,6 +2368,7 @@ dependencies = [ "static_assertions", "tiny-keccak", "unicode-ident", + "unicode-normalization", "unicode_names2", ] diff --git a/Cargo.toml b/Cargo.toml index d1de94534a0e7..c07df6dbc2ef3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" } unicode-ident = { version = "1.0.12" } unicode-width = { version = "0.1.11" } unicode_names2 = { version = "1.2.2" } +unicode-normalization = { version = "0.1.23" } ureq = { version = "2.9.6" } url = { version = "2.5.0" } uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] } diff --git a/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py new file mode 100644 index 0000000000000..2bdea407cbf0c --- /dev/null +++ b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py @@ -0,0 +1,9 @@ +"""Test that unicode identifiers are NFKC-normalised""" + +𝒞 = 500 +print(𝒞) +print(C + 𝒞) # 2 references to the same variable due to NFKC normalization +print(C / 𝒞) +print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮) + +print(𝒟) # F821 diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs index aa08d9d32de65..563c48422c138 100644 --- a/crates/ruff_linter/src/rules/pyflakes/mod.rs +++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs @@ -156,6 +156,7 @@ mod tests { #[test_case(Rule::UndefinedName, Path::new("F821_26.py"))] #[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))] #[test_case(Rule::UndefinedName, Path::new("F821_27.py"))] + #[test_case(Rule::UndefinedName, Path::new("F821_28.py"))] #[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))] #[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))] #[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))] diff --git a/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap new file mode 100644 index 0000000000000..e8464267070eb --- /dev/null +++ b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap @@ -0,0 +1,10 @@ +--- +source: crates/ruff_linter/src/rules/pyflakes/mod.rs +--- +F821_28.py:9:7: F821 Undefined name `𝒟` + | +7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮) +8 | +9 | print(𝒟) # F821 + | ^ F821 + | diff --git a/crates/ruff_python_formatter/src/expression/expr_name.rs b/crates/ruff_python_formatter/src/expression/expr_name.rs index f2014f6771f76..68cf5af95798e 100644 --- a/crates/ruff_python_formatter/src/expression/expr_name.rs +++ b/crates/ruff_python_formatter/src/expression/expr_name.rs @@ -1,4 +1,4 @@ -use ruff_formatter::{write, FormatContext}; +use ruff_formatter::write; use ruff_python_ast::AnyNodeRef; use ruff_python_ast::ExprName; @@ -11,16 +11,11 @@ pub struct FormatExprName; impl FormatNodeRule for FormatExprName { fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> { - let ExprName { id, range, ctx: _ } = item; - - debug_assert_eq!( - id.as_str(), - f.context() - .source_code() - .slice(*range) - .text(f.context().source_code()) - ); - + let ExprName { + id: _, + range, + ctx: _, + } = item; write!(f, [source_text_slice(*range)]) } diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 886bb07fec0b6..2ccf94a8b181f 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -28,6 +28,7 @@ rustc-hash = { workspace = true } static_assertions = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } +unicode-normalization = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index bb6316eb641fa..e803c0263b108 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -32,6 +32,7 @@ use std::iter::FusedIterator; use std::{char, cmp::Ordering, str::FromStr}; use unicode_ident::{is_xid_continue, is_xid_start}; +use unicode_normalization::UnicodeNormalization; use ruff_python_ast::{Int, IpyEscapeKind}; use ruff_text_size::{TextLen, TextRange, TextSize}; @@ -197,10 +198,25 @@ impl<'source> Lexer<'source> { _ => {} } - self.cursor.eat_while(is_identifier_continuation); + // Keep track of whether the identifier is ASCII-only or not. + // + // This is important because Python applies NFKC normalization to + // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers. + // We need to therefore do the same in our lexer, but applying NFKC normalization + // unconditionally is extremely expensive. If we know an identifier is ASCII-only, + // (by far the most common case), we can skip NFKC normalization of the identifier. + let mut is_ascii = first.is_ascii(); + self.cursor + .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); let text = self.token_text(); + if !is_ascii { + return Ok(Tok::Name { + name: text.nfkc().collect::().into_boxed_str(), + }); + } + let keyword = match text { "False" => Tok::False, "None" => Tok::None, @@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool { is_xid_start(c) } -// Checks if the character c is a valid continuation character as described -// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers -fn is_identifier_continuation(c: char) -> bool { +/// Checks if the character c is a valid continuation character as described +/// in . +/// +/// Additionally, this function also keeps track of whether or not the total +/// identifier is ASCII-only or not by mutably altering a reference to a +/// boolean value passed in. +fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool { // Arrange things such that ASCII codepoints never // result in the slower `is_xid_continue` getting called. if c.is_ascii() { matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') } else { + *identifier_is_ascii_only = false; is_xid_continue(c) } } @@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b): assert_debug_snapshot!(lex_source(source)); } + fn get_tokens_only(source: &str) -> Vec { + lex_source(source).into_iter().map(|(tok, _)| tok).collect() + } + + #[test] + fn test_nfkc_normalization() { + let source1 = "𝒞 = 500"; + let source2 = "C = 500"; + assert_eq!(get_tokens_only(source1), get_tokens_only(source2)); + } + fn triple_quoted_eol(eol: &str) -> Vec { let source = format!("\"\"\"{eol} test string{eol} \"\"\""); lex_source(&source) diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index 84080c1b8cad0..892915718fa1d 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -16,6 +16,9 @@ pub enum Tok { /// Token value for a name, commonly known as an identifier. Name { /// The name value. + /// + /// Unicode names are NFKC-normalized by the lexer, + /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers) name: Box, }, /// Token value for an integer.