Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply NFKC normalization to unicode identifiers in the lexer #10412

Merged
merged 6 commits into from Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Expand Up @@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
unicode-ident = { version = "1.0.12" }
unicode-width = { version = "0.1.11" }
unicode_names2 = { version = "1.2.2" }
unicode-normalization = { version = "0.1.23" }
ureq = { version = "2.9.6" }
url = { version = "2.5.0" }
uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
Expand Down
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
@@ -0,0 +1,9 @@
"""Test that unicode identifiers are NFKC-normalised"""

𝒞 = 500
print(𝒞)
print(C + 𝒞) # 2 references to the same variable due to NFKC normalization
print(C / 𝒞)
print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)

print(𝒟) # F821
1 change: 1 addition & 0 deletions crates/ruff_linter/src/rules/pyflakes/mod.rs
Expand Up @@ -156,6 +156,7 @@ mod tests {
#[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
#[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
#[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
#[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
#[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
#[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
#[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
Expand Down
@@ -0,0 +1,10 @@
---
source: crates/ruff_linter/src/rules/pyflakes/mod.rs
---
F821_28.py:9:7: F821 Undefined name `𝒟`
|
7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
8 |
9 | print(𝒟) # F821
| ^ F821
|
17 changes: 6 additions & 11 deletions crates/ruff_python_formatter/src/expression/expr_name.rs
@@ -1,4 +1,4 @@
use ruff_formatter::{write, FormatContext};
use ruff_formatter::write;
use ruff_python_ast::AnyNodeRef;
use ruff_python_ast::ExprName;

Expand All @@ -11,16 +11,11 @@ pub struct FormatExprName;

impl FormatNodeRule<ExprName> for FormatExprName {
fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> {
let ExprName { id, range, ctx: _ } = item;

debug_assert_eq!(
id.as_str(),
f.context()
.source_code()
.slice(*range)
.text(f.context().source_code())
);

AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
let ExprName {
id: _,
range,
ctx: _,
} = item;
write!(f, [source_text_slice(*range)])
}

Expand Down
1 change: 1 addition & 0 deletions crates/ruff_python_parser/Cargo.toml
Expand Up @@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
static_assertions = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
unicode-normalization = { workspace = true }

[dev-dependencies]
insta = { workspace = true }
Expand Down
29 changes: 18 additions & 11 deletions crates/ruff_python_parser/src/lexer.rs
Expand Up @@ -32,6 +32,7 @@ use std::iter::FusedIterator;
use std::{char, cmp::Ordering, str::FromStr};

use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_normalization::UnicodeNormalization;

use ruff_python_ast::{Int, IpyEscapeKind};
use ruff_text_size::{TextLen, TextRange, TextSize};
Expand Down Expand Up @@ -169,7 +170,7 @@ impl<'source> Lexer<'source> {
}

/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char) -> Result<Tok, LexicalError> {
fn lex_identifier(&mut self, first: char, ascii_first_char: bool) -> Result<Tok, LexicalError> {
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
// Detect potential string like rb'' b'' f'' u'' r''
match (first, self.cursor.first()) {
('f' | 'F', quote @ ('\'' | '"')) => {
Expand Down Expand Up @@ -197,11 +198,11 @@ impl<'source> Lexer<'source> {
_ => {}
}

self.cursor.eat_while(is_identifier_continuation);
let mut is_ascii = ascii_first_char;
self.cursor
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));

let text = self.token_text();

let keyword = match text {
let keyword = match self.token_text() {
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
"False" => Tok::False,
"None" => Tok::None,
"True" => Tok::True,
Expand Down Expand Up @@ -240,10 +241,15 @@ impl<'source> Lexer<'source> {
"while" => Tok::While,
"with" => Tok::With,
"yield" => Tok::Yield,
_ => {
text => {
let name = if is_ascii {
text.to_string()
} else {
text.nfkc().collect()
};
return Ok(Tok::Name {
name: text.to_string().into_boxed_str(),
})
name: name.into_boxed_str(),
});
}
};

Expand Down Expand Up @@ -906,7 +912,7 @@ impl<'source> Lexer<'source> {
if c.is_ascii() {
self.consume_ascii_character(c)
} else if is_unicode_identifier_start(c) {
let identifier = self.lex_identifier(c)?;
let identifier = self.lex_identifier(c, false)?;
self.state = State::Other;

Ok((identifier, self.token_range()))
Expand Down Expand Up @@ -1066,7 +1072,7 @@ impl<'source> Lexer<'source> {
// Dispatch based on the given character.
fn consume_ascii_character(&mut self, c: char) -> Result<Spanned, LexicalError> {
let token = match c {
c if is_ascii_identifier_start(c) => self.lex_identifier(c)?,
c if is_ascii_identifier_start(c) => self.lex_identifier(c, true)?,
'0'..='9' => self.lex_number(c)?,
'#' => return Ok((self.lex_comment(), self.token_range())),
'\'' | '"' => self.lex_string(None, c)?,
Expand Down Expand Up @@ -1585,12 +1591,13 @@ fn is_unicode_identifier_start(c: char) -> bool {

// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
fn is_identifier_continuation(c: char, ascii_only_identifier: &mut bool) -> bool {
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
*ascii_only_identifier = false;
is_xid_continue(c)
}
}
Expand Down