Skip to content

Commit

Permalink
fix the bug
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexWaygood committed Mar 14, 2024
1 parent e944c16 commit 1bf62d1
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 26 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
unicode-ident = { version = "1.0.12" }
unicode-width = { version = "0.1.11" }
unicode_names2 = { version = "1.2.2" }
unicode-normalization = { version = "0.1.23" }
ureq = { version = "2.9.6" }
url = { version = "2.5.0" }
uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Test that unicode identifiers are NFKC-normalised"""

𝒞 = 500
print(𝒞)
print(C + 𝒞) # 2 references to the same variable due to NFKC normalization
print(C / 𝒞)
print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)

print(𝒟) # F821
1 change: 1 addition & 0 deletions crates/ruff_linter/src/rules/pyflakes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ mod tests {
#[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
#[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
#[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
#[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
#[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
#[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
#[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
source: crates/ruff_linter/src/rules/pyflakes/mod.rs
---
F821_28.py:9:7: F821 Undefined name `𝒟`
|
7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
8 |
9 | print(𝒟) # F821
| ^ F821
|
1 change: 1 addition & 0 deletions crates/ruff_python_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
static_assertions = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
unicode-normalization = { workspace = true }

[dev-dependencies]
insta = { workspace = true }
Expand Down
47 changes: 31 additions & 16 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html

use std::iter::FusedIterator;
use std::{char, cmp::Ordering, str::FromStr};
use std::{borrow::Cow, char, cmp::Ordering, str::FromStr};

use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_normalization::UnicodeNormalization;

use ruff_python_ast::{Int, IpyEscapeKind};
use ruff_text_size::{TextLen, TextRange, TextSize};
Expand Down Expand Up @@ -197,11 +198,37 @@ impl<'source> Lexer<'source> {
_ => {}
}

self.cursor.eat_while(is_identifier_continuation);
let mut is_ascii = first.is_ascii();

let text = self.token_text();
loop {
let c = self.cursor.first();
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') {
break;
}
} else {
is_ascii = false;
if !is_xid_continue(c) {
break;
}
}
if self.cursor.is_eof() {
break;
}
self.cursor.bump();
}

let keyword = match text {
let text = {
if is_ascii {
Cow::Borrowed(self.token_text())
} else {
Cow::Owned(self.token_text().nfkc().collect())
}
};

let keyword = match &*text {
"False" => Tok::False,
"None" => Tok::None,
"True" => Tok::True,
Expand Down Expand Up @@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}

// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
is_xid_continue(c)
}
}

/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
/// characters.
///
Expand Down
10 changes: 0 additions & 10 deletions crates/ruff_python_parser/src/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,6 @@ impl<'a> Cursor<'a> {
}
}

/// Eats symbols while predicate returns true or until the end of file is reached.
#[inline]
pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
// It was tried making optimized version of this for eg. line comments, but
// LLVM can inline all of this and compile it down to fast iteration over bytes.
while predicate(self.first()) && !self.is_eof() {
self.bump();
}
}

/// Skips the next `count` bytes.
///
/// ## Panics
Expand Down

0 comments on commit 1bf62d1

Please sign in to comment.