From 0d02da182c92129c980ec7697192634d647635d1 Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Thu, 14 Mar 2024 17:26:04 +0000
Subject: [PATCH 1/6] fix the bug

---
 Cargo.lock                                    |  1 +
 Cargo.toml                                    |  1 +
 .../test/fixtures/pyflakes/F821_28.py         |  9 ++++
 crates/ruff_linter/src/rules/pyflakes/mod.rs  |  1 +
 ...les__pyflakes__tests__F821_F821_28.py.snap | 10 ++++
 crates/ruff_python_parser/Cargo.toml          |  1 +
 crates/ruff_python_parser/src/lexer.rs        | 47 ++++++++++++-------
 crates/ruff_python_parser/src/lexer/cursor.rs | 10 ----
 8 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py
 create mode 100644 crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap

diff --git a/Cargo.lock b/Cargo.lock
index b8345646d05ac..0a72f59ee275c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2368,6 +2368,7 @@ dependencies = [
  "static_assertions",
  "tiny-keccak",
  "unicode-ident",
+ "unicode-normalization",
  "unicode_names2",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index d1de94534a0e7..c07df6dbc2ef3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
 unicode-ident = { version = "1.0.12" }
 unicode-width = { version = "0.1.11" }
 unicode_names2 = { version = "1.2.2" }
+unicode-normalization = { version = "0.1.23" }
 ureq = { version = "2.9.6" }
 url = { version = "2.5.0" }
 uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
diff --git a/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py
new file mode 100644
index 0000000000000..2bdea407cbf0c
--- /dev/null
+++ b/crates/ruff_linter/resources/test/fixtures/pyflakes/F821_28.py
@@ -0,0 +1,9 @@
+"""Test that unicode identifiers are NFKC-normalised"""
+
+𝒞 = 500
+print(𝒞)
+print(C + 𝒞)  # 2 references to the same variable due to NFKC normalization
+print(C / 𝒞)
+print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
+
+print(𝒟)  # F821
diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs
index aa08d9d32de65..563c48422c138 100644
--- a/crates/ruff_linter/src/rules/pyflakes/mod.rs
+++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs
@@ -156,6 +156,7 @@ mod tests {
     #[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
     #[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
     #[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
+    #[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
     #[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
diff --git a/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap
new file mode 100644
index 0000000000000..e8464267070eb
--- /dev/null
+++ b/crates/ruff_linter/src/rules/pyflakes/snapshots/ruff_linter__rules__pyflakes__tests__F821_F821_28.py.snap
@@ -0,0 +1,10 @@
+---
+source: crates/ruff_linter/src/rules/pyflakes/mod.rs
+---
+F821_28.py:9:7: F821 Undefined name `𝒟`
+  |
+7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
+8 | 
+9 | print(𝒟)  # F821
+  |       ^ F821
+  |
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index 886bb07fec0b6..2ccf94a8b181f 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
+unicode-normalization = { workspace = true }
 
 [dev-dependencies]
 insta = { workspace = true }
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index bb6316eb641fa..a139855bfb0c0 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -29,9 +29,10 @@
 //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
 
 use std::iter::FusedIterator;
-use std::{char, cmp::Ordering, str::FromStr};
+use std::{borrow::Cow, char, cmp::Ordering, str::FromStr};
 
 use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_normalization::UnicodeNormalization;
 
 use ruff_python_ast::{Int, IpyEscapeKind};
 use ruff_text_size::{TextLen, TextRange, TextSize};
@@ -197,11 +198,37 @@ impl<'source> Lexer<'source> {
             _ => {}
         }
 
-        self.cursor.eat_while(is_identifier_continuation);
+        let mut is_ascii = first.is_ascii();
 
-        let text = self.token_text();
+        loop {
+            let c = self.cursor.first();
+            // Arrange things such that ASCII codepoints never
+            // result in the slower `is_xid_continue` getting called.
+            if c.is_ascii() {
+                if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') {
+                    break;
+                }
+            } else {
+                if !is_xid_continue(c) {
+                    break;
+                }
+                is_ascii = false;
+            }
+            if self.cursor.is_eof() {
+                break;
+            }
+            self.cursor.bump();
+        }
 
-        let keyword = match text {
+        let text = {
+            if is_ascii {
+                Cow::Borrowed(self.token_text())
+            } else {
+                Cow::Owned(self.token_text().nfkc().collect())
+            }
+        };
+
+        let keyword = match &*text {
             "False" => Tok::False,
             "None" => Tok::None,
             "True" => Tok::True,
@@ -1583,18 +1610,6 @@ fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
-// Checks if the character c is a valid continuation character as described
-// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-fn is_identifier_continuation(c: char) -> bool {
-    // Arrange things such that ASCII codepoints never
-    // result in the slower `is_xid_continue` getting called.
-    if c.is_ascii() {
-        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
-    } else {
-        is_xid_continue(c)
-    }
-}
-
 /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
 /// characters.
 ///
diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
index 6dd8e63d70ad8..eeac9b9228e70 100644
--- a/crates/ruff_python_parser/src/lexer/cursor.rs
+++ b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -119,16 +119,6 @@ impl<'a> Cursor<'a> {
         }
     }
 
-    /// Eats symbols while predicate returns true or until the end of file is reached.
-    #[inline]
-    pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
-        // It was tried making optimized version of this for eg. line comments, but
-        // LLVM can inline all of this and compile it down to fast iteration over bytes.
-        while predicate(self.first()) && !self.is_eof() {
-            self.bump();
-        }
-    }
-
     /// Skips the next `count` bytes.
     ///
     /// ## Panics

From a8a9863dfa6beba7156a2a95e393dbd32a945756 Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Thu, 14 Mar 2024 17:31:18 +0000
Subject: [PATCH 2/6] remove a debug assertion that no longer passes

---
 .../src/expression/expr_name.rs                 | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/ruff_python_formatter/src/expression/expr_name.rs b/crates/ruff_python_formatter/src/expression/expr_name.rs
index f2014f6771f76..68cf5af95798e 100644
--- a/crates/ruff_python_formatter/src/expression/expr_name.rs
+++ b/crates/ruff_python_formatter/src/expression/expr_name.rs
@@ -1,4 +1,4 @@
-use ruff_formatter::{write, FormatContext};
+use ruff_formatter::write;
 use ruff_python_ast::AnyNodeRef;
 use ruff_python_ast::ExprName;
 
@@ -11,16 +11,11 @@ pub struct FormatExprName;
 
 impl FormatNodeRule<ExprName> for FormatExprName {
     fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> {
-        let ExprName { id, range, ctx: _ } = item;
-
-        debug_assert_eq!(
-            id.as_str(),
-            f.context()
-                .source_code()
-                .slice(*range)
-                .text(f.context().source_code())
-        );
-
+        let ExprName {
+            id: _,
+            range,
+            ctx: _,
+        } = item;
         write!(f, [source_text_slice(*range)])
     }
 

From 1646f0fb4a711614e5e1027472f56a1607a957d1 Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Fri, 15 Mar 2024 15:44:13 +0000
Subject: [PATCH 3/6] optimize a little bit

---
 crates/ruff_python_parser/src/lexer.rs | 31 ++++++++++++--------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index a139855bfb0c0..c3a42c0a1c05d 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -29,7 +29,7 @@
 //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
 
 use std::iter::FusedIterator;
-use std::{borrow::Cow, char, cmp::Ordering, str::FromStr};
+use std::{char, cmp::Ordering, str::FromStr};
 
 use unicode_ident::{is_xid_continue, is_xid_start};
 use unicode_normalization::UnicodeNormalization;
@@ -170,7 +170,7 @@ impl<'source> Lexer<'source> {
     }
 
     /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
-    fn lex_identifier(&mut self, first: char) -> Result<Tok, LexicalError> {
+    fn lex_identifier(&mut self, first: char, ascii_first_char: bool) -> Result<Tok, LexicalError> {
         // Detect potential string like rb'' b'' f'' u'' r''
         match (first, self.cursor.first()) {
             ('f' | 'F', quote @ ('\'' | '"')) => {
@@ -198,7 +198,7 @@ impl<'source> Lexer<'source> {
             _ => {}
         }
 
-        let mut is_ascii = first.is_ascii();
+        let mut is_ascii = ascii_first_char;
 
         loop {
             let c = self.cursor.first();
@@ -220,15 +220,7 @@ impl<'source> Lexer<'source> {
             self.cursor.bump();
         }
 
-        let text = {
-            if is_ascii {
-                Cow::Borrowed(self.token_text())
-            } else {
-                Cow::Owned(self.token_text().nfkc().collect())
-            }
-        };
-
-        let keyword = match &*text {
+        let keyword = match self.token_text() {
             "False" => Tok::False,
             "None" => Tok::None,
             "True" => Tok::True,
@@ -267,10 +259,15 @@ impl<'source> Lexer<'source> {
             "while" => Tok::While,
             "with" => Tok::With,
             "yield" => Tok::Yield,
-            _ => {
+            text => {
+                let name = if is_ascii {
+                    text.to_string()
+                } else {
+                    text.nfkc().collect()
+                };
                 return Ok(Tok::Name {
-                    name: text.to_string().into_boxed_str(),
-                })
+                    name: name.into_boxed_str(),
+                });
             }
         };
 
@@ -933,7 +930,7 @@ impl<'source> Lexer<'source> {
             if c.is_ascii() {
                 self.consume_ascii_character(c)
             } else if is_unicode_identifier_start(c) {
-                let identifier = self.lex_identifier(c)?;
+                let identifier = self.lex_identifier(c, false)?;
                 self.state = State::Other;
 
                 Ok((identifier, self.token_range()))
@@ -1093,7 +1090,7 @@ impl<'source> Lexer<'source> {
     // Dispatch based on the given character.
     fn consume_ascii_character(&mut self, c: char) -> Result<Spanned, LexicalError> {
         let token = match c {
-            c if is_ascii_identifier_start(c) => self.lex_identifier(c)?,
+            c if is_ascii_identifier_start(c) => self.lex_identifier(c, true)?,
             '0'..='9' => self.lex_number(c)?,
             '#' => return Ok((self.lex_comment(), self.token_range())),
             '\'' | '"' => self.lex_string(None, c)?,

From d9a68bad8afcb43bc9e0a62f409de7965247a276 Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Fri, 15 Mar 2024 18:05:00 +0000
Subject: [PATCH 4/6] Optimize more

---
 crates/ruff_python_parser/src/lexer.rs        | 35 ++++++++-----------
 crates/ruff_python_parser/src/lexer/cursor.rs | 10 ++++++
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index c3a42c0a1c05d..4f42bd8c5670f 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -199,26 +199,8 @@ impl<'source> Lexer<'source> {
         }
 
         let mut is_ascii = ascii_first_char;
-
-        loop {
-            let c = self.cursor.first();
-            // Arrange things such that ASCII codepoints never
-            // result in the slower `is_xid_continue` getting called.
-            if c.is_ascii() {
-                if !matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') {
-                    break;
-                }
-            } else {
-                if !is_xid_continue(c) {
-                    break;
-                }
-                is_ascii = false;
-            }
-            if self.cursor.is_eof() {
-                break;
-            }
-            self.cursor.bump();
-        }
+        self.cursor
+            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
 
         let keyword = match self.token_text() {
             "False" => Tok::False,
@@ -1607,6 +1589,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
+// Checks if the character c is a valid continuation character as described
+// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+fn is_identifier_continuation(c: char, ascii_only_identifier: &mut bool) -> bool {
+    // Arrange things such that ASCII codepoints never
+    // result in the slower `is_xid_continue` getting called.
+    if c.is_ascii() {
+        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
+    } else {
+        *ascii_only_identifier = false;
+        is_xid_continue(c)
+    }
+}
+
 /// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
 /// characters.
 ///
diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs
index eeac9b9228e70..6dd8e63d70ad8 100644
--- a/crates/ruff_python_parser/src/lexer/cursor.rs
+++ b/crates/ruff_python_parser/src/lexer/cursor.rs
@@ -119,6 +119,16 @@ impl<'a> Cursor<'a> {
         }
     }
 
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    #[inline]
+    pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
+    }
+
     /// Skips the next `count` bytes.
     ///
     /// ## Panics

From d785be50606df110d848606165e2f7a9aa406835 Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Fri, 15 Mar 2024 18:41:35 +0000
Subject: [PATCH 5/6] docs and better naming

---
 crates/ruff_python_parser/src/lexer.rs | 27 ++++++++++++++++++++------
 crates/ruff_python_parser/src/token.rs |  3 +++
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index 4f42bd8c5670f..b8bd7df29b7dd 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -170,7 +170,11 @@ impl<'source> Lexer<'source> {
     }
 
     /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
-    fn lex_identifier(&mut self, first: char, ascii_first_char: bool) -> Result<Tok, LexicalError> {
+    fn lex_identifier(
+        &mut self,
+        first: char,
+        first_char_is_ascii: bool,
+    ) -> Result<Tok, LexicalError> {
         // Detect potential string like rb'' b'' f'' u'' r''
         match (first, self.cursor.first()) {
             ('f' | 'F', quote @ ('\'' | '"')) => {
@@ -198,7 +202,14 @@ impl<'source> Lexer<'source> {
             _ => {}
         }
 
-        let mut is_ascii = ascii_first_char;
+        // Keep track of whether the identifier is ASCII-only or not.
+        //
+        // This is important because Python applies NFKC normalization to
+        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
+        // We need to therefore do the same in our lexer, but applying NFKC normalization
+        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
+        // (by far the most common case), we can skip NFKC normalization of the identifier.
+        let mut is_ascii = first_char_is_ascii;
         self.cursor
             .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
 
@@ -1589,15 +1600,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
     is_xid_start(c)
 }
 
-// Checks if the character c is a valid continuation character as described
-// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-fn is_identifier_continuation(c: char, ascii_only_identifier: &mut bool) -> bool {
+/// Checks if the character c is a valid continuation character as described
+/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
+///
+/// Additionally, this function also keeps track of whether or not the total
+/// identifier is ASCII-only or not by mutably altering a reference to a
+/// boolean value passed in.
+fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
     // Arrange things such that ASCII codepoints never
     // result in the slower `is_xid_continue` getting called.
     if c.is_ascii() {
         matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
     } else {
-        *ascii_only_identifier = false;
+        *identifier_is_ascii_only = false;
         is_xid_continue(c)
     }
 }
diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs
index 84080c1b8cad0..892915718fa1d 100644
--- a/crates/ruff_python_parser/src/token.rs
+++ b/crates/ruff_python_parser/src/token.rs
@@ -16,6 +16,9 @@ pub enum Tok {
     /// Token value for a name, commonly known as an identifier.
     Name {
         /// The name value.
+        ///
+        /// Unicode names are NFKC-normalized by the lexer,
+        /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
         name: Box<str>,
     },
     /// Token value for an integer.

From 35500ab44ada0d9cd8f1f9667ca78bf61dc85b10 Mon Sep 17 00:00:00 2001
From: Alex Waygood <alex.waygood@gmail.com>
Date: Mon, 18 Mar 2024 11:36:46 +0000
Subject: [PATCH 6/6] address Micha's review

---
 crates/ruff_python_parser/src/lexer.rs | 44 ++++++++++++++++----------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index b8bd7df29b7dd..e803c0263b108 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -170,11 +170,7 @@ impl<'source> Lexer<'source> {
     }
 
     /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
-    fn lex_identifier(
-        &mut self,
-        first: char,
-        first_char_is_ascii: bool,
-    ) -> Result<Tok, LexicalError> {
+    fn lex_identifier(&mut self, first: char) -> Result<Tok, LexicalError> {
         // Detect potential string like rb'' b'' f'' u'' r''
         match (first, self.cursor.first()) {
             ('f' | 'F', quote @ ('\'' | '"')) => {
@@ -209,11 +205,19 @@ impl<'source> Lexer<'source> {
         // We need to therefore do the same in our lexer, but applying NFKC normalization
         // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
         // (by far the most common case), we can skip NFKC normalization of the identifier.
-        let mut is_ascii = first_char_is_ascii;
+        let mut is_ascii = first.is_ascii();
         self.cursor
             .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
 
-        let keyword = match self.token_text() {
+        let text = self.token_text();
+
+        if !is_ascii {
+            return Ok(Tok::Name {
+                name: text.nfkc().collect::<String>().into_boxed_str(),
+            });
+        }
+
+        let keyword = match text {
             "False" => Tok::False,
             "None" => Tok::None,
             "True" => Tok::True,
@@ -252,15 +256,10 @@ impl<'source> Lexer<'source> {
             "while" => Tok::While,
             "with" => Tok::With,
             "yield" => Tok::Yield,
-            text => {
-                let name = if is_ascii {
-                    text.to_string()
-                } else {
-                    text.nfkc().collect()
-                };
+            _ => {
                 return Ok(Tok::Name {
-                    name: name.into_boxed_str(),
-                });
+                    name: text.to_string().into_boxed_str(),
+                })
             }
         };
 
@@ -923,7 +922,7 @@ impl<'source> Lexer<'source> {
             if c.is_ascii() {
                 self.consume_ascii_character(c)
             } else if is_unicode_identifier_start(c) {
-                let identifier = self.lex_identifier(c, false)?;
+                let identifier = self.lex_identifier(c)?;
                 self.state = State::Other;
 
                 Ok((identifier, self.token_range()))
@@ -1083,7 +1082,7 @@ impl<'source> Lexer<'source> {
     // Dispatch based on the given character.
     fn consume_ascii_character(&mut self, c: char) -> Result<Spanned, LexicalError> {
         let token = match c {
-            c if is_ascii_identifier_start(c) => self.lex_identifier(c, true)?,
+            c if is_ascii_identifier_start(c) => self.lex_identifier(c)?,
             '0'..='9' => self.lex_number(c)?,
             '#' => return Ok((self.lex_comment(), self.token_range())),
             '\'' | '"' => self.lex_string(None, c)?,
@@ -2064,6 +2063,17 @@ def f(arg=%timeit a = b):
         assert_debug_snapshot!(lex_source(source));
     }
 
+    fn get_tokens_only(source: &str) -> Vec<Tok> {
+        lex_source(source).into_iter().map(|(tok, _)| tok).collect()
+    }
+
+    #[test]
+    fn test_nfkc_normalization() {
+        let source1 = "𝒞 = 500";
+        let source2 = "C = 500";
+        assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
+    }
+
     fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
         let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
         lex_source(&source)