feat(es/fast-lexer): Enhance identifier handling with Unicode support (#10226)

GiveMe-A-Name · web-flow · commit 482b63a905dd · 2025-03-18T22:49:24.000+09:00
**Description:**

- Added support for Unicode characters in identifiers, allowing for a broader range of valid identifiers.
- Introduced methods to handle Unicode escape sequences and UTF-8 characters in identifiers.
- Updated the lexer to utilize these new methods, improving the parsing of identifiers that include non-ASCII characters.
- Added tests to verify the correct handling of identifiers with Unicode characters and escape sequences.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/swc_ecma_fast_parser/Cargo.toml b/crates/swc_ecma_fast_parser/Cargo.toml
@@ -18,10 +18,11 @@ swc_atoms    = { version = "5.0.0", path = "../swc_atoms" }
 swc_common   = { version = "8.0.0", path = "../swc_common" }
 swc_ecma_ast = { version = "8.0.1", path = "../swc_ecma_ast" }
 
-assume     = { workspace = true }
-memchr     = { workspace = true }
-num-bigint = { workspace = true }
-wide       = { workspace = true }
+assume           = { workspace = true }
+memchr           = { workspace = true }
+num-bigint       = { workspace = true }
+unicode-id-start = { workspace = true }
+wide             = { workspace = true }
 
 [dev-dependencies]
 criterion         = { workspace = true }
diff --git a/crates/swc_ecma_fast_parser/src/lexer/cursor.rs b/crates/swc_ecma_fast_parser/src/lexer/cursor.rs
@@ -55,6 +55,19 @@ impl<'a> Cursor<'a> {
         }
     }
 
+    /// Peek at the current character without advancing
+    #[inline(always)]
+    pub fn peek_char(&self) -> Option<char> {
+        self.peek().and_then(|b| {
+            if b.is_ascii() {
+                Some(b as char)
+            } else {
+                let rest_str = unsafe { std::str::from_utf8_unchecked(self.rest()) };
+                rest_str.chars().next()
+            }
+        })
+    }
+
     /// Peek at a byte at a specific offset from the current position
     #[inline(always)]
     pub fn peek_at(&self, offset: u32) -> Option<u8> {
@@ -82,6 +95,20 @@ impl<'a> Cursor<'a> {
         self.pos += 1;
     }
 
+    /// Advance the cursor by one character
+    #[inline(always)]
+    pub fn advance_char(&mut self) {
+        assume!(unsafe: !self.is_eof());
+        let byte = self.peek().unwrap();
+        if byte.is_ascii() {
+            self.advance();
+        } else {
+            let rest_str = unsafe { std::str::from_utf8_unchecked(self.rest()) };
+            let ch = rest_str.chars().next().unwrap();
+            self.advance_n(ch.len_utf8() as u32);
+        }
+    }
+
     /// Advance the cursor by n bytes
     #[inline(always)]
     pub fn advance_n(&mut self, n: u32) {
diff --git a/crates/swc_ecma_fast_parser/src/lexer/identifier.rs b/crates/swc_ecma_fast_parser/src/lexer/identifier.rs
@@ -3,11 +3,13 @@
 //! This module handles the parsing of ECMAScript/TypeScript identifiers.
 
 use swc_atoms::Atom;
+use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
 
 use super::Lexer;
 use crate::{
     error::Result,
     token::{keyword_to_token_type, Token, TokenType, TokenValue},
+    util::likely,
 };
 
 /// Fast mapping from ASCII to check if a character is valid for identifier
@@ -65,15 +67,27 @@ impl Lexer<'_> {
         self.cursor.advance();
 
         // Read as many identifier continue chars as possible
-        self.cursor.advance_while(Self::is_identifier_continue);
+        self.cursor.advance_while(Self::is_ascii_id_continue);
 
         // Extract the identifier text
-        let span = self.span();
         let ident_start = start_pos.0;
         let ident_end = self.cursor.position();
         let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
-        let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
+        let non_unicode_ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
+
+        let ident_str = if let Some(ch) = self.cursor.peek() {
+            if ch == b'\\' {
+                &self.read_identifier_with_unicode_escape(non_unicode_ident_str)?
+            } else if !ch.is_ascii() {
+                &self.read_identifier_with_utf8_charater(non_unicode_ident_str)?
+            } else {
+                non_unicode_ident_str
+            }
+        } else {
+            non_unicode_ident_str
+        };
         let had_line_break_bool: bool = self.had_line_break.into();
+        let span = self.span();
 
         // For non-keyword identifiers, we can directly return without checking keyword
         // maps
@@ -94,20 +108,32 @@ impl Lexer<'_> {
         self.cursor.advance();
 
         // Read as many identifier continue chars as possible
-        self.cursor.advance_while(Self::is_identifier_continue);
+        self.cursor.advance_while(Self::is_ascii_id_continue);
 
         // Extract the identifier text
-        let span = self.span();
         let ident_start = start_pos.0;
         let ident_end = self.cursor.position();
-        let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
-        // SAFETY: We've verified the bytes are valid UTF-8
-        let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
         let had_line_break_bool: bool = self.had_line_break.into();
-
+        let non_unicode_ident_str = unsafe {
+            std::str::from_utf8_unchecked(self.cursor.slice_unchecked(ident_start, ident_end))
+        };
+
+        let ident_str = if let Some(ch) = self.cursor.peek() {
+            if ch == b'\\' {
+                &self.read_identifier_with_unicode_escape(non_unicode_ident_str)?
+            } else if !ch.is_ascii() {
+                &self.read_identifier_with_utf8_charater(non_unicode_ident_str)?
+            } else {
+                non_unicode_ident_str
+            }
+        } else {
+            non_unicode_ident_str
+        };
         // Ultra-fast path for common 2-6 letter keywords using direct table lookup
-        let len = ident_bytes.len();
+        let ident_bytes = ident_str.as_bytes();
+        let len = ident_str.len();
 
+        let span = self.span();
         // Only process if first byte is an ASCII lowercase letter (all keywords start
         // with a-z)
         if len > 0 && ident_bytes[0] >= b'a' && ident_bytes[0] <= b'z' {
@@ -131,6 +157,46 @@ impl Lexer<'_> {
         ))
     }
 
+    fn read_identifier_with_unicode_escape(&mut self, non_unicode: &str) -> Result<String> {
+        let mut buffer = String::from(non_unicode);
+        self.identifier_with_unicode_escape_part(&mut buffer)?;
+
+        Ok(buffer)
+    }
+
+    fn identifier_with_unicode_escape_part(&mut self, buffer: &mut String) -> Result<()> {
+        while let Some(ch) = self.cursor.peek_char() {
+            if ch == '\\' && self.cursor.peek_at(1) == Some(b'u') {
+                // Skip the "\\u"
+                self.cursor.advance_n(2);
+                let unicode_escape = self.read_unicode_escape()?;
+                buffer.push(unicode_escape);
+            } else if Self::is_identifier_continue(ch) {
+                buffer.push(ch);
+                self.cursor.advance_char();
+            } else {
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    fn read_identifier_with_utf8_charater(&mut self, non_unicode: &str) -> Result<String> {
+        let mut buffer = String::from(non_unicode);
+        while let Some(ch) = self.cursor.peek_char() {
+            if likely(Self::is_identifier_continue(ch)) {
+                buffer.push(ch);
+                self.cursor.advance_char();
+            } else if ch == '\\' {
+                self.identifier_with_unicode_escape_part(&mut buffer)?;
+            } else {
+                break;
+            }
+        }
+
+        Ok(buffer)
+    }
+
     /// Super fast check for ASCII identifier start character
     #[inline(always)]
     pub(crate) fn is_ascii_id_start(ch: u8) -> bool {
@@ -142,4 +208,26 @@ impl Lexer<'_> {
     pub(crate) fn is_ascii_id_continue(ch: u8) -> bool {
         ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 2) != 0 }
     }
+
+    /// Check if a byte is a valid identifier start character
+    #[inline(always)]
+    pub(crate) fn is_identifier_start(ch: char) -> bool {
+        // ASCII fast path using optimized identifier functions
+        if likely(ch.is_ascii()) {
+            Self::is_ascii_id_start(ch as u8)
+        } else {
+            is_id_start_unicode(ch)
+        }
+    }
+
+    /// Check if a byte is a valid identifier continue character
+    #[inline(always)]
+    pub(crate) fn is_identifier_continue(ch: char) -> bool {
+        // ASCII fast path using optimized identifier functions
+        if likely(ch.is_ascii()) {
+            Self::is_ascii_id_continue(ch as u8)
+        } else {
+            is_id_continue_unicode(ch)
+        }
+    }
 }
diff --git a/crates/swc_ecma_fast_parser/src/lexer/mod.rs b/crates/swc_ecma_fast_parser/src/lexer/mod.rs
@@ -393,7 +393,7 @@ impl<'a> Lexer<'a> {
             }
         } else {
             // Non-ASCII character path (less common)
-            if Self::is_identifier_start(ch) {
+            if Self::is_identifier_start(ch as char) {
                 self.read_non_keyword_identifier()
             } else {
                 self.cursor.advance();
@@ -717,28 +717,4 @@ impl<'a> Lexer<'a> {
             self.had_line_break = LineBreak::Present;
         }
     }
-
-    /// Check if a byte is a valid identifier start character
-    #[inline(always)]
-    fn is_identifier_start(byte: u8) -> bool {
-        // ASCII fast path using optimized identifier functions
-        if likely(byte < 128) {
-            Self::is_ascii_id_start(byte)
-        } else {
-            // Non-ASCII, needs further checking in read_identifier
-            true
-        }
-    }
-
-    /// Check if a byte is a valid identifier continue character
-    #[inline(always)]
-    fn is_identifier_continue(byte: u8) -> bool {
-        // ASCII fast path using optimized identifier functions
-        if likely(byte < 128) {
-            Self::is_ascii_id_continue(byte)
-        } else {
-            // Non-ASCII, needs further checking in read_identifier
-            true
-        }
-    }
 }
diff --git a/crates/swc_ecma_fast_parser/src/lexer/regex.rs b/crates/swc_ecma_fast_parser/src/lexer/regex.rs
@@ -83,7 +83,7 @@ impl Lexer<'_> {
         // Read the flags
         let mut flags = String::new();
         while let Some(ch) = self.cursor.peek() {
-            if Self::is_identifier_continue(ch) {
+            if Self::is_ascii_id_continue(ch) {
                 flags.push(ch as char);
                 self.cursor.advance();
             } else {
diff --git a/crates/swc_ecma_fast_parser/src/lexer/tests.rs b/crates/swc_ecma_fast_parser/src/lexer/tests.rs
@@ -1295,6 +1295,27 @@ fn test_lexer_number_edge_cases() {
     );
 }
 
+#[test]
+fn test_lexer_identifier_with_unicode() {
+    // Test identifier with Unicode characters
+    verify_tokens(
+        "a你好",
+        vec![(
+            TokenType::Ident,
+            Some(TokenValue::Word(Atom::from("a你好"))),
+        )],
+    );
+
+    // Test identifier with Unicode escape sequence (code point)
+    verify_tokens(
+        "e\\u{0061}स्ते\\u{0062}",
+        vec![(
+            TokenType::Ident,
+            Some(TokenValue::Word(Atom::from("eaस्तेb"))),
+        )],
+    );
+}
+
 #[test]
 #[should_panic]
 fn test_lexer_invalid_binary_number() {