Skip to content

Commit 482b63a

Browse files
authoredMar 18, 2025··
feat(es/fast-lexer): Enhance identifier handling with Unicode support (#10226)
**Description:** - Added support for Unicode characters in identifiers, allowing for a broader range of valid identifiers. - Introduced methods to handle Unicode escape sequences and UTF-8 characters in identifiers. - Updated the lexer to utilize these new methods, improving the parsing of identifiers that include non-ASCII characters. - Added tests to verify the correct handling of identifiers with Unicode characters and escape sequences.
1 parent 540bdf8 commit 482b63a

File tree

7 files changed

+154
-40
lines changed

7 files changed

+154
-40
lines changed
 

‎Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎crates/swc_ecma_fast_parser/Cargo.toml

+5-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ swc_atoms = { version = "5.0.0", path = "../swc_atoms" }
1818
swc_common = { version = "8.0.0", path = "../swc_common" }
1919
swc_ecma_ast = { version = "8.0.1", path = "../swc_ecma_ast" }
2020

21-
assume = { workspace = true }
22-
memchr = { workspace = true }
23-
num-bigint = { workspace = true }
24-
wide = { workspace = true }
21+
assume = { workspace = true }
22+
memchr = { workspace = true }
23+
num-bigint = { workspace = true }
24+
unicode-id-start = { workspace = true }
25+
wide = { workspace = true }
2526

2627
[dev-dependencies]
2728
criterion = { workspace = true }

‎crates/swc_ecma_fast_parser/src/lexer/cursor.rs

+27
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,19 @@ impl<'a> Cursor<'a> {
5555
}
5656
}
5757

58+
/// Peek at the current character without advancing
59+
#[inline(always)]
60+
pub fn peek_char(&self) -> Option<char> {
61+
self.peek().and_then(|b| {
62+
if b.is_ascii() {
63+
Some(b as char)
64+
} else {
65+
let rest_str = unsafe { std::str::from_utf8_unchecked(self.rest()) };
66+
rest_str.chars().next()
67+
}
68+
})
69+
}
70+
5871
/// Peek at a byte at a specific offset from the current position
5972
#[inline(always)]
6073
pub fn peek_at(&self, offset: u32) -> Option<u8> {
@@ -82,6 +95,20 @@ impl<'a> Cursor<'a> {
8295
self.pos += 1;
8396
}
8497

98+
/// Advance the cursor by one character
99+
#[inline(always)]
100+
pub fn advance_char(&mut self) {
101+
assume!(unsafe: !self.is_eof());
102+
let byte = self.peek().unwrap();
103+
if byte.is_ascii() {
104+
self.advance();
105+
} else {
106+
let rest_str = unsafe { std::str::from_utf8_unchecked(self.rest()) };
107+
let ch = rest_str.chars().next().unwrap();
108+
self.advance_n(ch.len_utf8() as u32);
109+
}
110+
}
111+
85112
/// Advance the cursor by n bytes
86113
#[inline(always)]
87114
pub fn advance_n(&mut self, n: u32) {

‎crates/swc_ecma_fast_parser/src/lexer/identifier.rs

+98-10
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
//! This module handles the parsing of ECMAScript/TypeScript identifiers.
44
55
use swc_atoms::Atom;
6+
use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
67

78
use super::Lexer;
89
use crate::{
910
error::Result,
1011
token::{keyword_to_token_type, Token, TokenType, TokenValue},
12+
util::likely,
1113
};
1214

1315
/// Fast mapping from ASCII to check if a character is valid for identifier
@@ -65,15 +67,27 @@ impl Lexer<'_> {
6567
self.cursor.advance();
6668

6769
// Read as many identifier continue chars as possible
68-
self.cursor.advance_while(Self::is_identifier_continue);
70+
self.cursor.advance_while(Self::is_ascii_id_continue);
6971

7072
// Extract the identifier text
71-
let span = self.span();
7273
let ident_start = start_pos.0;
7374
let ident_end = self.cursor.position();
7475
let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
75-
let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
76+
let non_unicode_ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
77+
78+
let ident_str = if let Some(ch) = self.cursor.peek() {
79+
if ch == b'\\' {
80+
&self.read_identifier_with_unicode_escape(non_unicode_ident_str)?
81+
} else if !ch.is_ascii() {
82+
&self.read_identifier_with_utf8_charater(non_unicode_ident_str)?
83+
} else {
84+
non_unicode_ident_str
85+
}
86+
} else {
87+
non_unicode_ident_str
88+
};
7689
let had_line_break_bool: bool = self.had_line_break.into();
90+
let span = self.span();
7791

7892
// For non-keyword identifiers, we can directly return without checking keyword
7993
// maps
@@ -94,20 +108,32 @@ impl Lexer<'_> {
94108
self.cursor.advance();
95109

96110
// Read as many identifier continue chars as possible
97-
self.cursor.advance_while(Self::is_identifier_continue);
111+
self.cursor.advance_while(Self::is_ascii_id_continue);
98112

99113
// Extract the identifier text
100-
let span = self.span();
101114
let ident_start = start_pos.0;
102115
let ident_end = self.cursor.position();
103-
let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
104-
// SAFETY: We've verified the bytes are valid UTF-8
105-
let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
106116
let had_line_break_bool: bool = self.had_line_break.into();
107-
117+
let non_unicode_ident_str = unsafe {
118+
std::str::from_utf8_unchecked(self.cursor.slice_unchecked(ident_start, ident_end))
119+
};
120+
121+
let ident_str = if let Some(ch) = self.cursor.peek() {
122+
if ch == b'\\' {
123+
&self.read_identifier_with_unicode_escape(non_unicode_ident_str)?
124+
} else if !ch.is_ascii() {
125+
&self.read_identifier_with_utf8_charater(non_unicode_ident_str)?
126+
} else {
127+
non_unicode_ident_str
128+
}
129+
} else {
130+
non_unicode_ident_str
131+
};
108132
// Ultra-fast path for common 2-6 letter keywords using direct table lookup
109-
let len = ident_bytes.len();
133+
let ident_bytes = ident_str.as_bytes();
134+
let len = ident_str.len();
110135

136+
let span = self.span();
111137
// Only process if first byte is an ASCII lowercase letter (all keywords start
112138
// with a-z)
113139
if len > 0 && ident_bytes[0] >= b'a' && ident_bytes[0] <= b'z' {
@@ -131,6 +157,46 @@ impl Lexer<'_> {
131157
))
132158
}
133159

160+
fn read_identifier_with_unicode_escape(&mut self, non_unicode: &str) -> Result<String> {
161+
let mut buffer = String::from(non_unicode);
162+
self.identifier_with_unicode_escape_part(&mut buffer)?;
163+
164+
Ok(buffer)
165+
}
166+
167+
fn identifier_with_unicode_escape_part(&mut self, buffer: &mut String) -> Result<()> {
168+
while let Some(ch) = self.cursor.peek_char() {
169+
if ch == '\\' && self.cursor.peek_at(1) == Some(b'u') {
170+
// Skip the "\\u"
171+
self.cursor.advance_n(2);
172+
let unicode_escape = self.read_unicode_escape()?;
173+
buffer.push(unicode_escape);
174+
} else if Self::is_identifier_continue(ch) {
175+
buffer.push(ch);
176+
self.cursor.advance_char();
177+
} else {
178+
break;
179+
}
180+
}
181+
Ok(())
182+
}
183+
184+
fn read_identifier_with_utf8_charater(&mut self, non_unicode: &str) -> Result<String> {
185+
let mut buffer = String::from(non_unicode);
186+
while let Some(ch) = self.cursor.peek_char() {
187+
if likely(Self::is_identifier_continue(ch)) {
188+
buffer.push(ch);
189+
self.cursor.advance_char();
190+
} else if ch == '\\' {
191+
self.identifier_with_unicode_escape_part(&mut buffer)?;
192+
} else {
193+
break;
194+
}
195+
}
196+
197+
Ok(buffer)
198+
}
199+
134200
/// Super fast check for ASCII identifier start character
135201
#[inline(always)]
136202
pub(crate) fn is_ascii_id_start(ch: u8) -> bool {
@@ -142,4 +208,26 @@ impl Lexer<'_> {
142208
pub(crate) fn is_ascii_id_continue(ch: u8) -> bool {
143209
ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 2) != 0 }
144210
}
211+
212+
/// Check if a byte is a valid identifier start character
213+
#[inline(always)]
214+
pub(crate) fn is_identifier_start(ch: char) -> bool {
215+
// ASCII fast path using optimized identifier functions
216+
if likely(ch.is_ascii()) {
217+
Self::is_ascii_id_start(ch as u8)
218+
} else {
219+
is_id_start_unicode(ch)
220+
}
221+
}
222+
223+
/// Check if a byte is a valid identifier continue character
224+
#[inline(always)]
225+
pub(crate) fn is_identifier_continue(ch: char) -> bool {
226+
// ASCII fast path using optimized identifier functions
227+
if likely(ch.is_ascii()) {
228+
Self::is_ascii_id_continue(ch as u8)
229+
} else {
230+
is_id_continue_unicode(ch)
231+
}
232+
}
145233
}

‎crates/swc_ecma_fast_parser/src/lexer/mod.rs

+1-25
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ impl<'a> Lexer<'a> {
393393
}
394394
} else {
395395
// Non-ASCII character path (less common)
396-
if Self::is_identifier_start(ch) {
396+
if Self::is_identifier_start(ch as char) {
397397
self.read_non_keyword_identifier()
398398
} else {
399399
self.cursor.advance();
@@ -717,28 +717,4 @@ impl<'a> Lexer<'a> {
717717
self.had_line_break = LineBreak::Present;
718718
}
719719
}
720-
721-
/// Check if a byte is a valid identifier start character
722-
#[inline(always)]
723-
fn is_identifier_start(byte: u8) -> bool {
724-
// ASCII fast path using optimized identifier functions
725-
if likely(byte < 128) {
726-
Self::is_ascii_id_start(byte)
727-
} else {
728-
// Non-ASCII, needs further checking in read_identifier
729-
true
730-
}
731-
}
732-
733-
/// Check if a byte is a valid identifier continue character
734-
#[inline(always)]
735-
fn is_identifier_continue(byte: u8) -> bool {
736-
// ASCII fast path using optimized identifier functions
737-
if likely(byte < 128) {
738-
Self::is_ascii_id_continue(byte)
739-
} else {
740-
// Non-ASCII, needs further checking in read_identifier
741-
true
742-
}
743-
}
744720
}

‎crates/swc_ecma_fast_parser/src/lexer/regex.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ impl Lexer<'_> {
8383
// Read the flags
8484
let mut flags = String::new();
8585
while let Some(ch) = self.cursor.peek() {
86-
if Self::is_identifier_continue(ch) {
86+
if Self::is_ascii_id_continue(ch) {
8787
flags.push(ch as char);
8888
self.cursor.advance();
8989
} else {

‎crates/swc_ecma_fast_parser/src/lexer/tests.rs

+21
Original file line numberDiff line numberDiff line change
@@ -1295,6 +1295,27 @@ fn test_lexer_number_edge_cases() {
12951295
);
12961296
}
12971297

1298+
#[test]
1299+
fn test_lexer_identifier_with_unicode() {
1300+
// Test identifier with Unicode characters
1301+
verify_tokens(
1302+
"a你好",
1303+
vec![(
1304+
TokenType::Ident,
1305+
Some(TokenValue::Word(Atom::from("a你好"))),
1306+
)],
1307+
);
1308+
1309+
// Test identifier with Unicode escape sequence (code point)
1310+
verify_tokens(
1311+
"e\\u{0061}स्ते\\u{0062}",
1312+
vec![(
1313+
TokenType::Ident,
1314+
Some(TokenValue::Word(Atom::from("eaस्तेb"))),
1315+
)],
1316+
);
1317+
}
1318+
12981319
#[test]
12991320
#[should_panic]
13001321
fn test_lexer_invalid_binary_number() {

0 commit comments

Comments
 (0)
Please sign in to comment.