3
3
//! This module handles the parsing of ECMAScript/TypeScript identifiers.
4
4
5
5
use swc_atoms:: Atom ;
6
+ use unicode_id_start:: { is_id_continue_unicode, is_id_start_unicode} ;
6
7
7
8
use super :: Lexer ;
8
9
use crate :: {
9
10
error:: Result ,
10
11
token:: { keyword_to_token_type, Token , TokenType , TokenValue } ,
12
+ util:: likely,
11
13
} ;
12
14
13
15
/// Fast mapping from ASCII to check if a character is valid for identifier
@@ -65,15 +67,27 @@ impl Lexer<'_> {
65
67
self . cursor . advance ( ) ;
66
68
67
69
// Read as many identifier continue chars as possible
68
- self . cursor . advance_while ( Self :: is_identifier_continue ) ;
70
+ self . cursor . advance_while ( Self :: is_ascii_id_continue ) ;
69
71
70
72
// Extract the identifier text
71
- let span = self . span ( ) ;
72
73
let ident_start = start_pos. 0 ;
73
74
let ident_end = self . cursor . position ( ) ;
74
75
let ident_bytes = unsafe { self . cursor . slice_unchecked ( ident_start, ident_end) } ;
75
- let ident_str = unsafe { std:: str:: from_utf8_unchecked ( ident_bytes) } ;
76
+ let non_unicode_ident_str = unsafe { std:: str:: from_utf8_unchecked ( ident_bytes) } ;
77
+
78
+ let ident_str = if let Some ( ch) = self . cursor . peek ( ) {
79
+ if ch == b'\\' {
80
+ & self . read_identifier_with_unicode_escape ( non_unicode_ident_str) ?
81
+ } else if !ch. is_ascii ( ) {
82
+ & self . read_identifier_with_utf8_charater ( non_unicode_ident_str) ?
83
+ } else {
84
+ non_unicode_ident_str
85
+ }
86
+ } else {
87
+ non_unicode_ident_str
88
+ } ;
76
89
let had_line_break_bool: bool = self . had_line_break . into ( ) ;
90
+ let span = self . span ( ) ;
77
91
78
92
// For non-keyword identifiers, we can directly return without checking keyword
79
93
// maps
@@ -94,20 +108,32 @@ impl Lexer<'_> {
94
108
self . cursor . advance ( ) ;
95
109
96
110
// Read as many identifier continue chars as possible
97
- self . cursor . advance_while ( Self :: is_identifier_continue ) ;
111
+ self . cursor . advance_while ( Self :: is_ascii_id_continue ) ;
98
112
99
113
// Extract the identifier text
100
- let span = self . span ( ) ;
101
114
let ident_start = start_pos. 0 ;
102
115
let ident_end = self . cursor . position ( ) ;
103
- let ident_bytes = unsafe { self . cursor . slice_unchecked ( ident_start, ident_end) } ;
104
- // SAFETY: We've verified the bytes are valid UTF-8
105
- let ident_str = unsafe { std:: str:: from_utf8_unchecked ( ident_bytes) } ;
106
116
let had_line_break_bool: bool = self . had_line_break . into ( ) ;
107
-
117
+ let non_unicode_ident_str = unsafe {
118
+ std:: str:: from_utf8_unchecked ( self . cursor . slice_unchecked ( ident_start, ident_end) )
119
+ } ;
120
+
121
+ let ident_str = if let Some ( ch) = self . cursor . peek ( ) {
122
+ if ch == b'\\' {
123
+ & self . read_identifier_with_unicode_escape ( non_unicode_ident_str) ?
124
+ } else if !ch. is_ascii ( ) {
125
+ & self . read_identifier_with_utf8_charater ( non_unicode_ident_str) ?
126
+ } else {
127
+ non_unicode_ident_str
128
+ }
129
+ } else {
130
+ non_unicode_ident_str
131
+ } ;
108
132
// Ultra-fast path for common 2-6 letter keywords using direct table lookup
109
- let len = ident_bytes. len ( ) ;
133
+ let ident_bytes = ident_str. as_bytes ( ) ;
134
+ let len = ident_str. len ( ) ;
110
135
136
+ let span = self . span ( ) ;
111
137
// Only process if first byte is an ASCII lowercase letter (all keywords start
112
138
// with a-z)
113
139
if len > 0 && ident_bytes[ 0 ] >= b'a' && ident_bytes[ 0 ] <= b'z' {
@@ -131,6 +157,46 @@ impl Lexer<'_> {
131
157
) )
132
158
}
133
159
160
+ fn read_identifier_with_unicode_escape ( & mut self , non_unicode : & str ) -> Result < String > {
161
+ let mut buffer = String :: from ( non_unicode) ;
162
+ self . identifier_with_unicode_escape_part ( & mut buffer) ?;
163
+
164
+ Ok ( buffer)
165
+ }
166
+
167
+ fn identifier_with_unicode_escape_part ( & mut self , buffer : & mut String ) -> Result < ( ) > {
168
+ while let Some ( ch) = self . cursor . peek_char ( ) {
169
+ if ch == '\\' && self . cursor . peek_at ( 1 ) == Some ( b'u' ) {
170
+ // Skip the "\\u"
171
+ self . cursor . advance_n ( 2 ) ;
172
+ let unicode_escape = self . read_unicode_escape ( ) ?;
173
+ buffer. push ( unicode_escape) ;
174
+ } else if Self :: is_identifier_continue ( ch) {
175
+ buffer. push ( ch) ;
176
+ self . cursor . advance_char ( ) ;
177
+ } else {
178
+ break ;
179
+ }
180
+ }
181
+ Ok ( ( ) )
182
+ }
183
+
184
+ fn read_identifier_with_utf8_charater ( & mut self , non_unicode : & str ) -> Result < String > {
185
+ let mut buffer = String :: from ( non_unicode) ;
186
+ while let Some ( ch) = self . cursor . peek_char ( ) {
187
+ if likely ( Self :: is_identifier_continue ( ch) ) {
188
+ buffer. push ( ch) ;
189
+ self . cursor . advance_char ( ) ;
190
+ } else if ch == '\\' {
191
+ self . identifier_with_unicode_escape_part ( & mut buffer) ?;
192
+ } else {
193
+ break ;
194
+ }
195
+ }
196
+
197
+ Ok ( buffer)
198
+ }
199
+
134
200
/// Super fast check for ASCII identifier start character
135
201
#[ inline( always) ]
136
202
pub ( crate ) fn is_ascii_id_start ( ch : u8 ) -> bool {
@@ -142,4 +208,26 @@ impl Lexer<'_> {
142
208
pub ( crate ) fn is_ascii_id_continue ( ch : u8 ) -> bool {
143
209
ch < 128 && unsafe { ( IDENT_CHAR . get_unchecked ( ch as usize ) & 2 ) != 0 }
144
210
}
211
+
212
+ /// Check if a byte is a valid identifier start character
213
+ #[ inline( always) ]
214
+ pub ( crate ) fn is_identifier_start ( ch : char ) -> bool {
215
+ // ASCII fast path using optimized identifier functions
216
+ if likely ( ch. is_ascii ( ) ) {
217
+ Self :: is_ascii_id_start ( ch as u8 )
218
+ } else {
219
+ is_id_start_unicode ( ch)
220
+ }
221
+ }
222
+
223
+ /// Check if a byte is a valid identifier continue character
224
+ #[ inline( always) ]
225
+ pub ( crate ) fn is_identifier_continue ( ch : char ) -> bool {
226
+ // ASCII fast path using optimized identifier functions
227
+ if likely ( ch. is_ascii ( ) ) {
228
+ Self :: is_ascii_id_continue ( ch as u8 )
229
+ } else {
230
+ is_id_continue_unicode ( ch)
231
+ }
232
+ }
145
233
}
0 commit comments