Skip to content

Commit db751f0

Browse files
committedSep 28, 2024·
refactor(linter): use regexp AST visitor in no-control-regex (#6129)
- closes #5416 Rewrites the `no-control-regex` rule to use a regular expression AST visitor instead of the `regex` crate and parsing by hand. This change simplifies the code and makes it easier to maintain. One notable change in the snapshots is the printing of the control characters. Previously, we always printed from the source text. Now, we print a representation of the control character itself based on its numeric value. This resulted in the nonprintable chars being printed, which are invisible. The other reason for this change is that the spans output by the regex parser for unicode escapes do not match 1:1 when raw strings and escapes are involved. This resulted in goofy looking spans in the output: ``` ⚠ eslint(no-control-regex): Unexpected control character: '*\\x' ╭─[no_control_regex.tsx:1:22] 1 │ new RegExp('\\u{1111}*\\x1F', 'u') · ──── ╰──── ``` Not sure where the bug lies there yet.
1 parent 3aa7e42 commit db751f0

File tree

2 files changed

+100
-169
lines changed

2 files changed

+100
-169
lines changed
 

‎crates/oxc_linter/src/rules/eslint/no_control_regex.rs

+100-169
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
1-
use lazy_static::lazy_static;
1+
use oxc_allocator::Allocator;
22
use oxc_ast::{
3-
ast::{Argument, RegExpFlags, RegExpPattern},
3+
ast::{Argument, RegExpFlags},
44
AstKind,
55
};
66
use oxc_diagnostics::OxcDiagnostic;
77
use oxc_macros::declare_oxc_lint;
8+
use oxc_regular_expression::{
9+
ast::{Character, Pattern},
10+
visit::Visit,
11+
Parser, ParserOptions,
12+
};
813
use oxc_span::{GetSpan, Span};
9-
use regex::{Matches, Regex};
1014

1115
use crate::{ast_util::extract_regex_flags, context::LintContext, rule::Rule, AstNode};
1216

@@ -64,196 +68,123 @@ declare_oxc_lint!(
6468

6569
impl Rule for NoControlRegex {
6670
fn run<'a>(&self, node: &AstNode<'a>, context: &LintContext<'a>) {
67-
if let Some(RegexPatternData { pattern, flags, span }) = regex_pattern(node) {
68-
let mut violations: Vec<&str> = Vec::new();
69-
let pattern = pattern.as_ref();
70-
let pattern_text = pattern.source_text(context.source_text());
71-
for matched_ctl_pattern in control_patterns(pattern_text.as_ref()) {
72-
let ctl = matched_ctl_pattern.as_str();
71+
match node.kind() {
72+
// regex literal
73+
AstKind::RegExpLiteral(reg) => {
74+
let Some(pattern) = reg.regex.pattern.as_pattern() else {
75+
return;
76+
};
7377

74-
// check for an even number of backslashes, since these will
75-
// prevent the pattern from being a control sequence
76-
if ctl.starts_with('\\') && matched_ctl_pattern.start() > 0 {
77-
let pattern_chars: Vec<char> = pattern_text.chars().collect(); // ew
78+
check_pattern(context, pattern, reg.span);
79+
}
7880

79-
// Convert byte index to char index
80-
let byte_start = matched_ctl_pattern.start();
81-
let char_start = pattern_text[..byte_start].chars().count();
81+
// new RegExp()
82+
AstKind::NewExpression(expr) => {
83+
// constructor is RegExp,
84+
if expr.callee.is_specific_id("RegExp")
85+
// which is provided at least 1 parameter,
86+
&& expr.arguments.len() > 0
87+
{
88+
// where the first one is a string literal
89+
// note: improvements required for strings used via identifier
90+
// references
91+
if let Argument::StringLiteral(pattern) = &expr.arguments[0] {
92+
// get pattern from arguments. Missing or non-string arguments
93+
// will be runtime errors, but are not covered by this rule.
94+
let alloc = Allocator::default();
95+
let pattern_with_slashes = format!("/{}/", &pattern.value);
96+
let flags = extract_regex_flags(&expr.arguments);
97+
let parser = Parser::new(
98+
&alloc,
99+
pattern_with_slashes.as_str(),
100+
ParserOptions {
101+
span_offset: expr
102+
.arguments
103+
.first()
104+
.map_or(0, |arg| arg.span().start),
105+
unicode_mode: flags
106+
.is_some_and(|flags| flags.contains(RegExpFlags::U)),
107+
unicode_sets_mode: flags
108+
.is_some_and(|flags| flags.contains(RegExpFlags::V)),
109+
},
110+
);
82111

83-
let mut first_backslash = char_start;
84-
while first_backslash > 0 && pattern_chars[first_backslash] == '\\' {
85-
first_backslash -= 1;
86-
}
112+
let Some(pattern) = parser.parse().ok().map(|pattern| pattern.pattern)
113+
else {
114+
return;
115+
};
87116

88-
let mut num_slashes = char_start - first_backslash;
89-
if first_backslash == 0 && pattern_chars[first_backslash] == '\\' {
90-
num_slashes += 1;
91-
}
92-
// even # of slashes
93-
if num_slashes % 2 == 0 {
94-
continue;
117+
check_pattern(context, &pattern, expr.span);
95118
}
96119
}
120+
}
97121

98-
if ctl.starts_with(r"\x") || ctl.starts_with(r"\u") {
99-
let mut numeric_part = &ctl[2..];
122+
// RegExp()
123+
AstKind::CallExpression(expr) => {
124+
// constructor is RegExp,
125+
if expr.callee.is_specific_id("RegExp")
126+
// which is provided at least 1 parameter,
127+
&& expr.arguments.len() > 0
128+
{
129+
// where the first one is a string literal
130+
// note: improvements required for strings used via identifier
131+
// references
132+
if let Argument::StringLiteral(pattern) = &expr.arguments[0] {
133+
// get pattern from arguments. Missing or non-string arguments
134+
// will be runtime errors, but are not covered by this rule.
135+
let alloc = Allocator::default();
136+
let pattern_with_slashes = format!("/{}/", &pattern.value);
137+
let flags = extract_regex_flags(&expr.arguments);
138+
let parser = Parser::new(
139+
&alloc,
140+
pattern_with_slashes.as_str(),
141+
ParserOptions {
142+
span_offset: expr
143+
.arguments
144+
.first()
145+
.map_or(0, |arg| arg.span().start),
146+
unicode_mode: flags
147+
.is_some_and(|flags| flags.contains(RegExpFlags::U)),
148+
unicode_sets_mode: flags
149+
.is_some_and(|flags| flags.contains(RegExpFlags::V)),
150+
},
151+
);
100152

101-
// extract numeric part from \u{00}
102-
if numeric_part.starts_with('{') {
103-
let has_unicode_flag = match flags {
104-
Some(flags) if flags.contains(RegExpFlags::U) => true,
105-
_ => {
106-
continue;
107-
}
153+
let Some(pattern) = parser.parse().ok().map(|pattern| pattern.pattern)
154+
else {
155+
return;
108156
};
109157

110-
// 1. Unicode control pattern is missing a curly brace
111-
// and is therefore invalid. (note: we may want to
112-
// report this?)
113-
// 2. Unicode flag is missing, which is needed for
114-
// interpreting \u{`nn`} as a unicode character
115-
if !has_unicode_flag || !numeric_part.ends_with('}') {
116-
continue;
117-
}
118-
119-
numeric_part = &numeric_part[1..numeric_part.len() - 1];
120-
}
121-
122-
match u32::from_str_radix(numeric_part, 16) {
123-
Err(_) => continue,
124-
Ok(as_num) if as_num > 0x1f => continue,
125-
Ok(_) => { /* noop */ }
158+
check_pattern(context, &pattern, expr.span);
126159
}
127160
}
128-
129-
violations.push(ctl);
130161
}
131-
132-
if !violations.is_empty() {
133-
let violations = violations.join(", ");
134-
context.diagnostic(no_control_regex_diagnostic(&violations, span));
135-
}
136-
}
162+
_ => {}
163+
};
137164
}
138165
}
139166

140-
/// Since we don't implement `ToOwned` trait.
141-
enum PatRef<'a> {
142-
Owned(RegExpPattern<'a>),
143-
Borrowed(&'a RegExpPattern<'a>),
144-
}
167+
fn check_pattern(context: &LintContext, pattern: &Pattern, span: Span) {
168+
let mut finder = ControlCharacterFinder { control_chars: Vec::new() };
169+
finder.visit_pattern(pattern);
145170

146-
impl<'a> AsRef<RegExpPattern<'a>> for PatRef<'a> {
147-
fn as_ref(&self) -> &RegExpPattern<'a> {
148-
match self {
149-
Self::Owned(pat) => pat,
150-
Self::Borrowed(pat) => pat,
151-
}
171+
if !finder.control_chars.is_empty() {
172+
let violations = finder.control_chars.join(", ");
173+
context.diagnostic(no_control_regex_diagnostic(&violations, span));
152174
}
153175
}
154176

155-
struct RegexPatternData<'a> {
156-
/// A regex pattern, either from a literal (`/foo/`) a RegExp constructor
157-
/// (`new RegExp("foo")`), or a RegExp function call (`RegExp("foo"))
158-
pattern: PatRef<'a>,
159-
/// Regex flags, if found. It's possible for this to be `Some` but have
160-
/// no flags.
161-
///
162-
/// Note that flags are represented by a `u8` and therefore safely clonable
163-
/// with low performance overhead.
164-
flags: Option<RegExpFlags>,
165-
/// The pattern's span. For [`oxc_ast::ast::Expression::NewExpression`]s
166-
/// and [`oxc_ast::ast::Expression::CallExpression`]s,
167-
/// this will match the entire new/call expression.
168-
///
169-
/// Note that spans are 8 bytes and safely clonable with low performance overhead
170-
span: Span,
177+
struct ControlCharacterFinder {
178+
control_chars: Vec<String>,
171179
}
172180

173-
/// Returns the regex pattern inside a node, if it's applicable.
174-
///
175-
/// e.g.:
176-
/// * /foo/ -> "foo"
177-
/// * new RegExp("foo") -> foo
178-
///
179-
/// note: [`RegExpFlags`] and [`Span`]s are both tiny and cloneable.
180-
fn regex_pattern<'a>(node: &AstNode<'a>) -> Option<RegexPatternData<'a>> {
181-
let kind = node.kind();
182-
match kind {
183-
// regex literal
184-
AstKind::RegExpLiteral(reg) => Some(RegexPatternData {
185-
pattern: PatRef::Borrowed(&reg.regex.pattern),
186-
flags: Some(reg.regex.flags),
187-
span: reg.span,
188-
}),
189-
190-
// new RegExp()
191-
AstKind::NewExpression(expr) => {
192-
// constructor is RegExp,
193-
if expr.callee.is_specific_id("RegExp")
194-
// which is provided at least 1 parameter,
195-
&& expr.arguments.len() > 0
196-
{
197-
// where the first one is a string literal
198-
// note: improvements required for strings used via identifier
199-
// references
200-
if let Argument::StringLiteral(pattern) = &expr.arguments[0] {
201-
// get pattern from arguments. Missing or non-string arguments
202-
// will be runtime errors, but are not covered by this rule.
203-
// Note that we're intentionally reporting the entire "new
204-
// RegExp("pat") expression, not just "pat".
205-
Some(RegexPatternData {
206-
pattern: PatRef::Owned(RegExpPattern::Raw(pattern.value.as_ref())),
207-
flags: extract_regex_flags(&expr.arguments),
208-
span: kind.span(),
209-
})
210-
} else {
211-
None
212-
}
213-
} else {
214-
None
215-
}
216-
}
217-
218-
// RegExp()
219-
AstKind::CallExpression(expr) => {
220-
// constructor is RegExp,
221-
if expr.callee.is_specific_id("RegExp")
222-
// which is provided at least 1 parameter,
223-
&& expr.arguments.len() > 0
224-
{
225-
// where the first one is a string literal
226-
// note: improvements required for strings used via identifier
227-
// references
228-
if let Argument::StringLiteral(pattern) = &expr.arguments[0] {
229-
// get pattern from arguments. Missing or non-string arguments
230-
// will be runtime errors, but are not covered by this rule.
231-
// Note that we're intentionally reporting the entire "new
232-
// RegExp("pat") expression, not just "pat".
233-
Some(RegexPatternData {
234-
pattern: PatRef::Owned(RegExpPattern::Raw(pattern.value.as_ref())),
235-
flags: extract_regex_flags(&expr.arguments),
236-
span: kind.span(),
237-
})
238-
} else {
239-
None
240-
}
241-
} else {
242-
None
243-
}
181+
impl<'a> Visit<'a> for ControlCharacterFinder {
182+
fn visit_character(&mut self, ch: &Character) {
183+
// Control characters are in the range 0x00 to 0x1F
184+
if ch.value <= 0x1F {
185+
self.control_chars.push(ch.to_string());
244186
}
245-
_ => None,
246-
}
247-
}
248-
249-
fn control_patterns(pattern: &str) -> Matches<'static, '_> {
250-
lazy_static! {
251-
static ref CTL_PAT: Regex = Regex::new(
252-
r"([\x00-\x1f]|(?:\\x\w{2})|(?:\\u\w{4})|(?:\\u\{\w{1,4}\}))"
253-
// r"((?:\\x\w{2}))"
254-
).unwrap();
255187
}
256-
CTL_PAT.find_iter(pattern)
257188
}
258189

259190
#[cfg(test)]
Binary file not shown.

0 commit comments

Comments
 (0)
Please sign in to comment.