Skip to content

Commit

Permalink
Rewrite parser as part of new regex-syntax crate.
Browse files Browse the repository at this point in the history
This commit introduces a new `regex-syntax` crate that provides a
regular expression parser and an abstract syntax for regular
expressions. As part of this effort, the parser has been rewritten and
has grown a substantial number of tests.

The `regex` crate itself hasn't changed too much. I opted for the
smallest possible delta to get it working with the new regex AST.
In most cases, this simplified code because it no longer has to deal
with unwieldy flags. (Instead, flag information is baked into the AST.)

Here is a list of public facing non-breaking changes:

* A new `regex-syntax` crate with a parser, regex AST and lots of tests.
  This closes #29 and fixes #84.
* A new flag, `x`, has been added. This allows one to write regexes with
  insignificant whitespace and comments.
* Repetition operators can now be directly applied to zero-width
  matches. e.g., `\b+` was previously not allowed but now works.
  Note that one could always write `(\b)+` previously. This change
  is mostly about lifting an arbitrary restriction.

And a list of breaking changes:

* A new `Regex::with_size_limit` constructor function, that allows one
  to tweak the limit on the size of a compiled regex. This fixes #67.
  The new method isn't a breaking change, but regexes that exceed the
  size limit (set to 10MB by default) will no longer compile. To fix,
  simply call `Regex::with_size_limit` with a bigger limit.
* Capture group names cannot start with a number. This is a breaking
  change because regexes that previously compiled (e.g., `(?P<1a>.)`)
  will now return an error. This fixes #69.
* The `regex::Error` type has been changed to reflect the better error
  reporting in the `regex-syntax` crate, and a new error for limiting
  regexes to a certain size. This is a breaking change. Most folks just
  call `unwrap()` on `Regex::new`, so I expect this to have minimal
  impact.

Closes #29, #67, #69, #79, #84.

[breaking-change]
  • Loading branch information
BurntSushi committed May 25, 2015
1 parent 3e26dc6 commit a66df89
Show file tree
Hide file tree
Showing 15 changed files with 4,268 additions and 1,503 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -2,4 +2,7 @@
/Cargo.lock
/regex_macros/target
/regex_macros/Cargo.lock
/regex_syntax/target
/regex_syntax/Cargo.lock
/bench-log
.*.swp
8 changes: 8 additions & 0 deletions Cargo.toml
Expand Up @@ -21,8 +21,16 @@ path = "regex_macros/benches/bench_dynamic.rs"
test = false
bench = true

[dependencies.regex-syntax]
path = "regex_syntax"
version = "*"

[dev-dependencies]
rand = "0.3"

[features]
pattern = []

[profile.bench]
opt-level = 3
lto = true
138 changes: 67 additions & 71 deletions regex_macros/src/lib.rs
Expand Up @@ -36,10 +36,7 @@ use rustc::plugin::Registry;

use regex::Regex;
use regex::native::{
OneChar, CharClass, Any, Save, Jump, Split,
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
Program, Dynamic, ExDynamic, Native,
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
Inst, Program, Dynamic, ExDynamic, Native,
simple_case_fold,
};

Expand Down Expand Up @@ -79,7 +76,9 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
// error is logged in 'parse' with cx.span_err
None => return DummyResult::any(sp),
};
let re = match Regex::new(&regex) {
// We use the largest possible size limit because this is happening at
// compile time. We trust the programmer.
let re = match Regex::with_size_limit(::std::usize::MAX, &regex) {
Ok(re) => re,
Err(err) => {
cx.span_err(sp, &err.to_string());
Expand Down Expand Up @@ -121,11 +120,10 @@ impl<'a> NfaGen<'a> {
None => cx.expr_none(self.sp),
}
);
let prefix_anchor =
match self.prog.insts[1] {
EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
_ => false,
};
let prefix_anchor = match self.prog.insts[1] {
Inst::StartText => true,
_ => false,
};
let init_groups = self.vec_expr(0..num_cap_locs,
&mut |cx, _| cx.expr_none(self.sp));

Expand Down Expand Up @@ -338,49 +336,55 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
let nextpc = pc + 1;
let body = match *inst {
EmptyBegin(flags) => {
let cond =
if flags & FLAG_MULTI > 0 {
quote_expr!(self.cx,
self.chars.is_begin()
|| self.chars.prev == Some('\n')
)
} else {
quote_expr!(self.cx, self.chars.is_begin())
};
Inst::StartLine => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
if $cond { self.add(nlist, $nextpc, &mut *groups) }
if self.chars.is_begin() || self.chars.prev == Some('\n') {
self.add(nlist, $nextpc, &mut *groups)
}
})
}
EmptyEnd(flags) => {
let cond =
if flags & FLAG_MULTI > 0 {
quote_expr!(self.cx,
self.chars.is_end()
|| self.chars.cur == Some('\n')
)
} else {
quote_expr!(self.cx, self.chars.is_end())
};
Inst::StartText => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
if $cond { self.add(nlist, $nextpc, &mut *groups) }
if self.chars.is_begin() {
self.add(nlist, $nextpc, &mut *groups)
}
})
}
EmptyWordBoundary(flags) => {
let cond =
if flags & FLAG_NEGATED > 0 {
quote_expr!(self.cx, !self.chars.is_word_boundary())
} else {
quote_expr!(self.cx, self.chars.is_word_boundary())
};
Inst::EndLine => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
if self.chars.is_end() || self.chars.cur == Some('\n') {
self.add(nlist, $nextpc, &mut *groups)
}
})
}
Inst::EndText => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
if self.chars.is_end() {
self.add(nlist, $nextpc, &mut *groups)
}
})
}
Inst::WordBoundary => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
if $cond { self.add(nlist, $nextpc, &mut *groups) }
if self.chars.is_word_boundary() {
self.add(nlist, $nextpc, &mut *groups)
}
})
}
Inst::NotWordBoundary => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
if !self.chars.is_word_boundary() {
self.add(nlist, $nextpc, &mut *groups)
}
})
}
Save(slot) => {
Inst::Save(slot) => {
let save = quote_expr!(self.cx, {
let old = groups[$slot];
groups[$slot] = Some(self.ic);
Expand Down Expand Up @@ -411,20 +415,20 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
})
}
}
Jump(to) => {
Inst::Jump(to) => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
self.add(nlist, $to, &mut *groups);
})
}
Split(x, y) => {
Inst::Split(x, y) => {
quote_expr!(self.cx, {
nlist.add_empty($pc);
self.add(nlist, $x, &mut *groups);
self.add(nlist, $y, &mut *groups);
})
}
// For Match, OneChar, CharClass, Any
// For Match, OneChar, CharClass, Any, AnyNoNL
_ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
};
self.arm_inst(pc, body)
Expand All @@ -439,7 +443,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
let nextpc = pc + 1;
let body = match *inst {
Match => {
Inst::Match => {
quote_expr!(self.cx, {
match self.which {
Exists => {
Expand All @@ -459,8 +463,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
}
})
}
OneChar(c, flags) => {
if flags & FLAG_NOCASE > 0 {
Inst::OneChar { c, casei } => {
if casei {
let upc = simple_case_fold(c);
quote_expr!(self.cx, {
let upc = self.chars.prev.map(simple_case_fold);
Expand All @@ -476,45 +480,37 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
})
}
}
CharClass(ref ranges, flags) => {
let negate = flags & FLAG_NEGATED > 0;
let casei = flags & FLAG_NOCASE > 0;
Inst::CharClass(ref cls) => {
let ranges: Vec<(char, char)> =
cls.iter().map(|r| (r.start, r.end)).collect();
let mranges = self.match_class(&ranges);
let get_char =
if casei {
if cls.is_case_insensitive() {
quote_expr!(
self.cx,
simple_case_fold(self.chars.prev.unwrap()))
} else {
quote_expr!(self.cx, self.chars.prev.unwrap())
};
let negcond =
if negate {
quote_expr!(self.cx, !found)
} else {
quote_expr!(self.cx, found)
};
let mranges = self.match_class(&ranges);
quote_expr!(self.cx, {
if self.chars.prev.is_some() {
let c = $get_char;
let found = $mranges;
if $negcond {
if $mranges {
self.add(nlist, $nextpc, caps);
}
}
})
}
Any(flags) => {
if flags & FLAG_DOTNL > 0 {
quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
} else {
quote_expr!(self.cx, {
if self.chars.prev != Some('\n') {
self.add(nlist, $nextpc, caps)
}
()
})
}
Inst::Any => {
quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
}
Inst::AnyNoNL => {
quote_expr!(self.cx, {
if self.chars.prev != Some('\n') {
self.add(nlist, $nextpc, caps);
}
()
})
}
// EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
_ => self.empty_block(),
Expand Down
6 changes: 3 additions & 3 deletions regex_macros/tests/tests.rs
Expand Up @@ -203,6 +203,8 @@ replace!(rep_named, replace_all,
"w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3");
replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t",
"", "trim me");
replace!(rep_number_hypen, replace, r"(.)(.)", "ab", "$1-$2", "a-b");
replace!(rep_number_underscore, replace, r"(.)(.)", "ab", "$1_$2", "a_b");

macro_rules! noparse(
($name:ident, $re:expr) => (
Expand All @@ -219,7 +221,6 @@ macro_rules! noparse(

noparse!(fail_double_repeat, "a**");
noparse!(fail_no_repeat_arg, "*");
noparse!(fail_no_repeat_arg_begin, "^*");
noparse!(fail_incomplete_escape, "\\");
noparse!(fail_class_incomplete, "[A-");
noparse!(fail_class_not_closed, "[A");
Expand All @@ -235,8 +236,7 @@ noparse!(fail_bad_capture_name, "(?P<na-me>)");
noparse!(fail_bad_flag, "(?a)a");
noparse!(fail_empty_alt_before, "|a");
noparse!(fail_empty_alt_after, "a|");
noparse!(fail_counted_big_exact, "a{1001}");
noparse!(fail_counted_big_min, "a{1001,}");
noparse!(fail_too_big, "a{10000000}");
noparse!(fail_counted_no_close, "a{1001");
noparse!(fail_unfinished_cap, "(?");
noparse!(fail_unfinished_escape, "\\");
Expand Down
13 changes: 13 additions & 0 deletions regex_syntax/Cargo.toml
@@ -0,0 +1,13 @@
[package]
name = "regex-syntax"
version = "0.1.0"
authors = ["The Rust Project Developers"]
license = "MIT/Apache-2.0"
repository = "https://github.com/rust-lang/regex"
documentation = "http://doc.rust-lang.org/regex"
homepage = "https://github.com/rust-lang/regex"
description = "A regular expression parser (RE2 only)."

[dev-dependencies]
quickcheck = "*"
rand = "*"

0 comments on commit a66df89

Please sign in to comment.