Rewrite parser as part of new regex-syntax crate.

This commit introduces a new `regex-syntax` crate that provides a regular expression parser and an abstract syntax for regular expressions. As part of this effort, the parser has been rewritten and has grown a substantial number of tests. The `regex` crate itself hasn't changed too much. I opted for the smallest possible delta to get it working with the new regex AST. In most cases, this simplified code because it no longer has to deal with unwieldy flags. (Instead, flag information is baked into the AST.) Here is a list of public facing non-breaking changes: * A new `regex-syntax` crate with a parser, regex AST and lots of tests. This closes #29 and fixes #84. * A new flag, `x`, has been added. This allows one to write regexes with insignificant whitespace and comments. * Repetition operators can now be directly applied to zero-width matches. e.g., `\b+` was previously not allowed but now works. Note that one could always write `(\b)+` previously. This change is mostly about lifting an arbitrary restriction. And a list of breaking changes: * A new `Regex::with_size_limit` constructor function, that allows one to tweak the limit on the size of a compiled regex. This fixes #67. The new method isn't a breaking change, but regexes that exceed the size limit (set to 10MB by default) will no longer compile. To fix, simply call `Regex::with_size_limit` with a bigger limit. * Capture group names cannot start with a number. This is a breaking change because regexes that previously compiled (e.g., `(?P<1a>.)`) will now return an error. This fixes #69. * The `regex::Error` type has been changed to reflect the better error reporting in the `regex-syntax` crate, and a new error for limiting regexes to a certain size. This is a breaking change. Most folks just call `unwrap()` on `Regex::new`, so I expect this to have minimal impact. Closes #29, #67, #69, #79, #84. [breaking-change]
rust-lang · May 25, 2015 · a66df89 · a66df89
1 parent 3e26dc6
commit a66df89
Show file tree

Hide file tree

Showing 15 changed files with 4,268 additions and 1,503 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,7 @@
 /Cargo.lock
 /regex_macros/target
 /regex_macros/Cargo.lock
+/regex_syntax/target
+/regex_syntax/Cargo.lock
+/bench-log
 .*.swp
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,8 +21,16 @@ path = "regex_macros/benches/bench_dynamic.rs"
 test = false
 bench = true
 
+[dependencies.regex-syntax]
+path = "regex_syntax"
+version = "*"
+
 [dev-dependencies]
 rand = "0.3"
 
 [features]
 pattern = []
+
+[profile.bench]
+opt-level = 3
+lto = true
diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs
@@ -36,10 +36,7 @@ use rustc::plugin::Registry;
 
 use regex::Regex;
 use regex::native::{
-    OneChar, CharClass, Any, Save, Jump, Split,
-    Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
-    Program, Dynamic, ExDynamic, Native,
-    FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
+    Inst, Program, Dynamic, ExDynamic, Native,
     simple_case_fold,
 };
 
@@ -79,7 +76,9 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
         // error is logged in 'parse' with cx.span_err
         None => return DummyResult::any(sp),
     };
-    let re = match Regex::new(&regex) {
+    // We use the largest possible size limit because this is happening at
+    // compile time. We trust the programmer.
+    let re = match Regex::with_size_limit(::std::usize::MAX, &regex) {
         Ok(re) => re,
         Err(err) => {
             cx.span_err(sp, &err.to_string());
@@ -121,11 +120,10 @@ impl<'a> NfaGen<'a> {
                 None => cx.expr_none(self.sp),
             }
         );
-        let prefix_anchor =
-            match self.prog.insts[1] {
-                EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
-                _ => false,
-            };
+        let prefix_anchor = match self.prog.insts[1] {
+            Inst::StartText => true,
+            _ => false,
+        };
         let init_groups = self.vec_expr(0..num_cap_locs,
                                         &mut |cx, _| cx.expr_none(self.sp));
 
@@ -338,49 +336,55 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
         let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
             let nextpc = pc + 1;
             let body = match *inst {
-                EmptyBegin(flags) => {
-                    let cond =
-                        if flags & FLAG_MULTI > 0 {
-                            quote_expr!(self.cx,
-                                self.chars.is_begin()
-                                || self.chars.prev == Some('\n')
-                            )
-                        } else {
-                            quote_expr!(self.cx, self.chars.is_begin())
-                        };
+                Inst::StartLine => {
                     quote_expr!(self.cx, {
                         nlist.add_empty($pc);
-                        if $cond { self.add(nlist, $nextpc, &mut *groups) }
+                        if self.chars.is_begin() || self.chars.prev == Some('\n') {
+                            self.add(nlist, $nextpc, &mut *groups)
+                        }
                     })
                 }
-                EmptyEnd(flags) => {
-                    let cond =
-                        if flags & FLAG_MULTI > 0 {
-                            quote_expr!(self.cx,
-                                self.chars.is_end()
-                                || self.chars.cur == Some('\n')
-                            )
-                        } else {
-                            quote_expr!(self.cx, self.chars.is_end())
-                        };
+                Inst::StartText => {
                     quote_expr!(self.cx, {
                         nlist.add_empty($pc);
-                        if $cond { self.add(nlist, $nextpc, &mut *groups) }
+                        if self.chars.is_begin() {
+                            self.add(nlist, $nextpc, &mut *groups)
+                        }
                     })
                 }
-                EmptyWordBoundary(flags) => {
-                    let cond =
-                        if flags & FLAG_NEGATED > 0 {
-                            quote_expr!(self.cx, !self.chars.is_word_boundary())
-                        } else {
-                            quote_expr!(self.cx, self.chars.is_word_boundary())
-                        };
+                Inst::EndLine => {
+                    quote_expr!(self.cx, {
+                        nlist.add_empty($pc);
+                        if self.chars.is_end() || self.chars.cur == Some('\n') {
+                            self.add(nlist, $nextpc, &mut *groups)
+                        }
+                    })
+                }
+                Inst::EndText => {
+                    quote_expr!(self.cx, {
+                        nlist.add_empty($pc);
+                        if self.chars.is_end() {
+                            self.add(nlist, $nextpc, &mut *groups)
+                        }
+                    })
+                }
+                Inst::WordBoundary => {
                     quote_expr!(self.cx, {
                         nlist.add_empty($pc);
-                        if $cond { self.add(nlist, $nextpc, &mut *groups) }
+                        if self.chars.is_word_boundary() {
+                            self.add(nlist, $nextpc, &mut *groups)
+                        }
+                    })
+                }
+                Inst::NotWordBoundary => {
+                    quote_expr!(self.cx, {
+                        nlist.add_empty($pc);
+                        if !self.chars.is_word_boundary() {
+                            self.add(nlist, $nextpc, &mut *groups)
+                        }
                     })
                 }
-                Save(slot) => {
+                Inst::Save(slot) => {
                     let save = quote_expr!(self.cx, {
                         let old = groups[$slot];
                         groups[$slot] = Some(self.ic);
@@ -411,20 +415,20 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         })
                     }
                 }
-                Jump(to) => {
+                Inst::Jump(to) => {
                     quote_expr!(self.cx, {
                         nlist.add_empty($pc);
                         self.add(nlist, $to, &mut *groups);
                     })
                 }
-                Split(x, y) => {
+                Inst::Split(x, y) => {
                     quote_expr!(self.cx, {
                         nlist.add_empty($pc);
                         self.add(nlist, $x, &mut *groups);
                         self.add(nlist, $y, &mut *groups);
                     })
                 }
-                // For Match, OneChar, CharClass, Any
+                // For Match, OneChar, CharClass, Any, AnyNoNL
                 _ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
             };
             self.arm_inst(pc, body)
@@ -439,7 +443,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
         let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
             let nextpc = pc + 1;
             let body = match *inst {
-                Match => {
+                Inst::Match => {
                     quote_expr!(self.cx, {
                         match self.which {
                             Exists => {
@@ -459,8 +463,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         }
                     })
                 }
-                OneChar(c, flags) => {
-                    if flags & FLAG_NOCASE > 0 {
+                Inst::OneChar { c, casei } => {
+                    if casei {
                         let upc = simple_case_fold(c);
                         quote_expr!(self.cx, {
                             let upc = self.chars.prev.map(simple_case_fold);
@@ -476,45 +480,37 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         })
                     }
                 }
-                CharClass(ref ranges, flags) => {
-                    let negate = flags & FLAG_NEGATED > 0;
-                    let casei = flags & FLAG_NOCASE > 0;
+                Inst::CharClass(ref cls) => {
+                    let ranges: Vec<(char, char)> =
+                        cls.iter().map(|r| (r.start, r.end)).collect();
+                    let mranges = self.match_class(&ranges);
                     let get_char =
-                        if casei {
+                        if cls.is_case_insensitive() {
                             quote_expr!(
                                 self.cx,
                                 simple_case_fold(self.chars.prev.unwrap()))
                         } else {
                             quote_expr!(self.cx, self.chars.prev.unwrap())
                         };
-                    let negcond =
-                        if negate {
-                            quote_expr!(self.cx, !found)
-                        } else {
-                            quote_expr!(self.cx, found)
-                        };
-                    let mranges = self.match_class(&ranges);
                     quote_expr!(self.cx, {
                         if self.chars.prev.is_some() {
                             let c = $get_char;
-                            let found = $mranges;
-                            if $negcond {
+                            if $mranges {
                                 self.add(nlist, $nextpc, caps);
                             }
                         }
                     })
                 }
-                Any(flags) => {
-                    if flags & FLAG_DOTNL > 0 {
-                        quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
-                    } else {
-                        quote_expr!(self.cx, {
-                            if self.chars.prev != Some('\n') {
-                                self.add(nlist, $nextpc, caps)
-                            }
-                            ()
-                        })
-                    }
+                Inst::Any => {
+                    quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
+                }
+                Inst::AnyNoNL => {
+                    quote_expr!(self.cx, {
+                        if self.chars.prev != Some('\n') {
+                            self.add(nlist, $nextpc, caps);
+                        }
+                        ()
+                    })
                 }
                 // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
                 _ => self.empty_block(),

diff --git a/regex_macros/tests/tests.rs b/regex_macros/tests/tests.rs
@@ -203,6 +203,8 @@ replace!(rep_named, replace_all,
          "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3");
 replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t  trim me\t   \t",
          "", "trim me");
+replace!(rep_number_hypen, replace, r"(.)(.)", "ab", "$1-$2", "a-b");
+replace!(rep_number_underscore, replace, r"(.)(.)", "ab", "$1_$2", "a_b");
 
 macro_rules! noparse(
     ($name:ident, $re:expr) => (
@@ -219,7 +221,6 @@ macro_rules! noparse(
 
 noparse!(fail_double_repeat, "a**");
 noparse!(fail_no_repeat_arg, "*");
-noparse!(fail_no_repeat_arg_begin, "^*");
 noparse!(fail_incomplete_escape, "\\");
 noparse!(fail_class_incomplete, "[A-");
 noparse!(fail_class_not_closed, "[A");
@@ -235,8 +236,7 @@ noparse!(fail_bad_capture_name, "(?P<na-me>)");
 noparse!(fail_bad_flag, "(?a)a");
 noparse!(fail_empty_alt_before, "|a");
 noparse!(fail_empty_alt_after, "a|");
-noparse!(fail_counted_big_exact, "a{1001}");
-noparse!(fail_counted_big_min, "a{1001,}");
+noparse!(fail_too_big, "a{10000000}");
 noparse!(fail_counted_no_close, "a{1001");
 noparse!(fail_unfinished_cap, "(?");
 noparse!(fail_unfinished_escape, "\\");

diff --git a/regex_syntax/Cargo.toml b/regex_syntax/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "regex-syntax"
+version = "0.1.0"
+authors = ["The Rust Project Developers"]
+license = "MIT/Apache-2.0"
+repository = "https://github.com/rust-lang/regex"
+documentation = "http://doc.rust-lang.org/regex"
+homepage = "https://github.com/rust-lang/regex"
+description = "A regular expression parser (RE2 only)."
+
+[dev-dependencies]
+quickcheck = "*"
+rand = "*"