From a66df890f26561e38992a982cef3c2944e8e2c28 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 25 May 2015 12:49:14 -0400 Subject: [PATCH] Rewrite parser as part of new regex-syntax crate. This commit introduces a new `regex-syntax` crate that provides a regular expression parser and an abstract syntax for regular expressions. As part of this effort, the parser has been rewritten and has grown a substantial number of tests. The `regex` crate itself hasn't changed too much. I opted for the smallest possible delta to get it working with the new regex AST. In most cases, this simplified code because it no longer has to deal with unwieldy flags. (Instead, flag information is baked into the AST.) Here is a list of public facing non-breaking changes: * A new `regex-syntax` crate with a parser, regex AST and lots of tests. This closes #29 and fixes #84. * A new flag, `x`, has been added. This allows one to write regexes with insignificant whitespace and comments. * Repetition operators can now be directly applied to zero-width matches. e.g., `\b+` was previously not allowed but now works. Note that one could always write `(\b)+` previously. This change is mostly about lifting an arbitrary restriction. And a list of breaking changes: * A new `Regex::with_size_limit` constructor function, that allows one to tweak the limit on the size of a compiled regex. This fixes #67. The new method isn't a breaking change, but regexes that exceed the size limit (set to 10MB by default) will no longer compile. To fix, simply call `Regex::with_size_limit` with a bigger limit. * Capture group names cannot start with a number. This is a breaking change because regexes that previously compiled (e.g., `(?P<1a>.)`) will now return an error. This fixes #69. * The `regex::Error` type has been changed to reflect the better error reporting in the `regex-syntax` crate, and a new error for limiting regexes to a certain size. This is a breaking change. Most folks just call `unwrap()` on `Regex::new`, so I expect this to have minimal impact. Closes #29, #67, #69, #79, #84. [breaking-change] --- .gitignore | 3 + Cargo.toml | 8 + regex_macros/src/lib.rs | 138 +- regex_macros/tests/tests.rs | 6 +- regex_syntax/Cargo.toml | 13 + regex_syntax/src/lib.rs | 1162 +++++++++++++ regex_syntax/src/parser.rs | 2298 ++++++++++++++++++++++++++ regex_syntax/src/properties.rs | 407 +++++ {src => regex_syntax/src}/unicode.rs | 0 scripts/unicode.py | 2 +- src/compile.rs | 185 ++- src/lib.rs | 132 +- src/parse.rs | 1160 ------------- src/re.rs | 99 +- src/vm.rs | 158 +- 15 files changed, 4268 insertions(+), 1503 deletions(-) create mode 100644 regex_syntax/Cargo.toml create mode 100644 regex_syntax/src/lib.rs create mode 100644 regex_syntax/src/parser.rs create mode 100644 regex_syntax/src/properties.rs rename {src => regex_syntax/src}/unicode.rs (100%) delete mode 100644 src/parse.rs diff --git a/.gitignore b/.gitignore index fd3afa8a97..ece777a348 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,7 @@ /Cargo.lock /regex_macros/target /regex_macros/Cargo.lock +/regex_syntax/target +/regex_syntax/Cargo.lock +/bench-log .*.swp diff --git a/Cargo.toml b/Cargo.toml index 33bf5ead6c..734c4da39b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,8 +21,16 @@ path = "regex_macros/benches/bench_dynamic.rs" test = false bench = true +[dependencies.regex-syntax] +path = "regex_syntax" +version = "*" + [dev-dependencies] rand = "0.3" [features] pattern = [] + +[profile.bench] +opt-level = 3 +lto = true diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs index 73e9705801..cd6c8d232d 100644 --- a/regex_macros/src/lib.rs +++ b/regex_macros/src/lib.rs @@ -36,10 +36,7 @@ use rustc::plugin::Registry; use regex::Regex; use regex::native::{ - OneChar, CharClass, Any, Save, Jump, Split, - Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, - Program, Dynamic, ExDynamic, Native, - FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, + Inst, Program, Dynamic, ExDynamic, Native, simple_case_fold, }; @@ -79,7 +76,9 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) // error is logged in 'parse' with cx.span_err None => return DummyResult::any(sp), }; - let re = match Regex::new(®ex) { + // We use the largest possible size limit because this is happening at + // compile time. We trust the programmer. + let re = match Regex::with_size_limit(::std::usize::MAX, ®ex) { Ok(re) => re, Err(err) => { cx.span_err(sp, &err.to_string()); @@ -121,11 +120,10 @@ impl<'a> NfaGen<'a> { None => cx.expr_none(self.sp), } ); - let prefix_anchor = - match self.prog.insts[1] { - EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, - _ => false, - }; + let prefix_anchor = match self.prog.insts[1] { + Inst::StartText => true, + _ => false, + }; let init_groups = self.vec_expr(0..num_cap_locs, &mut |cx, _| cx.expr_none(self.sp)); @@ -338,49 +336,55 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { let nextpc = pc + 1; let body = match *inst { - EmptyBegin(flags) => { - let cond = - if flags & FLAG_MULTI > 0 { - quote_expr!(self.cx, - self.chars.is_begin() - || self.chars.prev == Some('\n') - ) - } else { - quote_expr!(self.cx, self.chars.is_begin()) - }; + Inst::StartLine => { quote_expr!(self.cx, { nlist.add_empty($pc); - if $cond { self.add(nlist, $nextpc, &mut *groups) } + if self.chars.is_begin() || self.chars.prev == Some('\n') { + self.add(nlist, $nextpc, &mut *groups) + } }) } - EmptyEnd(flags) => { - let cond = - if flags & FLAG_MULTI > 0 { - quote_expr!(self.cx, - self.chars.is_end() - || self.chars.cur == Some('\n') - ) - } else { - quote_expr!(self.cx, self.chars.is_end()) - }; + Inst::StartText => { quote_expr!(self.cx, { nlist.add_empty($pc); - if $cond { self.add(nlist, $nextpc, &mut *groups) } + if self.chars.is_begin() { + self.add(nlist, $nextpc, &mut *groups) + } }) } - EmptyWordBoundary(flags) => { - let cond = - if flags & FLAG_NEGATED > 0 { - quote_expr!(self.cx, !self.chars.is_word_boundary()) - } else { - quote_expr!(self.cx, self.chars.is_word_boundary()) - }; + Inst::EndLine => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + if self.chars.is_end() || self.chars.cur == Some('\n') { + self.add(nlist, $nextpc, &mut *groups) + } + }) + } + Inst::EndText => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + if self.chars.is_end() { + self.add(nlist, $nextpc, &mut *groups) + } + }) + } + Inst::WordBoundary => { quote_expr!(self.cx, { nlist.add_empty($pc); - if $cond { self.add(nlist, $nextpc, &mut *groups) } + if self.chars.is_word_boundary() { + self.add(nlist, $nextpc, &mut *groups) + } + }) + } + Inst::NotWordBoundary => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + if !self.chars.is_word_boundary() { + self.add(nlist, $nextpc, &mut *groups) + } }) } - Save(slot) => { + Inst::Save(slot) => { let save = quote_expr!(self.cx, { let old = groups[$slot]; groups[$slot] = Some(self.ic); @@ -411,20 +415,20 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, }) } } - Jump(to) => { + Inst::Jump(to) => { quote_expr!(self.cx, { nlist.add_empty($pc); self.add(nlist, $to, &mut *groups); }) } - Split(x, y) => { + Inst::Split(x, y) => { quote_expr!(self.cx, { nlist.add_empty($pc); self.add(nlist, $x, &mut *groups); self.add(nlist, $y, &mut *groups); }) } - // For Match, OneChar, CharClass, Any + // For Match, OneChar, CharClass, Any, AnyNoNL _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), }; self.arm_inst(pc, body) @@ -439,7 +443,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { let nextpc = pc + 1; let body = match *inst { - Match => { + Inst::Match => { quote_expr!(self.cx, { match self.which { Exists => { @@ -459,8 +463,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, } }) } - OneChar(c, flags) => { - if flags & FLAG_NOCASE > 0 { + Inst::OneChar { c, casei } => { + if casei { let upc = simple_case_fold(c); quote_expr!(self.cx, { let upc = self.chars.prev.map(simple_case_fold); @@ -476,45 +480,37 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, }) } } - CharClass(ref ranges, flags) => { - let negate = flags & FLAG_NEGATED > 0; - let casei = flags & FLAG_NOCASE > 0; + Inst::CharClass(ref cls) => { + let ranges: Vec<(char, char)> = + cls.iter().map(|r| (r.start, r.end)).collect(); + let mranges = self.match_class(&ranges); let get_char = - if casei { + if cls.is_case_insensitive() { quote_expr!( self.cx, simple_case_fold(self.chars.prev.unwrap())) } else { quote_expr!(self.cx, self.chars.prev.unwrap()) }; - let negcond = - if negate { - quote_expr!(self.cx, !found) - } else { - quote_expr!(self.cx, found) - }; - let mranges = self.match_class(&ranges); quote_expr!(self.cx, { if self.chars.prev.is_some() { let c = $get_char; - let found = $mranges; - if $negcond { + if $mranges { self.add(nlist, $nextpc, caps); } } }) } - Any(flags) => { - if flags & FLAG_DOTNL > 0 { - quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) - } else { - quote_expr!(self.cx, { - if self.chars.prev != Some('\n') { - self.add(nlist, $nextpc, caps) - } - () - }) - } + Inst::Any => { + quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) + } + Inst::AnyNoNL => { + quote_expr!(self.cx, { + if self.chars.prev != Some('\n') { + self.add(nlist, $nextpc, caps); + } + () + }) } // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split _ => self.empty_block(), diff --git a/regex_macros/tests/tests.rs b/regex_macros/tests/tests.rs index 11670ac8f1..ab1db14a0b 100644 --- a/regex_macros/tests/tests.rs +++ b/regex_macros/tests/tests.rs @@ -203,6 +203,8 @@ replace!(rep_named, replace_all, "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3"); replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", "", "trim me"); +replace!(rep_number_hypen, replace, r"(.)(.)", "ab", "$1-$2", "a-b"); +replace!(rep_number_underscore, replace, r"(.)(.)", "ab", "$1_$2", "a_b"); macro_rules! noparse( ($name:ident, $re:expr) => ( @@ -219,7 +221,6 @@ macro_rules! noparse( noparse!(fail_double_repeat, "a**"); noparse!(fail_no_repeat_arg, "*"); -noparse!(fail_no_repeat_arg_begin, "^*"); noparse!(fail_incomplete_escape, "\\"); noparse!(fail_class_incomplete, "[A-"); noparse!(fail_class_not_closed, "[A"); @@ -235,8 +236,7 @@ noparse!(fail_bad_capture_name, "(?P)"); noparse!(fail_bad_flag, "(?a)a"); noparse!(fail_empty_alt_before, "|a"); noparse!(fail_empty_alt_after, "a|"); -noparse!(fail_counted_big_exact, "a{1001}"); -noparse!(fail_counted_big_min, "a{1001,}"); +noparse!(fail_too_big, "a{10000000}"); noparse!(fail_counted_no_close, "a{1001"); noparse!(fail_unfinished_cap, "(?"); noparse!(fail_unfinished_escape, "\\"); diff --git a/regex_syntax/Cargo.toml b/regex_syntax/Cargo.toml new file mode 100644 index 0000000000..48231a41ec --- /dev/null +++ b/regex_syntax/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "regex-syntax" +version = "0.1.0" +authors = ["The Rust Project Developers"] +license = "MIT/Apache-2.0" +repository = "https://github.com/rust-lang/regex" +documentation = "http://doc.rust-lang.org/regex" +homepage = "https://github.com/rust-lang/regex" +description = "A regular expression parser (RE2 only)." + +[dev-dependencies] +quickcheck = "*" +rand = "*" diff --git a/regex_syntax/src/lib.rs b/regex_syntax/src/lib.rs new file mode 100644 index 0000000000..95eed3f5c3 --- /dev/null +++ b/regex_syntax/src/lib.rs @@ -0,0 +1,1162 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This crate provides a regular expression parser and an abstract syntax for +regular expressions. The abstract syntax is defined by the `Expr` type. The +concrete syntax is enumerated in the +[`regex`](../regex/index.html#syntax) +crate documentation. + +Note that since this crate is first and foremost an implementation detail for +the `regex` crate, it may experience more frequent breaking changes. It is +exposed as a separate crate so that others may use it to do analysis on regular +expressions or even build their own matching engine. + +# Example: parsing an expression + +Parsing a regular expression can be done with the `Expr::parse` function. + +```rust +use regex_syntax::Expr; + +assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![ + Expr::Literal { chars: vec!['a', 'b'], casei: false }, + Expr::Literal { chars: vec!['y', 'z'], casei: false }, +])); +``` + +# Example: inspecting an error + +The parser in this crate provides very detailed error values. For example, +if an invalid character class range is given: + +```rust +use regex_syntax::{Expr, ErrorKind}; + +let err = Expr::parse(r"[z-a]").unwrap_err(); +assert_eq!(err.position(), 4); +assert_eq!(err.kind(), &ErrorKind::InvalidClassRange { + start: 'z', + end: 'a', +}); +``` + +Or unbalanced parentheses: + +```rust +use regex_syntax::{Expr, ErrorKind}; + +let err = Expr::parse(r"ab(cd").unwrap_err(); +assert_eq!(err.position(), 2); +assert_eq!(err.kind(), &ErrorKind::UnclosedParen); +``` +*/ + +#![deny(missing_docs)] + +#[cfg(test)] extern crate quickcheck; +#[cfg(test)] extern crate rand; + +mod parser; +mod unicode; + +use std::char; +use std::cmp::{Ordering, max, min}; +use std::fmt; +use std::iter::IntoIterator; +use std::ops::Deref; +use std::slice; +use std::vec; + +use unicode::case_folding; + +use self::Expr::*; +use self::Repeater::*; + +pub use parser::is_punct; + +/// A regular expression abstract syntax tree. +/// +/// An `Expr` represents the abstract syntax of a regular expression. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Expr { + /// An empty regex (which never matches any text). + Empty, + /// A sequence of one or more literal characters to be matched. + Literal { + /// The characters. + chars: Vec, + /// Whether to match case insensitively. + casei: bool, + }, + /// Match any character, excluding new line. + AnyChar, + /// Match any character. + AnyCharNoNL, + /// A character class. + Class(CharClass), + /// Match the start of a line or beginning of input. + StartLine, + /// Match the end of a line or end of input. + EndLine, + /// Match the beginning of input. + StartText, + /// Match the end of input. + EndText, + /// Match a word boundary (word character on one side and a non-word + /// character on the other). + WordBoundary, + /// Match a position that is not a word boundary (word or non-word + /// characters on both sides). + NotWordBoundary, + /// A group, possibly non-capturing. + Group { + /// The expression inside the group. + e: Box, + /// The capture index (starting at `1`) only for capturing groups. + i: Option, + /// The capture name, only for capturing named groups. + name: Option, + }, + /// A repeat operator (`?`, `*`, `+` or `{m,n}`). + Repeat { + /// The expression to be repeated. Limited to literals, `.`, classes + /// or grouped expressions. + e: Box, + /// The type of repeat operator used. + r: Repeater, + /// Whether the repeat is greedy (match the most) or not (match the + /// least). + greedy: bool, + }, + /// A concatenation of expressions. Must be matched one after the other. + /// + /// N.B. A concat expression can only appear at the top-level or + /// immediately inside a group expression. + Concat(Vec), + /// An alternation of expressions. Only one must match. + /// + /// N.B. An alternate expression can only appear at the top-level or + /// immediately inside a group expression. + Alternate(Vec), +} + +type CaptureIndex = Option; + +type CaptureName = Option; + +/// The type of a repeat operator expression. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Repeater { + /// Match zero or one (`?`). + ZeroOrOne, + /// Match zero or more (`*`). + ZeroOrMore, + /// Match one or more (`+`). + OneOrMore, + /// Match for at least `min` and at most `max` (`{m,n}`). + /// + /// When `max` is `None`, there is no upper bound on the number of matches. + Range { + /// Lower bound on the number of matches. + min: u32, + /// Optional upper bound on the number of matches. + max: Option, + }, +} + +/// A character class. +/// +/// A character class has a canonical format that the parser guarantees. Its +/// canonical format is defined by the following invariants: +/// +/// 1. Given any Unicode scalar value, it is matched by *at most* one character +/// range in a canonical character class. +/// 2. Every adjacent character range is separated by at least one Unicode +/// scalar value. +/// 3. Given any pair of character ranges `r1` and `r2`, if +/// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical +/// character class. +/// +/// In sum, any `CharClass` produced by this crate's parser is a sorted +/// sequence of non-overlapping ranges. This makes it possible to test whether +/// a character is matched by a class with a binary search. +/// +/// Additionally, a character class may be marked *case insensitive*. If it's +/// case insensitive, then: +/// +/// 1. Simple case folding has been applied to all ranges. +/// 2. Simple case folding must be applied to a character before testing +/// whether it matches the character class. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CharClass { + ranges: Vec, + casei: bool, +} + +/// A single inclusive range in a character class. +/// +/// Since range boundaries are defined by Unicode scalar values, the boundaries +/// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may +/// *cover* codepoints that are not scalar values. +/// +/// Note that this has a few convenient impls on `PartialEq` and `PartialOrd` +/// for testing whether a character is contained inside a given range. +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)] +pub struct ClassRange { + /// The start character of the range. + /// + /// This must be less than or equal to `end`. + pub start: char, + + /// The end character of the range. + /// + /// This must be greater than or equal to `end`. + pub end: char, +} + +impl Expr { + /// Parses a string in a regular expression syntax tree. + pub fn parse(s: &str) -> Result { + parser::Parser::parse(s).map(|e| e.simplify()) + } + + /// Returns true iff the expression can be repeated by a quantifier. + fn can_repeat(&self) -> bool { + match *self { + Literal{..} + | AnyChar + | AnyCharNoNL + | Class(_) + | StartLine | EndLine | StartText | EndText + | WordBoundary | NotWordBoundary + | Group{..} + => true, + _ => false, + } + } + + fn simplify(self) -> Expr { + fn combine_literals(es: &mut Vec, e: Expr) { + match (es.pop(), e) { + (None, e) => es.push(e), + (Some(Literal { chars: mut chars1, casei: casei1 }), + Literal { chars: chars2, casei: casei2 }) => { + if casei1 == casei2 { + chars1.extend(chars2); + es.push(Literal { chars: chars1, casei: casei1 }); + } else { + es.push(Literal { chars: chars1, casei: casei1 }); + es.push(Literal { chars: chars2, casei: casei2 }); + } + } + (Some(e1), e2) => { + es.push(e1); + es.push(e2); + } + } + } + match self { + Repeat { e, r, greedy } => Repeat { + e: Box::new(e.simplify()), + r: r, + greedy: greedy, + }, + Group { e, i, name } => { + let e = e.simplify(); + if i.is_none() && name.is_none() && e.can_repeat() { + e + } else { + Group { e: Box::new(e), i: i, name: name } + } + } + Concat(es) => { + let mut new_es = Vec::with_capacity(es.len()); + for e in es { + combine_literals(&mut new_es, e.simplify()); + } + if new_es.len() == 1 { + new_es.pop().unwrap() + } else { + Concat(new_es) + } + } + Alternate(es) => Alternate(es.into_iter() + .map(|e| e.simplify()) + .collect()), + e => e, + } + } +} + +impl Deref for CharClass { + type Target = Vec; + fn deref(&self) -> &Vec { &self.ranges } +} + +impl IntoIterator for CharClass { + type Item = ClassRange; + type IntoIter = vec::IntoIter; + fn into_iter(self) -> vec::IntoIter { self.ranges.into_iter() } +} + +impl<'a> IntoIterator for &'a CharClass { + type Item = &'a ClassRange; + type IntoIter = slice::Iter<'a, ClassRange>; + fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() } +} + +impl CharClass { + /// Create a new class from an existing set of ranges. + fn new(ranges: Vec) -> CharClass { + CharClass { ranges: ranges, casei: false } + } + + /// Create an empty class. + fn empty() -> CharClass { + CharClass::new(Vec::new()) + } + + /// Returns true if `c` is matched by this character class. + /// + /// If this character class is case insensitive, then simple case folding + /// is applied to `c` before checking for a match. + pub fn matches(&self, mut c: char) -> bool { + if self.is_case_insensitive() { + c = simple_case_fold(c) + } + self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok() + } + + /// Returns true if this character class should be matched case + /// insensitively. + /// + /// When `true`, simple case folding has already been applied to the + /// class. + pub fn is_case_insensitive(&self) -> bool { + self.casei + } + + /// Create a new empty class from this one. + /// + /// Namely, its capacity and case insensitive setting will be the same. + fn to_empty(&self) -> CharClass { + CharClass { ranges: Vec::with_capacity(self.len()), casei: self.casei } + } + + /// Merge two classes and canonicalize them. + #[cfg(test)] + fn merge(mut self, other: CharClass) -> CharClass { + self.ranges.extend(other); + self.canonicalize() + } + + /// Canonicalze any sequence of ranges. + /// + /// This is responsible for enforcing the canonical format invariants + /// as described on the docs for the `CharClass` type. + fn canonicalize(mut self) -> CharClass { + // TODO: Save some cycles here by checking if already canonicalized. + self.ranges.sort(); + let mut ordered = self.to_empty(); // TODO: Do this in place? + for candidate in self { + // If the candidate overlaps with an existing range, then it must + // be the most recent range added because we process the candidates + // in order. + if let Some(or) = ordered.ranges.last_mut() { + if or.overlapping(candidate) { + *or = or.merge(candidate); + continue; + } + } + ordered.ranges.push(candidate); + } + ordered + } + + /// Negates the character class. + /// + /// For all `c` where `c` is a Unicode scalar value, `c` matches `self` + /// if and only if `c` does not match `self.negate()`. + /// + /// Note that this cannot be called on a character class that has had + /// case folding applied to it. (Because case folding turns on a flag + /// and doesn't store every possible matching character. Therefore, + /// its negation is tricky to get right. Turns out, we don't need it + /// anyway!) + fn negate(mut self) -> CharClass { + fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) } + + // Never allow negating of a class that has been case folded! + assert!(!self.casei); + + if self.is_empty() { return self; } + self = self.canonicalize(); + let mut inv = self.to_empty(); + if self[0].start > '\x00' { + inv.ranges.push(range('\x00', dec_char(self[0].start))); + } + for win in self.windows(2) { + inv.ranges.push(range(inc_char(win[0].end), + dec_char(win[1].start))); + } + if self[self.len() - 1].end < char::MAX { + inv.ranges.push(range(inc_char(self[self.len() - 1].end), + char::MAX)); + } + inv + } + + /// Apply case folding to this character class. + /// + /// One a class had been case folded, it cannot be negated. + fn case_fold(self) -> CharClass { + let mut folded = self.to_empty(); + folded.casei = true; + for r in self { + // Applying case folding to a range is expensive because *every* + // character needed to be examined. Thus, we avoid that drudgery + // if no character in the current range is in our case folding + // table. + if r.needs_case_folding() { + folded.ranges.extend(r.case_fold()); + } else { + folded.ranges.push(r); + } + } + folded.canonicalize() + } +} + +impl ClassRange { + /// Create a new class range. + /// + /// If `end < start`, then the two values are swapped so that + /// the invariant `start <= end` is preserved. + fn new(start: char, end: char) -> ClassRange { + if start <= end { + ClassRange { start: start, end: end } + } else { + ClassRange { start: end, end: start } + } + } + + /// Create a range of one character. + fn one(c: char) -> ClassRange { + ClassRange { start: c, end: c } + } + + /// Returns true if and only if the two ranges are overlapping. Note that + /// since ranges are inclusive, `a-c` and `d-f` are overlapping! + fn overlapping(self, other: ClassRange) -> bool { + max(self.start, other.start) <= inc_char(min(self.end, other.end)) + } + + /// Creates a new range representing the union of `self` and `other. + fn merge(self, other: ClassRange) -> ClassRange { + ClassRange { + start: min(self.start, other.start), + end: max(self.end, other.end), + } + } + + /// Returns true if and only if this range contains a character that is + /// in the case folding table. + fn needs_case_folding(self) -> bool { + case_folding::C_plus_S_table + .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok() + } + + /// Apply case folding to this range. + /// + /// Since case folding might add characters such that the range is no + /// longer contiguous, this returns multiple class ranges. They are in + /// canonical order. + fn case_fold(self) -> Vec { + let (s, e) = (self.start as u32, self.end as u32 + 1); + let folded = (s..e).filter_map(char::from_u32).map(simple_case_fold); + ClassRange::ranges(folded) + } + + /// Turns a non-empty sequence of sorted characters into a sequence of + /// class ranges in canonical format/order. + fn ranges>(mut chars: I) -> Vec { + let mut ranges = Vec::with_capacity(100); + let mut start = chars.next().expect("non-empty char iterator"); + let mut end = start; + for c in chars { + if c != inc_char(end) { + ranges.push(ClassRange::new(start, end)); + start = c; + } + end = c; + } + ranges.push(ClassRange::new(start, end)); + ranges + } +} + +impl PartialEq for ClassRange { + #[inline] + fn eq(&self, other: &char) -> bool { + self.start <= *other && *other <= self.end + } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &ClassRange) -> bool { + other.eq(self) + } +} + +impl PartialOrd for ClassRange { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + Some(if self == other { + Ordering::Equal + } else if *other > self.end { + Ordering::Greater + } else { + Ordering::Less + }) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &ClassRange) -> Option { + other.partial_cmp(self).map(|o| o.reverse()) + } +} + +/// This implementation of `Display` will write a regular expression from the +/// syntax tree. It does not write the original string parsed. +// TODO(burntsushi): Write tests for the regex writer. +impl fmt::Display for Expr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Empty => write!(f, ""), + Literal { ref chars, casei } => { + if casei { try!(write!(f, "(?i:")); } + for &c in chars { + try!(write!(f, "{}", quote_char(c))); + } + if casei { try!(write!(f, ")")); } + Ok(()) + } + AnyChar => write!(f, "(?s:.)"), + AnyCharNoNL => write!(f, "."), + Class(ref cls) => write!(f, "{}", cls), + StartLine => write!(f, "(?m:^)"), + EndLine => write!(f, "(?m:$)"), + StartText => write!(f, r"^"), + EndText => write!(f, r"$"), + WordBoundary => write!(f, r"\b"), + NotWordBoundary => write!(f, r"\B"), + Group { ref e, i: None, name: None } => write!(f, "(?:{})", e), + Group { ref e, name: None, .. } => write!(f, "({})", e), + Group { ref e, name: Some(ref n), .. } => { + write!(f, "(?P<{}>{})", n, e) + } + Repeat { ref e, r, greedy } => { + match &**e { + &Literal { ref chars, .. } if chars.len() > 1 => { + try!(write!(f, "(?:{}){}", e, r)) + } + _ => try!(write!(f, "{}{}", e, r)), + } + if !greedy { try!(write!(f, "?")); } + Ok(()) + } + Concat(ref es) => { + for e in es { + try!(write!(f, "{}", e)); + } + Ok(()) + } + Alternate(ref es) => { + for (i, e) in es.iter().enumerate() { + if i > 0 { try!(write!(f, "|")); } + try!(write!(f, "{}", e)); + } + Ok(()) + } + } + } +} + +impl fmt::Display for Repeater { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + ZeroOrOne => write!(f, "?"), + ZeroOrMore => write!(f, "*"), + OneOrMore => write!(f, "+"), + Range { min: s, max: None } => write!(f, "{{{},}}", s), + Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s), + Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e), + } + } +} + +impl fmt::Display for CharClass { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.casei { + try!(write!(f, "(?i:")); + } + try!(write!(f, "[")); + for range in self.iter() { + try!(write!(f, "{}", range)); + } + try!(write!(f, "]")); + if self.casei { + try!(write!(f, ")")); + } + Ok(()) + } +} + +impl fmt::Display for ClassRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}-{}", quote_char(self.start), quote_char(self.end)) + } +} + +/// An alias for computations that can return a `Error`. +pub type Result = ::std::result::Result; + +/// A parse error. +/// +/// This includes details about the specific type of error and a rough +/// approximation of where it occurred. +#[derive(Clone, Debug, PartialEq)] +pub struct Error { + pos: usize, + surround: String, + kind: ErrorKind, +} + +/// The specific type of parse error that can occur. +#[derive(Clone, Debug, PartialEq)] +pub enum ErrorKind { + /// A negation symbol is used twice in flag settings. + /// e.g., `(?-i-s)`. + DoubleFlagNegation, + /// The same capture name was used more than once. + /// e.g., `(?P.)(?P.)`. + DuplicateCaptureName(String), + /// An alternate is empty. e.g., `(|a)`. + EmptyAlternate, + /// A capture group name is empty. e.g., `(?P<>a)`. + EmptyCaptureName, + /// A negation symbol was not proceded by any flags. e.g., `(?i-)`. + EmptyFlagNegation, + /// A group is empty. e.g., `()`. + EmptyGroup, + /// An invalid number was used in a counted repetition. e.g., `a{b}`. + InvalidBase10(String), + /// An invalid hexadecimal number was used in an escape sequence. + /// e.g., `\xAG`. + InvalidBase16(String), + /// An invalid capture name was used. e.g., `(?P<0a>b)`. + InvalidCaptureName(String), + /// An invalid class range was givien. Specifically, when the start of the + /// range is greater than the end. e.g., `[z-a]`. + InvalidClassRange { + /// The first character specified in the range. + start: char, + /// The second character specified in the range. + end: char, + }, + /// An escape sequence was used in a character class where it is not + /// allowed. e.g., `[a-\pN]` or `[\A]`. + InvalidClassEscape(Expr), + /// An invalid counted repetition min/max was given. e.g., `a{2,1}`. + InvalidRepeatRange { + /// The first number specified in the repetition. + min: u32, + /// The second number specified in the repetition. + max: u32, + }, + /// An invalid Unicode scalar value was used in a long hexadecimal + /// sequence. e.g., `\x{D800}`. + InvalidScalarValue(u32), + /// An empty counted repetition operator. e.g., `a{}`. + MissingBase10, + /// A repetition operator was not applied to an expression. e.g., `*`. + RepeaterExpectsExpr, + /// A repetition operator was applied to an expression that cannot be + /// repeated. e.g., `a+*` or `a|*`. + RepeaterUnexpectedExpr(Expr), + /// A capture group name that is never closed. e.g., `(?P usize { + self.pos + } + + /// Returns the type of the regex parse error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } +} + +impl ErrorKind { + fn description(&self) -> &str { + use ErrorKind::*; + match *self { + DoubleFlagNegation => "double flag negation", + DuplicateCaptureName(_) => "duplicate capture name", + EmptyAlternate => "empty alternate", + EmptyCaptureName => "empty capture name", + EmptyFlagNegation => "flag negation without any flags", + EmptyGroup => "empty group (e.g., '()')", + InvalidBase10(_) => "invalid base 10 number", + InvalidBase16(_) => "invalid base 16 number", + InvalidCaptureName(_) => "invalid capture name", + InvalidClassRange{..} => "invalid character class range", + InvalidClassEscape(_) => "invalid escape sequence in class", + InvalidRepeatRange{..} => "invalid counted repetition range", + InvalidScalarValue(_) => "invalid Unicode scalar value", + MissingBase10 => "missing count in repetition operator", + RepeaterExpectsExpr => "repetition operator missing expression", + RepeaterUnexpectedExpr(_) => "expression cannot be repeated", + UnclosedCaptureName(_) => "unclosed capture group name", + UnclosedHex => "unclosed hexadecimal literal", + UnclosedParen => "unclosed parenthesis", + UnclosedRepeat => "unclosed counted repetition operator", + UnclosedUnicodeName => "unclosed Unicode class literal", + UnexpectedClassEof => "unexpected EOF in character class", + UnexpectedEscapeEof => "unexpected EOF in escape sequence", + UnexpectedFlagEof => "unexpected EOF in flags", + UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal", + UnopenedParen => "unopened parenthesis", + UnrecognizedEscape(_) => "unrecognized escape sequence", + UnrecognizedFlag(_) => "unrecognized flag", + UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name", + } + } +} + +impl ::std::error::Error for Error { + fn description(&self) -> &str { + self.kind.description() + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Error parsing regex near '{}' at character offset {}: {}", + self.surround, self.pos, self.kind) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use ErrorKind::*; + match *self { + DoubleFlagNegation => + write!(f, "Only one negation symbol is allowed in flags."), + DuplicateCaptureName(ref s) => + write!(f, "Capture name '{}' is used more than once.", s), + EmptyAlternate => + write!(f, "Alternations cannot be empty."), + EmptyCaptureName => + write!(f, "Capture names cannot be empty."), + EmptyFlagNegation => + write!(f, "Flag negation requires setting at least one flag."), + EmptyGroup => + write!(f, "Empty regex groups (e.g., '()') are not allowed."), + InvalidBase10(ref s) => + write!(f, "Not a valid base 10 number: '{}'", s), + InvalidBase16(ref s) => + write!(f, "Not a valid base 16 number: '{}'", s), + InvalidCaptureName(ref s) => + write!(f, "Invalid capture name: '{}'. Capture names must \ + consist of [_a-zA-Z0-9] and are not allowed to \ + start with with a number.", s), + InvalidClassRange { start, end } => + write!(f, "Invalid character class range '{}-{}'. \ + Character class ranges must start with the smaller \ + character, but {} > {}", start, end, start, end), + InvalidClassEscape(ref e) => + write!(f, "Invalid escape sequence in character \ + class: '{}'.", e), + InvalidRepeatRange { min, max } => + write!(f, "Invalid counted repetition range: {{{}, {}}}. \ + Counted repetition ranges must start with the \ + minimum, but {} > {}", min, max, min, max), + InvalidScalarValue(c) => + write!(f, "Number does not correspond to a Unicode scalar \ + value: '{}'.", c), + MissingBase10 => + write!(f, "Missing maximum in counted reptition operator."), + RepeaterExpectsExpr => + write!(f, "Missing expression for reptition operator."), + RepeaterUnexpectedExpr(ref e) => + write!(f, "Invalid application of reptition operator to: \ + '{}'.", e), + UnclosedCaptureName(ref s) => + write!(f, "Capture name group for '{}' is not closed. \ + (Missing a '>'.)", s), + UnclosedHex => + write!(f, "Unclosed hexadecimal literal (missing a '}}')."), + UnclosedParen => + write!(f, "Unclosed parenthesis."), + UnclosedRepeat => + write!(f, "Unclosed counted repetition (missing a '}}')."), + UnclosedUnicodeName => + write!(f, "Unclosed Unicode literal (missing a '}}')."), + UnexpectedClassEof => + write!(f, "Character class was not closed before the end of \ + the regex (missing a ']')."), + UnexpectedEscapeEof => + write!(f, "Started an escape sequence that didn't finish \ + before the end of the regex."), + UnexpectedFlagEof => + write!(f, "Inline flag settings was not closed before the end \ + of the regex (missing a ')' or ':')."), + UnexpectedTwoDigitHexEof => + write!(f, "Unexpected end of two digit hexadecimal literal."), + UnopenedParen => + write!(f, "Unopened parenthesis."), + UnrecognizedEscape(c) => + write!(f, "Unrecognized escape sequence: '\\{}'.", c), + UnrecognizedFlag(c) => + write!(f, "Unrecognized flag: '{}'. \ + (Allowed flags: i, s, m, U, x.)", c), + UnrecognizedUnicodeClass(ref s) => + write!(f, "Unrecognized Unicode class name: '{}'.", s), + } + } +} + +/// Returns the Unicode *simple* case folding of `c`. +/// +/// N.B. This is hidden because it really isn't the responsibility of this +/// crate to do simple case folding. One hopes that either another crate or +/// the standard library will be able to do this for us. In any case, we still +/// expose it because it is used inside the various Regex engines. +#[doc(hidden)] +pub fn simple_case_fold(c: char) -> char { + match case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c)) { + Ok(i) => case_folding::C_plus_S_table[i].1, + Err(_) => c, + } +} + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn quote(text: &str) -> String { + let mut quoted = String::with_capacity(text.len()); + for c in text.chars() { + if parser::is_punct(c) { + quoted.push('\\'); + } + quoted.push(c); + } + quoted +} + +fn quote_char(c: char) -> String { + let mut s = String::new(); + if parser::is_punct(c) { + s.push('\\'); + } + s.push(c); + s +} + +fn inc_char(c: char) -> char { + match c { + char::MAX => char::MAX, + '\u{D7FF}' => '\u{E000}', + c => char::from_u32(c as u32 + 1).unwrap(), + } +} + +fn dec_char(c: char) -> char { + match c { + '\x00' => '\x00', + '\u{E000}' => '\u{D7FF}', + c => char::from_u32(c as u32 - 1).unwrap(), + } +} + +/// Returns true if and only if `c` is a word character. +#[doc(hidden)] +pub fn is_word_char(c: char) -> bool { + match c { + '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true, + _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| { + if c >= start && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }).is_ok(), + } +} + +#[cfg(test)] +mod properties; + +#[cfg(test)] +mod tests { + use {CharClass, ClassRange}; + + fn class(ranges: &[(char, char)]) -> CharClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); + CharClass::new(ranges) + } + + fn classi(ranges: &[(char, char)]) -> CharClass { + let mut cls = class(ranges); + cls.casei = true; + cls + } + + #[test] + fn class_canon_no_change() { + let cls = class(&[('a', 'c'), ('x', 'z')]); + assert_eq!(cls.clone().canonicalize(), cls); + } + + #[test] + fn class_canon_unordered() { + let cls = class(&[('x', 'z'), ('a', 'c')]); + assert_eq!(cls.canonicalize(), class(&[ + ('a', 'c'), ('x', 'z'), + ])); + } + + #[test] + fn class_canon_overlap() { + let cls = class(&[('x', 'z'), ('w', 'y')]); + assert_eq!(cls.canonicalize(), class(&[ + ('w', 'z'), + ])); + } + + #[test] + fn class_canon_overlap_many() { + let cls = class(&[ + ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), + ('m', 'p'), ('l', 's'), + ]); + assert_eq!(cls.clone().canonicalize(), class(&[ + ('a', 'j'), ('l', 's'), + ])); + } + + #[test] + fn class_canon_overlap_many_case_fold() { + let cls = class(&[ + ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), + ('M', 'P'), ('L', 'S'), ('c', 'f'), + ]); + assert_eq!(cls.case_fold(), classi(&[ + ('a', 'j'), ('l', 's'), + ])); + } + + #[test] + fn class_canon_overlap_boundary() { + let cls = class(&[('x', 'z'), ('u', 'w')]); + assert_eq!(cls.canonicalize(), class(&[ + ('u', 'z'), + ])); + } + + #[test] + fn class_canon_extreme_edge_case() { + let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + assert_eq!(cls.canonicalize(), class(&[ + ('\x00', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_canon_singles() { + let cls = class(&[('a', 'a'), ('b', 'b')]); + assert_eq!(cls.canonicalize(), class(&[('a', 'b')])); + } + + #[test] + fn class_negate_single() { + let cls = class(&[('a', 'a')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), ('\x62', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_singles() { + let cls = class(&[('a', 'a'), ('b', 'b')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), ('\x63', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_multiples() { + let cls = class(&[('a', 'c'), ('x', 'z')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_min_scalar() { + let cls = class(&[('\x00', 'a')]); + assert_eq!(cls.negate(), class(&[ + ('\x62', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_max_scalar() { + let cls = class(&[('a', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), + ])); + } + + #[test] + fn class_negate_everything() { + let cls = class(&[('\x00', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[])); + } + + #[test] + fn class_negate_everything_sans_one() { + let cls = class(&[ + ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}') + ]); + assert_eq!(cls.negate(), class(&[ + ('\u{10FFFE}', '\u{10FFFE}'), + ])); + } + + #[test] + fn class_negate_surrogates_min() { + let cls = class(&[('\x00', '\u{D7FF}')]); + assert_eq!(cls.negate(), class(&[ + ('\u{E000}', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_surrogates_min_edge() { + let cls = class(&[('\x00', '\u{D7FE}')]); + assert_eq!(cls.negate(), class(&[ + ('\u{D7FF}', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_surrogates_max() { + let cls = class(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\u{D7FF}'), + ])); + } + + #[test] + fn class_negate_surrogates_max_edge() { + let cls = class(&[('\u{E001}', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\u{E000}'), + ])); + } + + #[test] + fn class_fold_retain_only_needed() { + let cls = class(&[('A', 'Z'), ('a', 'z')]); + assert_eq!(cls.case_fold(), classi(&[ + ('a', 'z'), + ])); + } + + #[test] + fn class_fold_az() { + let cls = class(&[('A', 'Z')]); + assert_eq!(cls.case_fold(), classi(&[ + ('a', 'z'), + ])); + } + + #[test] + fn class_fold_a_underscore() { + let cls = class(&[('A', 'A'), ('_', '_')]); + assert_eq!(cls.clone().canonicalize(), class(&[ + ('A', 'A'), ('_', '_'), + ])); + assert_eq!(cls.case_fold(), classi(&[ + ('_', '_'), ('a', 'a'), + ])); + } + + #[test] + fn class_fold_a_equals() { + let cls = class(&[('A', 'A'), ('=', '=')]); + assert_eq!(cls.clone().canonicalize(), class(&[ + ('=', '='), ('A', 'A'), + ])); + assert_eq!(cls.case_fold(), classi(&[ + ('=', '='), ('a', 'a'), + ])); + } + + #[test] + fn class_fold_no_folding_needed() { + let cls = class(&[('\x00', '\x10')]); + assert_eq!(cls.case_fold(), classi(&[ + ('\x00', '\x10'), + ])); + } +} diff --git a/regex_syntax/src/parser.rs b/regex_syntax/src/parser.rs new file mode 100644 index 0000000000..25020cc752 --- /dev/null +++ b/regex_syntax/src/parser.rs @@ -0,0 +1,2298 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::cmp::{max, min}; + +use unicode::regex::UNICODE_CLASSES; + +use { + Expr, Repeater, CharClass, ClassRange, CaptureIndex, CaptureName, + Error, ErrorKind, Result, +}; + +/// Parser state. +/// +/// Keeps the entire input in memory and maintains a cursor (char offset). +/// +/// It also keeps an expression stack, which is responsible for managing +/// grouped expressions and flag state. +#[derive(Debug)] +pub struct Parser { + chars: Vec, + chari: usize, + stack: Vec, + caps: usize, + names: Vec, // to check for duplicates + flags: Flags, +} + +/// An empheral type for representing the expression stack. +/// +/// Everything on the stack is either a regular expression or a marker +/// indicating the opening of a group (possibly non-capturing). The opening +/// of a group copies the current flag state, which is reset on the parser +/// state once the group closes. +#[derive(Debug)] +enum Build { + Expr(Expr), + LeftParen { + i: CaptureIndex, + name: CaptureName, + chari: usize, + old_flags: Flags, + }, +} + +/// Flag state. +#[derive(Clone, Copy, Debug)] +struct Flags { + casei: bool, + multi: bool, + dotnl: bool, + swap_greed: bool, + ignore_space: bool, +} + +// Primary expression parsing routines. +impl Parser { + pub fn parse(s: &str) -> Result { + Parser { + chars: s.chars().collect(), + chari: 0, + stack: vec![], + caps: 0, + names: vec![], + flags: Flags { + casei: false, + multi: false, + dotnl: false, + swap_greed: false, + ignore_space: false, + }, + }.parse_expr() + } + + // Top-level expression parser. + // + // Starts at the beginning of the input and consumes until either the end + // of input or an error. + fn parse_expr(mut self) -> Result { + while !self.eof() { + let build_expr = match self.cur() { + '\\' => try!(self.parse_escape()), + '|' => { let e = try!(self.alternate()); self.bump(); e } + '?' => try!(self.parse_simple_repeat(Repeater::ZeroOrOne)), + '*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)), + '+' => try!(self.parse_simple_repeat(Repeater::OneOrMore)), + '{' => try!(self.parse_counted_repeat()), + '[' => match self.maybe_parse_ascii() { + None => try!(self.parse_class()), + Some(cls) => Build::Expr(Expr::Class(cls)), + }, + '^' => { + if self.flags.multi { + self.parse_one(Expr::StartLine) + } else { + self.parse_one(Expr::StartText) + } + } + '$' => { + if self.flags.multi { + self.parse_one(Expr::EndLine) + } else { + self.parse_one(Expr::EndText) + } + } + '.' => { + if self.flags.dotnl { + self.parse_one(Expr::AnyChar) + } else { + self.parse_one(Expr::AnyCharNoNL) + } + } + '(' => try!(self.parse_group()), + ')' => { + let (old_flags, e) = try!(self.close_paren()); + self.bump(); + self.flags = old_flags; + e + } + _ => Build::Expr(Expr::Literal { + chars: vec![self.bump()], + casei: self.flags.casei, + }), + }; + if !build_expr.is_empty() { + let build_expr = self.maybe_class_case_fold(build_expr); + self.stack.push(build_expr); + } + } + self.finish_concat() + } + + // Parses an escape sequence, e.g., \Ax + // + // Start: `\` + // End: `x` + fn parse_escape(&mut self) -> Result { + self.bump(); + if self.eof() { + return Err(self.err(ErrorKind::UnexpectedEscapeEof)); + } + let c = self.cur(); + if is_punct(c) { + return Ok(Build::Expr(Expr::Literal { + chars: vec![self.bump()], + casei: self.flags.casei, + })); + } + + fn lit(c: char) -> Build { + Build::Expr(Expr::Literal { chars: vec![c], casei: false }) + } + match c { + 'a' => { self.bump(); Ok(lit('\x07')) } + 'f' => { self.bump(); Ok(lit('\x0C')) } + 't' => { self.bump(); Ok(lit('\t')) } + 'n' => { self.bump(); Ok(lit('\n')) } + 'r' => { self.bump(); Ok(lit('\r')) } + 'v' => { self.bump(); Ok(lit('\x0B')) } + 'A' => { self.bump(); Ok(Build::Expr(Expr::StartText)) } + 'z' => { self.bump(); Ok(Build::Expr(Expr::EndText)) } + 'b' => { self.bump(); Ok(Build::Expr(Expr::WordBoundary)) } + 'B' => { self.bump(); Ok(Build::Expr(Expr::NotWordBoundary)) } + '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => self.parse_octal(), + 'x' => { self.bump(); self.parse_hex() } + 'p'|'P' => { + self.bump(); + self.parse_unicode_class(c == 'P') + .map(|cls| Build::Expr(Expr::Class(cls))) + } + 'd'|'s'|'w'|'D'|'S'|'W' => { + self.bump(); + Ok(Build::Expr(Expr::Class(self.parse_perl_class(c)))) + } + c => Err(self.err(ErrorKind::UnrecognizedEscape(c))), + } + } + + // Parses a group, e.g., `(abc)`. + // + // Start: `(` + // End: `a` + // + // A more interesting example, `(?Pabc)`. + // + // Start: `(` + // End: `a` + fn parse_group(&mut self) -> Result { + let chari = self.chari; + let mut name: CaptureName = None; + self.bump(); + if self.bump_if("?P<") { + let n = try!(self.parse_group_name()); + if self.names.iter().any(|n2| n2 == &n) { + return Err(self.err(ErrorKind::DuplicateCaptureName(n))); + } + self.names.push(n.clone()); + name = Some(n); + } else if self.bump_if("?") { + // This can never be capturing. It's either setting flags for + // the current group, or it's opening a non-capturing group or + // it's opening a group with a specific set of flags (which is + // also non-capturing). + // Anything else is an error. + return self.parse_group_flags(chari); + } + self.caps = checkadd(self.caps, 1); + Ok(Build::LeftParen { + i: Some(self.caps), + name: name, + chari: chari, + old_flags: self.flags, // no flags changed if we're here + }) + } + + // Parses flags (inline or grouped), e.g., `(?s-i:abc)`. + // + // Start: `s` + // End: `a` + // + // Another example, `(?s-i)a`. + // + // Start: `s` + // End: `a` + fn parse_group_flags(&mut self, opening_chari: usize) -> Result { + let old_flags = self.flags; + let mut sign = true; + let mut saw_flag = false; + loop { + if self.eof() { + // e.g., (?i + return Err(self.err(ErrorKind::UnexpectedFlagEof)); + } + match self.cur() { + 'i' => { self.flags.casei = sign; saw_flag = true } + 'm' => { self.flags.multi = sign; saw_flag = true } + 's' => { self.flags.dotnl = sign; saw_flag = true } + 'U' => { self.flags.swap_greed = sign; saw_flag = true } + 'x' => { self.flags.ignore_space = sign; saw_flag = true } + '-' => { + if !sign { + // e.g., (?-i-s) + return Err(self.err(ErrorKind::DoubleFlagNegation)); + } + sign = false; + saw_flag = false; + } + ')' => { + if !saw_flag { + // e.g., (?) + return Err(self.err(ErrorKind::EmptyFlagNegation)); + } + // At this point, we're just changing the flags inside + // the current group, which means the old flags have + // been saved elsewhere. Our modifications in place are + // okey dokey! + // + // This particular flag expression only has a stateful + // impact on a regex's AST, so nothing gets explicitly + // added. + self.bump(); + return Ok(Build::Expr(Expr::Empty)); + } + ':' => { + if !sign && !saw_flag { + // e.g., (?i-:a) + // Note that if there's no negation, it's OK not + // to see flag, because you end up with a regular + // non-capturing group: `(?:a)`. + return Err(self.err(ErrorKind::EmptyFlagNegation)); + } + self.bump(); + return Ok(Build::LeftParen { + i: None, + name: None, + chari: opening_chari, + old_flags: old_flags, + }); + } + // e.g., (?z:a) + c => return Err(self.err(ErrorKind::UnrecognizedFlag(c))), + } + self.bump(); + } + } + + // Parses a group name, e.g., `foo` in `(?Pabc)`. + // + // Start: `f` + // End: `a` + fn parse_group_name(&mut self) -> Result { + let mut name = String::new(); + while !self.eof() && !self.peek_is('>') { + name.push(self.bump()); + } + if self.eof() { + // e.g., (?Pa) + None => Err(self.err(ErrorKind::EmptyCaptureName)), + Some(c) if (c >= '0' && c <= '9') || !all_valid => { + // e.g., (?Px) + // e.g., (?P<1a>x) + Err(self.err(ErrorKind::InvalidCaptureName(name))) + } + _ => { + self.bump(); // for `>` + Ok(name) + } + } + } + + // Parses a counted repeition operator, e.g., `a{2,4}?z`. + // + // Start: `{` + // End: `z` + fn parse_counted_repeat(&mut self) -> Result { + let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., ({5} + if !e.can_repeat() { + // e.g., a*{5} + return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); + } + self.bump(); + let min = try!(self.parse_decimal(|c| c != ',' && c != '}')); + let mut max_opt = Some(min); + if self.bump_if(',') { + if self.peek_is('}') { + max_opt = None; + } else { + let max = try!(self.parse_decimal(|c| c != '}')); + if min > max { + // e.g., a{2,1} + return Err(self.err(ErrorKind::InvalidRepeatRange { + min: min, + max: max, + })); + } + max_opt = Some(max); + } + } + if !self.bump_if('}') { + Err(self.err(ErrorKind::UnclosedRepeat)) + } else { + Ok(Build::Expr(Expr::Repeat { + e: Box::new(e), + r: Repeater::Range { min: min, max: max_opt }, + greedy: !self.bump_if('?') ^ self.flags.swap_greed, + })) + } + } + + // Parses a simple repetition operator, e.g., `a+?z`. + // + // Start: `+` + // End: `z` + // + // N.B. "simple" in this context means "not min/max repetition", + // e.g., `a{1,2}`. + fn parse_simple_repeat(&mut self, rep: Repeater) -> Result { + let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., (* + if !e.can_repeat() { + // e.g., a** + return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); + } + self.bump(); + Ok(Build::Expr(Expr::Repeat { + e: Box::new(e), + r: rep, + greedy: !self.bump_if('?') ^ self.flags.swap_greed, + })) + } + + // Parses a decimal number until the given character, e.g., `a{123,456}`. + // + // Start: `1` + // End: `,` (where `until == ','`) + fn parse_decimal(&mut self, until: B) -> Result { + match self.bump_get(until) { + // e.g., a{} + None => Err(self.err(ErrorKind::MissingBase10)), + Some(n) => { + // e.g., a{xyz + // e.g., a{9999999999} + let n = n.trim(); + u32::from_str_radix(n, 10) + .map_err(|_| self.err(ErrorKind::InvalidBase10(n.into()))) + } + } + } + + // Parses an octal number, up to 3 digits, e.g., `a\123b` + // + // Start: `1` + // End: `b` + fn parse_octal(&mut self) -> Result { + use std::char; + let mut i = 0; // counter for limiting octal to 3 digits. + let n = self.bump_get(|c| { i += 1; i <= 3 && c >= '0' && c <= '7' }) + .expect("octal string"); // guaranteed at least 1 digit + // I think both of the following unwraps are impossible to fail. + // We limit it to a three digit octal number, which maxes out at + // `0777` or `511` in decimal. Since all digits are in `0...7`, we'll + // always have a valid `u32` number. Moreover, since all numbers in + // the range `0...511` are valid Unicode scalar values, it will always + // be a valid `char`. + // + // Hence, we `unwrap` with reckless abandon. + let n = u32::from_str_radix(&n, 8).ok().expect("valid octal number"); + Ok(Build::Expr(Expr::Literal { + chars: vec![char::from_u32(n).expect("Unicode scalar value")], + casei: self.flags.casei, + })) + } + + // Parses a hex number, e.g., `a\x5ab`. + // + // Start: `5` + // End: `b` + // + // And also, `a\x{2603}b`. + // + // Start: `{` + // End: `b` + fn parse_hex(&mut self) -> Result { + if self.bump_if('{') { + self.parse_hex_many_digits() + } else { + self.parse_hex_two_digits() + } + } + + // Parses a many-digit hex number, e.g., `a\x{2603}b`. + // + // Start: `2` + // End: `b` + fn parse_hex_many_digits(&mut self) -> Result { + use std::char; + + let s = self.bump_get(|c| c != '}').unwrap_or("".into()); + let n = try!(u32::from_str_radix(&s, 16) + .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); + let c = try!(char::from_u32(n) + .ok_or(self.err(ErrorKind::InvalidScalarValue(n)))); + if !self.bump_if('}') { + // e.g., a\x{d + return Err(self.err(ErrorKind::UnclosedHex)); + } + Ok(Build::Expr(Expr::Literal { + chars: vec![c], + casei: self.flags.casei, + })) + } + + // Parses a two-digit hex number, e.g., `a\x5ab`. + // + // Start: `5` + // End: `b` + fn parse_hex_two_digits(&mut self) -> Result { + use std::char; + + let mut i = 0; + let s = self.bump_get(|_| { i += 1; i <= 2 }).unwrap_or("".into()); + if s.len() < 2 { + // e.g., a\x + // e.g., a\xf + return Err(self.err(ErrorKind::UnexpectedTwoDigitHexEof)); + } + let n = try!(u32::from_str_radix(&s, 16) + .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); + Ok(Build::Expr(Expr::Literal { + // Because 0...255 are all valid Unicode scalar values. + chars: vec![char::from_u32(n).expect("Unicode scalar value")], + casei: self.flags.casei, + })) + } + + // Parses a character class, e.g., `[^a-zA-Z0-9]+`. + // + // Start: `[` + // End: `+` + fn parse_class(&mut self) -> Result { + self.bump(); + let negated = self.bump_if('^'); + let mut class = CharClass::empty(); + while self.bump_if('-') { + class.ranges.push(ClassRange::one('-')); + } + loop { + if self.eof() { + // e.g., [a + return Err(self.err(ErrorKind::UnexpectedClassEof)); + } + match self.cur() { + // If no ranges have been added, then `]` is the first + // character (sans, perhaps, the `^` symbol), so it should + // be interpreted as a `]` instead of a closing class bracket. + ']' if class.len() > 0 => { self.bump(); break } + '[' => match self.maybe_parse_ascii() { + Some(class2) => class.ranges.extend(class2), + None => { + self.bump(); + try!(self.parse_class_range(&mut class, '[')) + } + }, + '\\' => match try!(self.parse_escape()) { + Build::Expr(Expr::Class(class2)) => { + class.ranges.extend(class2); + } + Build::Expr(Expr::Literal { chars, .. }) => { + try!(self.parse_class_range(&mut class, chars[0])); + } + Build::Expr(e) => { + let err = ErrorKind::InvalidClassEscape(e); + return Err(self.err(err)); + } + // Because `parse_escape` can never return `LeftParen`. + _ => unreachable!(), + }, + start => { + self.bump(); + try!(self.parse_class_range(&mut class, start)); + } + } + } + if negated { + class = class.negate(); + } + Ok(Build::Expr(Expr::Class(class.canonicalize()))) + } + + // Parses a single range in a character class. + // + // Since this is a helper for `parse_class`, its signature sticks out. + // Namely, it requires the start character of the range and the char + // class to mutate. + // + // e.g., `[a-z]` + // + // Start: `-` (with start == `a`) + // End: `]` + fn parse_class_range(&mut self, class: &mut CharClass, start: char) + -> Result<()> { + if !self.bump_if('-') { + // Not a range, so just push a singleton range. + class.ranges.push(ClassRange::one(start)); + return Ok(()); + } + if self.eof() { + // e.g., [a- + return Err(self.err(ErrorKind::UnexpectedClassEof)); + } + if self.peek_is(']') { + // This is the end of the class, so we permit use of `-` as a + // regular char (just like we do in the beginning). + class.ranges.push(ClassRange::one(start)); + class.ranges.push(ClassRange::one('-')); + return Ok(()); + } + + // We have a real range. Just need to check to parse literal and + // make sure it's a valid range. + let end = match self.cur() { + '\\' => match try!(self.parse_escape()) { + Build::Expr(Expr::Literal { chars, .. }) => chars[0], + Build::Expr(e) => { + return Err(self.err(ErrorKind::InvalidClassEscape(e))); + } + // Because `parse_escape` can never return `LeftParen`. + _ => unreachable!(), + }, + _ => self.bump(), + }; + if end < start { + // e.g., [z-a] + return Err(self.err(ErrorKind::InvalidClassRange { + start: start, + end: end, + })); + } + class.ranges.push(ClassRange::new(start, end)); + Ok(()) + } + + // Parses an ASCII class, e.g., `[:alnum:]+`. + // + // Start: `[` + // End: `+` + // + // Also supports negation, e.g., `[:^alnum:]`. + // + // This parsing routine is distinct from the others in that it doesn't + // actually report any errors. Namely, if it fails, then the parser should + // fall back to parsing a regular class. + // + // This method will only make progress in the parser if it succeeds. + // Otherwise, the input remains where it started. + fn maybe_parse_ascii(&mut self) -> Option { + fn parse(p: &mut Parser) -> Option { + p.bump(); // the `[` + if !p.bump_if(':') { return None; } + let negate = p.bump_if('^'); + let name = match p.bump_get(|c| c != ':') { + None => return None, + Some(name) => name, + }; + if !p.bump_if(":]") { return None; } + ascii_class(&name).map(|c| if !negate { c } else { c.negate() }) + } + let start = self.chari; + match parse(self) { + None => { self.chari = start; None } + result => result, + } + } + + // Parses a Uncode class name, e.g., `a\pLb`. + // + // Start: `L` + // End: `b` + // + // And also, `a\p{Greek}b`. + // + // Start: `{` + // End: `b` + // + // `negate` is true when the class name is used with `\P`. + fn parse_unicode_class(&mut self, neg: bool) -> Result { + let name = + if self.bump_if('{') { + let n = self.bump_get(|c| c != '}').unwrap_or("".into()); + if n.is_empty() || !self.bump_if('}') { + // e.g., \p{Greek + return Err(self.err(ErrorKind::UnclosedUnicodeName)); + } + n + } else { + if self.eof() { + // e.g., \p + return Err(self.err(ErrorKind::UnexpectedEscapeEof)); + } + self.bump().to_string() + }; + match unicode_class(&name) { + None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))), + Some(cls) => if neg { Ok(cls.negate()) } else { Ok(cls) }, + } + } + + // Parses a perl character class with Unicode support. + // + // `name` must be one of d, s, w, D, S, W. If not, this function panics. + // + // No parser state is changed. + fn parse_perl_class(&mut self, name: char) -> CharClass { + use unicode::regex::{PERLD, PERLS, PERLW}; + match name { + 'd' => raw_class_to_expr(PERLD), + 'D' => raw_class_to_expr(PERLD).negate(), + 's' => raw_class_to_expr(PERLS), + 'S' => raw_class_to_expr(PERLS).negate(), + 'w' => raw_class_to_expr(PERLW), + 'W' => raw_class_to_expr(PERLW).negate(), + _ => unreachable!(), + } + } + + // Always bump to the next input and return the given expression as a + // `Build`. + // + // This is mostly for convenience when the surrounding context implies + // that the next character corresponds to the given expression. + fn parse_one(&mut self, e: Expr) -> Build { + self.bump(); + Build::Expr(e) + } +} + +// Auxiliary helper methods. +impl Parser { + fn chars(&self) -> Chars { + Chars::new(&self.chars[self.chari..], self.flags.ignore_space) + } + + fn bump(&mut self) -> char { + let c = self.cur(); + self.chari = checkadd(self.chari, self.chars().next_count()); + c + } + + fn cur(&self) -> char { self.chars().next().unwrap() } + + fn eof(&self) -> bool { self.chars().next().is_none() } + + fn bump_get(&mut self, s: B) -> Option { + let n = s.match_end(self); + if n == 0 { + None + } else { + let end = checkadd(self.chari, n); + let s = self.chars[self.chari..end] + .iter().cloned().collect::(); + self.chari = end; + Some(s) + } + } + + fn bump_if(&mut self, s: B) -> bool { + let n = s.match_end(self); + if n == 0 { + false + } else { + self.chari = checkadd(self.chari, n); + true + } + } + + fn peek_is(&self, s: B) -> bool { + s.match_end(self) > 0 + } + + fn err(&self, kind: ErrorKind) -> Error { + self.errat(self.chari, kind) + } + + fn errat(&self, pos: usize, kind: ErrorKind) -> Error { + Error { pos: pos, surround: self.windowat(pos), kind: kind } + } + + fn windowat(&self, pos: usize) -> String { + let s = max(5, pos) - 5; + let e = min(self.chars.len(), checkadd(pos, 5)); + self.chars[s..e].iter().cloned().collect() + } + + fn pop(&mut self, expected: ErrorKind) -> Result { + match self.stack.pop() { + None | Some(Build::LeftParen{..}) => Err(self.err(expected)), + Some(Build::Expr(e)) => Ok(e), + } + } + + // If the current contexts calls for case insensitivity and if the expr + // given is a character class, do case folding on it and return the new + // class. + // + // Otherwise, return the expression unchanged. + fn maybe_class_case_fold(&mut self, bexpr: Build) -> Build { + match bexpr { + Build::Expr(Expr::Class(cls)) => { + Build::Expr(Expr::Class( + if self.flags.casei && !cls.casei { + cls.case_fold() + } else { + cls + } + )) + } + bexpr => bexpr, + } + } +} + +struct Chars<'a> { + chars: &'a [char], + cur: usize, + ignore_space: bool, + in_comment: bool, +} + +impl<'a> Iterator for Chars<'a> { + type Item = char; + fn next(&mut self) -> Option { + self.skip(); + if self.cur < self.chars.len() { + let c = self.chars[self.cur]; + self.cur = checkadd(self.cur, 1); + Some(c) + } else { + None + } + } +} + +impl<'a> Chars<'a> { + fn new(chars: &[char], ignore_space: bool) -> Chars { + Chars { + chars: chars, + cur: 0, + ignore_space: ignore_space, + in_comment: false, + } + } + + fn skip(&mut self) { + if !self.ignore_space { return; } + while self.cur < self.chars.len() { + if !self.in_comment && self.c() == '#' { + self.in_comment = true; + } else if self.in_comment && self.c() == '\n' { + self.in_comment = false; + } + if self.in_comment || self.c().is_whitespace() { + self.cur = checkadd(self.cur, 1); + } else { + break; + } + } + } + + fn c(&self) -> char { + self.chars[self.cur] + } + + fn next_count(&mut self) -> usize { + self.next(); + self.cur + } +} + +// Auxiliary methods for manipulating the expression stack. +impl Parser { + // Called whenever an alternate (`|`) is found. + // + // This pops the expression stack until: + // + // 1. The stack is empty. Pushes an alternation with one arm. + // 2. An opening parenthesis is found. Leave the parenthesis + // on the stack and push an alternation with one arm. + // 3. An alternate (`|`) is found. Pop the existing alternation, + // add an arm and push the modified alternation. + // + // Each "arm" in the above corresponds to the concatenation of all + // popped expressions. + // + // In the first two cases, the stack is left in an invalid state + // because an alternation with one arm is not allowed. This + // particular state will be detected by `finish_concat` and an + // error will be reported. + // + // In none of the cases is an empty arm allowed. If an empty arm + // is found, an error is reported. + fn alternate(&mut self) -> Result { + let mut concat = vec![]; + let alts = |es| Ok(Build::Expr(Expr::Alternate(es))); + loop { + match self.stack.pop() { + None => { + if concat.is_empty() { + // e.g., |a + return Err(self.err(ErrorKind::EmptyAlternate)); + } + return alts(vec![rev_concat(concat)]); + } + Some(e @ Build::LeftParen{..}) => { + if concat.is_empty() { + // e.g., (|a) + return Err(self.err(ErrorKind::EmptyAlternate)); + } + self.stack.push(e); + return alts(vec![rev_concat(concat)]); + } + Some(Build::Expr(Expr::Alternate(mut es))) => { + if concat.is_empty() { + // e.g., a|| + return Err(self.err(ErrorKind::EmptyAlternate)); + } + es.push(rev_concat(concat)); + return alts(es); + } + Some(Build::Expr(e)) => { concat.push(e); } + } + } + } + + // Called whenever a closing parenthesis (`)`) is found. + // + // This pops the expression stack until: + // + // 1. The stack is empty. An error is reported because this + // indicates an unopened parenthesis. + // 2. An opening parenthesis is found. Pop the opening parenthesis + // and push a `Group` expression. + // 3. An alternate (`|`) is found. Pop the existing alternation + // and an arm to it in place. Pop one more item from the stack. + // If the stack was empty, then report an unopened parenthesis + // error, otherwise assume it is an opening parenthesis and + // push a `Group` expression with the popped alternation. + // (We can assume this is an opening parenthesis because an + // alternation either corresponds to the entire Regex or it + // corresponds to an entire group. This is guaranteed by the + // `alternate` method.) + // + // Each "arm" in the above corresponds to the concatenation of all + // popped expressions. + // + // Empty arms nor empty groups are allowed. + fn close_paren(&mut self) -> Result<(Flags, Build)> { + let mut concat = vec![]; + loop { + match self.stack.pop() { + // e.g., ) + None => return Err(self.err(ErrorKind::UnopenedParen)), + Some(Build::LeftParen { i, name, old_flags, .. }) => { + if concat.is_empty() { + // e.g., () + return Err(self.err(ErrorKind::EmptyGroup)); + } + return Ok((old_flags, Build::Expr(Expr::Group { + e: Box::new(rev_concat(concat)), + i: i, + name: name, + }))); + } + Some(Build::Expr(Expr::Alternate(mut es))) => { + if concat.is_empty() { + // e.g., (a|) + return Err(self.err(ErrorKind::EmptyAlternate)); + } + es.push(rev_concat(concat)); + match self.stack.pop() { + // e.g., a|b) + None => return Err(self.err(ErrorKind::UnopenedParen)), + Some(Build::Expr(_)) => unreachable!(), + Some(Build::LeftParen { i, name, old_flags, .. }) => { + return Ok((old_flags, Build::Expr(Expr::Group { + e: Box::new(Expr::Alternate(es)), + i: i, + name: name, + }))); + } + } + } + Some(Build::Expr(e)) => { concat.push(e); } + } + } + } + + // Called only when the parser reaches the end of input. + // + // This pops the expression stack until: + // + // 1. The stack is empty. Return concatenation of popped + // expressions. This concatenation may be empty! + // 2. An alternation is found. Pop the alternation and push + // a new arm. Return the alternation as the entire Regex. + // + // If an opening parenthesis is popped, then an error is + // returned since it indicates an unclosed parenthesis. + fn finish_concat(&mut self) -> Result { + let mut concat = vec![]; + loop { + match self.stack.pop() { + None => { return Ok(rev_concat(concat)); } + Some(Build::LeftParen{ chari, ..}) => { + // e.g., a(b + return Err(self.errat(chari, ErrorKind::UnclosedParen)); + } + Some(Build::Expr(Expr::Alternate(mut es))) => { + if concat.is_empty() { + // e.g., a| + return Err(self.err(ErrorKind::EmptyAlternate)); + } + es.push(rev_concat(concat)); + return Ok(Expr::Alternate(es)); + } + Some(Build::Expr(e)) => { concat.push(e); } + } + } + } +} + +impl Build { + fn is_empty(&self) -> bool { + match *self { + Build::Expr(Expr::Empty) => true, + _ => false, + } + } +} + +// Make it ergonomic to conditionally bump the parser. +// i.e., `bump_if('a')` or `bump_if("abc")`. +trait Bumpable { + fn match_end(self, p: &Parser) -> usize; +} + +impl Bumpable for char { + fn match_end(self, p: &Parser) -> usize { + let mut chars = p.chars(); + if chars.next().map(|c| c == self).unwrap_or(false) { + chars.cur + } else { + 0 + } + } +} + +impl<'a> Bumpable for &'a str { + fn match_end(self, p: &Parser) -> usize { + let mut search = self.chars(); + let mut rest = p.chars(); + let mut count = 0; + loop { + match (rest.next(), search.next()) { + (Some(c1), Some(c2)) if c1 == c2 => count = rest.cur, + (_, None) => return count, + _ => return 0, + } + } + } +} + +impl bool> Bumpable for F { + fn match_end(mut self, p: &Parser) -> usize { + let mut chars = p.chars(); + let mut count = 0; + while let Some(c) = chars.next() { + if !self(c) { + break + } + count = chars.cur; + } + count + } +} + +// Turn a sequence of expressions into a concatenation. +// This only uses `Concat` if there are 2 or more expressions. +fn rev_concat(mut exprs: Vec) -> Expr { + if exprs.len() == 0 { + Expr::Empty + } else if exprs.len() == 1 { + exprs.pop().unwrap() + } else { + exprs.reverse(); + Expr::Concat(exprs) + } +} + +// Returns ture iff the given character is allowed in a capture name. +// Note that the first char of a capture name must not be numeric. +fn is_valid_capture_char(c: char) -> bool { + c == '_' || (c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +/// Returns true iff the give character has significance in a regex. +#[doc(hidden)] +pub fn is_punct(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | + '[' | ']' | '{' | '}' | '^' | '$' => true, + _ => false, + } +} + +fn checkadd(x: usize, y: usize) -> usize { + x.checked_add(y).expect("regex length overflow") +} + +fn unicode_class(name: &str) -> Option { + UNICODE_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| { + raw_class_to_expr(UNICODE_CLASSES[i].1) + }) +} + +fn ascii_class(name: &str) -> Option { + ASCII_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| { + raw_class_to_expr(ASCII_CLASSES[i].1) + }) +} + +fn raw_class_to_expr(raw: &[(char, char)]) -> CharClass { + let range = |&(s, e)| ClassRange { start: s, end: e }; + CharClass::new(raw.iter().map(range).collect()) +} + +type Class = &'static [(char, char)]; +type NamedClasses = &'static [(&'static str, Class)]; + +const ASCII_CLASSES: NamedClasses = &[ + // Classes must be in alphabetical order so that bsearch works. + // [:alnum:] alphanumeric (== [0-9A-Za-z]) + // [:alpha:] alphabetic (== [A-Za-z]) + // [:ascii:] ASCII (== [\x00-\x7F]) + // [:blank:] blank (== [\t ]) + // [:cntrl:] control (== [\x00-\x1F\x7F]) + // [:digit:] digits (== [0-9]) + // [:graph:] graphical (== [!-~]) + // [:lower:] lower case (== [a-z]) + // [:print:] printable (== [ -~] == [ [:graph:]]) + // [:punct:] punctuation (== [!-/:-@[-`{-~]) + // [:space:] whitespace (== [\t\n\v\f\r ]) + // [:upper:] upper case (== [A-Z]) + // [:word:] word characters (== [0-9A-Za-z_]) + // [:xdigit:] hex digit (== [0-9A-Fa-f]) + // Taken from: http://golang.org/pkg/regex/syntax/ + ("alnum", &ALNUM), + ("alpha", &ALPHA), + ("ascii", &ASCII), + ("blank", &BLANK), + ("cntrl", &CNTRL), + ("digit", &DIGIT), + ("graph", &GRAPH), + ("lower", &LOWER), + ("print", &PRINT), + ("punct", &PUNCT), + ("space", &SPACE), + ("upper", &UPPER), + ("word", &WORD), + ("xdigit", &XDIGIT), +]; + +const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; +const ALPHA: Class = &[('A', 'Z'), ('a', 'z')]; +const ASCII: Class = &[('\x00', '\x7F')]; +const BLANK: Class = &[(' ', ' '), ('\t', '\t')]; +const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; +const DIGIT: Class = &[('0', '9')]; +const GRAPH: Class = &[('!', '~')]; +const LOWER: Class = &[('a', 'z')]; +const PRINT: Class = &[(' ', '~')]; +const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; +const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), + ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')]; +const UPPER: Class = &[('A', 'Z')]; +const WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]; +const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; + +#[cfg(test)] +mod tests { + use { CharClass, ClassRange, Expr, Repeater, ErrorKind }; + use unicode::regex::{PERLD, PERLS, PERLW}; + use super::Parser; + use super::{LOWER, UPPER}; + + static YI: &'static [(char, char)] = &[ + ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'), + ]; + + fn p(s: &str) -> Expr { Parser::parse(s).unwrap() } + fn lit(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: false } } + fn liti(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: true } } + fn b(v: T) -> Box { Box::new(v) } + fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) } + + fn class(ranges: &[(char, char)]) -> CharClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); + CharClass::new(ranges) + } + + fn classes(classes: &[&[(char, char)]]) -> CharClass { + let mut cls = CharClass::empty(); + for &ranges in classes { + cls.ranges.extend(class(ranges)); + } + cls.canonicalize() + } + + #[test] + fn empty() { + assert_eq!(p(""), Expr::Empty); + } + + #[test] + fn literal() { + assert_eq!(p("a"), lit('a')); + } + + #[test] + fn literal_string() { + assert_eq!(p("ab"), Expr::Concat(vec![lit('a'), lit('b')])); + } + + #[test] + fn start_literal() { + assert_eq!(p("^a"), Expr::Concat(vec![ + Expr::StartText, + Expr::Literal { chars: vec!['a'], casei: false }, + ])); + } + + #[test] + fn repeat_zero_or_one_greedy() { + assert_eq!(p("a?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrOne, + greedy: true, + }); + } + + #[test] + fn repeat_zero_or_one_greedy_concat() { + assert_eq!(p("ab?"), Expr::Concat(vec![ + lit('a'), + Expr::Repeat { + e: b(lit('b')), + r: Repeater::ZeroOrOne, + greedy: true, + }, + ])); + } + + #[test] + fn repeat_zero_or_one_nongreedy() { + assert_eq!(p("a??"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrOne, + greedy: false, + }); + } + + #[test] + fn repeat_one_or_more_greedy() { + assert_eq!(p("a+"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::OneOrMore, + greedy: true, + }); + } + + #[test] + fn repeat_one_or_more_nongreedy() { + assert_eq!(p("a+?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::OneOrMore, + greedy: false, + }); + } + + #[test] + fn repeat_zero_or_more_greedy() { + assert_eq!(p("a*"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + }); + } + + #[test] + fn repeat_zero_or_more_nongreedy() { + assert_eq!(p("a*?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: false, + }); + } + + #[test] + fn repeat_counted_exact() { + assert_eq!(p("a{5}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(5) }, + greedy: true, + }); + } + + #[test] + fn repeat_counted_min() { + assert_eq!(p("a{5,}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: None }, + greedy: true, + }); + } + + #[test] + fn repeat_counted_min_max() { + assert_eq!(p("a{5,10}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: true, + }); + } + + #[test] + fn repeat_counted_exact_nongreedy() { + assert_eq!(p("a{5}?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(5) }, + greedy: false, + }); + } + + #[test] + fn repeat_counted_min_nongreedy() { + assert_eq!(p("a{5,}?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: None }, + greedy: false, + }); + } + + #[test] + fn repeat_counted_min_max_nongreedy() { + assert_eq!(p("a{5,10}?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: false, + }); + } + + #[test] + fn repeat_counted_whitespace() { + assert_eq!(p("a{ 5 }"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(5) }, + greedy: true, + }); + assert_eq!(p("a{ 5 , 10 }"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: true, + }); + } + + #[test] + fn group_literal() { + assert_eq!(p("(a)"), Expr::Group { + e: b(lit('a')), + i: Some(1), + name: None, + }); + } + + #[test] + fn group_literal_concat() { + assert_eq!(p("(ab)"), Expr::Group { + e: b(c(&[lit('a'), lit('b')])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_two() { + assert_eq!(p("a|b"), Expr::Alternate(vec![lit('a'), lit('b')])); + } + + #[test] + fn alt_many() { + assert_eq!(p("a|b|c"), Expr::Alternate(vec![ + lit('a'), lit('b'), lit('c'), + ])); + } + + #[test] + fn alt_many_concat() { + assert_eq!(p("ab|bc|cd"), Expr::Alternate(vec![ + c(&[lit('a'), lit('b')]), + c(&[lit('b'), lit('c')]), + c(&[lit('c'), lit('d')]), + ])); + } + + #[test] + fn alt_group_two() { + assert_eq!(p("(a|b)"), Expr::Group { + e: b(Expr::Alternate(vec![lit('a'), lit('b')])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_group_many() { + assert_eq!(p("(a|b|c)"), Expr::Group { + e: b(Expr::Alternate(vec![lit('a'), lit('b'), lit('c')])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_group_many_concat() { + assert_eq!(p("(ab|bc|cd)"), Expr::Group { + e: b(Expr::Alternate(vec![ + c(&[lit('a'), lit('b')]), + c(&[lit('b'), lit('c')]), + c(&[lit('c'), lit('d')]), + ])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_group_nested() { + assert_eq!(p("(ab|(bc|(cd)))"), Expr::Group { + e: b(Expr::Alternate(vec![ + c(&[lit('a'), lit('b')]), + Expr::Group { + e: b(Expr::Alternate(vec![ + c(&[lit('b'), lit('c')]), + Expr::Group { + e: b(c(&[lit('c'), lit('d')])), + i: Some(3), + name: None, + } + ])), + i: Some(2), + name: None, + }, + ])), + i: Some(1), + name: None, + }); + } + + #[test] + fn group_name() { + assert_eq!(p("(?Pa)"), Expr::Group { + e: b(lit('a')), + i: Some(1), + name: Some("foo".into()), + }); + } + + #[test] + fn group_no_capture() { + assert_eq!(p("(?:a)"), Expr::Group { + e: b(lit('a')), + i: None, + name: None, + }); + } + + #[test] + fn group_flags() { + assert_eq!(p("(?i:a)"), Expr::Group { + e: b(liti('a')), + i: None, + name: None, + }); + } + + #[test] + fn group_flags_returned() { + assert_eq!(p("(?i:a)a"), c(&[ + Expr::Group { + e: b(liti('a')), + i: None, + name: None, + }, + lit('a'), + ])); + } + + #[test] + fn group_flags_retained() { + assert_eq!(p("(?i)(?-i:a)a"), c(&[ + Expr::Group { + e: b(lit('a')), + i: None, + name: None, + }, + liti('a'), + ])); + } + + #[test] + fn flags_inline() { + assert_eq!(p("(?i)a"), liti('a')); + } + + #[test] + fn flags_inline_multiple() { + assert_eq!(p("(?is)a."), c(&[liti('a'), Expr::AnyChar])); + } + + #[test] + fn flags_inline_multiline() { + assert_eq!(p("(?m)^(?-m)$"), c(&[Expr::StartLine, Expr::EndText])); + } + + #[test] + fn flags_inline_swap_greed() { + assert_eq!(p("(?U)a*a*?(?i-U)a*a*?"), c(&[ + Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: false, + }, + Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + }, + Expr::Repeat { + e: b(liti('a')), + r: Repeater::ZeroOrMore, + greedy: true, + }, + Expr::Repeat { + e: b(liti('a')), + r: Repeater::ZeroOrMore, + greedy: false, + }, + ])); + } + + #[test] + fn flags_inline_multiple_negate_one() { + assert_eq!(p("(?is)a.(?i-s)a."), c(&[ + liti('a'), Expr::AnyChar, liti('a'), Expr::AnyCharNoNL, + ])); + } + + #[test] + fn flags_inline_negate() { + assert_eq!(p("(?i)a(?-i)a"), c(&[liti('a'), lit('a')])); + } + + #[test] + fn flags_group_inline() { + assert_eq!(p("(a(?i)a)a"), c(&[ + Expr::Group { + e: b(c(&[lit('a'), liti('a')])), + i: Some(1), + name: None, + }, + lit('a'), + ])); + } + + #[test] + fn flags_group_inline_retain() { + assert_eq!(p("(?i)((?-i)a)a"), c(&[ + Expr::Group { + e: b(lit('a')), + i: Some(1), + name: None, + }, + liti('a'), + ])); + } + + #[test] + fn escape_simple() { + assert_eq!(p(r"\a\f\t\n\r\v"), c(&[ + lit('\x07'), lit('\x0C'), lit('\t'), + lit('\n'), lit('\r'), lit('\x0B'), + ])); + } + + #[test] + fn escape_boundaries() { + assert_eq!(p(r"\A\z\b\B"), c(&[ + Expr::StartText, Expr::EndText, + Expr::WordBoundary, Expr::NotWordBoundary, + ])); + } + + #[test] + fn escape_punctuation() { + assert_eq!(p(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$"), c(&[ + lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'), + lit('('), lit(')'), lit('|'), lit('['), lit(']'), + lit('{'), lit('}'), lit('^'), lit('$'), + ])); + } + + #[test] + fn escape_octal() { + assert_eq!(p(r"\123"), lit('S')); + assert_eq!(p(r"\1234"), c(&[lit('S'), lit('4')])); + } + + #[test] + fn escape_hex2() { + assert_eq!(p(r"\x53"), lit('S')); + assert_eq!(p(r"\x534"), c(&[lit('S'), lit('4')])); + } + + #[test] + fn escape_hex() { + assert_eq!(p(r"\x{53}"), lit('S')); + assert_eq!(p(r"\x{53}4"), c(&[lit('S'), lit('4')])); + assert_eq!(p(r"\x{2603}"), lit('\u{2603}')); + } + + #[test] + fn escape_unicode_name() { + assert_eq!(p(r"\p{Yi}"), Expr::Class(class(YI))); + } + + #[test] + fn escape_unicode_letter() { + assert_eq!(p(r"\pZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]))); + } + + #[test] + fn escape_unicode_name_case_fold() { + assert_eq!(p(r"(?i)\p{Yi}"), Expr::Class(class(YI).case_fold())); + } + + #[test] + fn escape_unicode_letter_case_fold() { + assert_eq!(p(r"(?i)\pZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]).case_fold())); + } + + #[test] + fn escape_unicode_name_negate() { + assert_eq!(p(r"\P{Yi}"), Expr::Class(class(YI).negate())); + } + + #[test] + fn escape_unicode_letter_negate() { + assert_eq!(p(r"\PZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]).negate())); + } + + #[test] + fn escape_unicode_name_negate_case_fold() { + assert_eq!(p(r"(?i)\P{Yi}"), + Expr::Class(class(YI).negate().case_fold())); + } + + #[test] + fn escape_unicode_letter_negate_case_fold() { + assert_eq!(p(r"(?i)\PZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]).negate().case_fold())); + } + + #[test] + fn escape_perl_d() { + assert_eq!(p(r"\d"), Expr::Class(class(PERLD))); + } + + #[test] + fn escape_perl_s() { + assert_eq!(p(r"\s"), Expr::Class(class(PERLS))); + } + + #[test] + fn escape_perl_w() { + assert_eq!(p(r"\w"), Expr::Class(class(PERLW))); + } + + #[test] + fn escape_perl_d_negate() { + assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate())); + } + + #[test] + fn escape_perl_s_negate() { + assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate())); + } + + #[test] + fn escape_perl_w_negate() { + assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate())); + } + + #[test] + fn escape_perl_d_case_fold() { + assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold())); + } + + #[test] + fn escape_perl_s_case_fold() { + assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold())); + } + + #[test] + fn escape_perl_w_case_fold() { + assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold())); + } + + #[test] + fn escape_perl_d_case_fold_negate() { + assert_eq!(p(r"(?i)\D"), + Expr::Class(class(PERLD).negate().case_fold())); + } + + #[test] + fn escape_perl_s_case_fold_negate() { + assert_eq!(p(r"(?i)\S"), + Expr::Class(class(PERLS).negate().case_fold())); + } + + #[test] + fn escape_perl_w_case_fold_negate() { + assert_eq!(p(r"(?i)\W"), + Expr::Class(class(PERLW).negate().case_fold())); + } + + #[test] + fn class_singleton() { + assert_eq!(p(r"[a]"), Expr::Class(class(&[('a', 'a')]))); + assert_eq!(p(r"[\x00]"), Expr::Class(class(&[('\x00', '\x00')]))); + assert_eq!(p(r"[\n]"), Expr::Class(class(&[('\n', '\n')]))); + assert_eq!(p("[\n]"), Expr::Class(class(&[('\n', '\n')]))); + } + + #[test] + fn class_singleton_negate() { + assert_eq!(p(r"[^a]"), Expr::Class(class(&[ + ('\x00', '\x60'), ('\x62', '\u{10FFFF}'), + ]))); + assert_eq!(p(r"[^\x00]"), Expr::Class(class(&[ + ('\x01', '\u{10FFFF}'), + ]))); + assert_eq!(p(r"[^\n]"), Expr::Class(class(&[ + ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), + ]))); + assert_eq!(p("[^\n]"), Expr::Class(class(&[ + ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), + ]))); + } + + #[test] + fn class_singleton_class() { + assert_eq!(p(r"[\d]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[\p{Yi}]"), Expr::Class(class(YI))); + } + + #[test] + fn class_singleton_class_negate() { + assert_eq!(p(r"[^\d]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate())); + assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate())); + } + + #[test] + fn class_singleton_class_negate_negate() { + assert_eq!(p(r"[^\D]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW))); + assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS))); + } + + #[test] + fn class_singleton_class_casei() { + assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold())); + } + + #[test] + fn class_singleton_class_negate_casei() { + assert_eq!(p(r"(?i)[^\d]"), + Expr::Class(class(PERLD).negate().case_fold())); + assert_eq!(p(r"(?i)[^\w]"), + Expr::Class(class(PERLW).negate().case_fold())); + assert_eq!(p(r"(?i)[^\s]"), + Expr::Class(class(PERLS).negate().case_fold())); + } + + #[test] + fn class_singleton_class_negate_negate_casei() { + assert_eq!(p(r"(?i)[^\D]"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold())); + assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold())); + } + + #[test] + fn class_multiple_class() { + assert_eq!(p(r"[\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]))); + } + + #[test] + fn class_multiple_class_negate() { + assert_eq!(p(r"[^\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]).negate())); + } + + #[test] + fn class_multiple_class_negate_negate() { + let nperld = class(PERLD).negate(); + let nyi = class(YI).negate(); + let cls = CharClass::empty().merge(nperld).merge(nyi); + assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate())); + } + + #[test] + fn class_multiple_class_casei() { + assert_eq!(p(r"(?i)[\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]).case_fold())); + } + + #[test] + fn class_multiple_class_negate_casei() { + assert_eq!(p(r"(?i)[^\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]).negate().case_fold())); + } + + #[test] + fn class_multiple_class_negate_negate_casei() { + let nperld = class(PERLD).negate(); + let nyi = class(YI).negate(); + let class = CharClass::empty().merge(nperld).merge(nyi); + assert_eq!(p(r"(?i)[^\D\P{Yi}]"), + Expr::Class(class.negate().case_fold())); + } + + #[test] + fn class_class_hypen() { + assert_eq!(p(r"[\p{Yi}-]"), Expr::Class(classes(&[ + &[('-', '-')], YI, + ]))); + assert_eq!(p(r"[\p{Yi}-a]"), Expr::Class(classes(&[ + &[('-', '-')], &[('a', 'a')], YI, + ]))); + } + + #[test] + fn class_brackets() { + assert_eq!(p("[]]"), Expr::Class(class(&[(']', ']')]))); + assert_eq!(p("[][]"), Expr::Class(class(&[('[', '['), (']', ']')]))); + assert_eq!(p("[[]]"), Expr::Concat(vec![ + Expr::Class(class(&[('[', '[')])), + lit(']'), + ])); + } + + #[test] + fn class_brackets_hypen() { + assert_eq!(p("[]-]"), Expr::Class(class(&[('-', '-'), (']', ']')]))); + assert_eq!(p("[-]]"), Expr::Concat(vec![ + Expr::Class(class(&[('-', '-')])), + lit(']'), + ])); + } + + #[test] + fn class_overlapping() { + assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')]))); + assert_eq!(p("[a-fg-m]"), Expr::Class(class(&[('a', 'm')]))); + } + + #[test] + fn ascii_class() { + assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); + assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); + } + + #[test] + fn ascii_class_not() { + assert_eq!(p("[:abc:]"), + Expr::Class(class(&[(':', ':'), ('a', 'c')]))); + } + + #[test] + fn ascii_class_multiple() { + assert_eq!(p("[[:lower:][:upper:]]"), + Expr::Class(classes(&[UPPER, LOWER]))); + } + + #[test] + fn ascii_class_negate() { + assert_eq!(p("[[:^upper:]]"), Expr::Class(class(UPPER).negate())); + assert_eq!(p("[^[:^upper:]]"), Expr::Class(class(UPPER))); + } + + #[test] + fn ascii_class_negate_multiple() { + let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate()); + let cls = CharClass::empty().merge(nlower).merge(nupper); + assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone())); + assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate())); + } + + #[test] + fn ascii_class_case_fold() { + assert_eq!(p("(?i)[:upper:]"), Expr::Class(class(UPPER).case_fold())); + assert_eq!(p("(?i)[[:upper:]]"), + Expr::Class(class(UPPER).case_fold())); + } + + #[test] + fn ascii_class_negate_case_fold() { + assert_eq!(p("(?i)[[:^upper:]]"), + Expr::Class(class(UPPER).negate().case_fold())); + assert_eq!(p("(?i)[^[:^upper:]]"), + Expr::Class(class(UPPER).case_fold())); + } + + #[test] + fn ignore_space_literal() { + assert_eq!(p("(?x) a b c"), Expr::Concat(vec![ + lit('a'), lit('b'), lit('c'), + ])); + } + + #[test] + fn ignore_space_literal_off() { + assert_eq!(p("(?x) a b c(?-x) a"), Expr::Concat(vec![ + lit('a'), lit('b'), lit('c'), lit(' '), lit('a'), + ])); + } + + #[test] + fn ignore_space_class() { + assert_eq!(p("(?x)[a + - z +]"), Expr::Class(class(&[('a', 'z')]))); + assert_eq!(p("(?x)[ ^ a + - z +]"), Expr::Class(class(&[('a', 'z')]).negate())); + } + + #[test] + fn ignore_space_escape() { + assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD))); + assert_eq!(p(r"(?x)\ + D"), Expr::Class(class(PERLD).negate())); + } + + #[test] + fn ignore_space_comments() { + assert_eq!(p(r"(?x)(?P + a # comment 1 +)(?P + z # comment 2 +)"), Expr::Concat(vec![ + Expr::Group { + e: Box::new(lit('a')), + i: Some(1), + name: Some("foo".into()), + }, + Expr::Group { + e: Box::new(lit('z')), + i: Some(2), + name: Some("bar".into()), + }, + ])); + } + + #[test] + fn ignore_space_comments_re_enable() { + assert_eq!(p(r"(?x)a # hi +(?-x:#) # sweet"), Expr::Concat(vec![ + lit('a'), + Expr::Group { + e: Box::new(lit('#')), + i: None, + name: None, + }, + ])); + } + + // Test every single possible error case. + + macro_rules! test_err { + ($re:expr, $pos:expr, $kind:expr) => {{ + let err = Parser::parse($re).unwrap_err(); + assert_eq!($pos, err.pos); + assert_eq!($kind, err.kind); + assert!($re.contains(&err.surround)); + }} + } + + #[test] + fn error_repeat_no_expr_simple() { + test_err!("(*", 1, ErrorKind::RepeaterExpectsExpr); + } + + #[test] + fn error_repeat_no_expr_counted() { + test_err!("({5}", 1, ErrorKind::RepeaterExpectsExpr); + } + + #[test] + fn error_repeat_beginning_counted() { + test_err!("{5}", 0, ErrorKind::RepeaterExpectsExpr); + } + + #[test] + fn error_repeat_illegal_exprs_simple() { + test_err!("a**", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + })); + test_err!("a|*", 2, + ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')])) + ); + } + + #[test] + fn error_repeat_illegal_exprs_counted() { + test_err!("a*{5}", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + })); + test_err!("a|{5}", 2, + ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')])) + ); + } + + #[test] + fn error_repeat_empty_number() { + test_err!("a{}", 2, ErrorKind::MissingBase10); + } + + #[test] + fn error_repeat_eof() { + test_err!("a{5", 3, ErrorKind::UnclosedRepeat); + } + + #[test] + fn error_repeat_empty_number_eof() { + test_err!("a{xyz", 5, ErrorKind::InvalidBase10("xyz".into())); + test_err!("a{12,xyz", 8, ErrorKind::InvalidBase10("xyz".into())); + } + + #[test] + fn error_repeat_invalid_number() { + test_err!("a{9999999999}", 12, + ErrorKind::InvalidBase10("9999999999".into())); + test_err!("a{1,9999999999}", 14, + ErrorKind::InvalidBase10("9999999999".into())); + } + + #[test] + fn error_repeat_invalid_number_extra() { + test_err!("a{12x}", 5, ErrorKind::InvalidBase10("12x".into())); + test_err!("a{1,12x}", 7, ErrorKind::InvalidBase10("12x".into())); + } + + #[test] + fn error_repeat_invalid_range() { + test_err!("a{2,1}", 5, + ErrorKind::InvalidRepeatRange { min: 2, max: 1 }); + } + + #[test] + fn error_alternate_empty() { + test_err!("|a", 0, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_alternate_empty_with_group() { + test_err!("(|a)", 1, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_alternate_empty_with_alternate() { + test_err!("a||", 2, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_close_paren_unopened_empty() { + test_err!(")", 0, ErrorKind::UnopenedParen); + } + + #[test] + fn error_close_paren_unopened() { + test_err!("ab)", 2, ErrorKind::UnopenedParen); + } + + #[test] + fn error_close_paren_unopened_with_alt() { + test_err!("a|b)", 3, ErrorKind::UnopenedParen); + } + + #[test] + fn error_close_paren_empty_alt() { + test_err!("(a|)", 3, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_close_paren_empty_group() { + test_err!("()", 1, ErrorKind::EmptyGroup); + } + + #[test] + fn error_close_paren_empty_group_with_name() { + test_err!("(?P)", 8, ErrorKind::EmptyGroup); + } + + #[test] + fn error_finish_concat_unclosed() { + test_err!("ab(xy", 2, ErrorKind::UnclosedParen); + } + + #[test] + fn error_finish_concat_empty_alt() { + test_err!("a|", 2, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_group_name_invalid() { + test_err!("(?Px)", 6, ErrorKind::InvalidCaptureName("a#".into())); + } + + #[test] + fn error_group_name_invalid_leading() { + test_err!("(?P<1a>a)", 6, ErrorKind::InvalidCaptureName("1a".into())); + } + + #[test] + fn error_group_name_unexpected_eof() { + test_err!("(?Pa)", 4, ErrorKind::EmptyCaptureName); + } + + #[test] + fn error_group_opts_unrecognized_flag() { + test_err!("(?z:a)", 2, ErrorKind::UnrecognizedFlag('z')); + } + + #[test] + fn error_group_opts_unexpected_eof() { + test_err!("(?i", 3, ErrorKind::UnexpectedFlagEof); + } + + #[test] + fn error_group_opts_double_negation() { + test_err!("(?-i-s:a)", 4, ErrorKind::DoubleFlagNegation); + } + + #[test] + fn error_group_opts_empty_negation() { + test_err!("(?i-:a)", 4, ErrorKind::EmptyFlagNegation); + } + + #[test] + fn error_group_opts_empty() { + test_err!("(?)", 2, ErrorKind::EmptyFlagNegation); + } + + #[test] + fn error_escape_unexpected_eof() { + test_err!(r"\", 1, ErrorKind::UnexpectedEscapeEof); + } + + #[test] + fn error_escape_unrecognized() { + test_err!(r"\m", 1, ErrorKind::UnrecognizedEscape('m')); + } + + #[test] + fn error_escape_hex2_eof0() { + test_err!(r"\x", 2, ErrorKind::UnexpectedTwoDigitHexEof); + } + + #[test] + fn error_escape_hex2_eof1() { + test_err!(r"\xA", 3, ErrorKind::UnexpectedTwoDigitHexEof); + } + + #[test] + fn error_escape_hex2_invalid() { + test_err!(r"\xAG", 4, ErrorKind::InvalidBase16("AG".into())); + } + + #[test] + fn error_escape_hex_eof0() { + test_err!(r"\x{", 3, ErrorKind::InvalidBase16("".into())); + } + + #[test] + fn error_escape_hex_eof1() { + test_err!(r"\x{A", 4, ErrorKind::UnclosedHex); + } + + #[test] + fn error_escape_hex_invalid() { + test_err!(r"\x{AG}", 5, ErrorKind::InvalidBase16("AG".into())); + } + + #[test] + fn error_escape_hex_invalid_scalar_value_surrogate() { + test_err!(r"\x{D800}", 7, ErrorKind::InvalidScalarValue(0xD800)); + } + + #[test] + fn error_escape_hex_invalid_scalar_value_high() { + test_err!(r"\x{110000}", 9, ErrorKind::InvalidScalarValue(0x110000)); + } + + #[test] + fn error_escape_hex_invalid_u32() { + test_err!(r"\x{9999999999}", 13, + ErrorKind::InvalidBase16("9999999999".into())); + } + + #[test] + fn error_unicode_unclosed() { + test_err!(r"\p{", 3, ErrorKind::UnclosedUnicodeName); + test_err!(r"\p{Greek", 8, ErrorKind::UnclosedUnicodeName); + } + + #[test] + fn error_unicode_no_letter() { + test_err!(r"\p", 2, ErrorKind::UnexpectedEscapeEof); + } + + #[test] + fn error_unicode_unknown_letter() { + test_err!(r"\pA", 3, ErrorKind::UnrecognizedUnicodeClass("A".into())); + } + + #[test] + fn error_unicode_unknown_name() { + test_err!(r"\p{Yii}", 7, + ErrorKind::UnrecognizedUnicodeClass("Yii".into())); + } + + #[test] + fn error_class_eof_empty() { + test_err!("[", 1, ErrorKind::UnexpectedClassEof); + test_err!("[^", 2, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_eof_non_empty() { + test_err!("[a", 2, ErrorKind::UnexpectedClassEof); + test_err!("[^a", 3, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_eof_range() { + test_err!("[a-", 3, ErrorKind::UnexpectedClassEof); + test_err!("[^a-", 4, ErrorKind::UnexpectedClassEof); + test_err!("[---", 4, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_invalid_escape() { + test_err!(r"[\pA]", 4, + ErrorKind::UnrecognizedUnicodeClass("A".into())); + } + + #[test] + fn error_class_valid_escape_not_allowed() { + test_err!(r"[\A]", 3, ErrorKind::InvalidClassEscape(Expr::StartText)); + } + + #[test] + fn error_class_range_valid_escape_not_allowed() { + test_err!(r"[a-\d]", 5, + ErrorKind::InvalidClassEscape(Expr::Class(class(PERLD)))); + test_err!(r"[a-\A]", 5, + ErrorKind::InvalidClassEscape(Expr::StartText)); + test_err!(r"[\A-a]", 3, + ErrorKind::InvalidClassEscape(Expr::StartText)); + } + + #[test] + fn error_class_invalid_range() { + test_err!("[z-a]", 4, ErrorKind::InvalidClassRange { + start: 'z', + end: 'a', + }); + } + + #[test] + fn error_class_empty_range() { + test_err!("[]", 2, ErrorKind::UnexpectedClassEof); + test_err!("[^]", 3, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_duplicate_capture_name() { + test_err!("(?P.)(?P.)", 14, + ErrorKind::DuplicateCaptureName("a".into())); + } +} diff --git a/regex_syntax/src/properties.rs b/regex_syntax/src/properties.rs new file mode 100644 index 0000000000..38cbb02e73 --- /dev/null +++ b/regex_syntax/src/properties.rs @@ -0,0 +1,407 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use quickcheck::{Arbitrary, Gen, Testable, QuickCheck, StdGen}; +use rand::Rng; + +use {Expr, CharClass, ClassRange, Repeater, dec_char}; + +fn qc(t: T) { + QuickCheck::new() + .tests(10_000) + .max_tests(20_000) + .quickcheck(t); +} + +fn class(ranges: &[(char, char)]) -> CharClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); + CharClass::new(ranges) +} + +// Test invariants for canonicalizing character classes. + +#[test] +fn negate() { + fn prop(ranges: Vec<(char, char)>) -> bool { + class(&ranges).canonicalize() == class(&ranges).negate().negate() + } + qc(prop as fn(Vec<(char, char)>) -> bool); +} + +#[test] +fn classes_are_sorted_and_nonoverlapping() { + fn prop(ranges: Vec<(char, char)>) -> bool { + class(&ranges) + .canonicalize() + .windows(2) + .all(|w| w[0].end < dec_char(w[1].start)) + } + qc(prop as fn(Vec<(char, char)>) -> bool); +} + +#[test] +fn valid_class_ranges() { + fn prop(ranges: Vec<(char, char)>) -> bool { + class(&ranges).canonicalize().into_iter().all(|r| r.start <= r.end) + } + qc(prop as fn(Vec<(char, char)>) -> bool); +} + +/// A wrapper type for generating "regex-like" Unicode strings. +/// +/// In particular, this type's `Arbitrary` impl specifically biases toward +/// special regex characters to make test cases more interesting. +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +struct RegexLikeString(String); + +impl Arbitrary for RegexLikeString { + fn arbitrary(g: &mut G) -> RegexLikeString { + const SPECIAL: &'static [char] = &[ + '\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}', + '^', '$', + ]; + // Generating random Unicode strings results in mostly uninteresting + // regexes. Namely, they'll mostly just be literals. + // To make properties using regex strings more interesting, we bias + // toward selecting characters of significance to a regex. + let size = { let s = g.size(); g.gen_range(0, s) }; + RegexLikeString((0..size).map(|_| { + if g.gen_weighted_bool(3) { + *g.choose(SPECIAL).unwrap() + } else { + g.gen() + } + }).collect()) + } + + fn shrink(&self) -> Box> { + // The regular `String` shrinker is good enough. + Box::new(self.0.shrink().map(RegexLikeString)) + } +} + +/// A special type for generating small non-zero sized ASCII strings. +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +struct SmallAscii(String); + +impl Arbitrary for SmallAscii { + fn arbitrary(g: &mut G) -> SmallAscii { + use std::char::from_u32; + let size = g.gen_range(1, 5); + SmallAscii((0..size) + .map(|_| from_u32(g.gen_range(97, 123)).unwrap()) + .collect()) + } + + fn shrink(&self) -> Box> { + Box::new(self.0.shrink().map(SmallAscii)) + } +} + +#[test] +fn parser_never_panics() { + fn prop(s: RegexLikeString) -> bool { + let _ = Expr::parse(&s.0); true + } + qc(prop as fn(RegexLikeString) -> bool); +} + +// Testing entire expressions. +// +// We only have one test at the moment, but the machinery could be useful +// for other things. +// +// In particular, Russ Cox writes about testing regexes by comparing the +// strings they match with other regex implementations. A fuzzer/shrinker +// (which is what's implemented below) would be a great way to drive that +// process. ---AG + +impl Arbitrary for Expr { + fn arbitrary(g: &mut G) -> Expr { + fix_capture_indices(gen_expr(g, 0, ExprType::Anything)).simplify() + } + + fn shrink(&self) -> Box> { + use Expr::*; + + let nada = || Box::new(None.into_iter()); + let es: Box> = match *self { + Empty | AnyChar | AnyCharNoNL + | StartLine | EndLine | StartText | EndText + | WordBoundary | NotWordBoundary => nada(), + Literal { ref chars, .. } if chars.len() == 1 => nada(), + Literal { ref chars, casei } => { + Box::new((chars.clone(), casei) + .shrink() + .filter(|&(ref chars, _)| chars.len() > 0) + .map(|(chars, casei)| { + Literal { chars: chars, casei: casei } + })) + } + Class(ref cls) => Box::new(cls.shrink().map(Class)), + Group { ref e, ref i, ref name } => { + let (i, name) = (i.clone(), name.clone()); + Box::new(e.clone().shrink() + .chain(e.clone().shrink() + .map(move |e| Group { + e: Box::new(e), + i: i.clone(), + name: name.clone(), + }))) + } + Repeat { ref e, ref r, greedy } => { + Box::new((*e.clone(), r.clone()) + .shrink() + .filter(|&(ref e, _)| e.can_repeat()) + .map(move |(e, r)| Repeat { + e: Box::new(e), + r: r, + greedy: greedy, + })) + } + // Concat(ref es) if es.len() <= 2 => nada(), + Concat(ref es) => { + Box::new(es.clone() + .shrink() + .filter(|es| es.len() > 0) + .map(|mut es| if es.len() == 1 { + es.pop().unwrap() + } else { + Concat(es) + })) + } + // Alternate(ref es) if es.len() <= 2 => nada(), + Alternate(ref es) => { + Box::new(es.clone() + .shrink() + .filter(|es| es.len() > 0) + .map(|mut es| if es.len() == 1 { + es.pop().unwrap() + } else { + Alternate(es) + })) + } + }; + Box::new(es.map(|e| fix_capture_indices(e).simplify())) + } +} + +enum ExprType { + NoSequences, // disallow concat/alternate + Anything, +} + +fn gen_expr(g: &mut G, depth: u32, ty: ExprType) -> Expr { + use Expr::*; + let ub = match (depth as usize >= g.size(), ty) { + (true, _) => 11, + (false, ExprType::NoSequences) => 13, + (false, ExprType::Anything) => 15, + }; + match g.gen_range(1, ub) { + 0 => Empty, + 1 => Literal { + chars: SmallAscii::arbitrary(g).0.chars().collect(), + casei: g.gen(), + }, + 2 => AnyChar, + 3 => AnyCharNoNL, + 4 => Class(CharClass::arbitrary(g)), + 5 => StartLine, + 6 => EndLine, + 7 => StartText, + 8 => EndText, + 9 => WordBoundary, + 10 => NotWordBoundary, + 11 => gen_group_expr(g, depth + 1), + 12 => Repeat { + e: Box::new(gen_repeatable_expr(g, depth + 1)), + r: Repeater::arbitrary(g), + greedy: bool::arbitrary(g), + }, + 13 => { + let size = { let s = g.size(); g.gen_range(2, s) }; + Concat((0..size) + .map(|_| { + gen_expr(g, depth + 1, ExprType::NoSequences) + }) + .collect()) + } + 14 => { + let size = { let s = g.size(); g.gen_range(2, s) }; + Alternate((0..size) + .map(|_| { + gen_expr(g, depth + 1, ExprType::NoSequences) + }) + .collect()) + } + _ => unreachable!() + } +} + +fn gen_repeatable_expr(g: &mut G, depth: u32) -> Expr { + use Expr::*; + match g.gen_range(1, 6) { + 0 => Empty, + 1 => Literal { + chars: vec![Arbitrary::arbitrary(g)], + casei: g.gen(), + }, + 2 => AnyChar, + 3 => AnyCharNoNL, + 4 => Class(CharClass::arbitrary(g)), + 5 => gen_group_expr(g, depth + 1), + _ => unreachable!(), + } +} + +fn gen_group_expr(g: &mut G, depth: u32) -> Expr { + let (i, name) = if g.gen() { + (None, None) + } else { + (Some(0), if g.gen() { + Some(SmallAscii::arbitrary(g).0) + } else { + None + }) + }; + Expr::Group { + e: Box::new(gen_expr(g, depth + 1, ExprType::Anything)), + i: i, + name: name, + } +} + +fn fix_capture_indices(e: Expr) -> Expr { + fn bx(e: Expr) -> Box { Box::new(e) } + fn fix(e: Expr, capi: &mut usize, names: &mut Vec) -> Expr { + use Expr::*; + match e { + Group { e, i: Some(_), mut name } => { + *capi += 1; + let i = *capi; + let mut dupe_name = false; + if let Some(ref n1) = name { + if names.iter().any(|n2| n1 == n2) { + dupe_name = true; + } else { + names.push(n1.clone()); + } + } + if dupe_name { name = None; } + Group { e: bx(fix(*e, capi, names)), i: Some(i), name: name } + } + Group { e, i, name } => { + Group { e: bx(fix(*e, capi, names)), i: i, name: name } + } + Repeat { e, r, greedy } => { + Repeat { e: bx(fix(*e, capi, names)), r: r, greedy: greedy } + } + Concat(es) => + Concat(es.into_iter().map(|e| fix(e, capi, names)).collect()), + Alternate(es) => + Alternate(es.into_iter().map(|e| fix(e, capi, names)).collect()), + e => e, + } + } + fix(e, &mut 0, &mut vec![]) +} + +impl Arbitrary for Repeater { + fn arbitrary(g: &mut G) -> Repeater { + use Repeater::*; + match g.gen_range(0, 4) { + 0 => ZeroOrOne, + 1 => ZeroOrMore, + 2 => OneOrMore, + 3 => { + use std::cmp::{max, min}; + let n1 = Arbitrary::arbitrary(g); + let n2 = Arbitrary::arbitrary(g); + Range { + min: min(n1, n2), + max: if g.gen() { None } else { Some(max(n1, n2)) }, + } + }, + _ => unreachable!(), + } + } + + fn shrink(&self) -> Box> { + use Repeater::*; + match *self { + ZeroOrOne | ZeroOrMore | OneOrMore => Box::new(None.into_iter()), + Range { min, max } => { + Box::new((min, max) + .shrink() + .map(|(min, max)| Range { min: min, max: max })) + } + } + } +} + +impl Arbitrary for CharClass { + fn arbitrary(g: &mut G) -> CharClass { + let mut ranges: Vec = Arbitrary::arbitrary(g); + if ranges.is_empty() { + ranges.push(Arbitrary::arbitrary(g)); + } + let cls = CharClass { + ranges: ranges, + casei: false, + }.canonicalize(); + if g.gen() { cls.case_fold() } else { cls } + } + + fn shrink(&self) -> Box> { + Box::new((self.ranges.clone(), self.casei) + .shrink() + .filter(|&(ref ranges, _)| ranges.len() > 0) + .map(|(ranges, casei)| { + let cls = CharClass { + ranges: ranges, + casei: casei, + }.canonicalize(); + if casei { cls.case_fold() } else { cls } + })) + } +} + +impl Arbitrary for ClassRange { + fn arbitrary(g: &mut G) -> ClassRange { + use std::char::from_u32; + ClassRange::new( + from_u32(g.gen_range(97, 123)).unwrap(), + from_u32(g.gen_range(97, 123)).unwrap(), + ) + } + + fn shrink(&self) -> Box> { + Box::new((self.start, self.end) + .shrink().map(|(s, e)| ClassRange::new(s, e))) + } +} + +#[test] +fn display_regex_roundtrips() { + // Given an AST, if we print it as a regex and then re-parse it, do we + // get back the same AST? + // A lot of this relies crucially on regex simplification. So this is + // testing `Expr::simplify` as much as it is testing the `Display` impl. + fn prop(e: Expr) -> bool { + e == Expr::parse(&e.to_string()).unwrap() + } + QuickCheck::new() + .tests(10_000) + .max_tests(20_000) + .gen(StdGen::new(::rand::thread_rng(), 50)) + .quickcheck(prop as fn(Expr) -> bool); +} diff --git a/src/unicode.rs b/regex_syntax/src/unicode.rs similarity index 100% rename from src/unicode.rs rename to regex_syntax/src/unicode.rs diff --git a/scripts/unicode.py b/scripts/unicode.py index f734b78099..05bf78c9d5 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -20,7 +20,7 @@ # Since this should not require frequent updates, we just store this # out-of-line and check the unicode.rs file into git. -import fileinput, re, os, sys, operator +import fileinput, re, os, sys preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at diff --git a/src/compile.rs b/src/compile.rs index fd1cf27c62..413da3e9e8 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -14,61 +14,29 @@ use self::Inst::*; use std::cmp; -use std::iter::repeat; -use parse; -use parse::{Flags, FLAG_EMPTY}; -use parse::Ast::{ - Nothing, Literal, Dot, AstClass, Begin, End, WordBoundary, Capture, - Cat, Alt, Rep, -}; -use parse::Repeater::{ZeroOne, ZeroMore, OneMore}; +use syntax::{self, Expr, Repeater}; +use Error; pub type InstIdx = usize; /// An instruction, the underlying unit of a compiled regular expression +#[allow(missing_docs)] #[derive(Debug, Clone)] pub enum Inst { /// When a Match instruction is executed, the current thread is successful. Match, - - /// The OneChar instruction matches a literal character. - /// The flags indicate whether to do a case insensitive match. - OneChar(char, Flags), - - /// The CharClass instruction tries to match one input character against - /// the range of characters given. - /// The flags indicate whether to do a case insensitive match. - CharClass(Vec<(char, char)>, Flags), - - /// Matches any character except new lines. - /// The flags indicate whether to include the '\n' character. - Any(Flags), - - /// Matches the beginning of the string, consumes no characters. - /// The flags indicate whether it matches if the preceding character - /// is a new line. - EmptyBegin(Flags), - - /// Matches the end of the string, consumes no characters. - /// The flags indicate whether it matches if the proceeding character - /// is a new line. - EmptyEnd(Flags), - - /// Matches a word boundary (\w on one side and \W \A or \z on the other), - /// and consumes no character. - /// The flags indicate whether this matches a word boundary or something - /// that isn't a word boundary. - EmptyWordBoundary(Flags), - - /// Saves the current position in the input string to the Nth save slot. + OneChar { c: char, casei: bool }, + CharClass(syntax::CharClass), + Any, + AnyNoNL, + StartLine, + EndLine, + StartText, + EndText, + WordBoundary, + NotWordBoundary, Save(usize), - - /// Jumps to the instruction at the index given. Jump(InstIdx), - - /// Jumps to the instruction at the first index given. If that leads to - /// a panic state, then the instruction at the second index given is - /// tried. Split(InstIdx, InstIdx), } @@ -90,14 +58,15 @@ pub struct Program { impl Program { /// Compiles a Regex given its AST. - pub fn new(ast: parse::Ast) -> (Program, Vec>) { + pub fn new(ast: Expr, size: usize) -> Result<(Program, Vec>), Error> { let mut c = Compiler { insts: Vec::with_capacity(100), - names: Vec::with_capacity(10), + names: vec![None], + size_limit: size, }; c.insts.push(Save(0)); - c.compile(ast); + try!(c.compile(ast)); c.insts.push(Save(1)); c.insts.push(Match); @@ -107,17 +76,17 @@ impl Program { let mut pre = String::with_capacity(5); for inst in c.insts[1..].iter() { match *inst { - OneChar(c, FLAG_EMPTY) => pre.push(c), + OneChar { c, casei: false } => pre.push(c), _ => break } } - let Compiler { insts, names } = c; + let Compiler { insts, names, .. } = c; let prog = Program { insts: insts, prefix: pre, }; - (prog, names) + Ok((prog, names)) } /// Returns the total number of capture groups in the regular expression. @@ -138,6 +107,7 @@ impl Program { struct Compiler { insts: Vec, names: Vec>, + size_limit: usize, } // The compiler implemented here is extremely simple. Most of the complexity @@ -145,83 +115,132 @@ struct Compiler { // The only tricky thing here is patching jump/split instructions to point to // the right instruction. impl Compiler { - fn compile(&mut self, ast: parse::Ast) { + fn check_size(&self) -> Result<(), Error> { + if self.insts.len() * ::std::mem::size_of::() > self.size_limit { + Err(Error::CompiledTooBig(self.size_limit)) + } else { + Ok(()) + } + } + + fn compile(&mut self, ast: Expr) -> Result<(), Error> { match ast { - Nothing => {}, - Literal(c, flags) => self.push(OneChar(c, flags)), - Dot(nl) => self.push(Any(nl)), - AstClass(ranges, flags) => self.push(CharClass(ranges, flags)), - Begin(flags) => self.push(EmptyBegin(flags)), - End(flags) => self.push(EmptyEnd(flags)), - WordBoundary(flags) => self.push(EmptyWordBoundary(flags)), - Capture(cap, name, x) => { - let len = self.names.len(); - if cap >= len { - self.names.extend(repeat(None).take(10 + cap - len)) + Expr::Empty => {}, + Expr::Literal { chars, casei } => { + for c in chars { + self.push(OneChar { c: c, casei: casei }); } - self.names[cap] = name; - - self.push(Save(2 * cap)); - self.compile(*x); - self.push(Save(2 * cap + 1)); } - Cat(xs) => { - for x in xs.into_iter() { - self.compile(x) + Expr::AnyChar => self.push(Any), + Expr::AnyCharNoNL => self.push(AnyNoNL), + Expr::Class(cls) => self.push(CharClass(cls)), + Expr::StartLine => self.push(StartLine), + Expr::EndLine => self.push(EndLine), + Expr::StartText => self.push(StartText), + Expr::EndText => self.push(EndText), + Expr::WordBoundary => self.push(WordBoundary), + Expr::NotWordBoundary => self.push(NotWordBoundary), + Expr::Group { e, i: None, name: None } => try!(self.compile(*e)), + Expr::Group { e, i, name } => { + let i = i.expect("capture index"); + self.names.push(name); + self.push(Save(2 * i)); + try!(self.compile(*e)); + self.push(Save(2 * i + 1)); + } + Expr::Concat(es) => { + for e in es { + try!(self.compile(e)); } } - Alt(x, y) => { + Expr::Alternate(mut es) => { + // TODO: Don't use recursion here. ---AG + if es.len() == 0 { + return Ok(()); + } + let e1 = es.remove(0); + if es.len() == 0 { + try!(self.compile(e1)); + return Ok(()); + } + let e2 = Expr::Alternate(es); // this causes recursion + let split = self.empty_split(); // push: split 0, 0 let j1 = self.insts.len(); - self.compile(*x); // push: insts for x + try!(self.compile(e1)); // push: insts for x let jmp = self.empty_jump(); // push: jmp 0 let j2 = self.insts.len(); - self.compile(*y); // push: insts for y + try!(self.compile(e2)); // push: insts for y let j3 = self.insts.len(); self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2 self.set_jump(jmp, j3); // jmp 0 -> jmp j3 } - Rep(x, ZeroOne, g) => { + Expr::Repeat { e, r: Repeater::ZeroOrOne, greedy } => { let split = self.empty_split(); let j1 = self.insts.len(); - self.compile(*x); + try!(self.compile(*e)); let j2 = self.insts.len(); - if g.is_greedy() { + if greedy { self.set_split(split, j1, j2); } else { self.set_split(split, j2, j1); } } - Rep(x, ZeroMore, g) => { + Expr::Repeat { e, r: Repeater::ZeroOrMore, greedy } => { let j1 = self.insts.len(); let split = self.empty_split(); let j2 = self.insts.len(); - self.compile(*x); + try!(self.compile(*e)); let jmp = self.empty_jump(); let j3 = self.insts.len(); self.set_jump(jmp, j1); - if g.is_greedy() { + if greedy { self.set_split(split, j2, j3); } else { self.set_split(split, j3, j2); } } - Rep(x, OneMore, g) => { + Expr::Repeat { e, r: Repeater::OneOrMore, greedy } => { let j1 = self.insts.len(); - self.compile(*x); + try!(self.compile(*e)); let split = self.empty_split(); let j2 = self.insts.len(); - if g.is_greedy() { + if greedy { self.set_split(split, j1, j2); } else { self.set_split(split, j2, j1); } } + Expr::Repeat { e, r: Repeater::Range { min, max: None }, greedy } => { + let e = *e; + for _ in 0..min { + try!(self.compile(e.clone())); + } + try!(self.compile(Expr::Repeat { + e: Box::new(e), + r: Repeater::ZeroOrMore, + greedy: greedy, + })); + } + Expr::Repeat { e, r: Repeater::Range { min, max: Some(max) }, greedy } => { + let e = *e; + for _ in 0..min { + try!(self.compile(e.clone())); + } + for _ in min..max { + try!(self.compile(Expr::Repeat { + e: Box::new(e.clone()), + r: Repeater::ZeroOrOne, + greedy: greedy, + })); + } + } } + self.check_size() } /// Appends the given instruction to the program. diff --git a/src/lib.rs b/src/lib.rs index c5cbb9c126..d63c98dad4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,17 +20,17 @@ //! details on the API, please see the documentation for the `Regex` type. //! //! # Usage -//! +//! //! This crates is [on crates.io](https://crates.io/crates/regex) and can be //! used by adding `regex` to your dependencies in your project's `Cargo.toml`. -//! +//! //! ```toml //! [dependencies] //! regex = "0.1.8" //! ``` -//! +//! //! and this to your crate root: -//! +//! //! ```rust //! extern crate regex; //! ``` @@ -43,11 +43,8 @@ //! //! ```rust //! use regex::Regex; -//! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") { -//! Ok(re) => re, -//! Err(err) => panic!("{}", err), -//! }; -//! assert_eq!(re.is_match("2014-01-01"), true); +//! let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +//! assert!(re.is_match("2014-01-01")); //! ``` //! //! Notice the use of the `^` and `$` anchors. In this crate, every expression @@ -55,8 +52,9 @@ //! it to match anywhere in the text. Anchors can be used to ensure that the //! full text matches an expression. //! -//! This example also demonstrates the utility of [raw -//! strings](../reference.html#character-and-string-literals) in Rust, which +//! This example also demonstrates the utility of +//! [raw strings](http://doc.rust-lang.org/stable/reference.html#raw-byte-string-literals) +//! in Rust, which //! are just like regular strings except they are prefixed with an `r` and do //! not process any escape sequences. For example, `"\\d"` is the same //! expression as `r"\d"`. @@ -81,7 +79,7 @@ //! //! fn main() { //! let re = regex!(r"^\d{4}-\d{2}-\d{2}$"); -//! assert_eq!(re.is_match("2014-01-01"), true); +//! assert!(re.is_match("2014-01-01")); //! } //! ``` //! @@ -96,20 +94,9 @@ //! expressions, but 100+ calls to `regex!` will probably result in a //! noticeably bigger binary. //! -//! **NOTE**: This is implemented using a compiler plugin, which will not be +//! **NOTE**: This is implemented using a compiler plugin, which is not //! available on the Rust 1.0 beta/stable channels. Therefore, you'll only -//! be able to use `regex!` on the nightlies. If you want to retain the -//! `regex!` macro, you can cheat and define this: -//! -//! ```rust -//! macro_rules! regex( -//! ($s:expr) => (regex::Regex::new($s).unwrap()); -//! ); -//! ``` -//! -//! But this just replaces native regexes with dynamic regexes under the hood. -//! Moreover, this will cause your program to panic *at runtime* if an invalid -//! regular expression is given. +//! be able to use `regex!` on the nightlies. //! //! # Example: iterating over capture groups //! @@ -159,6 +146,25 @@ //! provides more flexibility than is seen here. (See the documentation for //! `Regex::replace` for more details.) //! +//! Note that if your regex gets complicated, you can use the `x` flag to +//! enable insigificant whitespace mode, which also lets you write comments: +//! +//! ```rust +//! # extern crate regex; use regex::Regex; +//! # fn main() { +//! let re = Regex::new(r"(?x) +//! (?P\d{4}) # the year +//! - +//! (?P\d{2}) # the month +//! - +//! (?P\d{2}) # the day +//! ").unwrap(); +//! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +//! let after = re.replace_all(before, "$m/$d/$y"); +//! assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +//! # } +//! ``` +//! //! # Pay for what you use //! //! With respect to searching text with a regular expression, there are three @@ -180,15 +186,16 @@ //! # Unicode //! //! This implementation executes regular expressions **only** on sequences of -//! Unicode code points while exposing match locations as byte indices into the -//! search string. +//! Unicode scalar values while exposing match locations as byte indices into +//! the search string. //! -//! Currently, only naive case folding is supported. Namely, when matching -//! case insensitively, the characters are first converted to their uppercase -//! forms and then compared. +//! Currently, only simple case folding is supported. Namely, when matching +//! case insensitively, the characters are first mapped using the +//! [simple case folding](ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt) +//! mapping. //! //! Regular expressions themselves are also **only** interpreted as a sequence -//! of Unicode code points. This means you can use Unicode characters +//! of Unicode scalar values. This means you can use Unicode characters //! directly in your expression: //! //! ```rust @@ -214,7 +221,11 @@ //! # Syntax //! //! The syntax supported in this crate is almost in an exact correspondence -//! with the syntax supported by RE2. +//! with the syntax supported by RE2. It is documented below. +//! +//! Note that the regular expression parser and abstract syntax are exposed in +//! a separate crate, +//! [`regex-syntax`](../regex_syntax/index.html). //! //! ## Matching one character //! @@ -294,6 +305,7 @@ //! m multi-line mode: ^ and $ match begin/end of line //! s allow . to match \n //! U swap the meaning of x* and x*? +//! x ignore whitespace and allow line comments (starting with `#`) //! //! //! Here's an example that matches case insensitively for only part of the @@ -361,22 +373,19 @@ //! //! # Untrusted input //! -//! There are two factors to consider here: untrusted regular expressions and -//! untrusted search text. -//! -//! Currently, there are no counter-measures in place to prevent a malicious -//! user from writing an expression that may use a lot of resources. One such -//! example is to repeat counted repetitions: `((a{100}){100}){100}` will try -//! to repeat the `a` instruction `100^3` times. Essentially, this means it's -//! very easy for an attacker to exhaust your system's memory if they are -//! allowed to execute arbitrary regular expressions. A possible solution to -//! this is to impose a hard limit on the size of a compiled expression, but it -//! does not yet exist. -//! -//! The story is a bit better with untrusted search text, since this crate's -//! implementation provides `O(nm)` search where `n` is the number of -//! characters in the search text and `m` is the number of instructions in a -//! compiled expression. +//! This crate can handle both untrusted regular expressions and untrusted +//! search text. +//! +//! Untrusted regular expressions are handled by capping the size of a compiled +//! regular expression. (See `Regex::with_size_limit`.) Without this, it would +//! be trivial for an attacker to exhaust your system's memory with expressions +//! like `a{100}{100}{100}`. +//! +//! Untrusted search text is allowed because the matching engine(s) in this +//! crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search +//! text`), which means there's no way to cause exponential blow-up like with +//! some other regular expression engines. (We pay for this by disallowing +//! features like arbitrary look-ahead and back-references.) #![deny(missing_docs)] #![cfg_attr(test, deny(warnings))] @@ -385,16 +394,17 @@ html_favicon_url = "http://www.rust-lang.org/favicon.ico", html_root_url = "http://doc.rust-lang.org/regex/")] -pub use parse::Error; -pub use re::{Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed}; -pub use re::{FindCaptures, FindMatches}; -pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN}; -pub use re::{quote, is_match}; +extern crate regex_syntax as syntax; + +pub use re::{ + Regex, Error, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, + FindCaptures, FindMatches, + Replacer, NoExpand, RegexSplits, RegexSplitsN, + quote, is_match, +}; mod compile; -mod parse; mod re; -mod unicode; mod vm; /// The `native` module exists to support the `regex!` macro. Do not use. @@ -416,17 +426,11 @@ pub mod native { // On the bright side, `rustdoc` lets us hide this from the public API // documentation. pub use compile::Program; - pub use compile::Inst::{ - Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, - EmptyWordBoundary, Save, Jump, Split, - }; - pub use parse::{ - FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, - FLAG_SWAP_GREED, FLAG_NEGATED, - }; + pub use compile::Inst; + pub use syntax::simple_case_fold; pub use re::{ExDynamic, ExNative}; pub use re::Regex::{Dynamic, Native}; - pub use vm::{CharReader, find_prefix, simple_case_fold}; + pub use vm::{CharReader, find_prefix}; pub use vm::MatchKind::{self, Exists, Location, Submatches}; pub use vm::StepState::{ self, StepMatchEarlyReturn, StepMatch, StepContinue, diff --git a/src/parse.rs b/src/parse.rs deleted file mode 100644 index 6ec8362e49..0000000000 --- a/src/parse.rs +++ /dev/null @@ -1,1160 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::char; -use std::cmp; -use std::fmt; - -/// Static data containing Unicode ranges for general categories and scripts. -use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW}; -use vm::simple_case_fold; - -use self::Ast::*; -use self::Repeater::*; -use self::Greed::*; -use self::BuildAst::*; - -/// The maximum number of repetitions allowed with the `{n,m}` syntax. -static MAX_REPEAT: usize = 1000; - -/// Error corresponds to something that can go wrong while parsing -/// a regular expression. -/// -/// (Once an expression is compiled, it is not possible to produce an error -/// via searching, splitting or replacing.) -#[derive(Debug)] -pub struct Error { - /// The *approximate* character index of where the error occurred. - pub pos: usize, - /// A message describing the error. - pub msg: String, -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Regex syntax error near position {}: {}", - self.pos, self.msg) - } -} - -/// Represents the abstract syntax of a regular expression. -/// It is showable so that error messages resulting from a bug can provide -/// useful information. -/// It is cloneable so that expressions can be repeated for the counted -/// repetition feature. (No other copying is done.) -/// -/// Note that this representation prevents one from reproducing the regex as -/// it was typed. (But it could be used to reproduce an equivalent regex.) -#[derive(Debug, Clone)] -pub enum Ast { - Nothing, - Literal(char, Flags), - Dot(Flags), - AstClass(Vec<(char, char)>, Flags), - Begin(Flags), - End(Flags), - WordBoundary(Flags), - Capture(usize, Option, Box), - // Represent concatenation as a flat vector to avoid blowing the - // stack in the compiler. - Cat(Vec), - Alt(Box, Box), - Rep(Box, Repeater, Greed), -} - -#[derive(Debug, PartialEq, Clone)] -pub enum Repeater { - ZeroOne, - ZeroMore, - OneMore, -} - -#[derive(Debug, Clone)] -pub enum Greed { - Greedy, - Ungreedy, -} - -impl Copy for Greed {} - -impl Greed { - pub fn is_greedy(&self) -> bool { - match *self { - Greedy => true, - _ => false, - } - } - - fn swap(self, swapped: bool) -> Greed { - if !swapped { return self } - match self { - Greedy => Ungreedy, - Ungreedy => Greedy, - } - } -} - -/// BuildAst is a regrettable type that represents intermediate state for -/// constructing an abstract syntax tree. Its central purpose is to facilitate -/// parsing groups and alternations while also maintaining a stack of flag -/// state. -#[derive(Debug)] -enum BuildAst { - Expr(Ast), - Paren(Flags, usize, String), // '(' - Bar, // '|' -} - -impl BuildAst { - fn paren(&self) -> bool { - match *self { - Paren(_, _, _) => true, - _ => false, - } - } - - fn flags(&self) -> Flags { - match *self { - Paren(flags, _, _) => flags, - _ => panic!("Cannot get flags from {:?}", self), - } - } - - fn capture(&self) -> Option { - match *self { - Paren(_, 0, _) => None, - Paren(_, c, _) => Some(c), - _ => panic!("Cannot get capture group from {:?}", self), - } - } - - fn capture_name(&self) -> Option { - match *self { - Paren(_, 0, _) => None, - Paren(_, _, ref name) => { - if name.len() == 0 { - None - } else { - Some(name.clone()) - } - } - _ => panic!("Cannot get capture name from {:?}", self), - } - } - - fn bar(&self) -> bool { - match *self { - Bar => true, - _ => false, - } - } - - fn unwrap(self) -> Result { - match self { - Expr(x) => Ok(x), - _ => panic!("Tried to unwrap non-AST item: {:?}", self), - } - } -} - -/// Flags represents all options that can be twiddled by a user in an -/// expression. -pub type Flags = u8; - -pub const FLAG_EMPTY: u8 = 0; -pub const FLAG_NOCASE: u8 = 1 << 0; // i -pub const FLAG_MULTI: u8 = 1 << 1; // m -pub const FLAG_DOTNL: u8 = 1 << 2; // s -pub const FLAG_SWAP_GREED: u8 = 1 << 3; // U -pub const FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary - -struct Parser { - // The input, parsed only as a sequence of UTF8 code points. - chars: Vec, - // The index of the current character in the input. - chari: usize, - // The intermediate state representing the AST. - stack: Vec, - // The current set of flags. - flags: Flags, - // The total number of capture groups. - // Incremented each time an opening left paren is seen (assuming it is - // opening a capture group). - caps: usize, - // A set of all capture group names used only to detect duplicates. - names: Vec, -} - -pub fn parse(s: &str) -> Result { - Parser { - chars: s.chars().collect(), - chari: 0, - stack: vec!(), - flags: FLAG_EMPTY, - caps: 0, - names: vec!(), - }.parse() -} - -impl Parser { - fn parse(&mut self) -> Result { - if self.chars.len() == 0 { - return Ok(Nothing); - } - loop { - let c = self.cur(); - match c { - '?' | '*' | '+' => try!(self.push_repeater(c)), - '\\' => { - let ast = try!(self.parse_escape()); - if let AstClass(mut ranges, flags) = ast { - if flags & FLAG_NOCASE > 0 { - ranges = case_fold_and_combine_ranges(ranges); - } - self.push(AstClass(ranges, flags)) - } else { - self.push(ast) - } - } - '{' => try!(self.parse_counted()), - '[' => match self.try_parse_ascii() { - None => try!(self.parse_class()), - Some(class) => self.push(class), - }, - '(' => { - if self.peek_is(1, '?') { - try!(self.expect('?')); - try!(self.parse_group_opts()); - } else { - self.caps += 1; - self.stack.push(Paren(self.flags, - self.caps, - "".to_string())) - } - } - ')' => { - let catfrom = try!( - self.pos_last(false, |x| x.paren() || x.bar())); - try!(self.concat(catfrom)); - - let altfrom = try!(self.pos_last(false, |x| x.paren())); - // Before we smush the alternates together and pop off the - // left paren, let's grab the old flags and see if we - // need a capture. - let (cap, cap_name, oldflags) = { - let paren = &self.stack[altfrom-1]; - (paren.capture(), paren.capture_name(), paren.flags()) - }; - try!(self.alternate(altfrom)); - self.flags = oldflags; - - // If this was a capture, pop what we just pushed in - // alternate and make it a capture. - if cap.is_some() { - let ast = try!(self.pop_ast()); - self.push(Capture(cap.unwrap(), cap_name, Box::new(ast))); - } - } - '|' => { - let catfrom = try!( - self.pos_last(true, |x| x.paren() || x.bar())); - try!(self.concat(catfrom)); - - self.stack.push(Bar); - } - _ => try!(self.push_literal(c)), - } - if !self.next_char() { - break - } - } - - // Try to improve error handling. At this point, there should be - // no remaining open parens. - if self.stack.iter().any(|x| x.paren()) { - return self.err("Unclosed parenthesis.") - } - let catfrom = try!(self.pos_last(true, |x| x.bar())); - try!(self.concat(catfrom)); - try!(self.alternate(0)); - - assert!(self.stack.len() == 1); - self.pop_ast() - } - - fn noteof(&mut self, expected: &str) -> Result<(), Error> { - match self.next_char() { - true => Ok(()), - false => { - self.err(&format!("Expected {:?} but got EOF.", expected)) - } - } - } - - fn expect(&mut self, expected: char) -> Result<(), Error> { - match self.next_char() { - true if self.cur() == expected => Ok(()), - true => self.err(&format!("Expected '{}' but got '{}'.", - expected, self.cur())), - false => { - self.err(&format!("Expected '{}' but got EOF.", - expected)) - } - } - } - - fn next_char(&mut self) -> bool { - self.chari += 1; - self.chari < self.chars.len() - } - - fn pop_ast(&mut self) -> Result { - match self.stack.pop().unwrap().unwrap() { - Err(e) => Err(e), - Ok(ast) => Ok(ast), - } - } - - fn push(&mut self, ast: Ast) { - self.stack.push(Expr(ast)) - } - - fn push_repeater(&mut self, c: char) -> Result<(), Error> { - if self.stack.len() == 0 { - return self.err( - "A repeat operator must be preceded by a valid expression.") - } - let rep: Repeater = match c { - '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore, - _ => panic!("Not a valid repeater operator."), - }; - - match self.peek(1) { - Some('*') | Some('+') => - return self.err( - "Double repeat operators are not supported."), - _ => {}, - } - let ast = match self.stack.pop().unwrap() { // checked empty stack ^^ - Paren(_, _, _) | Bar | Expr(Nothing) | Expr(Rep(_, _, _)) => - return self.err("A repreat operator must be preceded by a \ - valid expression."), - Expr(Begin(_)) | Expr(End(_)) | Expr(WordBoundary(_)) => - return self.err( - "Repeat arguments cannot be empty width assertions."), - Expr(ast) => ast, - }; - let greed = try!(self.get_next_greedy()); - self.push(Rep(Box::new(ast), rep, greed)); - Ok(()) - } - - fn push_literal(&mut self, c: char) -> Result<(), Error> { - let flags = self.flags; - match c { - '.' => { - self.push(Dot(flags)) - } - '^' => { - self.push(Begin(flags)) - } - '$' => { - self.push(End(flags)) - } - _ => { - self.push(Literal(c, flags)) - } - } - Ok(()) - } - - // Parses all forms of character classes. - // Assumes that '[' is the current character. - fn parse_class(&mut self) -> Result<(), Error> { - let negated = - if self.peek_is(1, '^') { - try!(self.expect('^')); - true - } else { - false - }; - let mut ranges: Vec<(char, char)> = vec!(); - - while self.peek_is(1, '-') { - try!(self.expect('-')); - ranges.push(('-', '-')) - } - loop { - try!(self.noteof("a closing ']' or a non-empty character class)")); - let mut c = self.cur(); - match c { - '[' => - match self.try_parse_ascii() { - Some(AstClass(mut more_ranges, flags)) => { - more_ranges = combine_ranges(more_ranges); - if flags & FLAG_NEGATED > 0 { - more_ranges = invert_ranges(more_ranges); - } - ranges.extend(more_ranges); - continue - } - Some(ast) => - panic!("Expected Class AST but got '{:?}'", ast), - // Just drop down and try to add as a regular character. - None => {}, - }, - '\\' => { - match try!(self.parse_escape()) { - AstClass(mut more_ranges, flags) => { - more_ranges = combine_ranges(more_ranges); - if flags & FLAG_NEGATED > 0 { - more_ranges = invert_ranges(more_ranges); - } - ranges.extend(more_ranges); - continue - } - Literal(c2, _) => c = c2, // process below - Begin(_) | End(_) | WordBoundary(_) => - return self.err( - "\\A, \\z, \\b and \\B are not valid escape \ - sequences inside a character class."), - ast => panic!("Unexpected AST item '{:?}'", ast), - } - } - ']' if ranges.len() > 0 => { - if self.flags & FLAG_NOCASE > 0 { - ranges = case_fold_and_combine_ranges(ranges) - } else { - ranges = combine_ranges(ranges); - } - if negated { - ranges = invert_ranges(ranges); - } - let flags = self.flags & FLAG_NOCASE; - self.push(AstClass(ranges, flags)); - return Ok(()) - } - _ => {} - } - - if self.peek_is(1, '-') && !self.peek_is(2, ']') { - try!(self.expect('-')); - // The regex can't end here. - try!(self.noteof("not a ']'")); - // End the range with a single character or character escape. - let mut c2 = self.cur(); - if c2 == '\\' { - match try!(self.parse_escape()) { - Literal(c3, _) => c2 = c3, // allow literal escapes below - ast => return self.err(&format!( - "Expected a literal, but got {:?}.", ast)), - } - } - if c2 < c { - return self.err(&format!( - "Invalid character class range '{}-{}'", c, c2)) - } - ranges.push((c, self.cur())) - } else { - ranges.push((c, c)) - } - } - } - - // Tries to parse an ASCII character class of the form [:name:]. - // If successful, returns an AST character class corresponding to name - // and moves the parser to the final ']' character. - // If unsuccessful, no state is changed and None is returned. - // Assumes that '[' is the current character. - fn try_parse_ascii(&mut self) -> Option { - if !self.peek_is(1, ':') { - return None - } - let closer = - match self.pos(']') { - Some(i) => i, - None => return None, - }; - if self.chars[closer-1] != ':' { - return None - } - if closer - self.chari <= 3 { - return None - } - let mut name_start = self.chari + 2; - let negated = - if self.peek_is(2, '^') { - name_start += 1; - FLAG_NEGATED - } else { - FLAG_EMPTY - }; - let name = self.slice(name_start, closer - 1); - match find_class(ASCII_CLASSES, &name) { - None => None, - Some(ranges) => { - self.chari = closer; - let flags = negated | (self.flags & FLAG_NOCASE); - Some(AstClass(combine_ranges(ranges), flags)) - } - } - } - - // Parses counted repetition. Supports: - // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}? - // Assumes that '{' is the current character. - // Returns either an error or moves the parser to the final '}' character. - // (Or the '?' character if not greedy.) - fn parse_counted(&mut self) -> Result<(), Error> { - // Scan until the closing '}' and grab the stuff in {}. - let start = self.chari; - let closer = - match self.pos('}') { - Some(i) => i, - None => { - return self.err(&format!("No closing brace for counted \ - repetition starting at position \ - {}.", start)) - } - }; - self.chari = closer; - let greed = try!(self.get_next_greedy()); - let inner = self.chars[(start + 1)..closer].iter().cloned().collect::(); - - // Parse the min and max values from the regex. - let (mut min, mut max): (usize, Option); - if !inner.contains(",") { - min = try!(self.parse_usize(&inner)); - max = Some(min); - } else { - let pieces: Vec<&str> = inner.splitn(2, ',').collect(); - let (smin, smax) = (pieces[0], pieces[1]); - if smin.len() == 0 { - return self.err("Max repetitions cannot be specified \ - without min repetitions.") - } - min = try!(self.parse_usize(smin)); - max = - if smax.len() == 0 { - None - } else { - Some(try!(self.parse_usize(smax))) - }; - } - - // Do some bounds checking and make sure max >= min. - if min > MAX_REPEAT { - return self.err(&format!( - "{} exceeds maximum allowed repetitions ({})", - min, MAX_REPEAT)); - } - if max.is_some() { - let m = max.unwrap(); - if m > MAX_REPEAT { - return self.err(&format!( - "{} exceeds maximum allowed repetitions ({})", - m, MAX_REPEAT)); - } - if m < min { - return self.err(&format!( - "Max repetitions ({}) cannot be smaller than min \ - repetitions ({}).", m, min)); - } - } - - // Now manipulate the AST be repeating elements. - if max.is_none() { - // Require N copies of what's on the stack and then repeat it. - let ast = try!(self.pop_ast()); - for _ in 0..min { - self.push(ast.clone()) - } - self.push(Rep(Box::new(ast), ZeroMore, greed)); - } else { - // Require N copies of what's on the stack and then repeat it - // up to M times optionally. - let ast = try!(self.pop_ast()); - for _ in 0..min { - self.push(ast.clone()) - } - if let Some(max) = max { - for _ in min..max { - self.push(Rep(Box::new(ast.clone()), ZeroOne, greed)) - } - } - // It's possible that we popped something off the stack but - // never put anything back on it. To keep things simple, add - // a no-op expression. - if min == 0 && (max.is_none() || max == Some(0)) { - self.push(Nothing) - } - } - Ok(()) - } - - // Parses all escape sequences. - // Assumes that '\' is the current character. - fn parse_escape(&mut self) -> Result { - try!(self.noteof("an escape sequence following a '\\'")); - - let c = self.cur(); - if is_punct(c) { - return Ok(Literal(c, FLAG_EMPTY)) - } - match c { - 'a' => Ok(Literal('\x07', FLAG_EMPTY)), - 'f' => Ok(Literal('\x0C', FLAG_EMPTY)), - 't' => Ok(Literal('\t', FLAG_EMPTY)), - 'n' => Ok(Literal('\n', FLAG_EMPTY)), - 'r' => Ok(Literal('\r', FLAG_EMPTY)), - 'v' => Ok(Literal('\x0B', FLAG_EMPTY)), - 'A' => Ok(Begin(FLAG_EMPTY)), - 'z' => Ok(End(FLAG_EMPTY)), - 'b' => Ok(WordBoundary(FLAG_EMPTY)), - 'B' => Ok(WordBoundary(FLAG_NEGATED)), - '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())), - 'x' => Ok(try!(self.parse_hex())), - 'p' | 'P' => Ok(try!(self.parse_unicode_name())), - 'd' | 'D' | 's' | 'S' | 'w' | 'W' => { - let ranges = perl_unicode_class(c); - let mut flags = self.flags & FLAG_NOCASE; - if c.is_uppercase() { flags |= FLAG_NEGATED } - Ok(AstClass(ranges, flags)) - } - _ => { - self.err(&format!("Invalid escape sequence '\\\\{}'", c)) - } - } - } - - // Parses a Unicode character class name, either of the form \pF where - // F is a one letter Unicode class name or of the form \p{name} where - // name is the Unicode class name. - // Assumes that \p or \P has been read (and 'p' or 'P' is the current - // character). - fn parse_unicode_name(&mut self) -> Result { - let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY }; - let mut name: String; - if self.peek_is(1, '{') { - try!(self.expect('{')); - let closer = - match self.pos('}') { - Some(i) => i, - None => return self.err(&format!( - "Missing '}}' for unclosed '{{' at position {}", - self.chari)), - }; - if closer - self.chari + 1 == 0 { - return self.err("No Unicode class name found.") - } - name = self.slice(self.chari + 1, closer); - self.chari = closer; - } else { - if self.chari + 1 >= self.chars.len() { - return self.err("No single letter Unicode class name found.") - } - name = self.slice(self.chari + 1, self.chari + 2); - self.chari += 1; - } - match find_class(UNICODE_CLASSES, &name) { - None => { - return self.err(&format!("Could not find Unicode class '{}'", - name)) - } - Some(ranges) => { - Ok(AstClass(ranges, negated | (self.flags & FLAG_NOCASE))) - } - } - } - - // Parses an octal number, up to 3 digits. - // Assumes that \n has been read, where n is the first digit. - fn parse_octal(&mut self) -> Result { - let start = self.chari; - let mut end = start + 1; - let (d2, d3) = (self.peek(1), self.peek(2)); - if d2 >= Some('0') && d2 <= Some('7') { - try!(self.noteof("expected octal character in [0-7]")); - end += 1; - if d3 >= Some('0') && d3 <= Some('7') { - try!(self.noteof("expected octal character in [0-7]")); - end += 1; - } - } - match from_str_radix_pos_integer(&self.slice(start, end), 8) { - Ok(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), - Err(err) => self.err(&err), - } - } - - // Parse a hex number. Either exactly two digits or anything in {}. - // Assumes that \x has been read. - fn parse_hex(&mut self) -> Result { - if !self.peek_is(1, '{') { - try!(self.expect('{')); - return self.parse_hex_two() - } - let start = self.chari + 2; - let closer = - match self.pos('}') { - None => { - return self.err(&format!("Missing '}}' for unclosed \ - '{{' at position {}", start)) - } - Some(i) => i, - }; - self.chari = closer; - self.parse_hex_digits(&self.slice(start, closer)) - } - - // Parses a two-digit hex number. - // Assumes that \xn has been read, where n is the first digit and is the - // current character. - // After return, parser will point at the second digit. - fn parse_hex_two(&mut self) -> Result { - let (start, end) = (self.chari, self.chari + 2); - let bad = self.slice(start - 2, self.chars.len()); - try!(self.noteof(&format!("Invalid hex escape sequence '{}'", bad))); - self.parse_hex_digits(&self.slice(start, end)) - } - - // Parses `s` as a hexadecimal number. - fn parse_hex_digits(&self, s: &str) -> Result { - match from_str_radix_pos_integer(s, 16) { - Ok(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), - Err(err) => self.err(&err), - } - } - - // Parses a named capture. - // Assumes that '(?P<' has been consumed and that the current character - // is '<'. - // When done, parser will be at the closing '>' character. - fn parse_named_capture(&mut self) -> Result<(), Error> { - try!(self.noteof("a capture name")); - let closer = - match self.pos('>') { - Some(i) => i, - None => return self.err("Capture name must end with '>'."), - }; - if closer - self.chari == 0 { - return self.err("Capture names must have at least 1 character.") - } - let name = self.slice(self.chari, closer); - if !name.chars().all(is_valid_cap) { - return self.err( - "Capture names can only have underscores, letters and digits.") - } - if self.names.contains(&name) { - return self.err(&format!("Duplicate capture group name '{}'.", name)) - } - self.names.push(name.clone()); - self.chari = closer; - self.caps += 1; - self.stack.push(Paren(self.flags, self.caps, name)); - Ok(()) - } - - // Parses non-capture groups and options. - // Assumes that '(?' has already been consumed and '?' is the current - // character. - fn parse_group_opts(&mut self) -> Result<(), Error> { - if self.peek_is(1, 'P') && self.peek_is(2, '<') { - try!(self.expect('P')); try!(self.expect('<')); - return self.parse_named_capture() - } - let start = self.chari; - let mut flags = self.flags; - let mut sign = 1; - let mut saw_flag = false; - loop { - try!(self.noteof("expected non-empty set of flags or closing ')'")); - match self.cur() { - 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true}, - 'm' => { flags = flags | FLAG_MULTI; saw_flag = true}, - 's' => { flags = flags | FLAG_DOTNL; saw_flag = true}, - 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true}, - '-' => { - if sign < 0 { - return self.err(&format!( - "Cannot negate flags twice in '{}'.", - self.slice(start, self.chari + 1))) - } - sign = -1; - saw_flag = false; - flags = flags ^ flags; - } - ':' | ')' => { - if sign < 0 { - if !saw_flag { - return self.err(&format!( - "A valid flag does not follow negation in '{}'", - self.slice(start, self.chari + 1))) - } - flags = flags ^ flags; - } - if self.cur() == ':' { - // Save the old flags with the opening paren. - self.stack.push(Paren(self.flags, 0, "".to_string())); - } - self.flags = flags; - return Ok(()) - } - _ => return self.err(&format!( - "Unrecognized flag '{}'.", self.cur())), - } - } - } - - // Peeks at the next character and returns whether it's ungreedy or not. - // If it is, then the next character is consumed. - fn get_next_greedy(&mut self) -> Result { - Ok(if self.peek_is(1, '?') { - try!(self.expect('?')); - Ungreedy - } else { - Greedy - }.swap(self.flags & FLAG_SWAP_GREED > 0)) - } - - // Searches the stack (starting at the top) until it finds an expression - // for which `pred` returns true. The index of that expression in the - // stack is returned. - // If there's no match, then one of two things happens depending on the - // values of `allow_start`. When it's true, then `0` will be returned. - // Otherwise, an error will be returned. - // Generally, `allow_start` is only true when you're *not* expecting an - // opening parenthesis. - fn pos_last

(&self, allow_start: bool, pred: P) -> Result where - P: FnMut(&BuildAst) -> bool, - { - let from = match self.stack.iter().rev().position(pred) { - Some(i) => i, - None => { - if allow_start { - self.stack.len() - } else { - return self.err("No matching opening parenthesis.") - } - } - }; - // Adjust index since 'from' is for the reversed stack. - // Also, don't include the '(' or '|'. - Ok(self.stack.len() - from) - } - - // concat starts at `from` in the parser's stack and concatenates all - // expressions up to the top of the stack. The resulting concatenation is - // then pushed on to the stack. - // Usually `from` corresponds to the position of an opening parenthesis, - // a '|' (alternation) or the start of the entire expression. - fn concat(&mut self, from: usize) -> Result<(), Error> { - let ast = try!(self.build_from(from, concat_flatten)); - self.push(ast); - Ok(()) - } - - // concat starts at `from` in the parser's stack and alternates all - // expressions up to the top of the stack. The resulting alternation is - // then pushed on to the stack. - // Usually `from` corresponds to the position of an opening parenthesis - // or the start of the entire expression. - // This will also drop any opening parens or alternation bars found in - // the intermediate AST. - fn alternate(&mut self, mut from: usize) -> Result<(), Error> { - // Unlike in the concatenation case, we want 'build_from' to continue - // all the way to the opening left paren (so it will be popped off and - // thrown away). But be careful with overflow---we can't count on the - // open paren to be there. - if from > 0 { from = from - 1} - let ast = try!(self.build_from(from, |l,r| Alt(Box::new(l), Box::new(r)))); - self.push(ast); - Ok(()) - } - - // build_from combines all AST elements starting at 'from' in the - // parser's stack using 'mk' to combine them. If any such element is not an - // AST then it is popped off the stack and ignored. - fn build_from(&mut self, from: usize, mut mk: F) -> Result where - F: FnMut(Ast, Ast) -> Ast, - { - if from >= self.stack.len() { - return self.err("Empty group or alternate not allowed.") - } - - let mut combined = try!(self.pop_ast()); - let mut i = self.stack.len(); - while i > from { - i = i - 1; - match self.stack.pop().unwrap() { - Expr(x) => combined = mk(x, combined), - _ => {}, - } - } - Ok(combined) - } - - fn parse_usize(&self, s: &str) -> Result { - match s.parse::() { - Ok(i) => Ok(i), - Err(_) => { - self.err(&format!("Expected an unsigned integer but got '{}'.", - s)) - } - } - } - - fn char_from_u32(&self, n: u32) -> Result { - match char::from_u32(n) { - Some(c) => Ok(c), - None => { - self.err(&format!("Could not decode '{}' to unicode \ - character.", n)) - } - } - } - - fn pos(&self, c: char) -> Option { - self.chars.iter() - .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i) - } - - fn err(&self, msg: &str) -> Result { - Err(Error { - pos: self.chari, - msg: msg.to_string(), - }) - } - - fn peek(&self, offset: usize) -> Option { - if self.chari + offset >= self.chars.len() { - return None - } - Some(self.chars[self.chari + offset]) - } - - fn peek_is(&self, offset: usize, is: char) -> bool { - self.peek(offset) == Some(is) - } - - fn cur(&self) -> char { - self.chars[self.chari] - } - - fn slice(&self, start: usize, end: usize) -> String { - self.chars[start..end].iter().cloned().collect() - } -} - -// Given an unordered collection of character ranges, combine_ranges returns -// an ordered sequence of character ranges where no two ranges overlap. They -// are ordered from least to greatest (using start position). -fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> { - // Returns true iff the two character classes overlap or share a boundary. - // e.g., ('a', 'g') and ('h', 'm') would return true. - fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool { - cmp::max(a, x) <= inc_char(cmp::min(b, y)) - } - - // This is currently O(n^2), but I think with sufficient cleverness, - // it can be reduced to O(n) **if necessary**. - unordered.sort(); - let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len()); - for (us, ue) in unordered.into_iter() { - let (mut us, mut ue) = (us, ue); - assert!(us <= ue); - let mut which: Option = None; - for (i, &(os, oe)) in ordered.iter().enumerate() { - if should_merge((us, ue), (os, oe)) { - us = cmp::min(us, os); - ue = cmp::max(ue, oe); - which = Some(i); - break - } - } - match which { - None => ordered.push((us, ue)), - Some(i) => ordered[i] = (us, ue), - } - } - ordered.sort(); - ordered -} - -// FIXME: Is there a clever way to do this by considering ranges rather than individual chars? -// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table -fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> { - if ranges.is_empty() { - return ranges - } - let mut chars: Vec = ranges - .into_iter() - .flat_map(|(start, end)| start as u32 .. end as u32 + 1) - .filter_map(char::from_u32) - .map(simple_case_fold) - .collect(); - chars.sort(); - chars.dedup(); - let mut chars = chars.into_iter(); - let mut start = chars.next().unwrap(); - let mut end = start; - let mut ranges = Vec::new(); - for c in chars { - if c != inc_char(end) { - ranges.push((start, end)); - start = c; - } - end = c; - } - ranges.push((start, end)); - ranges -} - -fn invert_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> { - if ranges.is_empty() { return ranges; } - - let mut inv = Vec::with_capacity(ranges.len()); - if ranges[0].0 > '\x00' { - inv.push(('\x00', dec_char(ranges[0].0))); - } - for win in ranges.windows(2) { - let ((_, e1), (s2, _)) = (win[0], win[1]); - inv.push((inc_char(e1), dec_char(s2))); - } - if ranges[ranges.len() - 1].1 < char::MAX { - inv.push((inc_char(ranges[ranges.len() - 1].1), char::MAX)); - } - inv -} - -fn inc_char(c: char) -> char { - assert!(c < char::MAX); - match c { - '\u{D7FF}' => '\u{E000}', - c => char::from_u32(c as u32 + 1).unwrap(), - } -} - -fn dec_char(c: char) -> char { - assert!(c > '\x00'); - match c { - '\u{E000}' => '\u{D7FF}', - c => char::from_u32(c as u32 - 1).unwrap(), - } -} - -// Constructs a Unicode friendly Perl character class from \d, \s or \w -// (or any of their negated forms). Note that this does not handle negation. -fn perl_unicode_class(which: char) -> Vec<(char, char)> { - match which { - 'd' | 'D' => PERLD.to_vec(), - 's' | 'S' => PERLS.to_vec(), - 'w' | 'W' => PERLW.to_vec(), - _ => unreachable!(), - } -} - -// Returns a concatenation of two expressions. This also guarantees that a -// `Cat` expression will never be a direct child of another `Cat` expression. -fn concat_flatten(x: Ast, y: Ast) -> Ast { - match (x, y) { - (Cat(mut xs), Cat(ys)) => { xs.extend(ys.into_iter()); Cat(xs) } - (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) } - (ast, Cat(mut xs)) => { xs.insert(0, ast); Cat(xs) } - (ast1, ast2) => Cat(vec!(ast1, ast2)), - } -} - -fn from_str_radix_pos_integer(s: &str, radix: u32) -> Result { - let mut num = 0; - for c in s.chars() { - match c.to_digit(radix) { - None => return Err( - format!("Could not parse '{}' as a hex number.", s)), - Some(n) => { - num *= radix; - num += n; - } - } - } - Ok(num) -} - -pub fn is_punct(c: char) -> bool { - match c { - '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | - '[' | ']' | '{' | '}' | '^' | '$' => true, - _ => false, - } -} - -fn is_valid_cap(c: char) -> bool { - c == '_' || (c >= '0' && c <= '9') - || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') -} - -fn find_class(classes: NamedClasses, name: &str) -> Option> { - match classes.binary_search_by(|&(s, _)| s.cmp(name)) { - Ok(i) => Some(classes[i].1.to_vec()), - Err(_) => None, - } -} - -type Class = &'static [(char, char)]; -type NamedClasses = &'static [(&'static str, Class)]; - -static ASCII_CLASSES: NamedClasses = &[ - // Classes must be in alphabetical order so that bsearch works. - // [:alnum:] alphanumeric (== [0-9A-Za-z]) - // [:alpha:] alphabetic (== [A-Za-z]) - // [:ascii:] ASCII (== [\x00-\x7F]) - // [:blank:] blank (== [\t ]) - // [:cntrl:] control (== [\x00-\x1F\x7F]) - // [:digit:] digits (== [0-9]) - // [:graph:] graphical (== [!-~]) - // [:lower:] lower case (== [a-z]) - // [:print:] printable (== [ -~] == [ [:graph:]]) - // [:punct:] punctuation (== [!-/:-@[-`{-~]) - // [:space:] whitespace (== [\t\n\v\f\r ]) - // [:upper:] upper case (== [A-Z]) - // [:word:] word characters (== [0-9A-Za-z_]) - // [:xdigit:] hex digit (== [0-9A-Fa-f]) - // Taken from: http://golang.org/pkg/regex/syntax/ - ("alnum", &ALNUM), - ("alpha", &ALPHA), - ("ascii", &ASCII), - ("blank", &BLANK), - ("cntrl", &CNTRL), - ("digit", &DIGIT), - ("graph", &GRAPH), - ("lower", &LOWER), - ("print", &PRINT), - ("punct", &PUNCT), - ("space", &SPACE), - ("upper", &UPPER), - ("word", &WORD), - ("xdigit", &XDIGIT), -]; - -const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; -const ALPHA: Class = &[('A', 'Z'), ('a', 'z')]; -const ASCII: Class = &[('\x00', '\x7F')]; -const BLANK: Class = &[(' ', ' '), ('\t', '\t')]; -const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; -const DIGIT: Class = &[('0', '9')]; -const GRAPH: Class = &[('!', '~')]; -const LOWER: Class = &[('a', 'z')]; -const PRINT: Class = &[(' ', '~')]; -const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; -const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), - ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')]; -const UPPER: Class = &[('A', 'Z')]; -const WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]; -const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; diff --git a/src/re.rs b/src/re.rs index bfdcf5317a..5bf2c6b645 100644 --- a/src/re.rs +++ b/src/re.rs @@ -17,7 +17,7 @@ use std::str::pattern::{Pattern, Searcher, SearchStep}; use std::str::FromStr; use compile::Program; -use parse; +use syntax; use vm; use vm::CaptureLocs; use vm::MatchKind::{self, Exists, Location, Submatches}; @@ -32,7 +32,7 @@ use self::Regex::*; pub fn quote(text: &str) -> String { let mut quoted = String::with_capacity(text.len()); for c in text.chars() { - if parse::is_punct(c) { + if syntax::is_punct(c) { quoted.push('\\') } quoted.push(c); @@ -47,10 +47,54 @@ pub fn quote(text: &str) -> String { /// /// To find submatches, split or replace text, you'll need to compile an /// expression first. -pub fn is_match(regex: &str, text: &str) -> Result { +pub fn is_match(regex: &str, text: &str) -> Result { Regex::new(regex).map(|r| r.is_match(text)) } +/// An error that occurred during parsing or compiling a regular expression. +#[derive(Debug)] +pub enum Error { + /// A syntax error. + Syntax(syntax::Error), + /// The compiled program exceeded the set size limit. + /// The argument is the size limit imposed. + CompiledTooBig(usize), +} + +impl ::std::error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Syntax(ref err) => err.description(), + Error::CompiledTooBig(_) => "compiled program too big", + } + } + + fn cause(&self) -> Option<&::std::error::Error> { + match *self { + Error::Syntax(ref err) => Some(err), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Syntax(ref err) => err.fmt(f), + Error::CompiledTooBig(limit) => { + write!(f, "Compiled regex exceeds size limit of {} bytes.", + limit) + } + } + } +} + +impl From for Error { + fn from(err: syntax::Error) -> Error { + Error::Syntax(err) + } +} + /// A compiled regular expression /// /// It is represented as either a sequence of bytecode instructions (dynamic) @@ -159,9 +203,10 @@ impl fmt::Debug for Regex { } } -/// Equality comparison is based on the original string. It is possible that different regular -/// expressions have the same matching behavior, but are still compared unequal. For example, -/// `\d+` and `\d\d*` match the same set of strings, but are not considered equal. +/// Equality comparison is based on the original string. It is possible that +/// different regular expressions have the same matching behavior, but are +/// still compared unequal. For example, `\d+` and `\d\d*` match the same set +/// of strings, but are not considered equal. impl PartialEq for Regex { fn eq(&self, other: &Regex) -> bool { self.as_str() == other.as_str() @@ -171,10 +216,10 @@ impl PartialEq for Regex { impl Eq for Regex {} impl FromStr for Regex { - type Err = parse::Error; + type Err = Error; /// Attempts to parse a string into a regular expression - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { Regex::new(s) } } @@ -184,9 +229,20 @@ impl Regex { /// used repeatedly to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. - pub fn new(re: &str) -> Result { - let ast = try!(parse::parse(re)); - let (prog, names) = Program::new(ast); + pub fn new(re: &str) -> Result { + Regex::with_size_limit(10 * (1 << 20), re) + } + + /// Compiles a dynamic regular expression with the given size limit. + /// + /// The size limit is applied to the size of the *compiled* data structure. + /// If the data structure exceeds the size given, then an error is + /// returned. + /// + /// The default size limit used in `new` is 10MB. + pub fn with_size_limit(size: usize, re: &str) -> Result { + let ast = try!(syntax::Expr::parse(re)); + let (prog, names) = try!(Program::new(ast, size)); Ok(Dynamic(ExDynamic { original: re.to_string(), names: names, @@ -194,6 +250,7 @@ impl Regex { })) } + /// Returns true if and only if the regex matches the string given. /// /// # Example @@ -790,13 +847,19 @@ impl<'t> Captures<'t> { /// To write a literal `$` use `$$`. pub fn expand(&self, text: &str) -> String { // How evil can you get? - // FIXME: Don't use regexes for this. It's completely unnecessary. - let re = Regex::new(r"(^|[^$]|\b)\$(\d+|\w+)").unwrap(); + let re = Regex::new(r"(?x) + (?P^|\b|[^$]) # Ignore `$$name`. + \$ + (?P # Match the actual capture name. Can be... + [0-9]+ # A sequence of digits (for indexed captures), or... + | + [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. + ) + ").unwrap(); let text = re.replace_all(text, |refs: &Captures| -> String { - let pre = refs.at(1).unwrap_or(""); - let name = refs.at(2).unwrap_or(""); - format!("{}{}", pre, - match name.parse::() { + let before = refs.name("before").unwrap_or(""); + let name = refs.name("name").unwrap_or(""); + format!("{}{}", before, match name.parse::() { Err(_) => self.name(name).unwrap_or("").to_string(), Ok(i) => self.at(i).unwrap_or("").to_string(), }) @@ -809,7 +872,7 @@ impl<'t> Captures<'t> { #[inline] pub fn len(&self) -> usize { self.locs.len() / 2 } - /// Returns if there are no captured groups. + /// Returns true if and only if there are no captured groups. #[inline] pub fn is_empty(&self) -> bool { self.len() == 0 } } diff --git a/src/vm.rs b/src/vm.rs index 84da8e5089..7fcd7fded8 100644 --- a/src/vm.rs +++ b/src/vm.rs @@ -36,18 +36,12 @@ use self::MatchKind::*; use self::StepState::*; -use std::cmp::{self, Ordering}; -use std::iter::repeat; +use std::cmp; use std::mem; use compile::Program; -use compile::Inst::{ - Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary, - Save, Jump, Split, -}; -use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED}; -use unicode::regex::PERLW; -use unicode::case_folding; +use compile::Inst::*; +use syntax; pub type CaptureLocs = Vec>; @@ -122,18 +116,16 @@ impl<'r, 't> Nfa<'r, 't> { let ninsts = self.prog.insts.len(); let mut clist = Threads::new(self.which, ninsts, ncaps); let mut nlist = Threads::new(self.which, ninsts, ncaps); - - let mut groups = repeat(None).take(ncaps * 2).collect::>(); + let mut groups = vec![None; ncaps * 2]; // Determine if the expression starts with a '^' so we can avoid // simulating .*? // Make sure multi-line mode isn't enabled for it, otherwise we can't // drop the initial .*? - let prefix_anchor = - match self.prog.insts[1] { - EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, - _ => false, - }; + let prefix_anchor = match self.prog.insts[1] { + StartText => true, + _ => false, + }; self.ic = self.start; let mut next_ic = self.chars.set(self.start); @@ -224,30 +216,24 @@ impl<'r, 't> Nfa<'r, 't> { } } } - OneChar(c, flags) => { - if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) { + OneChar { c, casei } => { + if self.char_eq(casei, self.chars.prev, c) { self.add(nlist, pc+1, caps); } } - CharClass(ref ranges, flags) => { - if let Some(mut c) = self.chars.prev { - let negate = flags & FLAG_NEGATED > 0; - if flags & FLAG_NOCASE > 0 { - c = simple_case_fold(c); - } - let found = ranges.binary_search_by(|&rc| class_cmp(c, rc)).is_ok(); - if found ^ negate { - self.add(nlist, pc+1, caps); - } + CharClass(ref cls) => { + if self.chars.prev.map(|c| cls.matches(c)).unwrap_or(false) { + self.add(nlist, pc+1, caps); } } - Any(flags) => { - if flags & FLAG_DOTNL > 0 - || !self.char_eq(false, self.chars.prev, '\n') { + Any => self.add(nlist, pc+1, caps), + AnyNoNL => { + if !self.char_eq(false, self.chars.prev, '\n') { self.add(nlist, pc+1, caps) } } - EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_) + StartLine | EndLine | StartText | EndText + | WordBoundary | NotWordBoundary | Save(_) | Jump(_) | Split(_, _) => {}, } StepContinue @@ -272,28 +258,42 @@ impl<'r, 't> Nfa<'r, 't> { // We make a minor optimization by indicating that the state is "empty" // so that its capture groups are not filled in. match self.prog.insts[pc] { - EmptyBegin(flags) => { - let multi = flags & FLAG_MULTI > 0; + StartLine => { nlist.add(pc, groups, true); - if self.chars.is_begin() - || (multi && self.char_is(self.chars.prev, '\n')) { - self.add(nlist, pc + 1, groups) + if self.chars.is_begin() || self.char_is(self.chars.prev, '\n') { + self.add(nlist, pc + 1, groups); + } + } + StartText => { + nlist.add(pc, groups, true); + if self.chars.is_begin() { + self.add(nlist, pc + 1, groups); } } - EmptyEnd(flags) => { - let multi = flags & FLAG_MULTI > 0; + EndLine => { nlist.add(pc, groups, true); - if self.chars.is_end() - || (multi && self.char_is(self.chars.cur, '\n')) { + if self.chars.is_end() || self.char_is(self.chars.cur, '\n') { self.add(nlist, pc + 1, groups) } } - EmptyWordBoundary(flags) => { + EndText => { nlist.add(pc, groups, true); - if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) { + if self.chars.is_end() { self.add(nlist, pc + 1, groups) } } + WordBoundary => { + nlist.add(pc, groups, true); + if self.chars.is_word_boundary() { + self.add(nlist, pc + 1, groups); + } + } + NotWordBoundary => { + nlist.add(pc, groups, true); + if !self.chars.is_word_boundary() { + self.add(nlist, pc + 1, groups); + } + } Save(slot) => { nlist.add(pc, groups, true); match self.which { @@ -321,7 +321,7 @@ impl<'r, 't> Nfa<'r, 't> { self.add(nlist, x, groups); self.add(nlist, y, groups); } - Match | OneChar(_, _) | CharClass(_, _) | Any(_) => { + Match | OneChar{..} | CharClass(_) | Any | AnyNoNL => { nlist.add(pc, groups, false); } } @@ -334,7 +334,7 @@ impl<'r, 't> Nfa<'r, 't> { match textc { None => false, Some(textc) => { - regc == textc || (casei && simple_case_fold(regc) == simple_case_fold(textc)) + regc == textc || (casei && syntax::simple_case_fold(regc) == syntax::simple_case_fold(textc)) } } } @@ -425,17 +425,22 @@ impl<'t> CharReader<'t> { /// Returns true if and only if the current position is a word boundary. /// (Ignoring the range of the input to search.) pub fn is_word_boundary(&self) -> bool { + fn is_word(c: Option) -> bool { + c.map(syntax::is_word_char).unwrap_or(false) + } + if self.is_begin() { - return is_word(self.cur) + return is_word(self.cur); } if self.is_end() { - return is_word(self.prev) + return is_word(self.prev); } (is_word(self.cur) && !is_word(self.prev)) || (is_word(self.prev) && !is_word(self.cur)) } } +#[derive(Clone)] struct Thread { pc: usize, groups: Vec>, @@ -457,12 +462,11 @@ impl Threads { // // See http://research.swtch.com/sparse for the deets. fn new(which: MatchKind, num_insts: usize, ncaps: usize) -> Threads { + let t = Thread { pc: 0, groups: vec![None; ncaps * 2] }; Threads { which: which, - queue: (0..num_insts).map(|_| { - Thread {pc: 0, groups: repeat(None).take(ncaps * 2).collect() } - }).collect(), - sparse: repeat(0).take(num_insts).collect(), + queue: vec![t; num_insts], + sparse: vec![0; num_insts], size: 0, } } @@ -508,58 +512,6 @@ impl Threads { } } -/// Returns true if the character is a word character, according to the -/// (Unicode friendly) Perl character class '\w'. -/// Note that this is only use for testing word boundaries. The actual '\w' -/// is encoded as a CharClass instruction. -pub fn is_word(c: Option) -> bool { - let c = match c { - None => return false, - Some(c) => c, - }; - // Try the common ASCII case before invoking binary search. - match c { - '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true, - _ => PERLW.binary_search_by(|&(start, end)| { - if c >= start && c <= end { - Ordering::Equal - } else if start > c { - Ordering::Greater - } else { - Ordering::Less - } - }).ok().is_some() - } -} - - -/// Returns the Unicode *simple* case folding of `c`. -/// Uses the mappings with status C + S form Unicode’s `CaseFolding.txt`. -/// This is not as “correct” as full case folding, but preserves the number of code points. -pub fn simple_case_fold(c: char) -> char { - match case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c)) { - Ok(i) => case_folding::C_plus_S_table[i].1, - Err(_) => c - } -} - - -/// Given a character and a single character class range, return an ordering -/// indicating whether the character is less than the start of the range, -/// in the range (inclusive) or greater than the end of the range. -/// -/// This function is meant to be used with a binary search. -#[inline] -fn class_cmp(textc: char, (start, end): (char, char)) -> Ordering { - if textc >= start && textc <= end { - Ordering::Equal - } else if start > textc { - Ordering::Greater - } else { - Ordering::Less - } -} - /// Returns the starting location of `needle` in `haystack`. /// If `needle` is not in `haystack`, then `None` is returned. ///