diff --git a/.gitignore b/.gitignore index fd3afa8a97..ece777a348 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,7 @@ /Cargo.lock /regex_macros/target /regex_macros/Cargo.lock +/regex_syntax/target +/regex_syntax/Cargo.lock +/bench-log .*.swp diff --git a/Cargo.toml b/Cargo.toml index 33bf5ead6c..734c4da39b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,8 +21,16 @@ path = "regex_macros/benches/bench_dynamic.rs" test = false bench = true +[dependencies.regex-syntax] +path = "regex_syntax" +version = "*" + [dev-dependencies] rand = "0.3" [features] pattern = [] + +[profile.bench] +opt-level = 3 +lto = true diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs index 73e9705801..cd6c8d232d 100644 --- a/regex_macros/src/lib.rs +++ b/regex_macros/src/lib.rs @@ -36,10 +36,7 @@ use rustc::plugin::Registry; use regex::Regex; use regex::native::{ - OneChar, CharClass, Any, Save, Jump, Split, - Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, - Program, Dynamic, ExDynamic, Native, - FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, + Inst, Program, Dynamic, ExDynamic, Native, simple_case_fold, }; @@ -79,7 +76,9 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) // error is logged in 'parse' with cx.span_err None => return DummyResult::any(sp), }; - let re = match Regex::new(®ex) { + // We use the largest possible size limit because this is happening at + // compile time. We trust the programmer. + let re = match Regex::with_size_limit(::std::usize::MAX, ®ex) { Ok(re) => re, Err(err) => { cx.span_err(sp, &err.to_string()); @@ -121,11 +120,10 @@ impl<'a> NfaGen<'a> { None => cx.expr_none(self.sp), } ); - let prefix_anchor = - match self.prog.insts[1] { - EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, - _ => false, - }; + let prefix_anchor = match self.prog.insts[1] { + Inst::StartText => true, + _ => false, + }; let init_groups = self.vec_expr(0..num_cap_locs, &mut |cx, _| cx.expr_none(self.sp)); @@ -338,49 +336,55 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { let nextpc = pc + 1; let body = match *inst { - EmptyBegin(flags) => { - let cond = - if flags & FLAG_MULTI > 0 { - quote_expr!(self.cx, - self.chars.is_begin() - || self.chars.prev == Some('\n') - ) - } else { - quote_expr!(self.cx, self.chars.is_begin()) - }; + Inst::StartLine => { quote_expr!(self.cx, { nlist.add_empty($pc); - if $cond { self.add(nlist, $nextpc, &mut *groups) } + if self.chars.is_begin() || self.chars.prev == Some('\n') { + self.add(nlist, $nextpc, &mut *groups) + } }) } - EmptyEnd(flags) => { - let cond = - if flags & FLAG_MULTI > 0 { - quote_expr!(self.cx, - self.chars.is_end() - || self.chars.cur == Some('\n') - ) - } else { - quote_expr!(self.cx, self.chars.is_end()) - }; + Inst::StartText => { quote_expr!(self.cx, { nlist.add_empty($pc); - if $cond { self.add(nlist, $nextpc, &mut *groups) } + if self.chars.is_begin() { + self.add(nlist, $nextpc, &mut *groups) + } }) } - EmptyWordBoundary(flags) => { - let cond = - if flags & FLAG_NEGATED > 0 { - quote_expr!(self.cx, !self.chars.is_word_boundary()) - } else { - quote_expr!(self.cx, self.chars.is_word_boundary()) - }; + Inst::EndLine => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + if self.chars.is_end() || self.chars.cur == Some('\n') { + self.add(nlist, $nextpc, &mut *groups) + } + }) + } + Inst::EndText => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + if self.chars.is_end() { + self.add(nlist, $nextpc, &mut *groups) + } + }) + } + Inst::WordBoundary => { quote_expr!(self.cx, { nlist.add_empty($pc); - if $cond { self.add(nlist, $nextpc, &mut *groups) } + if self.chars.is_word_boundary() { + self.add(nlist, $nextpc, &mut *groups) + } + }) + } + Inst::NotWordBoundary => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + if !self.chars.is_word_boundary() { + self.add(nlist, $nextpc, &mut *groups) + } }) } - Save(slot) => { + Inst::Save(slot) => { let save = quote_expr!(self.cx, { let old = groups[$slot]; groups[$slot] = Some(self.ic); @@ -411,20 +415,20 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, }) } } - Jump(to) => { + Inst::Jump(to) => { quote_expr!(self.cx, { nlist.add_empty($pc); self.add(nlist, $to, &mut *groups); }) } - Split(x, y) => { + Inst::Split(x, y) => { quote_expr!(self.cx, { nlist.add_empty($pc); self.add(nlist, $x, &mut *groups); self.add(nlist, $y, &mut *groups); }) } - // For Match, OneChar, CharClass, Any + // For Match, OneChar, CharClass, Any, AnyNoNL _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), }; self.arm_inst(pc, body) @@ -439,7 +443,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { let nextpc = pc + 1; let body = match *inst { - Match => { + Inst::Match => { quote_expr!(self.cx, { match self.which { Exists => { @@ -459,8 +463,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, } }) } - OneChar(c, flags) => { - if flags & FLAG_NOCASE > 0 { + Inst::OneChar { c, casei } => { + if casei { let upc = simple_case_fold(c); quote_expr!(self.cx, { let upc = self.chars.prev.map(simple_case_fold); @@ -476,45 +480,37 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, }) } } - CharClass(ref ranges, flags) => { - let negate = flags & FLAG_NEGATED > 0; - let casei = flags & FLAG_NOCASE > 0; + Inst::CharClass(ref cls) => { + let ranges: Vec<(char, char)> = + cls.iter().map(|r| (r.start, r.end)).collect(); + let mranges = self.match_class(&ranges); let get_char = - if casei { + if cls.is_case_insensitive() { quote_expr!( self.cx, simple_case_fold(self.chars.prev.unwrap())) } else { quote_expr!(self.cx, self.chars.prev.unwrap()) }; - let negcond = - if negate { - quote_expr!(self.cx, !found) - } else { - quote_expr!(self.cx, found) - }; - let mranges = self.match_class(&ranges); quote_expr!(self.cx, { if self.chars.prev.is_some() { let c = $get_char; - let found = $mranges; - if $negcond { + if $mranges { self.add(nlist, $nextpc, caps); } } }) } - Any(flags) => { - if flags & FLAG_DOTNL > 0 { - quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) - } else { - quote_expr!(self.cx, { - if self.chars.prev != Some('\n') { - self.add(nlist, $nextpc, caps) - } - () - }) - } + Inst::Any => { + quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) + } + Inst::AnyNoNL => { + quote_expr!(self.cx, { + if self.chars.prev != Some('\n') { + self.add(nlist, $nextpc, caps); + } + () + }) } // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split _ => self.empty_block(), diff --git a/regex_macros/tests/tests.rs b/regex_macros/tests/tests.rs index 11670ac8f1..ab1db14a0b 100644 --- a/regex_macros/tests/tests.rs +++ b/regex_macros/tests/tests.rs @@ -203,6 +203,8 @@ replace!(rep_named, replace_all, "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3"); replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", "", "trim me"); +replace!(rep_number_hypen, replace, r"(.)(.)", "ab", "$1-$2", "a-b"); +replace!(rep_number_underscore, replace, r"(.)(.)", "ab", "$1_$2", "a_b"); macro_rules! noparse( ($name:ident, $re:expr) => ( @@ -219,7 +221,6 @@ macro_rules! noparse( noparse!(fail_double_repeat, "a**"); noparse!(fail_no_repeat_arg, "*"); -noparse!(fail_no_repeat_arg_begin, "^*"); noparse!(fail_incomplete_escape, "\\"); noparse!(fail_class_incomplete, "[A-"); noparse!(fail_class_not_closed, "[A"); @@ -235,8 +236,7 @@ noparse!(fail_bad_capture_name, "(?P)"); noparse!(fail_bad_flag, "(?a)a"); noparse!(fail_empty_alt_before, "|a"); noparse!(fail_empty_alt_after, "a|"); -noparse!(fail_counted_big_exact, "a{1001}"); -noparse!(fail_counted_big_min, "a{1001,}"); +noparse!(fail_too_big, "a{10000000}"); noparse!(fail_counted_no_close, "a{1001"); noparse!(fail_unfinished_cap, "(?"); noparse!(fail_unfinished_escape, "\\"); diff --git a/regex_syntax/Cargo.toml b/regex_syntax/Cargo.toml new file mode 100644 index 0000000000..48231a41ec --- /dev/null +++ b/regex_syntax/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "regex-syntax" +version = "0.1.0" +authors = ["The Rust Project Developers"] +license = "MIT/Apache-2.0" +repository = "https://github.com/rust-lang/regex" +documentation = "http://doc.rust-lang.org/regex" +homepage = "https://github.com/rust-lang/regex" +description = "A regular expression parser (RE2 only)." + +[dev-dependencies] +quickcheck = "*" +rand = "*" diff --git a/regex_syntax/src/lib.rs b/regex_syntax/src/lib.rs new file mode 100644 index 0000000000..95eed3f5c3 --- /dev/null +++ b/regex_syntax/src/lib.rs @@ -0,0 +1,1162 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This crate provides a regular expression parser and an abstract syntax for +regular expressions. The abstract syntax is defined by the `Expr` type. The +concrete syntax is enumerated in the +[`regex`](../regex/index.html#syntax) +crate documentation. + +Note that since this crate is first and foremost an implementation detail for +the `regex` crate, it may experience more frequent breaking changes. It is +exposed as a separate crate so that others may use it to do analysis on regular +expressions or even build their own matching engine. + +# Example: parsing an expression + +Parsing a regular expression can be done with the `Expr::parse` function. + +```rust +use regex_syntax::Expr; + +assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![ + Expr::Literal { chars: vec!['a', 'b'], casei: false }, + Expr::Literal { chars: vec!['y', 'z'], casei: false }, +])); +``` + +# Example: inspecting an error + +The parser in this crate provides very detailed error values. For example, +if an invalid character class range is given: + +```rust +use regex_syntax::{Expr, ErrorKind}; + +let err = Expr::parse(r"[z-a]").unwrap_err(); +assert_eq!(err.position(), 4); +assert_eq!(err.kind(), &ErrorKind::InvalidClassRange { + start: 'z', + end: 'a', +}); +``` + +Or unbalanced parentheses: + +```rust +use regex_syntax::{Expr, ErrorKind}; + +let err = Expr::parse(r"ab(cd").unwrap_err(); +assert_eq!(err.position(), 2); +assert_eq!(err.kind(), &ErrorKind::UnclosedParen); +``` +*/ + +#![deny(missing_docs)] + +#[cfg(test)] extern crate quickcheck; +#[cfg(test)] extern crate rand; + +mod parser; +mod unicode; + +use std::char; +use std::cmp::{Ordering, max, min}; +use std::fmt; +use std::iter::IntoIterator; +use std::ops::Deref; +use std::slice; +use std::vec; + +use unicode::case_folding; + +use self::Expr::*; +use self::Repeater::*; + +pub use parser::is_punct; + +/// A regular expression abstract syntax tree. +/// +/// An `Expr` represents the abstract syntax of a regular expression. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Expr { + /// An empty regex (which never matches any text). + Empty, + /// A sequence of one or more literal characters to be matched. + Literal { + /// The characters. + chars: Vec, + /// Whether to match case insensitively. + casei: bool, + }, + /// Match any character, excluding new line. + AnyChar, + /// Match any character. + AnyCharNoNL, + /// A character class. + Class(CharClass), + /// Match the start of a line or beginning of input. + StartLine, + /// Match the end of a line or end of input. + EndLine, + /// Match the beginning of input. + StartText, + /// Match the end of input. + EndText, + /// Match a word boundary (word character on one side and a non-word + /// character on the other). + WordBoundary, + /// Match a position that is not a word boundary (word or non-word + /// characters on both sides). + NotWordBoundary, + /// A group, possibly non-capturing. + Group { + /// The expression inside the group. + e: Box, + /// The capture index (starting at `1`) only for capturing groups. + i: Option, + /// The capture name, only for capturing named groups. + name: Option, + }, + /// A repeat operator (`?`, `*`, `+` or `{m,n}`). + Repeat { + /// The expression to be repeated. Limited to literals, `.`, classes + /// or grouped expressions. + e: Box, + /// The type of repeat operator used. + r: Repeater, + /// Whether the repeat is greedy (match the most) or not (match the + /// least). + greedy: bool, + }, + /// A concatenation of expressions. Must be matched one after the other. + /// + /// N.B. A concat expression can only appear at the top-level or + /// immediately inside a group expression. + Concat(Vec), + /// An alternation of expressions. Only one must match. + /// + /// N.B. An alternate expression can only appear at the top-level or + /// immediately inside a group expression. + Alternate(Vec), +} + +type CaptureIndex = Option; + +type CaptureName = Option; + +/// The type of a repeat operator expression. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Repeater { + /// Match zero or one (`?`). + ZeroOrOne, + /// Match zero or more (`*`). + ZeroOrMore, + /// Match one or more (`+`). + OneOrMore, + /// Match for at least `min` and at most `max` (`{m,n}`). + /// + /// When `max` is `None`, there is no upper bound on the number of matches. + Range { + /// Lower bound on the number of matches. + min: u32, + /// Optional upper bound on the number of matches. + max: Option, + }, +} + +/// A character class. +/// +/// A character class has a canonical format that the parser guarantees. Its +/// canonical format is defined by the following invariants: +/// +/// 1. Given any Unicode scalar value, it is matched by *at most* one character +/// range in a canonical character class. +/// 2. Every adjacent character range is separated by at least one Unicode +/// scalar value. +/// 3. Given any pair of character ranges `r1` and `r2`, if +/// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical +/// character class. +/// +/// In sum, any `CharClass` produced by this crate's parser is a sorted +/// sequence of non-overlapping ranges. This makes it possible to test whether +/// a character is matched by a class with a binary search. +/// +/// Additionally, a character class may be marked *case insensitive*. If it's +/// case insensitive, then: +/// +/// 1. Simple case folding has been applied to all ranges. +/// 2. Simple case folding must be applied to a character before testing +/// whether it matches the character class. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CharClass { + ranges: Vec, + casei: bool, +} + +/// A single inclusive range in a character class. +/// +/// Since range boundaries are defined by Unicode scalar values, the boundaries +/// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may +/// *cover* codepoints that are not scalar values. +/// +/// Note that this has a few convenient impls on `PartialEq` and `PartialOrd` +/// for testing whether a character is contained inside a given range. +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)] +pub struct ClassRange { + /// The start character of the range. + /// + /// This must be less than or equal to `end`. + pub start: char, + + /// The end character of the range. + /// + /// This must be greater than or equal to `end`. + pub end: char, +} + +impl Expr { + /// Parses a string in a regular expression syntax tree. + pub fn parse(s: &str) -> Result { + parser::Parser::parse(s).map(|e| e.simplify()) + } + + /// Returns true iff the expression can be repeated by a quantifier. + fn can_repeat(&self) -> bool { + match *self { + Literal{..} + | AnyChar + | AnyCharNoNL + | Class(_) + | StartLine | EndLine | StartText | EndText + | WordBoundary | NotWordBoundary + | Group{..} + => true, + _ => false, + } + } + + fn simplify(self) -> Expr { + fn combine_literals(es: &mut Vec, e: Expr) { + match (es.pop(), e) { + (None, e) => es.push(e), + (Some(Literal { chars: mut chars1, casei: casei1 }), + Literal { chars: chars2, casei: casei2 }) => { + if casei1 == casei2 { + chars1.extend(chars2); + es.push(Literal { chars: chars1, casei: casei1 }); + } else { + es.push(Literal { chars: chars1, casei: casei1 }); + es.push(Literal { chars: chars2, casei: casei2 }); + } + } + (Some(e1), e2) => { + es.push(e1); + es.push(e2); + } + } + } + match self { + Repeat { e, r, greedy } => Repeat { + e: Box::new(e.simplify()), + r: r, + greedy: greedy, + }, + Group { e, i, name } => { + let e = e.simplify(); + if i.is_none() && name.is_none() && e.can_repeat() { + e + } else { + Group { e: Box::new(e), i: i, name: name } + } + } + Concat(es) => { + let mut new_es = Vec::with_capacity(es.len()); + for e in es { + combine_literals(&mut new_es, e.simplify()); + } + if new_es.len() == 1 { + new_es.pop().unwrap() + } else { + Concat(new_es) + } + } + Alternate(es) => Alternate(es.into_iter() + .map(|e| e.simplify()) + .collect()), + e => e, + } + } +} + +impl Deref for CharClass { + type Target = Vec; + fn deref(&self) -> &Vec { &self.ranges } +} + +impl IntoIterator for CharClass { + type Item = ClassRange; + type IntoIter = vec::IntoIter; + fn into_iter(self) -> vec::IntoIter { self.ranges.into_iter() } +} + +impl<'a> IntoIterator for &'a CharClass { + type Item = &'a ClassRange; + type IntoIter = slice::Iter<'a, ClassRange>; + fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() } +} + +impl CharClass { + /// Create a new class from an existing set of ranges. + fn new(ranges: Vec) -> CharClass { + CharClass { ranges: ranges, casei: false } + } + + /// Create an empty class. + fn empty() -> CharClass { + CharClass::new(Vec::new()) + } + + /// Returns true if `c` is matched by this character class. + /// + /// If this character class is case insensitive, then simple case folding + /// is applied to `c` before checking for a match. + pub fn matches(&self, mut c: char) -> bool { + if self.is_case_insensitive() { + c = simple_case_fold(c) + } + self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok() + } + + /// Returns true if this character class should be matched case + /// insensitively. + /// + /// When `true`, simple case folding has already been applied to the + /// class. + pub fn is_case_insensitive(&self) -> bool { + self.casei + } + + /// Create a new empty class from this one. + /// + /// Namely, its capacity and case insensitive setting will be the same. + fn to_empty(&self) -> CharClass { + CharClass { ranges: Vec::with_capacity(self.len()), casei: self.casei } + } + + /// Merge two classes and canonicalize them. + #[cfg(test)] + fn merge(mut self, other: CharClass) -> CharClass { + self.ranges.extend(other); + self.canonicalize() + } + + /// Canonicalze any sequence of ranges. + /// + /// This is responsible for enforcing the canonical format invariants + /// as described on the docs for the `CharClass` type. + fn canonicalize(mut self) -> CharClass { + // TODO: Save some cycles here by checking if already canonicalized. + self.ranges.sort(); + let mut ordered = self.to_empty(); // TODO: Do this in place? + for candidate in self { + // If the candidate overlaps with an existing range, then it must + // be the most recent range added because we process the candidates + // in order. + if let Some(or) = ordered.ranges.last_mut() { + if or.overlapping(candidate) { + *or = or.merge(candidate); + continue; + } + } + ordered.ranges.push(candidate); + } + ordered + } + + /// Negates the character class. + /// + /// For all `c` where `c` is a Unicode scalar value, `c` matches `self` + /// if and only if `c` does not match `self.negate()`. + /// + /// Note that this cannot be called on a character class that has had + /// case folding applied to it. (Because case folding turns on a flag + /// and doesn't store every possible matching character. Therefore, + /// its negation is tricky to get right. Turns out, we don't need it + /// anyway!) + fn negate(mut self) -> CharClass { + fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) } + + // Never allow negating of a class that has been case folded! + assert!(!self.casei); + + if self.is_empty() { return self; } + self = self.canonicalize(); + let mut inv = self.to_empty(); + if self[0].start > '\x00' { + inv.ranges.push(range('\x00', dec_char(self[0].start))); + } + for win in self.windows(2) { + inv.ranges.push(range(inc_char(win[0].end), + dec_char(win[1].start))); + } + if self[self.len() - 1].end < char::MAX { + inv.ranges.push(range(inc_char(self[self.len() - 1].end), + char::MAX)); + } + inv + } + + /// Apply case folding to this character class. + /// + /// One a class had been case folded, it cannot be negated. + fn case_fold(self) -> CharClass { + let mut folded = self.to_empty(); + folded.casei = true; + for r in self { + // Applying case folding to a range is expensive because *every* + // character needed to be examined. Thus, we avoid that drudgery + // if no character in the current range is in our case folding + // table. + if r.needs_case_folding() { + folded.ranges.extend(r.case_fold()); + } else { + folded.ranges.push(r); + } + } + folded.canonicalize() + } +} + +impl ClassRange { + /// Create a new class range. + /// + /// If `end < start`, then the two values are swapped so that + /// the invariant `start <= end` is preserved. + fn new(start: char, end: char) -> ClassRange { + if start <= end { + ClassRange { start: start, end: end } + } else { + ClassRange { start: end, end: start } + } + } + + /// Create a range of one character. + fn one(c: char) -> ClassRange { + ClassRange { start: c, end: c } + } + + /// Returns true if and only if the two ranges are overlapping. Note that + /// since ranges are inclusive, `a-c` and `d-f` are overlapping! + fn overlapping(self, other: ClassRange) -> bool { + max(self.start, other.start) <= inc_char(min(self.end, other.end)) + } + + /// Creates a new range representing the union of `self` and `other. + fn merge(self, other: ClassRange) -> ClassRange { + ClassRange { + start: min(self.start, other.start), + end: max(self.end, other.end), + } + } + + /// Returns true if and only if this range contains a character that is + /// in the case folding table. + fn needs_case_folding(self) -> bool { + case_folding::C_plus_S_table + .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok() + } + + /// Apply case folding to this range. + /// + /// Since case folding might add characters such that the range is no + /// longer contiguous, this returns multiple class ranges. They are in + /// canonical order. + fn case_fold(self) -> Vec { + let (s, e) = (self.start as u32, self.end as u32 + 1); + let folded = (s..e).filter_map(char::from_u32).map(simple_case_fold); + ClassRange::ranges(folded) + } + + /// Turns a non-empty sequence of sorted characters into a sequence of + /// class ranges in canonical format/order. + fn ranges>(mut chars: I) -> Vec { + let mut ranges = Vec::with_capacity(100); + let mut start = chars.next().expect("non-empty char iterator"); + let mut end = start; + for c in chars { + if c != inc_char(end) { + ranges.push(ClassRange::new(start, end)); + start = c; + } + end = c; + } + ranges.push(ClassRange::new(start, end)); + ranges + } +} + +impl PartialEq for ClassRange { + #[inline] + fn eq(&self, other: &char) -> bool { + self.start <= *other && *other <= self.end + } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &ClassRange) -> bool { + other.eq(self) + } +} + +impl PartialOrd for ClassRange { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + Some(if self == other { + Ordering::Equal + } else if *other > self.end { + Ordering::Greater + } else { + Ordering::Less + }) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &ClassRange) -> Option { + other.partial_cmp(self).map(|o| o.reverse()) + } +} + +/// This implementation of `Display` will write a regular expression from the +/// syntax tree. It does not write the original string parsed. +// TODO(burntsushi): Write tests for the regex writer. +impl fmt::Display for Expr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Empty => write!(f, ""), + Literal { ref chars, casei } => { + if casei { try!(write!(f, "(?i:")); } + for &c in chars { + try!(write!(f, "{}", quote_char(c))); + } + if casei { try!(write!(f, ")")); } + Ok(()) + } + AnyChar => write!(f, "(?s:.)"), + AnyCharNoNL => write!(f, "."), + Class(ref cls) => write!(f, "{}", cls), + StartLine => write!(f, "(?m:^)"), + EndLine => write!(f, "(?m:$)"), + StartText => write!(f, r"^"), + EndText => write!(f, r"$"), + WordBoundary => write!(f, r"\b"), + NotWordBoundary => write!(f, r"\B"), + Group { ref e, i: None, name: None } => write!(f, "(?:{})", e), + Group { ref e, name: None, .. } => write!(f, "({})", e), + Group { ref e, name: Some(ref n), .. } => { + write!(f, "(?P<{}>{})", n, e) + } + Repeat { ref e, r, greedy } => { + match &**e { + &Literal { ref chars, .. } if chars.len() > 1 => { + try!(write!(f, "(?:{}){}", e, r)) + } + _ => try!(write!(f, "{}{}", e, r)), + } + if !greedy { try!(write!(f, "?")); } + Ok(()) + } + Concat(ref es) => { + for e in es { + try!(write!(f, "{}", e)); + } + Ok(()) + } + Alternate(ref es) => { + for (i, e) in es.iter().enumerate() { + if i > 0 { try!(write!(f, "|")); } + try!(write!(f, "{}", e)); + } + Ok(()) + } + } + } +} + +impl fmt::Display for Repeater { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + ZeroOrOne => write!(f, "?"), + ZeroOrMore => write!(f, "*"), + OneOrMore => write!(f, "+"), + Range { min: s, max: None } => write!(f, "{{{},}}", s), + Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s), + Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e), + } + } +} + +impl fmt::Display for CharClass { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.casei { + try!(write!(f, "(?i:")); + } + try!(write!(f, "[")); + for range in self.iter() { + try!(write!(f, "{}", range)); + } + try!(write!(f, "]")); + if self.casei { + try!(write!(f, ")")); + } + Ok(()) + } +} + +impl fmt::Display for ClassRange { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}-{}", quote_char(self.start), quote_char(self.end)) + } +} + +/// An alias for computations that can return a `Error`. +pub type Result = ::std::result::Result; + +/// A parse error. +/// +/// This includes details about the specific type of error and a rough +/// approximation of where it occurred. +#[derive(Clone, Debug, PartialEq)] +pub struct Error { + pos: usize, + surround: String, + kind: ErrorKind, +} + +/// The specific type of parse error that can occur. +#[derive(Clone, Debug, PartialEq)] +pub enum ErrorKind { + /// A negation symbol is used twice in flag settings. + /// e.g., `(?-i-s)`. + DoubleFlagNegation, + /// The same capture name was used more than once. + /// e.g., `(?P.)(?P.)`. + DuplicateCaptureName(String), + /// An alternate is empty. e.g., `(|a)`. + EmptyAlternate, + /// A capture group name is empty. e.g., `(?P<>a)`. + EmptyCaptureName, + /// A negation symbol was not proceded by any flags. e.g., `(?i-)`. + EmptyFlagNegation, + /// A group is empty. e.g., `()`. + EmptyGroup, + /// An invalid number was used in a counted repetition. e.g., `a{b}`. + InvalidBase10(String), + /// An invalid hexadecimal number was used in an escape sequence. + /// e.g., `\xAG`. + InvalidBase16(String), + /// An invalid capture name was used. e.g., `(?P<0a>b)`. + InvalidCaptureName(String), + /// An invalid class range was givien. Specifically, when the start of the + /// range is greater than the end. e.g., `[z-a]`. + InvalidClassRange { + /// The first character specified in the range. + start: char, + /// The second character specified in the range. + end: char, + }, + /// An escape sequence was used in a character class where it is not + /// allowed. e.g., `[a-\pN]` or `[\A]`. + InvalidClassEscape(Expr), + /// An invalid counted repetition min/max was given. e.g., `a{2,1}`. + InvalidRepeatRange { + /// The first number specified in the repetition. + min: u32, + /// The second number specified in the repetition. + max: u32, + }, + /// An invalid Unicode scalar value was used in a long hexadecimal + /// sequence. e.g., `\x{D800}`. + InvalidScalarValue(u32), + /// An empty counted repetition operator. e.g., `a{}`. + MissingBase10, + /// A repetition operator was not applied to an expression. e.g., `*`. + RepeaterExpectsExpr, + /// A repetition operator was applied to an expression that cannot be + /// repeated. e.g., `a+*` or `a|*`. + RepeaterUnexpectedExpr(Expr), + /// A capture group name that is never closed. e.g., `(?P usize { + self.pos + } + + /// Returns the type of the regex parse error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } +} + +impl ErrorKind { + fn description(&self) -> &str { + use ErrorKind::*; + match *self { + DoubleFlagNegation => "double flag negation", + DuplicateCaptureName(_) => "duplicate capture name", + EmptyAlternate => "empty alternate", + EmptyCaptureName => "empty capture name", + EmptyFlagNegation => "flag negation without any flags", + EmptyGroup => "empty group (e.g., '()')", + InvalidBase10(_) => "invalid base 10 number", + InvalidBase16(_) => "invalid base 16 number", + InvalidCaptureName(_) => "invalid capture name", + InvalidClassRange{..} => "invalid character class range", + InvalidClassEscape(_) => "invalid escape sequence in class", + InvalidRepeatRange{..} => "invalid counted repetition range", + InvalidScalarValue(_) => "invalid Unicode scalar value", + MissingBase10 => "missing count in repetition operator", + RepeaterExpectsExpr => "repetition operator missing expression", + RepeaterUnexpectedExpr(_) => "expression cannot be repeated", + UnclosedCaptureName(_) => "unclosed capture group name", + UnclosedHex => "unclosed hexadecimal literal", + UnclosedParen => "unclosed parenthesis", + UnclosedRepeat => "unclosed counted repetition operator", + UnclosedUnicodeName => "unclosed Unicode class literal", + UnexpectedClassEof => "unexpected EOF in character class", + UnexpectedEscapeEof => "unexpected EOF in escape sequence", + UnexpectedFlagEof => "unexpected EOF in flags", + UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal", + UnopenedParen => "unopened parenthesis", + UnrecognizedEscape(_) => "unrecognized escape sequence", + UnrecognizedFlag(_) => "unrecognized flag", + UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name", + } + } +} + +impl ::std::error::Error for Error { + fn description(&self) -> &str { + self.kind.description() + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Error parsing regex near '{}' at character offset {}: {}", + self.surround, self.pos, self.kind) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use ErrorKind::*; + match *self { + DoubleFlagNegation => + write!(f, "Only one negation symbol is allowed in flags."), + DuplicateCaptureName(ref s) => + write!(f, "Capture name '{}' is used more than once.", s), + EmptyAlternate => + write!(f, "Alternations cannot be empty."), + EmptyCaptureName => + write!(f, "Capture names cannot be empty."), + EmptyFlagNegation => + write!(f, "Flag negation requires setting at least one flag."), + EmptyGroup => + write!(f, "Empty regex groups (e.g., '()') are not allowed."), + InvalidBase10(ref s) => + write!(f, "Not a valid base 10 number: '{}'", s), + InvalidBase16(ref s) => + write!(f, "Not a valid base 16 number: '{}'", s), + InvalidCaptureName(ref s) => + write!(f, "Invalid capture name: '{}'. Capture names must \ + consist of [_a-zA-Z0-9] and are not allowed to \ + start with with a number.", s), + InvalidClassRange { start, end } => + write!(f, "Invalid character class range '{}-{}'. \ + Character class ranges must start with the smaller \ + character, but {} > {}", start, end, start, end), + InvalidClassEscape(ref e) => + write!(f, "Invalid escape sequence in character \ + class: '{}'.", e), + InvalidRepeatRange { min, max } => + write!(f, "Invalid counted repetition range: {{{}, {}}}. \ + Counted repetition ranges must start with the \ + minimum, but {} > {}", min, max, min, max), + InvalidScalarValue(c) => + write!(f, "Number does not correspond to a Unicode scalar \ + value: '{}'.", c), + MissingBase10 => + write!(f, "Missing maximum in counted reptition operator."), + RepeaterExpectsExpr => + write!(f, "Missing expression for reptition operator."), + RepeaterUnexpectedExpr(ref e) => + write!(f, "Invalid application of reptition operator to: \ + '{}'.", e), + UnclosedCaptureName(ref s) => + write!(f, "Capture name group for '{}' is not closed. \ + (Missing a '>'.)", s), + UnclosedHex => + write!(f, "Unclosed hexadecimal literal (missing a '}}')."), + UnclosedParen => + write!(f, "Unclosed parenthesis."), + UnclosedRepeat => + write!(f, "Unclosed counted repetition (missing a '}}')."), + UnclosedUnicodeName => + write!(f, "Unclosed Unicode literal (missing a '}}')."), + UnexpectedClassEof => + write!(f, "Character class was not closed before the end of \ + the regex (missing a ']')."), + UnexpectedEscapeEof => + write!(f, "Started an escape sequence that didn't finish \ + before the end of the regex."), + UnexpectedFlagEof => + write!(f, "Inline flag settings was not closed before the end \ + of the regex (missing a ')' or ':')."), + UnexpectedTwoDigitHexEof => + write!(f, "Unexpected end of two digit hexadecimal literal."), + UnopenedParen => + write!(f, "Unopened parenthesis."), + UnrecognizedEscape(c) => + write!(f, "Unrecognized escape sequence: '\\{}'.", c), + UnrecognizedFlag(c) => + write!(f, "Unrecognized flag: '{}'. \ + (Allowed flags: i, s, m, U, x.)", c), + UnrecognizedUnicodeClass(ref s) => + write!(f, "Unrecognized Unicode class name: '{}'.", s), + } + } +} + +/// Returns the Unicode *simple* case folding of `c`. +/// +/// N.B. This is hidden because it really isn't the responsibility of this +/// crate to do simple case folding. One hopes that either another crate or +/// the standard library will be able to do this for us. In any case, we still +/// expose it because it is used inside the various Regex engines. +#[doc(hidden)] +pub fn simple_case_fold(c: char) -> char { + match case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c)) { + Ok(i) => case_folding::C_plus_S_table[i].1, + Err(_) => c, + } +} + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn quote(text: &str) -> String { + let mut quoted = String::with_capacity(text.len()); + for c in text.chars() { + if parser::is_punct(c) { + quoted.push('\\'); + } + quoted.push(c); + } + quoted +} + +fn quote_char(c: char) -> String { + let mut s = String::new(); + if parser::is_punct(c) { + s.push('\\'); + } + s.push(c); + s +} + +fn inc_char(c: char) -> char { + match c { + char::MAX => char::MAX, + '\u{D7FF}' => '\u{E000}', + c => char::from_u32(c as u32 + 1).unwrap(), + } +} + +fn dec_char(c: char) -> char { + match c { + '\x00' => '\x00', + '\u{E000}' => '\u{D7FF}', + c => char::from_u32(c as u32 - 1).unwrap(), + } +} + +/// Returns true if and only if `c` is a word character. +#[doc(hidden)] +pub fn is_word_char(c: char) -> bool { + match c { + '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true, + _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| { + if c >= start && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }).is_ok(), + } +} + +#[cfg(test)] +mod properties; + +#[cfg(test)] +mod tests { + use {CharClass, ClassRange}; + + fn class(ranges: &[(char, char)]) -> CharClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); + CharClass::new(ranges) + } + + fn classi(ranges: &[(char, char)]) -> CharClass { + let mut cls = class(ranges); + cls.casei = true; + cls + } + + #[test] + fn class_canon_no_change() { + let cls = class(&[('a', 'c'), ('x', 'z')]); + assert_eq!(cls.clone().canonicalize(), cls); + } + + #[test] + fn class_canon_unordered() { + let cls = class(&[('x', 'z'), ('a', 'c')]); + assert_eq!(cls.canonicalize(), class(&[ + ('a', 'c'), ('x', 'z'), + ])); + } + + #[test] + fn class_canon_overlap() { + let cls = class(&[('x', 'z'), ('w', 'y')]); + assert_eq!(cls.canonicalize(), class(&[ + ('w', 'z'), + ])); + } + + #[test] + fn class_canon_overlap_many() { + let cls = class(&[ + ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), + ('m', 'p'), ('l', 's'), + ]); + assert_eq!(cls.clone().canonicalize(), class(&[ + ('a', 'j'), ('l', 's'), + ])); + } + + #[test] + fn class_canon_overlap_many_case_fold() { + let cls = class(&[ + ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), + ('M', 'P'), ('L', 'S'), ('c', 'f'), + ]); + assert_eq!(cls.case_fold(), classi(&[ + ('a', 'j'), ('l', 's'), + ])); + } + + #[test] + fn class_canon_overlap_boundary() { + let cls = class(&[('x', 'z'), ('u', 'w')]); + assert_eq!(cls.canonicalize(), class(&[ + ('u', 'z'), + ])); + } + + #[test] + fn class_canon_extreme_edge_case() { + let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + assert_eq!(cls.canonicalize(), class(&[ + ('\x00', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_canon_singles() { + let cls = class(&[('a', 'a'), ('b', 'b')]); + assert_eq!(cls.canonicalize(), class(&[('a', 'b')])); + } + + #[test] + fn class_negate_single() { + let cls = class(&[('a', 'a')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), ('\x62', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_singles() { + let cls = class(&[('a', 'a'), ('b', 'b')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), ('\x63', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_multiples() { + let cls = class(&[('a', 'c'), ('x', 'z')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_min_scalar() { + let cls = class(&[('\x00', 'a')]); + assert_eq!(cls.negate(), class(&[ + ('\x62', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_max_scalar() { + let cls = class(&[('a', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\x60'), + ])); + } + + #[test] + fn class_negate_everything() { + let cls = class(&[('\x00', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[])); + } + + #[test] + fn class_negate_everything_sans_one() { + let cls = class(&[ + ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}') + ]); + assert_eq!(cls.negate(), class(&[ + ('\u{10FFFE}', '\u{10FFFE}'), + ])); + } + + #[test] + fn class_negate_surrogates_min() { + let cls = class(&[('\x00', '\u{D7FF}')]); + assert_eq!(cls.negate(), class(&[ + ('\u{E000}', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_surrogates_min_edge() { + let cls = class(&[('\x00', '\u{D7FE}')]); + assert_eq!(cls.negate(), class(&[ + ('\u{D7FF}', '\u{10FFFF}'), + ])); + } + + #[test] + fn class_negate_surrogates_max() { + let cls = class(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\u{D7FF}'), + ])); + } + + #[test] + fn class_negate_surrogates_max_edge() { + let cls = class(&[('\u{E001}', '\u{10FFFF}')]); + assert_eq!(cls.negate(), class(&[ + ('\x00', '\u{E000}'), + ])); + } + + #[test] + fn class_fold_retain_only_needed() { + let cls = class(&[('A', 'Z'), ('a', 'z')]); + assert_eq!(cls.case_fold(), classi(&[ + ('a', 'z'), + ])); + } + + #[test] + fn class_fold_az() { + let cls = class(&[('A', 'Z')]); + assert_eq!(cls.case_fold(), classi(&[ + ('a', 'z'), + ])); + } + + #[test] + fn class_fold_a_underscore() { + let cls = class(&[('A', 'A'), ('_', '_')]); + assert_eq!(cls.clone().canonicalize(), class(&[ + ('A', 'A'), ('_', '_'), + ])); + assert_eq!(cls.case_fold(), classi(&[ + ('_', '_'), ('a', 'a'), + ])); + } + + #[test] + fn class_fold_a_equals() { + let cls = class(&[('A', 'A'), ('=', '=')]); + assert_eq!(cls.clone().canonicalize(), class(&[ + ('=', '='), ('A', 'A'), + ])); + assert_eq!(cls.case_fold(), classi(&[ + ('=', '='), ('a', 'a'), + ])); + } + + #[test] + fn class_fold_no_folding_needed() { + let cls = class(&[('\x00', '\x10')]); + assert_eq!(cls.case_fold(), classi(&[ + ('\x00', '\x10'), + ])); + } +} diff --git a/regex_syntax/src/parser.rs b/regex_syntax/src/parser.rs new file mode 100644 index 0000000000..25020cc752 --- /dev/null +++ b/regex_syntax/src/parser.rs @@ -0,0 +1,2298 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::cmp::{max, min}; + +use unicode::regex::UNICODE_CLASSES; + +use { + Expr, Repeater, CharClass, ClassRange, CaptureIndex, CaptureName, + Error, ErrorKind, Result, +}; + +/// Parser state. +/// +/// Keeps the entire input in memory and maintains a cursor (char offset). +/// +/// It also keeps an expression stack, which is responsible for managing +/// grouped expressions and flag state. +#[derive(Debug)] +pub struct Parser { + chars: Vec, + chari: usize, + stack: Vec, + caps: usize, + names: Vec, // to check for duplicates + flags: Flags, +} + +/// An empheral type for representing the expression stack. +/// +/// Everything on the stack is either a regular expression or a marker +/// indicating the opening of a group (possibly non-capturing). The opening +/// of a group copies the current flag state, which is reset on the parser +/// state once the group closes. +#[derive(Debug)] +enum Build { + Expr(Expr), + LeftParen { + i: CaptureIndex, + name: CaptureName, + chari: usize, + old_flags: Flags, + }, +} + +/// Flag state. +#[derive(Clone, Copy, Debug)] +struct Flags { + casei: bool, + multi: bool, + dotnl: bool, + swap_greed: bool, + ignore_space: bool, +} + +// Primary expression parsing routines. +impl Parser { + pub fn parse(s: &str) -> Result { + Parser { + chars: s.chars().collect(), + chari: 0, + stack: vec![], + caps: 0, + names: vec![], + flags: Flags { + casei: false, + multi: false, + dotnl: false, + swap_greed: false, + ignore_space: false, + }, + }.parse_expr() + } + + // Top-level expression parser. + // + // Starts at the beginning of the input and consumes until either the end + // of input or an error. + fn parse_expr(mut self) -> Result { + while !self.eof() { + let build_expr = match self.cur() { + '\\' => try!(self.parse_escape()), + '|' => { let e = try!(self.alternate()); self.bump(); e } + '?' => try!(self.parse_simple_repeat(Repeater::ZeroOrOne)), + '*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)), + '+' => try!(self.parse_simple_repeat(Repeater::OneOrMore)), + '{' => try!(self.parse_counted_repeat()), + '[' => match self.maybe_parse_ascii() { + None => try!(self.parse_class()), + Some(cls) => Build::Expr(Expr::Class(cls)), + }, + '^' => { + if self.flags.multi { + self.parse_one(Expr::StartLine) + } else { + self.parse_one(Expr::StartText) + } + } + '$' => { + if self.flags.multi { + self.parse_one(Expr::EndLine) + } else { + self.parse_one(Expr::EndText) + } + } + '.' => { + if self.flags.dotnl { + self.parse_one(Expr::AnyChar) + } else { + self.parse_one(Expr::AnyCharNoNL) + } + } + '(' => try!(self.parse_group()), + ')' => { + let (old_flags, e) = try!(self.close_paren()); + self.bump(); + self.flags = old_flags; + e + } + _ => Build::Expr(Expr::Literal { + chars: vec![self.bump()], + casei: self.flags.casei, + }), + }; + if !build_expr.is_empty() { + let build_expr = self.maybe_class_case_fold(build_expr); + self.stack.push(build_expr); + } + } + self.finish_concat() + } + + // Parses an escape sequence, e.g., \Ax + // + // Start: `\` + // End: `x` + fn parse_escape(&mut self) -> Result { + self.bump(); + if self.eof() { + return Err(self.err(ErrorKind::UnexpectedEscapeEof)); + } + let c = self.cur(); + if is_punct(c) { + return Ok(Build::Expr(Expr::Literal { + chars: vec![self.bump()], + casei: self.flags.casei, + })); + } + + fn lit(c: char) -> Build { + Build::Expr(Expr::Literal { chars: vec![c], casei: false }) + } + match c { + 'a' => { self.bump(); Ok(lit('\x07')) } + 'f' => { self.bump(); Ok(lit('\x0C')) } + 't' => { self.bump(); Ok(lit('\t')) } + 'n' => { self.bump(); Ok(lit('\n')) } + 'r' => { self.bump(); Ok(lit('\r')) } + 'v' => { self.bump(); Ok(lit('\x0B')) } + 'A' => { self.bump(); Ok(Build::Expr(Expr::StartText)) } + 'z' => { self.bump(); Ok(Build::Expr(Expr::EndText)) } + 'b' => { self.bump(); Ok(Build::Expr(Expr::WordBoundary)) } + 'B' => { self.bump(); Ok(Build::Expr(Expr::NotWordBoundary)) } + '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => self.parse_octal(), + 'x' => { self.bump(); self.parse_hex() } + 'p'|'P' => { + self.bump(); + self.parse_unicode_class(c == 'P') + .map(|cls| Build::Expr(Expr::Class(cls))) + } + 'd'|'s'|'w'|'D'|'S'|'W' => { + self.bump(); + Ok(Build::Expr(Expr::Class(self.parse_perl_class(c)))) + } + c => Err(self.err(ErrorKind::UnrecognizedEscape(c))), + } + } + + // Parses a group, e.g., `(abc)`. + // + // Start: `(` + // End: `a` + // + // A more interesting example, `(?Pabc)`. + // + // Start: `(` + // End: `a` + fn parse_group(&mut self) -> Result { + let chari = self.chari; + let mut name: CaptureName = None; + self.bump(); + if self.bump_if("?P<") { + let n = try!(self.parse_group_name()); + if self.names.iter().any(|n2| n2 == &n) { + return Err(self.err(ErrorKind::DuplicateCaptureName(n))); + } + self.names.push(n.clone()); + name = Some(n); + } else if self.bump_if("?") { + // This can never be capturing. It's either setting flags for + // the current group, or it's opening a non-capturing group or + // it's opening a group with a specific set of flags (which is + // also non-capturing). + // Anything else is an error. + return self.parse_group_flags(chari); + } + self.caps = checkadd(self.caps, 1); + Ok(Build::LeftParen { + i: Some(self.caps), + name: name, + chari: chari, + old_flags: self.flags, // no flags changed if we're here + }) + } + + // Parses flags (inline or grouped), e.g., `(?s-i:abc)`. + // + // Start: `s` + // End: `a` + // + // Another example, `(?s-i)a`. + // + // Start: `s` + // End: `a` + fn parse_group_flags(&mut self, opening_chari: usize) -> Result { + let old_flags = self.flags; + let mut sign = true; + let mut saw_flag = false; + loop { + if self.eof() { + // e.g., (?i + return Err(self.err(ErrorKind::UnexpectedFlagEof)); + } + match self.cur() { + 'i' => { self.flags.casei = sign; saw_flag = true } + 'm' => { self.flags.multi = sign; saw_flag = true } + 's' => { self.flags.dotnl = sign; saw_flag = true } + 'U' => { self.flags.swap_greed = sign; saw_flag = true } + 'x' => { self.flags.ignore_space = sign; saw_flag = true } + '-' => { + if !sign { + // e.g., (?-i-s) + return Err(self.err(ErrorKind::DoubleFlagNegation)); + } + sign = false; + saw_flag = false; + } + ')' => { + if !saw_flag { + // e.g., (?) + return Err(self.err(ErrorKind::EmptyFlagNegation)); + } + // At this point, we're just changing the flags inside + // the current group, which means the old flags have + // been saved elsewhere. Our modifications in place are + // okey dokey! + // + // This particular flag expression only has a stateful + // impact on a regex's AST, so nothing gets explicitly + // added. + self.bump(); + return Ok(Build::Expr(Expr::Empty)); + } + ':' => { + if !sign && !saw_flag { + // e.g., (?i-:a) + // Note that if there's no negation, it's OK not + // to see flag, because you end up with a regular + // non-capturing group: `(?:a)`. + return Err(self.err(ErrorKind::EmptyFlagNegation)); + } + self.bump(); + return Ok(Build::LeftParen { + i: None, + name: None, + chari: opening_chari, + old_flags: old_flags, + }); + } + // e.g., (?z:a) + c => return Err(self.err(ErrorKind::UnrecognizedFlag(c))), + } + self.bump(); + } + } + + // Parses a group name, e.g., `foo` in `(?Pabc)`. + // + // Start: `f` + // End: `a` + fn parse_group_name(&mut self) -> Result { + let mut name = String::new(); + while !self.eof() && !self.peek_is('>') { + name.push(self.bump()); + } + if self.eof() { + // e.g., (?Pa) + None => Err(self.err(ErrorKind::EmptyCaptureName)), + Some(c) if (c >= '0' && c <= '9') || !all_valid => { + // e.g., (?Px) + // e.g., (?P<1a>x) + Err(self.err(ErrorKind::InvalidCaptureName(name))) + } + _ => { + self.bump(); // for `>` + Ok(name) + } + } + } + + // Parses a counted repeition operator, e.g., `a{2,4}?z`. + // + // Start: `{` + // End: `z` + fn parse_counted_repeat(&mut self) -> Result { + let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., ({5} + if !e.can_repeat() { + // e.g., a*{5} + return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); + } + self.bump(); + let min = try!(self.parse_decimal(|c| c != ',' && c != '}')); + let mut max_opt = Some(min); + if self.bump_if(',') { + if self.peek_is('}') { + max_opt = None; + } else { + let max = try!(self.parse_decimal(|c| c != '}')); + if min > max { + // e.g., a{2,1} + return Err(self.err(ErrorKind::InvalidRepeatRange { + min: min, + max: max, + })); + } + max_opt = Some(max); + } + } + if !self.bump_if('}') { + Err(self.err(ErrorKind::UnclosedRepeat)) + } else { + Ok(Build::Expr(Expr::Repeat { + e: Box::new(e), + r: Repeater::Range { min: min, max: max_opt }, + greedy: !self.bump_if('?') ^ self.flags.swap_greed, + })) + } + } + + // Parses a simple repetition operator, e.g., `a+?z`. + // + // Start: `+` + // End: `z` + // + // N.B. "simple" in this context means "not min/max repetition", + // e.g., `a{1,2}`. + fn parse_simple_repeat(&mut self, rep: Repeater) -> Result { + let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., (* + if !e.can_repeat() { + // e.g., a** + return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); + } + self.bump(); + Ok(Build::Expr(Expr::Repeat { + e: Box::new(e), + r: rep, + greedy: !self.bump_if('?') ^ self.flags.swap_greed, + })) + } + + // Parses a decimal number until the given character, e.g., `a{123,456}`. + // + // Start: `1` + // End: `,` (where `until == ','`) + fn parse_decimal(&mut self, until: B) -> Result { + match self.bump_get(until) { + // e.g., a{} + None => Err(self.err(ErrorKind::MissingBase10)), + Some(n) => { + // e.g., a{xyz + // e.g., a{9999999999} + let n = n.trim(); + u32::from_str_radix(n, 10) + .map_err(|_| self.err(ErrorKind::InvalidBase10(n.into()))) + } + } + } + + // Parses an octal number, up to 3 digits, e.g., `a\123b` + // + // Start: `1` + // End: `b` + fn parse_octal(&mut self) -> Result { + use std::char; + let mut i = 0; // counter for limiting octal to 3 digits. + let n = self.bump_get(|c| { i += 1; i <= 3 && c >= '0' && c <= '7' }) + .expect("octal string"); // guaranteed at least 1 digit + // I think both of the following unwraps are impossible to fail. + // We limit it to a three digit octal number, which maxes out at + // `0777` or `511` in decimal. Since all digits are in `0...7`, we'll + // always have a valid `u32` number. Moreover, since all numbers in + // the range `0...511` are valid Unicode scalar values, it will always + // be a valid `char`. + // + // Hence, we `unwrap` with reckless abandon. + let n = u32::from_str_radix(&n, 8).ok().expect("valid octal number"); + Ok(Build::Expr(Expr::Literal { + chars: vec![char::from_u32(n).expect("Unicode scalar value")], + casei: self.flags.casei, + })) + } + + // Parses a hex number, e.g., `a\x5ab`. + // + // Start: `5` + // End: `b` + // + // And also, `a\x{2603}b`. + // + // Start: `{` + // End: `b` + fn parse_hex(&mut self) -> Result { + if self.bump_if('{') { + self.parse_hex_many_digits() + } else { + self.parse_hex_two_digits() + } + } + + // Parses a many-digit hex number, e.g., `a\x{2603}b`. + // + // Start: `2` + // End: `b` + fn parse_hex_many_digits(&mut self) -> Result { + use std::char; + + let s = self.bump_get(|c| c != '}').unwrap_or("".into()); + let n = try!(u32::from_str_radix(&s, 16) + .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); + let c = try!(char::from_u32(n) + .ok_or(self.err(ErrorKind::InvalidScalarValue(n)))); + if !self.bump_if('}') { + // e.g., a\x{d + return Err(self.err(ErrorKind::UnclosedHex)); + } + Ok(Build::Expr(Expr::Literal { + chars: vec![c], + casei: self.flags.casei, + })) + } + + // Parses a two-digit hex number, e.g., `a\x5ab`. + // + // Start: `5` + // End: `b` + fn parse_hex_two_digits(&mut self) -> Result { + use std::char; + + let mut i = 0; + let s = self.bump_get(|_| { i += 1; i <= 2 }).unwrap_or("".into()); + if s.len() < 2 { + // e.g., a\x + // e.g., a\xf + return Err(self.err(ErrorKind::UnexpectedTwoDigitHexEof)); + } + let n = try!(u32::from_str_radix(&s, 16) + .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); + Ok(Build::Expr(Expr::Literal { + // Because 0...255 are all valid Unicode scalar values. + chars: vec![char::from_u32(n).expect("Unicode scalar value")], + casei: self.flags.casei, + })) + } + + // Parses a character class, e.g., `[^a-zA-Z0-9]+`. + // + // Start: `[` + // End: `+` + fn parse_class(&mut self) -> Result { + self.bump(); + let negated = self.bump_if('^'); + let mut class = CharClass::empty(); + while self.bump_if('-') { + class.ranges.push(ClassRange::one('-')); + } + loop { + if self.eof() { + // e.g., [a + return Err(self.err(ErrorKind::UnexpectedClassEof)); + } + match self.cur() { + // If no ranges have been added, then `]` is the first + // character (sans, perhaps, the `^` symbol), so it should + // be interpreted as a `]` instead of a closing class bracket. + ']' if class.len() > 0 => { self.bump(); break } + '[' => match self.maybe_parse_ascii() { + Some(class2) => class.ranges.extend(class2), + None => { + self.bump(); + try!(self.parse_class_range(&mut class, '[')) + } + }, + '\\' => match try!(self.parse_escape()) { + Build::Expr(Expr::Class(class2)) => { + class.ranges.extend(class2); + } + Build::Expr(Expr::Literal { chars, .. }) => { + try!(self.parse_class_range(&mut class, chars[0])); + } + Build::Expr(e) => { + let err = ErrorKind::InvalidClassEscape(e); + return Err(self.err(err)); + } + // Because `parse_escape` can never return `LeftParen`. + _ => unreachable!(), + }, + start => { + self.bump(); + try!(self.parse_class_range(&mut class, start)); + } + } + } + if negated { + class = class.negate(); + } + Ok(Build::Expr(Expr::Class(class.canonicalize()))) + } + + // Parses a single range in a character class. + // + // Since this is a helper for `parse_class`, its signature sticks out. + // Namely, it requires the start character of the range and the char + // class to mutate. + // + // e.g., `[a-z]` + // + // Start: `-` (with start == `a`) + // End: `]` + fn parse_class_range(&mut self, class: &mut CharClass, start: char) + -> Result<()> { + if !self.bump_if('-') { + // Not a range, so just push a singleton range. + class.ranges.push(ClassRange::one(start)); + return Ok(()); + } + if self.eof() { + // e.g., [a- + return Err(self.err(ErrorKind::UnexpectedClassEof)); + } + if self.peek_is(']') { + // This is the end of the class, so we permit use of `-` as a + // regular char (just like we do in the beginning). + class.ranges.push(ClassRange::one(start)); + class.ranges.push(ClassRange::one('-')); + return Ok(()); + } + + // We have a real range. Just need to check to parse literal and + // make sure it's a valid range. + let end = match self.cur() { + '\\' => match try!(self.parse_escape()) { + Build::Expr(Expr::Literal { chars, .. }) => chars[0], + Build::Expr(e) => { + return Err(self.err(ErrorKind::InvalidClassEscape(e))); + } + // Because `parse_escape` can never return `LeftParen`. + _ => unreachable!(), + }, + _ => self.bump(), + }; + if end < start { + // e.g., [z-a] + return Err(self.err(ErrorKind::InvalidClassRange { + start: start, + end: end, + })); + } + class.ranges.push(ClassRange::new(start, end)); + Ok(()) + } + + // Parses an ASCII class, e.g., `[:alnum:]+`. + // + // Start: `[` + // End: `+` + // + // Also supports negation, e.g., `[:^alnum:]`. + // + // This parsing routine is distinct from the others in that it doesn't + // actually report any errors. Namely, if it fails, then the parser should + // fall back to parsing a regular class. + // + // This method will only make progress in the parser if it succeeds. + // Otherwise, the input remains where it started. + fn maybe_parse_ascii(&mut self) -> Option { + fn parse(p: &mut Parser) -> Option { + p.bump(); // the `[` + if !p.bump_if(':') { return None; } + let negate = p.bump_if('^'); + let name = match p.bump_get(|c| c != ':') { + None => return None, + Some(name) => name, + }; + if !p.bump_if(":]") { return None; } + ascii_class(&name).map(|c| if !negate { c } else { c.negate() }) + } + let start = self.chari; + match parse(self) { + None => { self.chari = start; None } + result => result, + } + } + + // Parses a Uncode class name, e.g., `a\pLb`. + // + // Start: `L` + // End: `b` + // + // And also, `a\p{Greek}b`. + // + // Start: `{` + // End: `b` + // + // `negate` is true when the class name is used with `\P`. + fn parse_unicode_class(&mut self, neg: bool) -> Result { + let name = + if self.bump_if('{') { + let n = self.bump_get(|c| c != '}').unwrap_or("".into()); + if n.is_empty() || !self.bump_if('}') { + // e.g., \p{Greek + return Err(self.err(ErrorKind::UnclosedUnicodeName)); + } + n + } else { + if self.eof() { + // e.g., \p + return Err(self.err(ErrorKind::UnexpectedEscapeEof)); + } + self.bump().to_string() + }; + match unicode_class(&name) { + None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))), + Some(cls) => if neg { Ok(cls.negate()) } else { Ok(cls) }, + } + } + + // Parses a perl character class with Unicode support. + // + // `name` must be one of d, s, w, D, S, W. If not, this function panics. + // + // No parser state is changed. + fn parse_perl_class(&mut self, name: char) -> CharClass { + use unicode::regex::{PERLD, PERLS, PERLW}; + match name { + 'd' => raw_class_to_expr(PERLD), + 'D' => raw_class_to_expr(PERLD).negate(), + 's' => raw_class_to_expr(PERLS), + 'S' => raw_class_to_expr(PERLS).negate(), + 'w' => raw_class_to_expr(PERLW), + 'W' => raw_class_to_expr(PERLW).negate(), + _ => unreachable!(), + } + } + + // Always bump to the next input and return the given expression as a + // `Build`. + // + // This is mostly for convenience when the surrounding context implies + // that the next character corresponds to the given expression. + fn parse_one(&mut self, e: Expr) -> Build { + self.bump(); + Build::Expr(e) + } +} + +// Auxiliary helper methods. +impl Parser { + fn chars(&self) -> Chars { + Chars::new(&self.chars[self.chari..], self.flags.ignore_space) + } + + fn bump(&mut self) -> char { + let c = self.cur(); + self.chari = checkadd(self.chari, self.chars().next_count()); + c + } + + fn cur(&self) -> char { self.chars().next().unwrap() } + + fn eof(&self) -> bool { self.chars().next().is_none() } + + fn bump_get(&mut self, s: B) -> Option { + let n = s.match_end(self); + if n == 0 { + None + } else { + let end = checkadd(self.chari, n); + let s = self.chars[self.chari..end] + .iter().cloned().collect::(); + self.chari = end; + Some(s) + } + } + + fn bump_if(&mut self, s: B) -> bool { + let n = s.match_end(self); + if n == 0 { + false + } else { + self.chari = checkadd(self.chari, n); + true + } + } + + fn peek_is(&self, s: B) -> bool { + s.match_end(self) > 0 + } + + fn err(&self, kind: ErrorKind) -> Error { + self.errat(self.chari, kind) + } + + fn errat(&self, pos: usize, kind: ErrorKind) -> Error { + Error { pos: pos, surround: self.windowat(pos), kind: kind } + } + + fn windowat(&self, pos: usize) -> String { + let s = max(5, pos) - 5; + let e = min(self.chars.len(), checkadd(pos, 5)); + self.chars[s..e].iter().cloned().collect() + } + + fn pop(&mut self, expected: ErrorKind) -> Result { + match self.stack.pop() { + None | Some(Build::LeftParen{..}) => Err(self.err(expected)), + Some(Build::Expr(e)) => Ok(e), + } + } + + // If the current contexts calls for case insensitivity and if the expr + // given is a character class, do case folding on it and return the new + // class. + // + // Otherwise, return the expression unchanged. + fn maybe_class_case_fold(&mut self, bexpr: Build) -> Build { + match bexpr { + Build::Expr(Expr::Class(cls)) => { + Build::Expr(Expr::Class( + if self.flags.casei && !cls.casei { + cls.case_fold() + } else { + cls + } + )) + } + bexpr => bexpr, + } + } +} + +struct Chars<'a> { + chars: &'a [char], + cur: usize, + ignore_space: bool, + in_comment: bool, +} + +impl<'a> Iterator for Chars<'a> { + type Item = char; + fn next(&mut self) -> Option { + self.skip(); + if self.cur < self.chars.len() { + let c = self.chars[self.cur]; + self.cur = checkadd(self.cur, 1); + Some(c) + } else { + None + } + } +} + +impl<'a> Chars<'a> { + fn new(chars: &[char], ignore_space: bool) -> Chars { + Chars { + chars: chars, + cur: 0, + ignore_space: ignore_space, + in_comment: false, + } + } + + fn skip(&mut self) { + if !self.ignore_space { return; } + while self.cur < self.chars.len() { + if !self.in_comment && self.c() == '#' { + self.in_comment = true; + } else if self.in_comment && self.c() == '\n' { + self.in_comment = false; + } + if self.in_comment || self.c().is_whitespace() { + self.cur = checkadd(self.cur, 1); + } else { + break; + } + } + } + + fn c(&self) -> char { + self.chars[self.cur] + } + + fn next_count(&mut self) -> usize { + self.next(); + self.cur + } +} + +// Auxiliary methods for manipulating the expression stack. +impl Parser { + // Called whenever an alternate (`|`) is found. + // + // This pops the expression stack until: + // + // 1. The stack is empty. Pushes an alternation with one arm. + // 2. An opening parenthesis is found. Leave the parenthesis + // on the stack and push an alternation with one arm. + // 3. An alternate (`|`) is found. Pop the existing alternation, + // add an arm and push the modified alternation. + // + // Each "arm" in the above corresponds to the concatenation of all + // popped expressions. + // + // In the first two cases, the stack is left in an invalid state + // because an alternation with one arm is not allowed. This + // particular state will be detected by `finish_concat` and an + // error will be reported. + // + // In none of the cases is an empty arm allowed. If an empty arm + // is found, an error is reported. + fn alternate(&mut self) -> Result { + let mut concat = vec![]; + let alts = |es| Ok(Build::Expr(Expr::Alternate(es))); + loop { + match self.stack.pop() { + None => { + if concat.is_empty() { + // e.g., |a + return Err(self.err(ErrorKind::EmptyAlternate)); + } + return alts(vec![rev_concat(concat)]); + } + Some(e @ Build::LeftParen{..}) => { + if concat.is_empty() { + // e.g., (|a) + return Err(self.err(ErrorKind::EmptyAlternate)); + } + self.stack.push(e); + return alts(vec![rev_concat(concat)]); + } + Some(Build::Expr(Expr::Alternate(mut es))) => { + if concat.is_empty() { + // e.g., a|| + return Err(self.err(ErrorKind::EmptyAlternate)); + } + es.push(rev_concat(concat)); + return alts(es); + } + Some(Build::Expr(e)) => { concat.push(e); } + } + } + } + + // Called whenever a closing parenthesis (`)`) is found. + // + // This pops the expression stack until: + // + // 1. The stack is empty. An error is reported because this + // indicates an unopened parenthesis. + // 2. An opening parenthesis is found. Pop the opening parenthesis + // and push a `Group` expression. + // 3. An alternate (`|`) is found. Pop the existing alternation + // and an arm to it in place. Pop one more item from the stack. + // If the stack was empty, then report an unopened parenthesis + // error, otherwise assume it is an opening parenthesis and + // push a `Group` expression with the popped alternation. + // (We can assume this is an opening parenthesis because an + // alternation either corresponds to the entire Regex or it + // corresponds to an entire group. This is guaranteed by the + // `alternate` method.) + // + // Each "arm" in the above corresponds to the concatenation of all + // popped expressions. + // + // Empty arms nor empty groups are allowed. + fn close_paren(&mut self) -> Result<(Flags, Build)> { + let mut concat = vec![]; + loop { + match self.stack.pop() { + // e.g., ) + None => return Err(self.err(ErrorKind::UnopenedParen)), + Some(Build::LeftParen { i, name, old_flags, .. }) => { + if concat.is_empty() { + // e.g., () + return Err(self.err(ErrorKind::EmptyGroup)); + } + return Ok((old_flags, Build::Expr(Expr::Group { + e: Box::new(rev_concat(concat)), + i: i, + name: name, + }))); + } + Some(Build::Expr(Expr::Alternate(mut es))) => { + if concat.is_empty() { + // e.g., (a|) + return Err(self.err(ErrorKind::EmptyAlternate)); + } + es.push(rev_concat(concat)); + match self.stack.pop() { + // e.g., a|b) + None => return Err(self.err(ErrorKind::UnopenedParen)), + Some(Build::Expr(_)) => unreachable!(), + Some(Build::LeftParen { i, name, old_flags, .. }) => { + return Ok((old_flags, Build::Expr(Expr::Group { + e: Box::new(Expr::Alternate(es)), + i: i, + name: name, + }))); + } + } + } + Some(Build::Expr(e)) => { concat.push(e); } + } + } + } + + // Called only when the parser reaches the end of input. + // + // This pops the expression stack until: + // + // 1. The stack is empty. Return concatenation of popped + // expressions. This concatenation may be empty! + // 2. An alternation is found. Pop the alternation and push + // a new arm. Return the alternation as the entire Regex. + // + // If an opening parenthesis is popped, then an error is + // returned since it indicates an unclosed parenthesis. + fn finish_concat(&mut self) -> Result { + let mut concat = vec![]; + loop { + match self.stack.pop() { + None => { return Ok(rev_concat(concat)); } + Some(Build::LeftParen{ chari, ..}) => { + // e.g., a(b + return Err(self.errat(chari, ErrorKind::UnclosedParen)); + } + Some(Build::Expr(Expr::Alternate(mut es))) => { + if concat.is_empty() { + // e.g., a| + return Err(self.err(ErrorKind::EmptyAlternate)); + } + es.push(rev_concat(concat)); + return Ok(Expr::Alternate(es)); + } + Some(Build::Expr(e)) => { concat.push(e); } + } + } + } +} + +impl Build { + fn is_empty(&self) -> bool { + match *self { + Build::Expr(Expr::Empty) => true, + _ => false, + } + } +} + +// Make it ergonomic to conditionally bump the parser. +// i.e., `bump_if('a')` or `bump_if("abc")`. +trait Bumpable { + fn match_end(self, p: &Parser) -> usize; +} + +impl Bumpable for char { + fn match_end(self, p: &Parser) -> usize { + let mut chars = p.chars(); + if chars.next().map(|c| c == self).unwrap_or(false) { + chars.cur + } else { + 0 + } + } +} + +impl<'a> Bumpable for &'a str { + fn match_end(self, p: &Parser) -> usize { + let mut search = self.chars(); + let mut rest = p.chars(); + let mut count = 0; + loop { + match (rest.next(), search.next()) { + (Some(c1), Some(c2)) if c1 == c2 => count = rest.cur, + (_, None) => return count, + _ => return 0, + } + } + } +} + +impl bool> Bumpable for F { + fn match_end(mut self, p: &Parser) -> usize { + let mut chars = p.chars(); + let mut count = 0; + while let Some(c) = chars.next() { + if !self(c) { + break + } + count = chars.cur; + } + count + } +} + +// Turn a sequence of expressions into a concatenation. +// This only uses `Concat` if there are 2 or more expressions. +fn rev_concat(mut exprs: Vec) -> Expr { + if exprs.len() == 0 { + Expr::Empty + } else if exprs.len() == 1 { + exprs.pop().unwrap() + } else { + exprs.reverse(); + Expr::Concat(exprs) + } +} + +// Returns ture iff the given character is allowed in a capture name. +// Note that the first char of a capture name must not be numeric. +fn is_valid_capture_char(c: char) -> bool { + c == '_' || (c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +/// Returns true iff the give character has significance in a regex. +#[doc(hidden)] +pub fn is_punct(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | + '[' | ']' | '{' | '}' | '^' | '$' => true, + _ => false, + } +} + +fn checkadd(x: usize, y: usize) -> usize { + x.checked_add(y).expect("regex length overflow") +} + +fn unicode_class(name: &str) -> Option { + UNICODE_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| { + raw_class_to_expr(UNICODE_CLASSES[i].1) + }) +} + +fn ascii_class(name: &str) -> Option { + ASCII_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| { + raw_class_to_expr(ASCII_CLASSES[i].1) + }) +} + +fn raw_class_to_expr(raw: &[(char, char)]) -> CharClass { + let range = |&(s, e)| ClassRange { start: s, end: e }; + CharClass::new(raw.iter().map(range).collect()) +} + +type Class = &'static [(char, char)]; +type NamedClasses = &'static [(&'static str, Class)]; + +const ASCII_CLASSES: NamedClasses = &[ + // Classes must be in alphabetical order so that bsearch works. + // [:alnum:] alphanumeric (== [0-9A-Za-z]) + // [:alpha:] alphabetic (== [A-Za-z]) + // [:ascii:] ASCII (== [\x00-\x7F]) + // [:blank:] blank (== [\t ]) + // [:cntrl:] control (== [\x00-\x1F\x7F]) + // [:digit:] digits (== [0-9]) + // [:graph:] graphical (== [!-~]) + // [:lower:] lower case (== [a-z]) + // [:print:] printable (== [ -~] == [ [:graph:]]) + // [:punct:] punctuation (== [!-/:-@[-`{-~]) + // [:space:] whitespace (== [\t\n\v\f\r ]) + // [:upper:] upper case (== [A-Z]) + // [:word:] word characters (== [0-9A-Za-z_]) + // [:xdigit:] hex digit (== [0-9A-Fa-f]) + // Taken from: http://golang.org/pkg/regex/syntax/ + ("alnum", &ALNUM), + ("alpha", &ALPHA), + ("ascii", &ASCII), + ("blank", &BLANK), + ("cntrl", &CNTRL), + ("digit", &DIGIT), + ("graph", &GRAPH), + ("lower", &LOWER), + ("print", &PRINT), + ("punct", &PUNCT), + ("space", &SPACE), + ("upper", &UPPER), + ("word", &WORD), + ("xdigit", &XDIGIT), +]; + +const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; +const ALPHA: Class = &[('A', 'Z'), ('a', 'z')]; +const ASCII: Class = &[('\x00', '\x7F')]; +const BLANK: Class = &[(' ', ' '), ('\t', '\t')]; +const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; +const DIGIT: Class = &[('0', '9')]; +const GRAPH: Class = &[('!', '~')]; +const LOWER: Class = &[('a', 'z')]; +const PRINT: Class = &[(' ', '~')]; +const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; +const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), + ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')]; +const UPPER: Class = &[('A', 'Z')]; +const WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]; +const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; + +#[cfg(test)] +mod tests { + use { CharClass, ClassRange, Expr, Repeater, ErrorKind }; + use unicode::regex::{PERLD, PERLS, PERLW}; + use super::Parser; + use super::{LOWER, UPPER}; + + static YI: &'static [(char, char)] = &[ + ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'), + ]; + + fn p(s: &str) -> Expr { Parser::parse(s).unwrap() } + fn lit(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: false } } + fn liti(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: true } } + fn b(v: T) -> Box { Box::new(v) } + fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) } + + fn class(ranges: &[(char, char)]) -> CharClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); + CharClass::new(ranges) + } + + fn classes(classes: &[&[(char, char)]]) -> CharClass { + let mut cls = CharClass::empty(); + for &ranges in classes { + cls.ranges.extend(class(ranges)); + } + cls.canonicalize() + } + + #[test] + fn empty() { + assert_eq!(p(""), Expr::Empty); + } + + #[test] + fn literal() { + assert_eq!(p("a"), lit('a')); + } + + #[test] + fn literal_string() { + assert_eq!(p("ab"), Expr::Concat(vec![lit('a'), lit('b')])); + } + + #[test] + fn start_literal() { + assert_eq!(p("^a"), Expr::Concat(vec![ + Expr::StartText, + Expr::Literal { chars: vec!['a'], casei: false }, + ])); + } + + #[test] + fn repeat_zero_or_one_greedy() { + assert_eq!(p("a?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrOne, + greedy: true, + }); + } + + #[test] + fn repeat_zero_or_one_greedy_concat() { + assert_eq!(p("ab?"), Expr::Concat(vec![ + lit('a'), + Expr::Repeat { + e: b(lit('b')), + r: Repeater::ZeroOrOne, + greedy: true, + }, + ])); + } + + #[test] + fn repeat_zero_or_one_nongreedy() { + assert_eq!(p("a??"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrOne, + greedy: false, + }); + } + + #[test] + fn repeat_one_or_more_greedy() { + assert_eq!(p("a+"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::OneOrMore, + greedy: true, + }); + } + + #[test] + fn repeat_one_or_more_nongreedy() { + assert_eq!(p("a+?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::OneOrMore, + greedy: false, + }); + } + + #[test] + fn repeat_zero_or_more_greedy() { + assert_eq!(p("a*"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + }); + } + + #[test] + fn repeat_zero_or_more_nongreedy() { + assert_eq!(p("a*?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: false, + }); + } + + #[test] + fn repeat_counted_exact() { + assert_eq!(p("a{5}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(5) }, + greedy: true, + }); + } + + #[test] + fn repeat_counted_min() { + assert_eq!(p("a{5,}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: None }, + greedy: true, + }); + } + + #[test] + fn repeat_counted_min_max() { + assert_eq!(p("a{5,10}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: true, + }); + } + + #[test] + fn repeat_counted_exact_nongreedy() { + assert_eq!(p("a{5}?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(5) }, + greedy: false, + }); + } + + #[test] + fn repeat_counted_min_nongreedy() { + assert_eq!(p("a{5,}?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: None }, + greedy: false, + }); + } + + #[test] + fn repeat_counted_min_max_nongreedy() { + assert_eq!(p("a{5,10}?"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: false, + }); + } + + #[test] + fn repeat_counted_whitespace() { + assert_eq!(p("a{ 5 }"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(5) }, + greedy: true, + }); + assert_eq!(p("a{ 5 , 10 }"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: true, + }); + } + + #[test] + fn group_literal() { + assert_eq!(p("(a)"), Expr::Group { + e: b(lit('a')), + i: Some(1), + name: None, + }); + } + + #[test] + fn group_literal_concat() { + assert_eq!(p("(ab)"), Expr::Group { + e: b(c(&[lit('a'), lit('b')])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_two() { + assert_eq!(p("a|b"), Expr::Alternate(vec![lit('a'), lit('b')])); + } + + #[test] + fn alt_many() { + assert_eq!(p("a|b|c"), Expr::Alternate(vec![ + lit('a'), lit('b'), lit('c'), + ])); + } + + #[test] + fn alt_many_concat() { + assert_eq!(p("ab|bc|cd"), Expr::Alternate(vec![ + c(&[lit('a'), lit('b')]), + c(&[lit('b'), lit('c')]), + c(&[lit('c'), lit('d')]), + ])); + } + + #[test] + fn alt_group_two() { + assert_eq!(p("(a|b)"), Expr::Group { + e: b(Expr::Alternate(vec![lit('a'), lit('b')])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_group_many() { + assert_eq!(p("(a|b|c)"), Expr::Group { + e: b(Expr::Alternate(vec![lit('a'), lit('b'), lit('c')])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_group_many_concat() { + assert_eq!(p("(ab|bc|cd)"), Expr::Group { + e: b(Expr::Alternate(vec![ + c(&[lit('a'), lit('b')]), + c(&[lit('b'), lit('c')]), + c(&[lit('c'), lit('d')]), + ])), + i: Some(1), + name: None, + }); + } + + #[test] + fn alt_group_nested() { + assert_eq!(p("(ab|(bc|(cd)))"), Expr::Group { + e: b(Expr::Alternate(vec![ + c(&[lit('a'), lit('b')]), + Expr::Group { + e: b(Expr::Alternate(vec![ + c(&[lit('b'), lit('c')]), + Expr::Group { + e: b(c(&[lit('c'), lit('d')])), + i: Some(3), + name: None, + } + ])), + i: Some(2), + name: None, + }, + ])), + i: Some(1), + name: None, + }); + } + + #[test] + fn group_name() { + assert_eq!(p("(?Pa)"), Expr::Group { + e: b(lit('a')), + i: Some(1), + name: Some("foo".into()), + }); + } + + #[test] + fn group_no_capture() { + assert_eq!(p("(?:a)"), Expr::Group { + e: b(lit('a')), + i: None, + name: None, + }); + } + + #[test] + fn group_flags() { + assert_eq!(p("(?i:a)"), Expr::Group { + e: b(liti('a')), + i: None, + name: None, + }); + } + + #[test] + fn group_flags_returned() { + assert_eq!(p("(?i:a)a"), c(&[ + Expr::Group { + e: b(liti('a')), + i: None, + name: None, + }, + lit('a'), + ])); + } + + #[test] + fn group_flags_retained() { + assert_eq!(p("(?i)(?-i:a)a"), c(&[ + Expr::Group { + e: b(lit('a')), + i: None, + name: None, + }, + liti('a'), + ])); + } + + #[test] + fn flags_inline() { + assert_eq!(p("(?i)a"), liti('a')); + } + + #[test] + fn flags_inline_multiple() { + assert_eq!(p("(?is)a."), c(&[liti('a'), Expr::AnyChar])); + } + + #[test] + fn flags_inline_multiline() { + assert_eq!(p("(?m)^(?-m)$"), c(&[Expr::StartLine, Expr::EndText])); + } + + #[test] + fn flags_inline_swap_greed() { + assert_eq!(p("(?U)a*a*?(?i-U)a*a*?"), c(&[ + Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: false, + }, + Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + }, + Expr::Repeat { + e: b(liti('a')), + r: Repeater::ZeroOrMore, + greedy: true, + }, + Expr::Repeat { + e: b(liti('a')), + r: Repeater::ZeroOrMore, + greedy: false, + }, + ])); + } + + #[test] + fn flags_inline_multiple_negate_one() { + assert_eq!(p("(?is)a.(?i-s)a."), c(&[ + liti('a'), Expr::AnyChar, liti('a'), Expr::AnyCharNoNL, + ])); + } + + #[test] + fn flags_inline_negate() { + assert_eq!(p("(?i)a(?-i)a"), c(&[liti('a'), lit('a')])); + } + + #[test] + fn flags_group_inline() { + assert_eq!(p("(a(?i)a)a"), c(&[ + Expr::Group { + e: b(c(&[lit('a'), liti('a')])), + i: Some(1), + name: None, + }, + lit('a'), + ])); + } + + #[test] + fn flags_group_inline_retain() { + assert_eq!(p("(?i)((?-i)a)a"), c(&[ + Expr::Group { + e: b(lit('a')), + i: Some(1), + name: None, + }, + liti('a'), + ])); + } + + #[test] + fn escape_simple() { + assert_eq!(p(r"\a\f\t\n\r\v"), c(&[ + lit('\x07'), lit('\x0C'), lit('\t'), + lit('\n'), lit('\r'), lit('\x0B'), + ])); + } + + #[test] + fn escape_boundaries() { + assert_eq!(p(r"\A\z\b\B"), c(&[ + Expr::StartText, Expr::EndText, + Expr::WordBoundary, Expr::NotWordBoundary, + ])); + } + + #[test] + fn escape_punctuation() { + assert_eq!(p(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$"), c(&[ + lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'), + lit('('), lit(')'), lit('|'), lit('['), lit(']'), + lit('{'), lit('}'), lit('^'), lit('$'), + ])); + } + + #[test] + fn escape_octal() { + assert_eq!(p(r"\123"), lit('S')); + assert_eq!(p(r"\1234"), c(&[lit('S'), lit('4')])); + } + + #[test] + fn escape_hex2() { + assert_eq!(p(r"\x53"), lit('S')); + assert_eq!(p(r"\x534"), c(&[lit('S'), lit('4')])); + } + + #[test] + fn escape_hex() { + assert_eq!(p(r"\x{53}"), lit('S')); + assert_eq!(p(r"\x{53}4"), c(&[lit('S'), lit('4')])); + assert_eq!(p(r"\x{2603}"), lit('\u{2603}')); + } + + #[test] + fn escape_unicode_name() { + assert_eq!(p(r"\p{Yi}"), Expr::Class(class(YI))); + } + + #[test] + fn escape_unicode_letter() { + assert_eq!(p(r"\pZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]))); + } + + #[test] + fn escape_unicode_name_case_fold() { + assert_eq!(p(r"(?i)\p{Yi}"), Expr::Class(class(YI).case_fold())); + } + + #[test] + fn escape_unicode_letter_case_fold() { + assert_eq!(p(r"(?i)\pZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]).case_fold())); + } + + #[test] + fn escape_unicode_name_negate() { + assert_eq!(p(r"\P{Yi}"), Expr::Class(class(YI).negate())); + } + + #[test] + fn escape_unicode_letter_negate() { + assert_eq!(p(r"\PZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]).negate())); + } + + #[test] + fn escape_unicode_name_negate_case_fold() { + assert_eq!(p(r"(?i)\P{Yi}"), + Expr::Class(class(YI).negate().case_fold())); + } + + #[test] + fn escape_unicode_letter_negate_case_fold() { + assert_eq!(p(r"(?i)\PZ"), Expr::Class(class(&[ + ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), + ]).negate().case_fold())); + } + + #[test] + fn escape_perl_d() { + assert_eq!(p(r"\d"), Expr::Class(class(PERLD))); + } + + #[test] + fn escape_perl_s() { + assert_eq!(p(r"\s"), Expr::Class(class(PERLS))); + } + + #[test] + fn escape_perl_w() { + assert_eq!(p(r"\w"), Expr::Class(class(PERLW))); + } + + #[test] + fn escape_perl_d_negate() { + assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate())); + } + + #[test] + fn escape_perl_s_negate() { + assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate())); + } + + #[test] + fn escape_perl_w_negate() { + assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate())); + } + + #[test] + fn escape_perl_d_case_fold() { + assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold())); + } + + #[test] + fn escape_perl_s_case_fold() { + assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold())); + } + + #[test] + fn escape_perl_w_case_fold() { + assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold())); + } + + #[test] + fn escape_perl_d_case_fold_negate() { + assert_eq!(p(r"(?i)\D"), + Expr::Class(class(PERLD).negate().case_fold())); + } + + #[test] + fn escape_perl_s_case_fold_negate() { + assert_eq!(p(r"(?i)\S"), + Expr::Class(class(PERLS).negate().case_fold())); + } + + #[test] + fn escape_perl_w_case_fold_negate() { + assert_eq!(p(r"(?i)\W"), + Expr::Class(class(PERLW).negate().case_fold())); + } + + #[test] + fn class_singleton() { + assert_eq!(p(r"[a]"), Expr::Class(class(&[('a', 'a')]))); + assert_eq!(p(r"[\x00]"), Expr::Class(class(&[('\x00', '\x00')]))); + assert_eq!(p(r"[\n]"), Expr::Class(class(&[('\n', '\n')]))); + assert_eq!(p("[\n]"), Expr::Class(class(&[('\n', '\n')]))); + } + + #[test] + fn class_singleton_negate() { + assert_eq!(p(r"[^a]"), Expr::Class(class(&[ + ('\x00', '\x60'), ('\x62', '\u{10FFFF}'), + ]))); + assert_eq!(p(r"[^\x00]"), Expr::Class(class(&[ + ('\x01', '\u{10FFFF}'), + ]))); + assert_eq!(p(r"[^\n]"), Expr::Class(class(&[ + ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), + ]))); + assert_eq!(p("[^\n]"), Expr::Class(class(&[ + ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'), + ]))); + } + + #[test] + fn class_singleton_class() { + assert_eq!(p(r"[\d]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[\p{Yi}]"), Expr::Class(class(YI))); + } + + #[test] + fn class_singleton_class_negate() { + assert_eq!(p(r"[^\d]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate())); + assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate())); + } + + #[test] + fn class_singleton_class_negate_negate() { + assert_eq!(p(r"[^\D]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW))); + assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS))); + } + + #[test] + fn class_singleton_class_casei() { + assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold())); + } + + #[test] + fn class_singleton_class_negate_casei() { + assert_eq!(p(r"(?i)[^\d]"), + Expr::Class(class(PERLD).negate().case_fold())); + assert_eq!(p(r"(?i)[^\w]"), + Expr::Class(class(PERLW).negate().case_fold())); + assert_eq!(p(r"(?i)[^\s]"), + Expr::Class(class(PERLS).negate().case_fold())); + } + + #[test] + fn class_singleton_class_negate_negate_casei() { + assert_eq!(p(r"(?i)[^\D]"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold())); + assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold())); + } + + #[test] + fn class_multiple_class() { + assert_eq!(p(r"[\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]))); + } + + #[test] + fn class_multiple_class_negate() { + assert_eq!(p(r"[^\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]).negate())); + } + + #[test] + fn class_multiple_class_negate_negate() { + let nperld = class(PERLD).negate(); + let nyi = class(YI).negate(); + let cls = CharClass::empty().merge(nperld).merge(nyi); + assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate())); + } + + #[test] + fn class_multiple_class_casei() { + assert_eq!(p(r"(?i)[\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]).case_fold())); + } + + #[test] + fn class_multiple_class_negate_casei() { + assert_eq!(p(r"(?i)[^\d\p{Yi}]"), Expr::Class(classes(&[ + PERLD, YI, + ]).negate().case_fold())); + } + + #[test] + fn class_multiple_class_negate_negate_casei() { + let nperld = class(PERLD).negate(); + let nyi = class(YI).negate(); + let class = CharClass::empty().merge(nperld).merge(nyi); + assert_eq!(p(r"(?i)[^\D\P{Yi}]"), + Expr::Class(class.negate().case_fold())); + } + + #[test] + fn class_class_hypen() { + assert_eq!(p(r"[\p{Yi}-]"), Expr::Class(classes(&[ + &[('-', '-')], YI, + ]))); + assert_eq!(p(r"[\p{Yi}-a]"), Expr::Class(classes(&[ + &[('-', '-')], &[('a', 'a')], YI, + ]))); + } + + #[test] + fn class_brackets() { + assert_eq!(p("[]]"), Expr::Class(class(&[(']', ']')]))); + assert_eq!(p("[][]"), Expr::Class(class(&[('[', '['), (']', ']')]))); + assert_eq!(p("[[]]"), Expr::Concat(vec![ + Expr::Class(class(&[('[', '[')])), + lit(']'), + ])); + } + + #[test] + fn class_brackets_hypen() { + assert_eq!(p("[]-]"), Expr::Class(class(&[('-', '-'), (']', ']')]))); + assert_eq!(p("[-]]"), Expr::Concat(vec![ + Expr::Class(class(&[('-', '-')])), + lit(']'), + ])); + } + + #[test] + fn class_overlapping() { + assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')]))); + assert_eq!(p("[a-fg-m]"), Expr::Class(class(&[('a', 'm')]))); + } + + #[test] + fn ascii_class() { + assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); + assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); + } + + #[test] + fn ascii_class_not() { + assert_eq!(p("[:abc:]"), + Expr::Class(class(&[(':', ':'), ('a', 'c')]))); + } + + #[test] + fn ascii_class_multiple() { + assert_eq!(p("[[:lower:][:upper:]]"), + Expr::Class(classes(&[UPPER, LOWER]))); + } + + #[test] + fn ascii_class_negate() { + assert_eq!(p("[[:^upper:]]"), Expr::Class(class(UPPER).negate())); + assert_eq!(p("[^[:^upper:]]"), Expr::Class(class(UPPER))); + } + + #[test] + fn ascii_class_negate_multiple() { + let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate()); + let cls = CharClass::empty().merge(nlower).merge(nupper); + assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone())); + assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate())); + } + + #[test] + fn ascii_class_case_fold() { + assert_eq!(p("(?i)[:upper:]"), Expr::Class(class(UPPER).case_fold())); + assert_eq!(p("(?i)[[:upper:]]"), + Expr::Class(class(UPPER).case_fold())); + } + + #[test] + fn ascii_class_negate_case_fold() { + assert_eq!(p("(?i)[[:^upper:]]"), + Expr::Class(class(UPPER).negate().case_fold())); + assert_eq!(p("(?i)[^[:^upper:]]"), + Expr::Class(class(UPPER).case_fold())); + } + + #[test] + fn ignore_space_literal() { + assert_eq!(p("(?x) a b c"), Expr::Concat(vec![ + lit('a'), lit('b'), lit('c'), + ])); + } + + #[test] + fn ignore_space_literal_off() { + assert_eq!(p("(?x) a b c(?-x) a"), Expr::Concat(vec![ + lit('a'), lit('b'), lit('c'), lit(' '), lit('a'), + ])); + } + + #[test] + fn ignore_space_class() { + assert_eq!(p("(?x)[a + - z +]"), Expr::Class(class(&[('a', 'z')]))); + assert_eq!(p("(?x)[ ^ a + - z +]"), Expr::Class(class(&[('a', 'z')]).negate())); + } + + #[test] + fn ignore_space_escape() { + assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD))); + assert_eq!(p(r"(?x)\ + D"), Expr::Class(class(PERLD).negate())); + } + + #[test] + fn ignore_space_comments() { + assert_eq!(p(r"(?x)(?P + a # comment 1 +)(?P + z # comment 2 +)"), Expr::Concat(vec![ + Expr::Group { + e: Box::new(lit('a')), + i: Some(1), + name: Some("foo".into()), + }, + Expr::Group { + e: Box::new(lit('z')), + i: Some(2), + name: Some("bar".into()), + }, + ])); + } + + #[test] + fn ignore_space_comments_re_enable() { + assert_eq!(p(r"(?x)a # hi +(?-x:#) # sweet"), Expr::Concat(vec![ + lit('a'), + Expr::Group { + e: Box::new(lit('#')), + i: None, + name: None, + }, + ])); + } + + // Test every single possible error case. + + macro_rules! test_err { + ($re:expr, $pos:expr, $kind:expr) => {{ + let err = Parser::parse($re).unwrap_err(); + assert_eq!($pos, err.pos); + assert_eq!($kind, err.kind); + assert!($re.contains(&err.surround)); + }} + } + + #[test] + fn error_repeat_no_expr_simple() { + test_err!("(*", 1, ErrorKind::RepeaterExpectsExpr); + } + + #[test] + fn error_repeat_no_expr_counted() { + test_err!("({5}", 1, ErrorKind::RepeaterExpectsExpr); + } + + #[test] + fn error_repeat_beginning_counted() { + test_err!("{5}", 0, ErrorKind::RepeaterExpectsExpr); + } + + #[test] + fn error_repeat_illegal_exprs_simple() { + test_err!("a**", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + })); + test_err!("a|*", 2, + ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')])) + ); + } + + #[test] + fn error_repeat_illegal_exprs_counted() { + test_err!("a*{5}", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat { + e: b(lit('a')), + r: Repeater::ZeroOrMore, + greedy: true, + })); + test_err!("a|{5}", 2, + ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')])) + ); + } + + #[test] + fn error_repeat_empty_number() { + test_err!("a{}", 2, ErrorKind::MissingBase10); + } + + #[test] + fn error_repeat_eof() { + test_err!("a{5", 3, ErrorKind::UnclosedRepeat); + } + + #[test] + fn error_repeat_empty_number_eof() { + test_err!("a{xyz", 5, ErrorKind::InvalidBase10("xyz".into())); + test_err!("a{12,xyz", 8, ErrorKind::InvalidBase10("xyz".into())); + } + + #[test] + fn error_repeat_invalid_number() { + test_err!("a{9999999999}", 12, + ErrorKind::InvalidBase10("9999999999".into())); + test_err!("a{1,9999999999}", 14, + ErrorKind::InvalidBase10("9999999999".into())); + } + + #[test] + fn error_repeat_invalid_number_extra() { + test_err!("a{12x}", 5, ErrorKind::InvalidBase10("12x".into())); + test_err!("a{1,12x}", 7, ErrorKind::InvalidBase10("12x".into())); + } + + #[test] + fn error_repeat_invalid_range() { + test_err!("a{2,1}", 5, + ErrorKind::InvalidRepeatRange { min: 2, max: 1 }); + } + + #[test] + fn error_alternate_empty() { + test_err!("|a", 0, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_alternate_empty_with_group() { + test_err!("(|a)", 1, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_alternate_empty_with_alternate() { + test_err!("a||", 2, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_close_paren_unopened_empty() { + test_err!(")", 0, ErrorKind::UnopenedParen); + } + + #[test] + fn error_close_paren_unopened() { + test_err!("ab)", 2, ErrorKind::UnopenedParen); + } + + #[test] + fn error_close_paren_unopened_with_alt() { + test_err!("a|b)", 3, ErrorKind::UnopenedParen); + } + + #[test] + fn error_close_paren_empty_alt() { + test_err!("(a|)", 3, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_close_paren_empty_group() { + test_err!("()", 1, ErrorKind::EmptyGroup); + } + + #[test] + fn error_close_paren_empty_group_with_name() { + test_err!("(?P)", 8, ErrorKind::EmptyGroup); + } + + #[test] + fn error_finish_concat_unclosed() { + test_err!("ab(xy", 2, ErrorKind::UnclosedParen); + } + + #[test] + fn error_finish_concat_empty_alt() { + test_err!("a|", 2, ErrorKind::EmptyAlternate); + } + + #[test] + fn error_group_name_invalid() { + test_err!("(?Px)", 6, ErrorKind::InvalidCaptureName("a#".into())); + } + + #[test] + fn error_group_name_invalid_leading() { + test_err!("(?P<1a>a)", 6, ErrorKind::InvalidCaptureName("1a".into())); + } + + #[test] + fn error_group_name_unexpected_eof() { + test_err!("(?Pa)", 4, ErrorKind::EmptyCaptureName); + } + + #[test] + fn error_group_opts_unrecognized_flag() { + test_err!("(?z:a)", 2, ErrorKind::UnrecognizedFlag('z')); + } + + #[test] + fn error_group_opts_unexpected_eof() { + test_err!("(?i", 3, ErrorKind::UnexpectedFlagEof); + } + + #[test] + fn error_group_opts_double_negation() { + test_err!("(?-i-s:a)", 4, ErrorKind::DoubleFlagNegation); + } + + #[test] + fn error_group_opts_empty_negation() { + test_err!("(?i-:a)", 4, ErrorKind::EmptyFlagNegation); + } + + #[test] + fn error_group_opts_empty() { + test_err!("(?)", 2, ErrorKind::EmptyFlagNegation); + } + + #[test] + fn error_escape_unexpected_eof() { + test_err!(r"\", 1, ErrorKind::UnexpectedEscapeEof); + } + + #[test] + fn error_escape_unrecognized() { + test_err!(r"\m", 1, ErrorKind::UnrecognizedEscape('m')); + } + + #[test] + fn error_escape_hex2_eof0() { + test_err!(r"\x", 2, ErrorKind::UnexpectedTwoDigitHexEof); + } + + #[test] + fn error_escape_hex2_eof1() { + test_err!(r"\xA", 3, ErrorKind::UnexpectedTwoDigitHexEof); + } + + #[test] + fn error_escape_hex2_invalid() { + test_err!(r"\xAG", 4, ErrorKind::InvalidBase16("AG".into())); + } + + #[test] + fn error_escape_hex_eof0() { + test_err!(r"\x{", 3, ErrorKind::InvalidBase16("".into())); + } + + #[test] + fn error_escape_hex_eof1() { + test_err!(r"\x{A", 4, ErrorKind::UnclosedHex); + } + + #[test] + fn error_escape_hex_invalid() { + test_err!(r"\x{AG}", 5, ErrorKind::InvalidBase16("AG".into())); + } + + #[test] + fn error_escape_hex_invalid_scalar_value_surrogate() { + test_err!(r"\x{D800}", 7, ErrorKind::InvalidScalarValue(0xD800)); + } + + #[test] + fn error_escape_hex_invalid_scalar_value_high() { + test_err!(r"\x{110000}", 9, ErrorKind::InvalidScalarValue(0x110000)); + } + + #[test] + fn error_escape_hex_invalid_u32() { + test_err!(r"\x{9999999999}", 13, + ErrorKind::InvalidBase16("9999999999".into())); + } + + #[test] + fn error_unicode_unclosed() { + test_err!(r"\p{", 3, ErrorKind::UnclosedUnicodeName); + test_err!(r"\p{Greek", 8, ErrorKind::UnclosedUnicodeName); + } + + #[test] + fn error_unicode_no_letter() { + test_err!(r"\p", 2, ErrorKind::UnexpectedEscapeEof); + } + + #[test] + fn error_unicode_unknown_letter() { + test_err!(r"\pA", 3, ErrorKind::UnrecognizedUnicodeClass("A".into())); + } + + #[test] + fn error_unicode_unknown_name() { + test_err!(r"\p{Yii}", 7, + ErrorKind::UnrecognizedUnicodeClass("Yii".into())); + } + + #[test] + fn error_class_eof_empty() { + test_err!("[", 1, ErrorKind::UnexpectedClassEof); + test_err!("[^", 2, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_eof_non_empty() { + test_err!("[a", 2, ErrorKind::UnexpectedClassEof); + test_err!("[^a", 3, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_eof_range() { + test_err!("[a-", 3, ErrorKind::UnexpectedClassEof); + test_err!("[^a-", 4, ErrorKind::UnexpectedClassEof); + test_err!("[---", 4, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_invalid_escape() { + test_err!(r"[\pA]", 4, + ErrorKind::UnrecognizedUnicodeClass("A".into())); + } + + #[test] + fn error_class_valid_escape_not_allowed() { + test_err!(r"[\A]", 3, ErrorKind::InvalidClassEscape(Expr::StartText)); + } + + #[test] + fn error_class_range_valid_escape_not_allowed() { + test_err!(r"[a-\d]", 5, + ErrorKind::InvalidClassEscape(Expr::Class(class(PERLD)))); + test_err!(r"[a-\A]", 5, + ErrorKind::InvalidClassEscape(Expr::StartText)); + test_err!(r"[\A-a]", 3, + ErrorKind::InvalidClassEscape(Expr::StartText)); + } + + #[test] + fn error_class_invalid_range() { + test_err!("[z-a]", 4, ErrorKind::InvalidClassRange { + start: 'z', + end: 'a', + }); + } + + #[test] + fn error_class_empty_range() { + test_err!("[]", 2, ErrorKind::UnexpectedClassEof); + test_err!("[^]", 3, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_duplicate_capture_name() { + test_err!("(?P.)(?P.)", 14, + ErrorKind::DuplicateCaptureName("a".into())); + } +} diff --git a/regex_syntax/src/properties.rs b/regex_syntax/src/properties.rs new file mode 100644 index 0000000000..38cbb02e73 --- /dev/null +++ b/regex_syntax/src/properties.rs @@ -0,0 +1,407 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use quickcheck::{Arbitrary, Gen, Testable, QuickCheck, StdGen}; +use rand::Rng; + +use {Expr, CharClass, ClassRange, Repeater, dec_char}; + +fn qc(t: T) { + QuickCheck::new() + .tests(10_000) + .max_tests(20_000) + .quickcheck(t); +} + +fn class(ranges: &[(char, char)]) -> CharClass { + let ranges = ranges.iter().cloned() + .map(|(c1, c2)| ClassRange::new(c1, c2)).collect(); + CharClass::new(ranges) +} + +// Test invariants for canonicalizing character classes. + +#[test] +fn negate() { + fn prop(ranges: Vec<(char, char)>) -> bool { + class(&ranges).canonicalize() == class(&ranges).negate().negate() + } + qc(prop as fn(Vec<(char, char)>) -> bool); +} + +#[test] +fn classes_are_sorted_and_nonoverlapping() { + fn prop(ranges: Vec<(char, char)>) -> bool { + class(&ranges) + .canonicalize() + .windows(2) + .all(|w| w[0].end < dec_char(w[1].start)) + } + qc(prop as fn(Vec<(char, char)>) -> bool); +} + +#[test] +fn valid_class_ranges() { + fn prop(ranges: Vec<(char, char)>) -> bool { + class(&ranges).canonicalize().into_iter().all(|r| r.start <= r.end) + } + qc(prop as fn(Vec<(char, char)>) -> bool); +} + +/// A wrapper type for generating "regex-like" Unicode strings. +/// +/// In particular, this type's `Arbitrary` impl specifically biases toward +/// special regex characters to make test cases more interesting. +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +struct RegexLikeString(String); + +impl Arbitrary for RegexLikeString { + fn arbitrary(g: &mut G) -> RegexLikeString { + const SPECIAL: &'static [char] = &[ + '\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}', + '^', '$', + ]; + // Generating random Unicode strings results in mostly uninteresting + // regexes. Namely, they'll mostly just be literals. + // To make properties using regex strings more interesting, we bias + // toward selecting characters of significance to a regex. + let size = { let s = g.size(); g.gen_range(0, s) }; + RegexLikeString((0..size).map(|_| { + if g.gen_weighted_bool(3) { + *g.choose(SPECIAL).unwrap() + } else { + g.gen() + } + }).collect()) + } + + fn shrink(&self) -> Box> { + // The regular `String` shrinker is good enough. + Box::new(self.0.shrink().map(RegexLikeString)) + } +} + +/// A special type for generating small non-zero sized ASCII strings. +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +struct SmallAscii(String); + +impl Arbitrary for SmallAscii { + fn arbitrary(g: &mut G) -> SmallAscii { + use std::char::from_u32; + let size = g.gen_range(1, 5); + SmallAscii((0..size) + .map(|_| from_u32(g.gen_range(97, 123)).unwrap()) + .collect()) + } + + fn shrink(&self) -> Box> { + Box::new(self.0.shrink().map(SmallAscii)) + } +} + +#[test] +fn parser_never_panics() { + fn prop(s: RegexLikeString) -> bool { + let _ = Expr::parse(&s.0); true + } + qc(prop as fn(RegexLikeString) -> bool); +} + +// Testing entire expressions. +// +// We only have one test at the moment, but the machinery could be useful +// for other things. +// +// In particular, Russ Cox writes about testing regexes by comparing the +// strings they match with other regex implementations. A fuzzer/shrinker +// (which is what's implemented below) would be a great way to drive that +// process. ---AG + +impl Arbitrary for Expr { + fn arbitrary(g: &mut G) -> Expr { + fix_capture_indices(gen_expr(g, 0, ExprType::Anything)).simplify() + } + + fn shrink(&self) -> Box> { + use Expr::*; + + let nada = || Box::new(None.into_iter()); + let es: Box> = match *self { + Empty | AnyChar | AnyCharNoNL + | StartLine | EndLine | StartText | EndText + | WordBoundary | NotWordBoundary => nada(), + Literal { ref chars, .. } if chars.len() == 1 => nada(), + Literal { ref chars, casei } => { + Box::new((chars.clone(), casei) + .shrink() + .filter(|&(ref chars, _)| chars.len() > 0) + .map(|(chars, casei)| { + Literal { chars: chars, casei: casei } + })) + } + Class(ref cls) => Box::new(cls.shrink().map(Class)), + Group { ref e, ref i, ref name } => { + let (i, name) = (i.clone(), name.clone()); + Box::new(e.clone().shrink() + .chain(e.clone().shrink() + .map(move |e| Group { + e: Box::new(e), + i: i.clone(), + name: name.clone(), + }))) + } + Repeat { ref e, ref r, greedy } => { + Box::new((*e.clone(), r.clone()) + .shrink() + .filter(|&(ref e, _)| e.can_repeat()) + .map(move |(e, r)| Repeat { + e: Box::new(e), + r: r, + greedy: greedy, + })) + } + // Concat(ref es) if es.len() <= 2 => nada(), + Concat(ref es) => { + Box::new(es.clone() + .shrink() + .filter(|es| es.len() > 0) + .map(|mut es| if es.len() == 1 { + es.pop().unwrap() + } else { + Concat(es) + })) + } + // Alternate(ref es) if es.len() <= 2 => nada(), + Alternate(ref es) => { + Box::new(es.clone() + .shrink() + .filter(|es| es.len() > 0) + .map(|mut es| if es.len() == 1 { + es.pop().unwrap() + } else { + Alternate(es) + })) + } + }; + Box::new(es.map(|e| fix_capture_indices(e).simplify())) + } +} + +enum ExprType { + NoSequences, // disallow concat/alternate + Anything, +} + +fn gen_expr(g: &mut G, depth: u32, ty: ExprType) -> Expr { + use Expr::*; + let ub = match (depth as usize >= g.size(), ty) { + (true, _) => 11, + (false, ExprType::NoSequences) => 13, + (false, ExprType::Anything) => 15, + }; + match g.gen_range(1, ub) { + 0 => Empty, + 1 => Literal { + chars: SmallAscii::arbitrary(g).0.chars().collect(), + casei: g.gen(), + }, + 2 => AnyChar, + 3 => AnyCharNoNL, + 4 => Class(CharClass::arbitrary(g)), + 5 => StartLine, + 6 => EndLine, + 7 => StartText, + 8 => EndText, + 9 => WordBoundary, + 10 => NotWordBoundary, + 11 => gen_group_expr(g, depth + 1), + 12 => Repeat { + e: Box::new(gen_repeatable_expr(g, depth + 1)), + r: Repeater::arbitrary(g), + greedy: bool::arbitrary(g), + }, + 13 => { + let size = { let s = g.size(); g.gen_range(2, s) }; + Concat((0..size) + .map(|_| { + gen_expr(g, depth + 1, ExprType::NoSequences) + }) + .collect()) + } + 14 => { + let size = { let s = g.size(); g.gen_range(2, s) }; + Alternate((0..size) + .map(|_| { + gen_expr(g, depth + 1, ExprType::NoSequences) + }) + .collect()) + } + _ => unreachable!() + } +} + +fn gen_repeatable_expr(g: &mut G, depth: u32) -> Expr { + use Expr::*; + match g.gen_range(1, 6) { + 0 => Empty, + 1 => Literal { + chars: vec![Arbitrary::arbitrary(g)], + casei: g.gen(), + }, + 2 => AnyChar, + 3 => AnyCharNoNL, + 4 => Class(CharClass::arbitrary(g)), + 5 => gen_group_expr(g, depth + 1), + _ => unreachable!(), + } +} + +fn gen_group_expr(g: &mut G, depth: u32) -> Expr { + let (i, name) = if g.gen() { + (None, None) + } else { + (Some(0), if g.gen() { + Some(SmallAscii::arbitrary(g).0) + } else { + None + }) + }; + Expr::Group { + e: Box::new(gen_expr(g, depth + 1, ExprType::Anything)), + i: i, + name: name, + } +} + +fn fix_capture_indices(e: Expr) -> Expr { + fn bx(e: Expr) -> Box { Box::new(e) } + fn fix(e: Expr, capi: &mut usize, names: &mut Vec) -> Expr { + use Expr::*; + match e { + Group { e, i: Some(_), mut name } => { + *capi += 1; + let i = *capi; + let mut dupe_name = false; + if let Some(ref n1) = name { + if names.iter().any(|n2| n1 == n2) { + dupe_name = true; + } else { + names.push(n1.clone()); + } + } + if dupe_name { name = None; } + Group { e: bx(fix(*e, capi, names)), i: Some(i), name: name } + } + Group { e, i, name } => { + Group { e: bx(fix(*e, capi, names)), i: i, name: name } + } + Repeat { e, r, greedy } => { + Repeat { e: bx(fix(*e, capi, names)), r: r, greedy: greedy } + } + Concat(es) => + Concat(es.into_iter().map(|e| fix(e, capi, names)).collect()), + Alternate(es) => + Alternate(es.into_iter().map(|e| fix(e, capi, names)).collect()), + e => e, + } + } + fix(e, &mut 0, &mut vec![]) +} + +impl Arbitrary for Repeater { + fn arbitrary(g: &mut G) -> Repeater { + use Repeater::*; + match g.gen_range(0, 4) { + 0 => ZeroOrOne, + 1 => ZeroOrMore, + 2 => OneOrMore, + 3 => { + use std::cmp::{max, min}; + let n1 = Arbitrary::arbitrary(g); + let n2 = Arbitrary::arbitrary(g); + Range { + min: min(n1, n2), + max: if g.gen() { None } else { Some(max(n1, n2)) }, + } + }, + _ => unreachable!(), + } + } + + fn shrink(&self) -> Box> { + use Repeater::*; + match *self { + ZeroOrOne | ZeroOrMore | OneOrMore => Box::new(None.into_iter()), + Range { min, max } => { + Box::new((min, max) + .shrink() + .map(|(min, max)| Range { min: min, max: max })) + } + } + } +} + +impl Arbitrary for CharClass { + fn arbitrary(g: &mut G) -> CharClass { + let mut ranges: Vec = Arbitrary::arbitrary(g); + if ranges.is_empty() { + ranges.push(Arbitrary::arbitrary(g)); + } + let cls = CharClass { + ranges: ranges, + casei: false, + }.canonicalize(); + if g.gen() { cls.case_fold() } else { cls } + } + + fn shrink(&self) -> Box> { + Box::new((self.ranges.clone(), self.casei) + .shrink() + .filter(|&(ref ranges, _)| ranges.len() > 0) + .map(|(ranges, casei)| { + let cls = CharClass { + ranges: ranges, + casei: casei, + }.canonicalize(); + if casei { cls.case_fold() } else { cls } + })) + } +} + +impl Arbitrary for ClassRange { + fn arbitrary(g: &mut G) -> ClassRange { + use std::char::from_u32; + ClassRange::new( + from_u32(g.gen_range(97, 123)).unwrap(), + from_u32(g.gen_range(97, 123)).unwrap(), + ) + } + + fn shrink(&self) -> Box> { + Box::new((self.start, self.end) + .shrink().map(|(s, e)| ClassRange::new(s, e))) + } +} + +#[test] +fn display_regex_roundtrips() { + // Given an AST, if we print it as a regex and then re-parse it, do we + // get back the same AST? + // A lot of this relies crucially on regex simplification. So this is + // testing `Expr::simplify` as much as it is testing the `Display` impl. + fn prop(e: Expr) -> bool { + e == Expr::parse(&e.to_string()).unwrap() + } + QuickCheck::new() + .tests(10_000) + .max_tests(20_000) + .gen(StdGen::new(::rand::thread_rng(), 50)) + .quickcheck(prop as fn(Expr) -> bool); +} diff --git a/src/unicode.rs b/regex_syntax/src/unicode.rs similarity index 100% rename from src/unicode.rs rename to regex_syntax/src/unicode.rs diff --git a/scripts/unicode.py b/scripts/unicode.py index f734b78099..05bf78c9d5 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -20,7 +20,7 @@ # Since this should not require frequent updates, we just store this # out-of-line and check the unicode.rs file into git. -import fileinput, re, os, sys, operator +import fileinput, re, os, sys preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at diff --git a/src/compile.rs b/src/compile.rs index fd1cf27c62..413da3e9e8 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -14,61 +14,29 @@ use self::Inst::*; use std::cmp; -use std::iter::repeat; -use parse; -use parse::{Flags, FLAG_EMPTY}; -use parse::Ast::{ - Nothing, Literal, Dot, AstClass, Begin, End, WordBoundary, Capture, - Cat, Alt, Rep, -}; -use parse::Repeater::{ZeroOne, ZeroMore, OneMore}; +use syntax::{self, Expr, Repeater}; +use Error; pub type InstIdx = usize; /// An instruction, the underlying unit of a compiled regular expression +#[allow(missing_docs)] #[derive(Debug, Clone)] pub enum Inst { /// When a Match instruction is executed, the current thread is successful. Match, - - /// The OneChar instruction matches a literal character. - /// The flags indicate whether to do a case insensitive match. - OneChar(char, Flags), - - /// The CharClass instruction tries to match one input character against - /// the range of characters given. - /// The flags indicate whether to do a case insensitive match. - CharClass(Vec<(char, char)>, Flags), - - /// Matches any character except new lines. - /// The flags indicate whether to include the '\n' character. - Any(Flags), - - /// Matches the beginning of the string, consumes no characters. - /// The flags indicate whether it matches if the preceding character - /// is a new line. - EmptyBegin(Flags), - - /// Matches the end of the string, consumes no characters. - /// The flags indicate whether it matches if the proceeding character - /// is a new line. - EmptyEnd(Flags), - - /// Matches a word boundary (\w on one side and \W \A or \z on the other), - /// and consumes no character. - /// The flags indicate whether this matches a word boundary or something - /// that isn't a word boundary. - EmptyWordBoundary(Flags), - - /// Saves the current position in the input string to the Nth save slot. + OneChar { c: char, casei: bool }, + CharClass(syntax::CharClass), + Any, + AnyNoNL, + StartLine, + EndLine, + StartText, + EndText, + WordBoundary, + NotWordBoundary, Save(usize), - - /// Jumps to the instruction at the index given. Jump(InstIdx), - - /// Jumps to the instruction at the first index given. If that leads to - /// a panic state, then the instruction at the second index given is - /// tried. Split(InstIdx, InstIdx), } @@ -90,14 +58,15 @@ pub struct Program { impl Program { /// Compiles a Regex given its AST. - pub fn new(ast: parse::Ast) -> (Program, Vec>) { + pub fn new(ast: Expr, size: usize) -> Result<(Program, Vec>), Error> { let mut c = Compiler { insts: Vec::with_capacity(100), - names: Vec::with_capacity(10), + names: vec![None], + size_limit: size, }; c.insts.push(Save(0)); - c.compile(ast); + try!(c.compile(ast)); c.insts.push(Save(1)); c.insts.push(Match); @@ -107,17 +76,17 @@ impl Program { let mut pre = String::with_capacity(5); for inst in c.insts[1..].iter() { match *inst { - OneChar(c, FLAG_EMPTY) => pre.push(c), + OneChar { c, casei: false } => pre.push(c), _ => break } } - let Compiler { insts, names } = c; + let Compiler { insts, names, .. } = c; let prog = Program { insts: insts, prefix: pre, }; - (prog, names) + Ok((prog, names)) } /// Returns the total number of capture groups in the regular expression. @@ -138,6 +107,7 @@ impl Program { struct Compiler { insts: Vec, names: Vec>, + size_limit: usize, } // The compiler implemented here is extremely simple. Most of the complexity @@ -145,83 +115,132 @@ struct Compiler { // The only tricky thing here is patching jump/split instructions to point to // the right instruction. impl Compiler { - fn compile(&mut self, ast: parse::Ast) { + fn check_size(&self) -> Result<(), Error> { + if self.insts.len() * ::std::mem::size_of::() > self.size_limit { + Err(Error::CompiledTooBig(self.size_limit)) + } else { + Ok(()) + } + } + + fn compile(&mut self, ast: Expr) -> Result<(), Error> { match ast { - Nothing => {}, - Literal(c, flags) => self.push(OneChar(c, flags)), - Dot(nl) => self.push(Any(nl)), - AstClass(ranges, flags) => self.push(CharClass(ranges, flags)), - Begin(flags) => self.push(EmptyBegin(flags)), - End(flags) => self.push(EmptyEnd(flags)), - WordBoundary(flags) => self.push(EmptyWordBoundary(flags)), - Capture(cap, name, x) => { - let len = self.names.len(); - if cap >= len { - self.names.extend(repeat(None).take(10 + cap - len)) + Expr::Empty => {}, + Expr::Literal { chars, casei } => { + for c in chars { + self.push(OneChar { c: c, casei: casei }); } - self.names[cap] = name; - - self.push(Save(2 * cap)); - self.compile(*x); - self.push(Save(2 * cap + 1)); } - Cat(xs) => { - for x in xs.into_iter() { - self.compile(x) + Expr::AnyChar => self.push(Any), + Expr::AnyCharNoNL => self.push(AnyNoNL), + Expr::Class(cls) => self.push(CharClass(cls)), + Expr::StartLine => self.push(StartLine), + Expr::EndLine => self.push(EndLine), + Expr::StartText => self.push(StartText), + Expr::EndText => self.push(EndText), + Expr::WordBoundary => self.push(WordBoundary), + Expr::NotWordBoundary => self.push(NotWordBoundary), + Expr::Group { e, i: None, name: None } => try!(self.compile(*e)), + Expr::Group { e, i, name } => { + let i = i.expect("capture index"); + self.names.push(name); + self.push(Save(2 * i)); + try!(self.compile(*e)); + self.push(Save(2 * i + 1)); + } + Expr::Concat(es) => { + for e in es { + try!(self.compile(e)); } } - Alt(x, y) => { + Expr::Alternate(mut es) => { + // TODO: Don't use recursion here. ---AG + if es.len() == 0 { + return Ok(()); + } + let e1 = es.remove(0); + if es.len() == 0 { + try!(self.compile(e1)); + return Ok(()); + } + let e2 = Expr::Alternate(es); // this causes recursion + let split = self.empty_split(); // push: split 0, 0 let j1 = self.insts.len(); - self.compile(*x); // push: insts for x + try!(self.compile(e1)); // push: insts for x let jmp = self.empty_jump(); // push: jmp 0 let j2 = self.insts.len(); - self.compile(*y); // push: insts for y + try!(self.compile(e2)); // push: insts for y let j3 = self.insts.len(); self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2 self.set_jump(jmp, j3); // jmp 0 -> jmp j3 } - Rep(x, ZeroOne, g) => { + Expr::Repeat { e, r: Repeater::ZeroOrOne, greedy } => { let split = self.empty_split(); let j1 = self.insts.len(); - self.compile(*x); + try!(self.compile(*e)); let j2 = self.insts.len(); - if g.is_greedy() { + if greedy { self.set_split(split, j1, j2); } else { self.set_split(split, j2, j1); } } - Rep(x, ZeroMore, g) => { + Expr::Repeat { e, r: Repeater::ZeroOrMore, greedy } => { let j1 = self.insts.len(); let split = self.empty_split(); let j2 = self.insts.len(); - self.compile(*x); + try!(self.compile(*e)); let jmp = self.empty_jump(); let j3 = self.insts.len(); self.set_jump(jmp, j1); - if g.is_greedy() { + if greedy { self.set_split(split, j2, j3); } else { self.set_split(split, j3, j2); } } - Rep(x, OneMore, g) => { + Expr::Repeat { e, r: Repeater::OneOrMore, greedy } => { let j1 = self.insts.len(); - self.compile(*x); + try!(self.compile(*e)); let split = self.empty_split(); let j2 = self.insts.len(); - if g.is_greedy() { + if greedy { self.set_split(split, j1, j2); } else { self.set_split(split, j2, j1); } } + Expr::Repeat { e, r: Repeater::Range { min, max: None }, greedy } => { + let e = *e; + for _ in 0..min { + try!(self.compile(e.clone())); + } + try!(self.compile(Expr::Repeat { + e: Box::new(e), + r: Repeater::ZeroOrMore, + greedy: greedy, + })); + } + Expr::Repeat { e, r: Repeater::Range { min, max: Some(max) }, greedy } => { + let e = *e; + for _ in 0..min { + try!(self.compile(e.clone())); + } + for _ in min..max { + try!(self.compile(Expr::Repeat { + e: Box::new(e.clone()), + r: Repeater::ZeroOrOne, + greedy: greedy, + })); + } + } } + self.check_size() } /// Appends the given instruction to the program. diff --git a/src/lib.rs b/src/lib.rs index c5cbb9c126..d63c98dad4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,17 +20,17 @@ //! details on the API, please see the documentation for the `Regex` type. //! //! # Usage -//! +//! //! This crates is [on crates.io](https://crates.io/crates/regex) and can be //! used by adding `regex` to your dependencies in your project's `Cargo.toml`. -//! +//! //! ```toml //! [dependencies] //! regex = "0.1.8" //! ``` -//! +//! //! and this to your crate root: -//! +//! //! ```rust //! extern crate regex; //! ``` @@ -43,11 +43,8 @@ //! //! ```rust //! use regex::Regex; -//! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") { -//! Ok(re) => re, -//! Err(err) => panic!("{}", err), -//! }; -//! assert_eq!(re.is_match("2014-01-01"), true); +//! let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +//! assert!(re.is_match("2014-01-01")); //! ``` //! //! Notice the use of the `^` and `$` anchors. In this crate, every expression @@ -55,8 +52,9 @@ //! it to match anywhere in the text. Anchors can be used to ensure that the //! full text matches an expression. //! -//! This example also demonstrates the utility of [raw -//! strings](../reference.html#character-and-string-literals) in Rust, which +//! This example also demonstrates the utility of +//! [raw strings](http://doc.rust-lang.org/stable/reference.html#raw-byte-string-literals) +//! in Rust, which //! are just like regular strings except they are prefixed with an `r` and do //! not process any escape sequences. For example, `"\\d"` is the same //! expression as `r"\d"`. @@ -81,7 +79,7 @@ //! //! fn main() { //! let re = regex!(r"^\d{4}-\d{2}-\d{2}$"); -//! assert_eq!(re.is_match("2014-01-01"), true); +//! assert!(re.is_match("2014-01-01")); //! } //! ``` //! @@ -96,20 +94,9 @@ //! expressions, but 100+ calls to `regex!` will probably result in a //! noticeably bigger binary. //! -//! **NOTE**: This is implemented using a compiler plugin, which will not be +//! **NOTE**: This is implemented using a compiler plugin, which is not //! available on the Rust 1.0 beta/stable channels. Therefore, you'll only -//! be able to use `regex!` on the nightlies. If you want to retain the -//! `regex!` macro, you can cheat and define this: -//! -//! ```rust -//! macro_rules! regex( -//! ($s:expr) => (regex::Regex::new($s).unwrap()); -//! ); -//! ``` -//! -//! But this just replaces native regexes with dynamic regexes under the hood. -//! Moreover, this will cause your program to panic *at runtime* if an invalid -//! regular expression is given. +//! be able to use `regex!` on the nightlies. //! //! # Example: iterating over capture groups //! @@ -159,6 +146,25 @@ //! provides more flexibility than is seen here. (See the documentation for //! `Regex::replace` for more details.) //! +//! Note that if your regex gets complicated, you can use the `x` flag to +//! enable insigificant whitespace mode, which also lets you write comments: +//! +//! ```rust +//! # extern crate regex; use regex::Regex; +//! # fn main() { +//! let re = Regex::new(r"(?x) +//! (?P\d{4}) # the year +//! - +//! (?P\d{2}) # the month +//! - +//! (?P\d{2}) # the day +//! ").unwrap(); +//! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +//! let after = re.replace_all(before, "$m/$d/$y"); +//! assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +//! # } +//! ``` +//! //! # Pay for what you use //! //! With respect to searching text with a regular expression, there are three @@ -180,15 +186,16 @@ //! # Unicode //! //! This implementation executes regular expressions **only** on sequences of -//! Unicode code points while exposing match locations as byte indices into the -//! search string. +//! Unicode scalar values while exposing match locations as byte indices into +//! the search string. //! -//! Currently, only naive case folding is supported. Namely, when matching -//! case insensitively, the characters are first converted to their uppercase -//! forms and then compared. +//! Currently, only simple case folding is supported. Namely, when matching +//! case insensitively, the characters are first mapped using the +//! [simple case folding](ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt) +//! mapping. //! //! Regular expressions themselves are also **only** interpreted as a sequence -//! of Unicode code points. This means you can use Unicode characters +//! of Unicode scalar values. This means you can use Unicode characters //! directly in your expression: //! //! ```rust @@ -214,7 +221,11 @@ //! # Syntax //! //! The syntax supported in this crate is almost in an exact correspondence -//! with the syntax supported by RE2. +//! with the syntax supported by RE2. It is documented below. +//! +//! Note that the regular expression parser and abstract syntax are exposed in +//! a separate crate, +//! [`regex-syntax`](../regex_syntax/index.html). //! //! ## Matching one character //! @@ -294,6 +305,7 @@ //! m multi-line mode: ^ and $ match begin/end of line //! s allow . to match \n //! U swap the meaning of x* and x*? +//! x ignore whitespace and allow line comments (starting with `#`) //! //! //! Here's an example that matches case insensitively for only part of the @@ -361,22 +373,19 @@ //! //! # Untrusted input //! -//! There are two factors to consider here: untrusted regular expressions and -//! untrusted search text. -//! -//! Currently, there are no counter-measures in place to prevent a malicious -//! user from writing an expression that may use a lot of resources. One such -//! example is to repeat counted repetitions: `((a{100}){100}){100}` will try -//! to repeat the `a` instruction `100^3` times. Essentially, this means it's -//! very easy for an attacker to exhaust your system's memory if they are -//! allowed to execute arbitrary regular expressions. A possible solution to -//! this is to impose a hard limit on the size of a compiled expression, but it -//! does not yet exist. -//! -//! The story is a bit better with untrusted search text, since this crate's -//! implementation provides `O(nm)` search where `n` is the number of -//! characters in the search text and `m` is the number of instructions in a -//! compiled expression. +//! This crate can handle both untrusted regular expressions and untrusted +//! search text. +//! +//! Untrusted regular expressions are handled by capping the size of a compiled +//! regular expression. (See `Regex::with_size_limit`.) Without this, it would +//! be trivial for an attacker to exhaust your system's memory with expressions +//! like `a{100}{100}{100}`. +//! +//! Untrusted search text is allowed because the matching engine(s) in this +//! crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search +//! text`), which means there's no way to cause exponential blow-up like with +//! some other regular expression engines. (We pay for this by disallowing +//! features like arbitrary look-ahead and back-references.) #![deny(missing_docs)] #![cfg_attr(test, deny(warnings))] @@ -385,16 +394,17 @@ html_favicon_url = "http://www.rust-lang.org/favicon.ico", html_root_url = "http://doc.rust-lang.org/regex/")] -pub use parse::Error; -pub use re::{Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed}; -pub use re::{FindCaptures, FindMatches}; -pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN}; -pub use re::{quote, is_match}; +extern crate regex_syntax as syntax; + +pub use re::{ + Regex, Error, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed, + FindCaptures, FindMatches, + Replacer, NoExpand, RegexSplits, RegexSplitsN, + quote, is_match, +}; mod compile; -mod parse; mod re; -mod unicode; mod vm; /// The `native` module exists to support the `regex!` macro. Do not use. @@ -416,17 +426,11 @@ pub mod native { // On the bright side, `rustdoc` lets us hide this from the public API // documentation. pub use compile::Program; - pub use compile::Inst::{ - Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, - EmptyWordBoundary, Save, Jump, Split, - }; - pub use parse::{ - FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, - FLAG_SWAP_GREED, FLAG_NEGATED, - }; + pub use compile::Inst; + pub use syntax::simple_case_fold; pub use re::{ExDynamic, ExNative}; pub use re::Regex::{Dynamic, Native}; - pub use vm::{CharReader, find_prefix, simple_case_fold}; + pub use vm::{CharReader, find_prefix}; pub use vm::MatchKind::{self, Exists, Location, Submatches}; pub use vm::StepState::{ self, StepMatchEarlyReturn, StepMatch, StepContinue, diff --git a/src/parse.rs b/src/parse.rs deleted file mode 100644 index 6ec8362e49..0000000000 --- a/src/parse.rs +++ /dev/null @@ -1,1160 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::char; -use std::cmp; -use std::fmt; - -/// Static data containing Unicode ranges for general categories and scripts. -use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW}; -use vm::simple_case_fold; - -use self::Ast::*; -use self::Repeater::*; -use self::Greed::*; -use self::BuildAst::*; - -/// The maximum number of repetitions allowed with the `{n,m}` syntax. -static MAX_REPEAT: usize = 1000; - -/// Error corresponds to something that can go wrong while parsing -/// a regular expression. -/// -/// (Once an expression is compiled, it is not possible to produce an error -/// via searching, splitting or replacing.) -#[derive(Debug)] -pub struct Error { - /// The *approximate* character index of where the error occurred. - pub pos: usize, - /// A message describing the error. - pub msg: String, -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Regex syntax error near position {}: {}", - self.pos, self.msg) - } -} - -/// Represents the abstract syntax of a regular expression. -/// It is showable so that error messages resulting from a bug can provide -/// useful information. -/// It is cloneable so that expressions can be repeated for the counted -/// repetition feature. (No other copying is done.) -/// -/// Note that this representation prevents one from reproducing the regex as -/// it was typed. (But it could be used to reproduce an equivalent regex.) -#[derive(Debug, Clone)] -pub enum Ast { - Nothing, - Literal(char, Flags), - Dot(Flags), - AstClass(Vec<(char, char)>, Flags), - Begin(Flags), - End(Flags), - WordBoundary(Flags), - Capture(usize, Option, Box), - // Represent concatenation as a flat vector to avoid blowing the - // stack in the compiler. - Cat(Vec), - Alt(Box, Box), - Rep(Box, Repeater, Greed), -} - -#[derive(Debug, PartialEq, Clone)] -pub enum Repeater { - ZeroOne, - ZeroMore, - OneMore, -} - -#[derive(Debug, Clone)] -pub enum Greed { - Greedy, - Ungreedy, -} - -impl Copy for Greed {} - -impl Greed { - pub fn is_greedy(&self) -> bool { - match *self { - Greedy => true, - _ => false, - } - } - - fn swap(self, swapped: bool) -> Greed { - if !swapped { return self } - match self { - Greedy => Ungreedy, - Ungreedy => Greedy, - } - } -} - -/// BuildAst is a regrettable type that represents intermediate state for -/// constructing an abstract syntax tree. Its central purpose is to facilitate -/// parsing groups and alternations while also maintaining a stack of flag -/// state. -#[derive(Debug)] -enum BuildAst { - Expr(Ast), - Paren(Flags, usize, String), // '(' - Bar, // '|' -} - -impl BuildAst { - fn paren(&self) -> bool { - match *self { - Paren(_, _, _) => true, - _ => false, - } - } - - fn flags(&self) -> Flags { - match *self { - Paren(flags, _, _) => flags, - _ => panic!("Cannot get flags from {:?}", self), - } - } - - fn capture(&self) -> Option { - match *self { - Paren(_, 0, _) => None, - Paren(_, c, _) => Some(c), - _ => panic!("Cannot get capture group from {:?}", self), - } - } - - fn capture_name(&self) -> Option { - match *self { - Paren(_, 0, _) => None, - Paren(_, _, ref name) => { - if name.len() == 0 { - None - } else { - Some(name.clone()) - } - } - _ => panic!("Cannot get capture name from {:?}", self), - } - } - - fn bar(&self) -> bool { - match *self { - Bar => true, - _ => false, - } - } - - fn unwrap(self) -> Result { - match self { - Expr(x) => Ok(x), - _ => panic!("Tried to unwrap non-AST item: {:?}", self), - } - } -} - -/// Flags represents all options that can be twiddled by a user in an -/// expression. -pub type Flags = u8; - -pub const FLAG_EMPTY: u8 = 0; -pub const FLAG_NOCASE: u8 = 1 << 0; // i -pub const FLAG_MULTI: u8 = 1 << 1; // m -pub const FLAG_DOTNL: u8 = 1 << 2; // s -pub const FLAG_SWAP_GREED: u8 = 1 << 3; // U -pub const FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary - -struct Parser { - // The input, parsed only as a sequence of UTF8 code points. - chars: Vec, - // The index of the current character in the input. - chari: usize, - // The intermediate state representing the AST. - stack: Vec, - // The current set of flags. - flags: Flags, - // The total number of capture groups. - // Incremented each time an opening left paren is seen (assuming it is - // opening a capture group). - caps: usize, - // A set of all capture group names used only to detect duplicates. - names: Vec, -} - -pub fn parse(s: &str) -> Result { - Parser { - chars: s.chars().collect(), - chari: 0, - stack: vec!(), - flags: FLAG_EMPTY, - caps: 0, - names: vec!(), - }.parse() -} - -impl Parser { - fn parse(&mut self) -> Result { - if self.chars.len() == 0 { - return Ok(Nothing); - } - loop { - let c = self.cur(); - match c { - '?' | '*' | '+' => try!(self.push_repeater(c)), - '\\' => { - let ast = try!(self.parse_escape()); - if let AstClass(mut ranges, flags) = ast { - if flags & FLAG_NOCASE > 0 { - ranges = case_fold_and_combine_ranges(ranges); - } - self.push(AstClass(ranges, flags)) - } else { - self.push(ast) - } - } - '{' => try!(self.parse_counted()), - '[' => match self.try_parse_ascii() { - None => try!(self.parse_class()), - Some(class) => self.push(class), - }, - '(' => { - if self.peek_is(1, '?') { - try!(self.expect('?')); - try!(self.parse_group_opts()); - } else { - self.caps += 1; - self.stack.push(Paren(self.flags, - self.caps, - "".to_string())) - } - } - ')' => { - let catfrom = try!( - self.pos_last(false, |x| x.paren() || x.bar())); - try!(self.concat(catfrom)); - - let altfrom = try!(self.pos_last(false, |x| x.paren())); - // Before we smush the alternates together and pop off the - // left paren, let's grab the old flags and see if we - // need a capture. - let (cap, cap_name, oldflags) = { - let paren = &self.stack[altfrom-1]; - (paren.capture(), paren.capture_name(), paren.flags()) - }; - try!(self.alternate(altfrom)); - self.flags = oldflags; - - // If this was a capture, pop what we just pushed in - // alternate and make it a capture. - if cap.is_some() { - let ast = try!(self.pop_ast()); - self.push(Capture(cap.unwrap(), cap_name, Box::new(ast))); - } - } - '|' => { - let catfrom = try!( - self.pos_last(true, |x| x.paren() || x.bar())); - try!(self.concat(catfrom)); - - self.stack.push(Bar); - } - _ => try!(self.push_literal(c)), - } - if !self.next_char() { - break - } - } - - // Try to improve error handling. At this point, there should be - // no remaining open parens. - if self.stack.iter().any(|x| x.paren()) { - return self.err("Unclosed parenthesis.") - } - let catfrom = try!(self.pos_last(true, |x| x.bar())); - try!(self.concat(catfrom)); - try!(self.alternate(0)); - - assert!(self.stack.len() == 1); - self.pop_ast() - } - - fn noteof(&mut self, expected: &str) -> Result<(), Error> { - match self.next_char() { - true => Ok(()), - false => { - self.err(&format!("Expected {:?} but got EOF.", expected)) - } - } - } - - fn expect(&mut self, expected: char) -> Result<(), Error> { - match self.next_char() { - true if self.cur() == expected => Ok(()), - true => self.err(&format!("Expected '{}' but got '{}'.", - expected, self.cur())), - false => { - self.err(&format!("Expected '{}' but got EOF.", - expected)) - } - } - } - - fn next_char(&mut self) -> bool { - self.chari += 1; - self.chari < self.chars.len() - } - - fn pop_ast(&mut self) -> Result { - match self.stack.pop().unwrap().unwrap() { - Err(e) => Err(e), - Ok(ast) => Ok(ast), - } - } - - fn push(&mut self, ast: Ast) { - self.stack.push(Expr(ast)) - } - - fn push_repeater(&mut self, c: char) -> Result<(), Error> { - if self.stack.len() == 0 { - return self.err( - "A repeat operator must be preceded by a valid expression.") - } - let rep: Repeater = match c { - '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore, - _ => panic!("Not a valid repeater operator."), - }; - - match self.peek(1) { - Some('*') | Some('+') => - return self.err( - "Double repeat operators are not supported."), - _ => {}, - } - let ast = match self.stack.pop().unwrap() { // checked empty stack ^^ - Paren(_, _, _) | Bar | Expr(Nothing) | Expr(Rep(_, _, _)) => - return self.err("A repreat operator must be preceded by a \ - valid expression."), - Expr(Begin(_)) | Expr(End(_)) | Expr(WordBoundary(_)) => - return self.err( - "Repeat arguments cannot be empty width assertions."), - Expr(ast) => ast, - }; - let greed = try!(self.get_next_greedy()); - self.push(Rep(Box::new(ast), rep, greed)); - Ok(()) - } - - fn push_literal(&mut self, c: char) -> Result<(), Error> { - let flags = self.flags; - match c { - '.' => { - self.push(Dot(flags)) - } - '^' => { - self.push(Begin(flags)) - } - '$' => { - self.push(End(flags)) - } - _ => { - self.push(Literal(c, flags)) - } - } - Ok(()) - } - - // Parses all forms of character classes. - // Assumes that '[' is the current character. - fn parse_class(&mut self) -> Result<(), Error> { - let negated = - if self.peek_is(1, '^') { - try!(self.expect('^')); - true - } else { - false - }; - let mut ranges: Vec<(char, char)> = vec!(); - - while self.peek_is(1, '-') { - try!(self.expect('-')); - ranges.push(('-', '-')) - } - loop { - try!(self.noteof("a closing ']' or a non-empty character class)")); - let mut c = self.cur(); - match c { - '[' => - match self.try_parse_ascii() { - Some(AstClass(mut more_ranges, flags)) => { - more_ranges = combine_ranges(more_ranges); - if flags & FLAG_NEGATED > 0 { - more_ranges = invert_ranges(more_ranges); - } - ranges.extend(more_ranges); - continue - } - Some(ast) => - panic!("Expected Class AST but got '{:?}'", ast), - // Just drop down and try to add as a regular character. - None => {}, - }, - '\\' => { - match try!(self.parse_escape()) { - AstClass(mut more_ranges, flags) => { - more_ranges = combine_ranges(more_ranges); - if flags & FLAG_NEGATED > 0 { - more_ranges = invert_ranges(more_ranges); - } - ranges.extend(more_ranges); - continue - } - Literal(c2, _) => c = c2, // process below - Begin(_) | End(_) | WordBoundary(_) => - return self.err( - "\\A, \\z, \\b and \\B are not valid escape \ - sequences inside a character class."), - ast => panic!("Unexpected AST item '{:?}'", ast), - } - } - ']' if ranges.len() > 0 => { - if self.flags & FLAG_NOCASE > 0 { - ranges = case_fold_and_combine_ranges(ranges) - } else { - ranges = combine_ranges(ranges); - } - if negated { - ranges = invert_ranges(ranges); - } - let flags = self.flags & FLAG_NOCASE; - self.push(AstClass(ranges, flags)); - return Ok(()) - } - _ => {} - } - - if self.peek_is(1, '-') && !self.peek_is(2, ']') { - try!(self.expect('-')); - // The regex can't end here. - try!(self.noteof("not a ']'")); - // End the range with a single character or character escape. - let mut c2 = self.cur(); - if c2 == '\\' { - match try!(self.parse_escape()) { - Literal(c3, _) => c2 = c3, // allow literal escapes below - ast => return self.err(&format!( - "Expected a literal, but got {:?}.", ast)), - } - } - if c2 < c { - return self.err(&format!( - "Invalid character class range '{}-{}'", c, c2)) - } - ranges.push((c, self.cur())) - } else { - ranges.push((c, c)) - } - } - } - - // Tries to parse an ASCII character class of the form [:name:]. - // If successful, returns an AST character class corresponding to name - // and moves the parser to the final ']' character. - // If unsuccessful, no state is changed and None is returned. - // Assumes that '[' is the current character. - fn try_parse_ascii(&mut self) -> Option { - if !self.peek_is(1, ':') { - return None - } - let closer = - match self.pos(']') { - Some(i) => i, - None => return None, - }; - if self.chars[closer-1] != ':' { - return None - } - if closer - self.chari <= 3 { - return None - } - let mut name_start = self.chari + 2; - let negated = - if self.peek_is(2, '^') { - name_start += 1; - FLAG_NEGATED - } else { - FLAG_EMPTY - }; - let name = self.slice(name_start, closer - 1); - match find_class(ASCII_CLASSES, &name) { - None => None, - Some(ranges) => { - self.chari = closer; - let flags = negated | (self.flags & FLAG_NOCASE); - Some(AstClass(combine_ranges(ranges), flags)) - } - } - } - - // Parses counted repetition. Supports: - // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}? - // Assumes that '{' is the current character. - // Returns either an error or moves the parser to the final '}' character. - // (Or the '?' character if not greedy.) - fn parse_counted(&mut self) -> Result<(), Error> { - // Scan until the closing '}' and grab the stuff in {}. - let start = self.chari; - let closer = - match self.pos('}') { - Some(i) => i, - None => { - return self.err(&format!("No closing brace for counted \ - repetition starting at position \ - {}.", start)) - } - }; - self.chari = closer; - let greed = try!(self.get_next_greedy()); - let inner = self.chars[(start + 1)..closer].iter().cloned().collect::(); - - // Parse the min and max values from the regex. - let (mut min, mut max): (usize, Option); - if !inner.contains(",") { - min = try!(self.parse_usize(&inner)); - max = Some(min); - } else { - let pieces: Vec<&str> = inner.splitn(2, ',').collect(); - let (smin, smax) = (pieces[0], pieces[1]); - if smin.len() == 0 { - return self.err("Max repetitions cannot be specified \ - without min repetitions.") - } - min = try!(self.parse_usize(smin)); - max = - if smax.len() == 0 { - None - } else { - Some(try!(self.parse_usize(smax))) - }; - } - - // Do some bounds checking and make sure max >= min. - if min > MAX_REPEAT { - return self.err(&format!( - "{} exceeds maximum allowed repetitions ({})", - min, MAX_REPEAT)); - } - if max.is_some() { - let m = max.unwrap(); - if m > MAX_REPEAT { - return self.err(&format!( - "{} exceeds maximum allowed repetitions ({})", - m, MAX_REPEAT)); - } - if m < min { - return self.err(&format!( - "Max repetitions ({}) cannot be smaller than min \ - repetitions ({}).", m, min)); - } - } - - // Now manipulate the AST be repeating elements. - if max.is_none() { - // Require N copies of what's on the stack and then repeat it. - let ast = try!(self.pop_ast()); - for _ in 0..min { - self.push(ast.clone()) - } - self.push(Rep(Box::new(ast), ZeroMore, greed)); - } else { - // Require N copies of what's on the stack and then repeat it - // up to M times optionally. - let ast = try!(self.pop_ast()); - for _ in 0..min { - self.push(ast.clone()) - } - if let Some(max) = max { - for _ in min..max { - self.push(Rep(Box::new(ast.clone()), ZeroOne, greed)) - } - } - // It's possible that we popped something off the stack but - // never put anything back on it. To keep things simple, add - // a no-op expression. - if min == 0 && (max.is_none() || max == Some(0)) { - self.push(Nothing) - } - } - Ok(()) - } - - // Parses all escape sequences. - // Assumes that '\' is the current character. - fn parse_escape(&mut self) -> Result { - try!(self.noteof("an escape sequence following a '\\'")); - - let c = self.cur(); - if is_punct(c) { - return Ok(Literal(c, FLAG_EMPTY)) - } - match c { - 'a' => Ok(Literal('\x07', FLAG_EMPTY)), - 'f' => Ok(Literal('\x0C', FLAG_EMPTY)), - 't' => Ok(Literal('\t', FLAG_EMPTY)), - 'n' => Ok(Literal('\n', FLAG_EMPTY)), - 'r' => Ok(Literal('\r', FLAG_EMPTY)), - 'v' => Ok(Literal('\x0B', FLAG_EMPTY)), - 'A' => Ok(Begin(FLAG_EMPTY)), - 'z' => Ok(End(FLAG_EMPTY)), - 'b' => Ok(WordBoundary(FLAG_EMPTY)), - 'B' => Ok(WordBoundary(FLAG_NEGATED)), - '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())), - 'x' => Ok(try!(self.parse_hex())), - 'p' | 'P' => Ok(try!(self.parse_unicode_name())), - 'd' | 'D' | 's' | 'S' | 'w' | 'W' => { - let ranges = perl_unicode_class(c); - let mut flags = self.flags & FLAG_NOCASE; - if c.is_uppercase() { flags |= FLAG_NEGATED } - Ok(AstClass(ranges, flags)) - } - _ => { - self.err(&format!("Invalid escape sequence '\\\\{}'", c)) - } - } - } - - // Parses a Unicode character class name, either of the form \pF where - // F is a one letter Unicode class name or of the form \p{name} where - // name is the Unicode class name. - // Assumes that \p or \P has been read (and 'p' or 'P' is the current - // character). - fn parse_unicode_name(&mut self) -> Result { - let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY }; - let mut name: String; - if self.peek_is(1, '{') { - try!(self.expect('{')); - let closer = - match self.pos('}') { - Some(i) => i, - None => return self.err(&format!( - "Missing '}}' for unclosed '{{' at position {}", - self.chari)), - }; - if closer - self.chari + 1 == 0 { - return self.err("No Unicode class name found.") - } - name = self.slice(self.chari + 1, closer); - self.chari = closer; - } else { - if self.chari + 1 >= self.chars.len() { - return self.err("No single letter Unicode class name found.") - } - name = self.slice(self.chari + 1, self.chari + 2); - self.chari += 1; - } - match find_class(UNICODE_CLASSES, &name) { - None => { - return self.err(&format!("Could not find Unicode class '{}'", - name)) - } - Some(ranges) => { - Ok(AstClass(ranges, negated | (self.flags & FLAG_NOCASE))) - } - } - } - - // Parses an octal number, up to 3 digits. - // Assumes that \n has been read, where n is the first digit. - fn parse_octal(&mut self) -> Result { - let start = self.chari; - let mut end = start + 1; - let (d2, d3) = (self.peek(1), self.peek(2)); - if d2 >= Some('0') && d2 <= Some('7') { - try!(self.noteof("expected octal character in [0-7]")); - end += 1; - if d3 >= Some('0') && d3 <= Some('7') { - try!(self.noteof("expected octal character in [0-7]")); - end += 1; - } - } - match from_str_radix_pos_integer(&self.slice(start, end), 8) { - Ok(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), - Err(err) => self.err(&err), - } - } - - // Parse a hex number. Either exactly two digits or anything in {}. - // Assumes that \x has been read. - fn parse_hex(&mut self) -> Result { - if !self.peek_is(1, '{') { - try!(self.expect('{')); - return self.parse_hex_two() - } - let start = self.chari + 2; - let closer = - match self.pos('}') { - None => { - return self.err(&format!("Missing '}}' for unclosed \ - '{{' at position {}", start)) - } - Some(i) => i, - }; - self.chari = closer; - self.parse_hex_digits(&self.slice(start, closer)) - } - - // Parses a two-digit hex number. - // Assumes that \xn has been read, where n is the first digit and is the - // current character. - // After return, parser will point at the second digit. - fn parse_hex_two(&mut self) -> Result { - let (start, end) = (self.chari, self.chari + 2); - let bad = self.slice(start - 2, self.chars.len()); - try!(self.noteof(&format!("Invalid hex escape sequence '{}'", bad))); - self.parse_hex_digits(&self.slice(start, end)) - } - - // Parses `s` as a hexadecimal number. - fn parse_hex_digits(&self, s: &str) -> Result { - match from_str_radix_pos_integer(s, 16) { - Ok(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), - Err(err) => self.err(&err), - } - } - - // Parses a named capture. - // Assumes that '(?P<' has been consumed and that the current character - // is '<'. - // When done, parser will be at the closing '>' character. - fn parse_named_capture(&mut self) -> Result<(), Error> { - try!(self.noteof("a capture name")); - let closer = - match self.pos('>') { - Some(i) => i, - None => return self.err("Capture name must end with '>'."), - }; - if closer - self.chari == 0 { - return self.err("Capture names must have at least 1 character.") - } - let name = self.slice(self.chari, closer); - if !name.chars().all(is_valid_cap) { - return self.err( - "Capture names can only have underscores, letters and digits.") - } - if self.names.contains(&name) { - return self.err(&format!("Duplicate capture group name '{}'.", name)) - } - self.names.push(name.clone()); - self.chari = closer; - self.caps += 1; - self.stack.push(Paren(self.flags, self.caps, name)); - Ok(()) - } - - // Parses non-capture groups and options. - // Assumes that '(?' has already been consumed and '?' is the current - // character. - fn parse_group_opts(&mut self) -> Result<(), Error> { - if self.peek_is(1, 'P') && self.peek_is(2, '<') { - try!(self.expect('P')); try!(self.expect('<')); - return self.parse_named_capture() - } - let start = self.chari; - let mut flags = self.flags; - let mut sign = 1; - let mut saw_flag = false; - loop { - try!(self.noteof("expected non-empty set of flags or closing ')'")); - match self.cur() { - 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true}, - 'm' => { flags = flags | FLAG_MULTI; saw_flag = true}, - 's' => { flags = flags | FLAG_DOTNL; saw_flag = true}, - 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true}, - '-' => { - if sign < 0 { - return self.err(&format!( - "Cannot negate flags twice in '{}'.", - self.slice(start, self.chari + 1))) - } - sign = -1; - saw_flag = false; - flags = flags ^ flags; - } - ':' | ')' => { - if sign < 0 { - if !saw_flag { - return self.err(&format!( - "A valid flag does not follow negation in '{}'", - self.slice(start, self.chari + 1))) - } - flags = flags ^ flags; - } - if self.cur() == ':' { - // Save the old flags with the opening paren. - self.stack.push(Paren(self.flags, 0, "".to_string())); - } - self.flags = flags; - return Ok(()) - } - _ => return self.err(&format!( - "Unrecognized flag '{}'.", self.cur())), - } - } - } - - // Peeks at the next character and returns whether it's ungreedy or not. - // If it is, then the next character is consumed. - fn get_next_greedy(&mut self) -> Result { - Ok(if self.peek_is(1, '?') { - try!(self.expect('?')); - Ungreedy - } else { - Greedy - }.swap(self.flags & FLAG_SWAP_GREED > 0)) - } - - // Searches the stack (starting at the top) until it finds an expression - // for which `pred` returns true. The index of that expression in the - // stack is returned. - // If there's no match, then one of two things happens depending on the - // values of `allow_start`. When it's true, then `0` will be returned. - // Otherwise, an error will be returned. - // Generally, `allow_start` is only true when you're *not* expecting an - // opening parenthesis. - fn pos_last

(&self, allow_start: bool, pred: P) -> Result where - P: FnMut(&BuildAst) -> bool, - { - let from = match self.stack.iter().rev().position(pred) { - Some(i) => i, - None => { - if allow_start { - self.stack.len() - } else { - return self.err("No matching opening parenthesis.") - } - } - }; - // Adjust index since 'from' is for the reversed stack. - // Also, don't include the '(' or '|'. - Ok(self.stack.len() - from) - } - - // concat starts at `from` in the parser's stack and concatenates all - // expressions up to the top of the stack. The resulting concatenation is - // then pushed on to the stack. - // Usually `from` corresponds to the position of an opening parenthesis, - // a '|' (alternation) or the start of the entire expression. - fn concat(&mut self, from: usize) -> Result<(), Error> { - let ast = try!(self.build_from(from, concat_flatten)); - self.push(ast); - Ok(()) - } - - // concat starts at `from` in the parser's stack and alternates all - // expressions up to the top of the stack. The resulting alternation is - // then pushed on to the stack. - // Usually `from` corresponds to the position of an opening parenthesis - // or the start of the entire expression. - // This will also drop any opening parens or alternation bars found in - // the intermediate AST. - fn alternate(&mut self, mut from: usize) -> Result<(), Error> { - // Unlike in the concatenation case, we want 'build_from' to continue - // all the way to the opening left paren (so it will be popped off and - // thrown away). But be careful with overflow---we can't count on the - // open paren to be there. - if from > 0 { from = from - 1} - let ast = try!(self.build_from(from, |l,r| Alt(Box::new(l), Box::new(r)))); - self.push(ast); - Ok(()) - } - - // build_from combines all AST elements starting at 'from' in the - // parser's stack using 'mk' to combine them. If any such element is not an - // AST then it is popped off the stack and ignored. - fn build_from(&mut self, from: usize, mut mk: F) -> Result where - F: FnMut(Ast, Ast) -> Ast, - { - if from >= self.stack.len() { - return self.err("Empty group or alternate not allowed.") - } - - let mut combined = try!(self.pop_ast()); - let mut i = self.stack.len(); - while i > from { - i = i - 1; - match self.stack.pop().unwrap() { - Expr(x) => combined = mk(x, combined), - _ => {}, - } - } - Ok(combined) - } - - fn parse_usize(&self, s: &str) -> Result { - match s.parse::() { - Ok(i) => Ok(i), - Err(_) => { - self.err(&format!("Expected an unsigned integer but got '{}'.", - s)) - } - } - } - - fn char_from_u32(&self, n: u32) -> Result { - match char::from_u32(n) { - Some(c) => Ok(c), - None => { - self.err(&format!("Could not decode '{}' to unicode \ - character.", n)) - } - } - } - - fn pos(&self, c: char) -> Option { - self.chars.iter() - .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i) - } - - fn err(&self, msg: &str) -> Result { - Err(Error { - pos: self.chari, - msg: msg.to_string(), - }) - } - - fn peek(&self, offset: usize) -> Option { - if self.chari + offset >= self.chars.len() { - return None - } - Some(self.chars[self.chari + offset]) - } - - fn peek_is(&self, offset: usize, is: char) -> bool { - self.peek(offset) == Some(is) - } - - fn cur(&self) -> char { - self.chars[self.chari] - } - - fn slice(&self, start: usize, end: usize) -> String { - self.chars[start..end].iter().cloned().collect() - } -} - -// Given an unordered collection of character ranges, combine_ranges returns -// an ordered sequence of character ranges where no two ranges overlap. They -// are ordered from least to greatest (using start position). -fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> { - // Returns true iff the two character classes overlap or share a boundary. - // e.g., ('a', 'g') and ('h', 'm') would return true. - fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool { - cmp::max(a, x) <= inc_char(cmp::min(b, y)) - } - - // This is currently O(n^2), but I think with sufficient cleverness, - // it can be reduced to O(n) **if necessary**. - unordered.sort(); - let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len()); - for (us, ue) in unordered.into_iter() { - let (mut us, mut ue) = (us, ue); - assert!(us <= ue); - let mut which: Option = None; - for (i, &(os, oe)) in ordered.iter().enumerate() { - if should_merge((us, ue), (os, oe)) { - us = cmp::min(us, os); - ue = cmp::max(ue, oe); - which = Some(i); - break - } - } - match which { - None => ordered.push((us, ue)), - Some(i) => ordered[i] = (us, ue), - } - } - ordered.sort(); - ordered -} - -// FIXME: Is there a clever way to do this by considering ranges rather than individual chars? -// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table -fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> { - if ranges.is_empty() { - return ranges - } - let mut chars: Vec = ranges - .into_iter() - .flat_map(|(start, end)| start as u32 .. end as u32 + 1) - .filter_map(char::from_u32) - .map(simple_case_fold) - .collect(); - chars.sort(); - chars.dedup(); - let mut chars = chars.into_iter(); - let mut start = chars.next().unwrap(); - let mut end = start; - let mut ranges = Vec::new(); - for c in chars { - if c != inc_char(end) { - ranges.push((start, end)); - start = c; - } - end = c; - } - ranges.push((start, end)); - ranges -} - -fn invert_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> { - if ranges.is_empty() { return ranges; } - - let mut inv = Vec::with_capacity(ranges.len()); - if ranges[0].0 > '\x00' { - inv.push(('\x00', dec_char(ranges[0].0))); - } - for win in ranges.windows(2) { - let ((_, e1), (s2, _)) = (win[0], win[1]); - inv.push((inc_char(e1), dec_char(s2))); - } - if ranges[ranges.len() - 1].1 < char::MAX { - inv.push((inc_char(ranges[ranges.len() - 1].1), char::MAX)); - } - inv -} - -fn inc_char(c: char) -> char { - assert!(c < char::MAX); - match c { - '\u{D7FF}' => '\u{E000}', - c => char::from_u32(c as u32 + 1).unwrap(), - } -} - -fn dec_char(c: char) -> char { - assert!(c > '\x00'); - match c { - '\u{E000}' => '\u{D7FF}', - c => char::from_u32(c as u32 - 1).unwrap(), - } -} - -// Constructs a Unicode friendly Perl character class from \d, \s or \w -// (or any of their negated forms). Note that this does not handle negation. -fn perl_unicode_class(which: char) -> Vec<(char, char)> { - match which { - 'd' | 'D' => PERLD.to_vec(), - 's' | 'S' => PERLS.to_vec(), - 'w' | 'W' => PERLW.to_vec(), - _ => unreachable!(), - } -} - -// Returns a concatenation of two expressions. This also guarantees that a -// `Cat` expression will never be a direct child of another `Cat` expression. -fn concat_flatten(x: Ast, y: Ast) -> Ast { - match (x, y) { - (Cat(mut xs), Cat(ys)) => { xs.extend(ys.into_iter()); Cat(xs) } - (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) } - (ast, Cat(mut xs)) => { xs.insert(0, ast); Cat(xs) } - (ast1, ast2) => Cat(vec!(ast1, ast2)), - } -} - -fn from_str_radix_pos_integer(s: &str, radix: u32) -> Result { - let mut num = 0; - for c in s.chars() { - match c.to_digit(radix) { - None => return Err( - format!("Could not parse '{}' as a hex number.", s)), - Some(n) => { - num *= radix; - num += n; - } - } - } - Ok(num) -} - -pub fn is_punct(c: char) -> bool { - match c { - '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | - '[' | ']' | '{' | '}' | '^' | '$' => true, - _ => false, - } -} - -fn is_valid_cap(c: char) -> bool { - c == '_' || (c >= '0' && c <= '9') - || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') -} - -fn find_class(classes: NamedClasses, name: &str) -> Option> { - match classes.binary_search_by(|&(s, _)| s.cmp(name)) { - Ok(i) => Some(classes[i].1.to_vec()), - Err(_) => None, - } -} - -type Class = &'static [(char, char)]; -type NamedClasses = &'static [(&'static str, Class)]; - -static ASCII_CLASSES: NamedClasses = &[ - // Classes must be in alphabetical order so that bsearch works. - // [:alnum:] alphanumeric (== [0-9A-Za-z]) - // [:alpha:] alphabetic (== [A-Za-z]) - // [:ascii:] ASCII (== [\x00-\x7F]) - // [:blank:] blank (== [\t ]) - // [:cntrl:] control (== [\x00-\x1F\x7F]) - // [:digit:] digits (== [0-9]) - // [:graph:] graphical (== [!-~]) - // [:lower:] lower case (== [a-z]) - // [:print:] printable (== [ -~] == [ [:graph:]]) - // [:punct:] punctuation (== [!-/:-@[-`{-~]) - // [:space:] whitespace (== [\t\n\v\f\r ]) - // [:upper:] upper case (== [A-Z]) - // [:word:] word characters (== [0-9A-Za-z_]) - // [:xdigit:] hex digit (== [0-9A-Fa-f]) - // Taken from: http://golang.org/pkg/regex/syntax/ - ("alnum", &ALNUM), - ("alpha", &ALPHA), - ("ascii", &ASCII), - ("blank", &BLANK), - ("cntrl", &CNTRL), - ("digit", &DIGIT), - ("graph", &GRAPH), - ("lower", &LOWER), - ("print", &PRINT), - ("punct", &PUNCT), - ("space", &SPACE), - ("upper", &UPPER), - ("word", &WORD), - ("xdigit", &XDIGIT), -]; - -const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; -const ALPHA: Class = &[('A', 'Z'), ('a', 'z')]; -const ASCII: Class = &[('\x00', '\x7F')]; -const BLANK: Class = &[(' ', ' '), ('\t', '\t')]; -const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; -const DIGIT: Class = &[('0', '9')]; -const GRAPH: Class = &[('!', '~')]; -const LOWER: Class = &[('a', 'z')]; -const PRINT: Class = &[(' ', '~')]; -const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; -const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), - ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')]; -const UPPER: Class = &[('A', 'Z')]; -const WORD: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]; -const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')]; diff --git a/src/re.rs b/src/re.rs index bfdcf5317a..5bf2c6b645 100644 --- a/src/re.rs +++ b/src/re.rs @@ -17,7 +17,7 @@ use std::str::pattern::{Pattern, Searcher, SearchStep}; use std::str::FromStr; use compile::Program; -use parse; +use syntax; use vm; use vm::CaptureLocs; use vm::MatchKind::{self, Exists, Location, Submatches}; @@ -32,7 +32,7 @@ use self::Regex::*; pub fn quote(text: &str) -> String { let mut quoted = String::with_capacity(text.len()); for c in text.chars() { - if parse::is_punct(c) { + if syntax::is_punct(c) { quoted.push('\\') } quoted.push(c); @@ -47,10 +47,54 @@ pub fn quote(text: &str) -> String { /// /// To find submatches, split or replace text, you'll need to compile an /// expression first. -pub fn is_match(regex: &str, text: &str) -> Result { +pub fn is_match(regex: &str, text: &str) -> Result { Regex::new(regex).map(|r| r.is_match(text)) } +/// An error that occurred during parsing or compiling a regular expression. +#[derive(Debug)] +pub enum Error { + /// A syntax error. + Syntax(syntax::Error), + /// The compiled program exceeded the set size limit. + /// The argument is the size limit imposed. + CompiledTooBig(usize), +} + +impl ::std::error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Syntax(ref err) => err.description(), + Error::CompiledTooBig(_) => "compiled program too big", + } + } + + fn cause(&self) -> Option<&::std::error::Error> { + match *self { + Error::Syntax(ref err) => Some(err), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Syntax(ref err) => err.fmt(f), + Error::CompiledTooBig(limit) => { + write!(f, "Compiled regex exceeds size limit of {} bytes.", + limit) + } + } + } +} + +impl From for Error { + fn from(err: syntax::Error) -> Error { + Error::Syntax(err) + } +} + /// A compiled regular expression /// /// It is represented as either a sequence of bytecode instructions (dynamic) @@ -159,9 +203,10 @@ impl fmt::Debug for Regex { } } -/// Equality comparison is based on the original string. It is possible that different regular -/// expressions have the same matching behavior, but are still compared unequal. For example, -/// `\d+` and `\d\d*` match the same set of strings, but are not considered equal. +/// Equality comparison is based on the original string. It is possible that +/// different regular expressions have the same matching behavior, but are +/// still compared unequal. For example, `\d+` and `\d\d*` match the same set +/// of strings, but are not considered equal. impl PartialEq for Regex { fn eq(&self, other: &Regex) -> bool { self.as_str() == other.as_str() @@ -171,10 +216,10 @@ impl PartialEq for Regex { impl Eq for Regex {} impl FromStr for Regex { - type Err = parse::Error; + type Err = Error; /// Attempts to parse a string into a regular expression - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { Regex::new(s) } } @@ -184,9 +229,20 @@ impl Regex { /// used repeatedly to search, split or replace text in a string. /// /// If an invalid expression is given, then an error is returned. - pub fn new(re: &str) -> Result { - let ast = try!(parse::parse(re)); - let (prog, names) = Program::new(ast); + pub fn new(re: &str) -> Result { + Regex::with_size_limit(10 * (1 << 20), re) + } + + /// Compiles a dynamic regular expression with the given size limit. + /// + /// The size limit is applied to the size of the *compiled* data structure. + /// If the data structure exceeds the size given, then an error is + /// returned. + /// + /// The default size limit used in `new` is 10MB. + pub fn with_size_limit(size: usize, re: &str) -> Result { + let ast = try!(syntax::Expr::parse(re)); + let (prog, names) = try!(Program::new(ast, size)); Ok(Dynamic(ExDynamic { original: re.to_string(), names: names, @@ -194,6 +250,7 @@ impl Regex { })) } + /// Returns true if and only if the regex matches the string given. /// /// # Example @@ -790,13 +847,19 @@ impl<'t> Captures<'t> { /// To write a literal `$` use `$$`. pub fn expand(&self, text: &str) -> String { // How evil can you get? - // FIXME: Don't use regexes for this. It's completely unnecessary. - let re = Regex::new(r"(^|[^$]|\b)\$(\d+|\w+)").unwrap(); + let re = Regex::new(r"(?x) + (?P^|\b|[^$]) # Ignore `$$name`. + \$ + (?P # Match the actual capture name. Can be... + [0-9]+ # A sequence of digits (for indexed captures), or... + | + [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. + ) + ").unwrap(); let text = re.replace_all(text, |refs: &Captures| -> String { - let pre = refs.at(1).unwrap_or(""); - let name = refs.at(2).unwrap_or(""); - format!("{}{}", pre, - match name.parse::() { + let before = refs.name("before").unwrap_or(""); + let name = refs.name("name").unwrap_or(""); + format!("{}{}", before, match name.parse::() { Err(_) => self.name(name).unwrap_or("").to_string(), Ok(i) => self.at(i).unwrap_or("").to_string(), }) @@ -809,7 +872,7 @@ impl<'t> Captures<'t> { #[inline] pub fn len(&self) -> usize { self.locs.len() / 2 } - /// Returns if there are no captured groups. + /// Returns true if and only if there are no captured groups. #[inline] pub fn is_empty(&self) -> bool { self.len() == 0 } } diff --git a/src/vm.rs b/src/vm.rs index 84da8e5089..7fcd7fded8 100644 --- a/src/vm.rs +++ b/src/vm.rs @@ -36,18 +36,12 @@ use self::MatchKind::*; use self::StepState::*; -use std::cmp::{self, Ordering}; -use std::iter::repeat; +use std::cmp; use std::mem; use compile::Program; -use compile::Inst::{ - Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary, - Save, Jump, Split, -}; -use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED}; -use unicode::regex::PERLW; -use unicode::case_folding; +use compile::Inst::*; +use syntax; pub type CaptureLocs = Vec>; @@ -122,18 +116,16 @@ impl<'r, 't> Nfa<'r, 't> { let ninsts = self.prog.insts.len(); let mut clist = Threads::new(self.which, ninsts, ncaps); let mut nlist = Threads::new(self.which, ninsts, ncaps); - - let mut groups = repeat(None).take(ncaps * 2).collect::>(); + let mut groups = vec![None; ncaps * 2]; // Determine if the expression starts with a '^' so we can avoid // simulating .*? // Make sure multi-line mode isn't enabled for it, otherwise we can't // drop the initial .*? - let prefix_anchor = - match self.prog.insts[1] { - EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, - _ => false, - }; + let prefix_anchor = match self.prog.insts[1] { + StartText => true, + _ => false, + }; self.ic = self.start; let mut next_ic = self.chars.set(self.start); @@ -224,30 +216,24 @@ impl<'r, 't> Nfa<'r, 't> { } } } - OneChar(c, flags) => { - if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) { + OneChar { c, casei } => { + if self.char_eq(casei, self.chars.prev, c) { self.add(nlist, pc+1, caps); } } - CharClass(ref ranges, flags) => { - if let Some(mut c) = self.chars.prev { - let negate = flags & FLAG_NEGATED > 0; - if flags & FLAG_NOCASE > 0 { - c = simple_case_fold(c); - } - let found = ranges.binary_search_by(|&rc| class_cmp(c, rc)).is_ok(); - if found ^ negate { - self.add(nlist, pc+1, caps); - } + CharClass(ref cls) => { + if self.chars.prev.map(|c| cls.matches(c)).unwrap_or(false) { + self.add(nlist, pc+1, caps); } } - Any(flags) => { - if flags & FLAG_DOTNL > 0 - || !self.char_eq(false, self.chars.prev, '\n') { + Any => self.add(nlist, pc+1, caps), + AnyNoNL => { + if !self.char_eq(false, self.chars.prev, '\n') { self.add(nlist, pc+1, caps) } } - EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_) + StartLine | EndLine | StartText | EndText + | WordBoundary | NotWordBoundary | Save(_) | Jump(_) | Split(_, _) => {}, } StepContinue @@ -272,28 +258,42 @@ impl<'r, 't> Nfa<'r, 't> { // We make a minor optimization by indicating that the state is "empty" // so that its capture groups are not filled in. match self.prog.insts[pc] { - EmptyBegin(flags) => { - let multi = flags & FLAG_MULTI > 0; + StartLine => { nlist.add(pc, groups, true); - if self.chars.is_begin() - || (multi && self.char_is(self.chars.prev, '\n')) { - self.add(nlist, pc + 1, groups) + if self.chars.is_begin() || self.char_is(self.chars.prev, '\n') { + self.add(nlist, pc + 1, groups); + } + } + StartText => { + nlist.add(pc, groups, true); + if self.chars.is_begin() { + self.add(nlist, pc + 1, groups); } } - EmptyEnd(flags) => { - let multi = flags & FLAG_MULTI > 0; + EndLine => { nlist.add(pc, groups, true); - if self.chars.is_end() - || (multi && self.char_is(self.chars.cur, '\n')) { + if self.chars.is_end() || self.char_is(self.chars.cur, '\n') { self.add(nlist, pc + 1, groups) } } - EmptyWordBoundary(flags) => { + EndText => { nlist.add(pc, groups, true); - if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) { + if self.chars.is_end() { self.add(nlist, pc + 1, groups) } } + WordBoundary => { + nlist.add(pc, groups, true); + if self.chars.is_word_boundary() { + self.add(nlist, pc + 1, groups); + } + } + NotWordBoundary => { + nlist.add(pc, groups, true); + if !self.chars.is_word_boundary() { + self.add(nlist, pc + 1, groups); + } + } Save(slot) => { nlist.add(pc, groups, true); match self.which { @@ -321,7 +321,7 @@ impl<'r, 't> Nfa<'r, 't> { self.add(nlist, x, groups); self.add(nlist, y, groups); } - Match | OneChar(_, _) | CharClass(_, _) | Any(_) => { + Match | OneChar{..} | CharClass(_) | Any | AnyNoNL => { nlist.add(pc, groups, false); } } @@ -334,7 +334,7 @@ impl<'r, 't> Nfa<'r, 't> { match textc { None => false, Some(textc) => { - regc == textc || (casei && simple_case_fold(regc) == simple_case_fold(textc)) + regc == textc || (casei && syntax::simple_case_fold(regc) == syntax::simple_case_fold(textc)) } } } @@ -425,17 +425,22 @@ impl<'t> CharReader<'t> { /// Returns true if and only if the current position is a word boundary. /// (Ignoring the range of the input to search.) pub fn is_word_boundary(&self) -> bool { + fn is_word(c: Option) -> bool { + c.map(syntax::is_word_char).unwrap_or(false) + } + if self.is_begin() { - return is_word(self.cur) + return is_word(self.cur); } if self.is_end() { - return is_word(self.prev) + return is_word(self.prev); } (is_word(self.cur) && !is_word(self.prev)) || (is_word(self.prev) && !is_word(self.cur)) } } +#[derive(Clone)] struct Thread { pc: usize, groups: Vec>, @@ -457,12 +462,11 @@ impl Threads { // // See http://research.swtch.com/sparse for the deets. fn new(which: MatchKind, num_insts: usize, ncaps: usize) -> Threads { + let t = Thread { pc: 0, groups: vec![None; ncaps * 2] }; Threads { which: which, - queue: (0..num_insts).map(|_| { - Thread {pc: 0, groups: repeat(None).take(ncaps * 2).collect() } - }).collect(), - sparse: repeat(0).take(num_insts).collect(), + queue: vec![t; num_insts], + sparse: vec![0; num_insts], size: 0, } } @@ -508,58 +512,6 @@ impl Threads { } } -/// Returns true if the character is a word character, according to the -/// (Unicode friendly) Perl character class '\w'. -/// Note that this is only use for testing word boundaries. The actual '\w' -/// is encoded as a CharClass instruction. -pub fn is_word(c: Option) -> bool { - let c = match c { - None => return false, - Some(c) => c, - }; - // Try the common ASCII case before invoking binary search. - match c { - '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true, - _ => PERLW.binary_search_by(|&(start, end)| { - if c >= start && c <= end { - Ordering::Equal - } else if start > c { - Ordering::Greater - } else { - Ordering::Less - } - }).ok().is_some() - } -} - - -/// Returns the Unicode *simple* case folding of `c`. -/// Uses the mappings with status C + S form Unicode’s `CaseFolding.txt`. -/// This is not as “correct” as full case folding, but preserves the number of code points. -pub fn simple_case_fold(c: char) -> char { - match case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c)) { - Ok(i) => case_folding::C_plus_S_table[i].1, - Err(_) => c - } -} - - -/// Given a character and a single character class range, return an ordering -/// indicating whether the character is less than the start of the range, -/// in the range (inclusive) or greater than the end of the range. -/// -/// This function is meant to be used with a binary search. -#[inline] -fn class_cmp(textc: char, (start, end): (char, char)) -> Ordering { - if textc >= start && textc <= end { - Ordering::Equal - } else if start > textc { - Ordering::Greater - } else { - Ordering::Less - } -} - /// Returns the starting location of `needle` in `haystack`. /// If `needle` is not in `haystack`, then `None` is returned. ///