use bit_set::BitSet;
use regex::escape;
use std::collections::HashMap;
use std::str::FromStr;
use std::usize;
use crate::codepoint_len;
use crate::CompileError;
use crate::Error;
use crate::Expr;
use crate::LookAround::*;
use crate::ParseError;
use crate::Result;
use crate::MAX_RECURSION;
const FLAG_CASEI: u32 = 1;
const FLAG_MULTI: u32 = 1 << 1;
const FLAG_DOTNL: u32 = 1 << 2;
const FLAG_SWAP_GREED: u32 = 1 << 3;
const FLAG_IGNORE_SPACE: u32 = 1 << 4;
const FLAG_UNICODE: u32 = 1 << 5;
pub(crate) type NamedGroups = HashMap<String, usize>;
#[derive(Debug)]
pub struct ExprTree {
pub expr: Expr,
pub backrefs: BitSet,
pub named_groups: NamedGroups,
}
#[derive(Debug)]
pub(crate) struct Parser<'a> {
re: &'a str, backrefs: BitSet,
flags: u32,
named_groups: NamedGroups,
numeric_backrefs: bool,
curr_group: usize, }
impl<'a> Parser<'a> {
pub(crate) fn parse(re: &str) -> Result<ExprTree> {
let mut p = Parser::new(re);
let (ix, expr) = p.parse_re(0, 0)?;
if ix < re.len() {
return Err(Error::ParseError(
ix,
ParseError::GeneralParseError("end of string not reached".to_string()),
));
}
Ok(ExprTree {
expr,
backrefs: Default::default(),
named_groups: p.named_groups,
})
}
fn new(re: &str) -> Parser<'_> {
Parser {
re,
backrefs: Default::default(),
named_groups: Default::default(),
numeric_backrefs: false,
flags: FLAG_UNICODE,
curr_group: 0,
}
}
fn parse_re(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
let (ix, child) = self.parse_branch(ix, depth)?;
let mut ix = self.optional_whitespace(ix)?;
if self.re[ix..].starts_with('|') {
let mut children = vec![child];
while self.re[ix..].starts_with('|') {
ix += 1;
let (next, child) = self.parse_branch(ix, depth)?;
children.push(child);
ix = self.optional_whitespace(next)?;
}
return Ok((ix, Expr::Alt(children)));
}
if self.numeric_backrefs && !self.named_groups.is_empty() {
return Err(Error::CompileError(CompileError::NamedBackrefOnly));
}
Ok((ix, child))
}
fn parse_branch(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
let mut children = Vec::new();
let mut ix = ix;
while ix < self.re.len() {
let (next, child) = self.parse_piece(ix, depth)?;
if next == ix {
break;
}
if child != Expr::Empty {
children.push(child);
}
ix = next;
}
match children.len() {
0 => Ok((ix, Expr::Empty)),
1 => Ok((ix, children.pop().unwrap())),
_ => Ok((ix, Expr::Concat(children))),
}
}
fn parse_piece(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
let (ix, child) = self.parse_atom(ix, depth)?;
let mut ix = self.optional_whitespace(ix)?;
if ix < self.re.len() {
let (lo, hi) = match self.re.as_bytes()[ix] {
b'?' => (0, 1),
b'*' => (0, usize::MAX),
b'+' => (1, usize::MAX),
b'{' => {
match self.parse_repeat(ix) {
Ok((next, lo, hi)) => {
ix = next - 1;
(lo, hi)
}
Err(_) => {
return Ok((ix, child));
}
}
}
_ => return Ok((ix, child)),
};
if !self.is_repeatable(&child) {
return Err(Error::ParseError(ix, ParseError::TargetNotRepeatable));
}
ix += 1;
ix = self.optional_whitespace(ix)?;
let mut greedy = true;
if ix < self.re.len() && self.re.as_bytes()[ix] == b'?' {
greedy = false;
ix += 1;
}
greedy ^= self.flag(FLAG_SWAP_GREED);
let mut node = Expr::Repeat {
child: Box::new(child),
lo,
hi,
greedy,
};
if ix < self.re.len() && self.re.as_bytes()[ix] == b'+' {
ix += 1;
node = Expr::AtomicGroup(Box::new(node));
}
return Ok((ix, node));
}
Ok((ix, child))
}
fn is_repeatable(&self, child: &Expr) -> bool {
match child {
Expr::LookAround(_, _) => false,
Expr::Empty => false,
Expr::StartText => false,
Expr::EndText => false,
Expr::StartLine => false,
Expr::EndLine => false,
_ => true,
}
}
fn parse_repeat(&self, ix: usize) -> Result<(usize, usize, usize)> {
let ix = self.optional_whitespace(ix + 1)?; let bytes = self.re.as_bytes();
if ix == self.re.len() {
return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
}
let mut end = ix;
let lo = if bytes[ix] == b',' {
0
} else if let Some((next, lo)) = parse_decimal(self.re, ix) {
end = next;
lo
} else {
return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
};
let ix = self.optional_whitespace(end)?; if ix == self.re.len() {
return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
}
end = ix;
let hi = match bytes[ix] {
b'}' => lo,
b',' => {
end = self.optional_whitespace(ix + 1)?; if let Some((next, hi)) = parse_decimal(self.re, end) {
end = next;
hi
} else {
usize::MAX
}
}
_ => return Err(Error::ParseError(ix, ParseError::InvalidRepeat)),
};
let ix = self.optional_whitespace(end)?; if ix == self.re.len() || bytes[ix] != b'}' {
return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
}
Ok((ix + 1, lo, hi))
}
fn parse_atom(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
let ix = self.optional_whitespace(ix)?;
if ix == self.re.len() {
return Ok((ix, Expr::Empty));
}
match self.re.as_bytes()[ix] {
b'.' => Ok((
ix + 1,
Expr::Any {
newline: self.flag(FLAG_DOTNL),
},
)),
b'^' => Ok((
ix + 1,
if self.flag(FLAG_MULTI) {
Expr::StartLine
} else {
Expr::StartText
},
)),
b'$' => Ok((
ix + 1,
if self.flag(FLAG_MULTI) {
Expr::EndLine
} else {
Expr::EndText
},
)),
b'(' => self.parse_group(ix, depth),
b'\\' => {
let (next, expr) = self.parse_escape(ix)?;
if let Expr::Backref(group) = expr {
self.backrefs.insert(group);
}
Ok((next, expr))
}
b'+' | b'*' | b'?' | b'|' | b')' => Ok((ix, Expr::Empty)),
b'[' => self.parse_class(ix),
b => {
let next = ix + codepoint_len(b);
Ok((
next,
Expr::Literal {
val: String::from(&self.re[ix..next]),
casei: self.flag(FLAG_CASEI),
},
))
}
}
}
fn parse_named_backref(&self, ix: usize, open: &str, close: &str) -> Result<(usize, Expr)> {
if let Some((id, skip)) = parse_id(&self.re[ix..], open, close) {
let group = if let Some(group) = self.named_groups.get(id) {
Some(*group)
} else if let Ok(group) = id.parse() {
Some(group)
} else {
None
};
if let Some(group) = group {
return Ok((ix + skip, Expr::Backref(group)));
}
Err(Error::ParseError(
ix,
ParseError::InvalidGroupNameBackref(id.to_string()),
))
} else {
Err(Error::ParseError(ix, ParseError::InvalidGroupName))
}
}
fn parse_numbered_backref(&mut self, ix: usize) -> Result<(usize, Expr)> {
if let Some((end, group)) = parse_decimal(self.re, ix) {
if group < self.re.len() / 2 {
self.numeric_backrefs = true;
return Ok((end, Expr::Backref(group)));
}
}
return Err(Error::ParseError(ix, ParseError::InvalidBackref));
}
fn parse_escape(&mut self, ix: usize) -> Result<(usize, Expr)> {
if ix + 1 == self.re.len() {
return Err(Error::ParseError(ix, ParseError::TrailingBackslash));
}
let bytes = self.re.as_bytes();
let b = bytes[ix + 1];
let mut end = ix + 1 + codepoint_len(b);
let mut size = 1;
if is_digit(b) {
return self.parse_numbered_backref(ix + 1);
} else if b == b'k' {
return self.parse_named_backref(ix + 2, "<", ">");
} else if b == b'A' || b == b'z' || b == b'b' || b == b'B' {
size = 0;
} else if (b | 32) == b'd'
|| (b | 32) == b's'
|| (b | 32) == b'w'
|| b == b'a'
|| b == b'f'
|| b == b'n'
|| b == b'r'
|| b == b't'
|| b == b'v'
{
} else if b == b'e' {
let inner = String::from(r"\x1B");
return Ok((
end,
Expr::Delegate {
inner,
size,
casei: false,
},
));
} else if (b | 32) == b'h' {
let s = if b == b'h' {
"[0-9A-Fa-f]"
} else {
"[^0-9A-Fa-f]"
};
let inner = String::from(s);
return Ok((
end,
Expr::Delegate {
inner,
size,
casei: false,
},
));
} else if b == b'x' {
return self.parse_hex(end, 2);
} else if b == b'u' {
return self.parse_hex(end, 4);
} else if b == b'U' {
return self.parse_hex(end, 8);
} else if (b | 32) == b'p' {
if end == self.re.len() {
return Err(Error::ParseError(
ix,
ParseError::InvalidEscape("\\p must be followed by a unicode name".to_string()),
));
}
let b = bytes[end];
end += codepoint_len(b);
if b == b'{' {
loop {
if end == self.re.len() {
return Err(Error::ParseError(ix, ParseError::UnclosedUnicodeName));
}
let b = bytes[end];
if b == b'}' {
end += 1;
break;
}
end += codepoint_len(b);
}
}
} else if b == b'K' {
return Ok((end, Expr::KeepOut));
} else if b == b'G' {
return Ok((end, Expr::ContinueFromPreviousMatchEnd));
} else if b'a' <= (b | 32) && (b | 32) <= b'z' {
return Err(Error::ParseError(
ix,
ParseError::InvalidEscape(format!("\\{}", &self.re[ix + 1..end])),
));
} else if 0x20 <= b && b <= 0x7f {
return Ok((end, make_literal(&self.re[ix + 1..end])));
}
let inner = String::from(&self.re[ix..end]);
Ok((
end,
Expr::Delegate {
inner,
size,
casei: self.flag(FLAG_CASEI),
},
))
}
fn parse_hex(&self, ix: usize, digits: usize) -> Result<(usize, Expr)> {
if ix >= self.re.len() {
return Err(Error::ParseError(ix, ParseError::InvalidHex));
}
let bytes = self.re.as_bytes();
let b = bytes[ix];
let (end, s) = if ix + digits <= self.re.len()
&& bytes[ix..ix + digits].iter().all(|&b| is_hex_digit(b))
{
let end = ix + digits;
(end, &self.re[ix..end])
} else if b == b'{' {
let starthex = ix + 1;
let mut endhex = starthex;
loop {
if endhex == self.re.len() {
return Err(Error::ParseError(ix, ParseError::InvalidHex));
}
let b = bytes[endhex];
if endhex > starthex && b == b'}' {
break;
}
if is_hex_digit(b) && endhex < starthex + 8 {
endhex += 1;
} else {
return Err(Error::ParseError(ix, ParseError::InvalidHex));
}
}
(endhex + 1, &self.re[starthex..endhex])
} else {
return Err(Error::ParseError(ix, ParseError::InvalidHex));
};
let codepoint = u32::from_str_radix(s, 16).unwrap();
if let Some(c) = ::std::char::from_u32(codepoint) {
let mut inner = String::with_capacity(4);
inner.push(c);
Ok((
end,
Expr::Literal {
val: inner,
casei: self.flag(FLAG_CASEI),
},
))
} else {
Err(Error::ParseError(ix, ParseError::InvalidCodepointValue))
}
}
fn parse_class(&mut self, ix: usize) -> Result<(usize, Expr)> {
let bytes = self.re.as_bytes();
let mut ix = ix + 1; let mut class = String::new();
let mut nest = 1;
class.push('[');
if ix < self.re.len() && bytes[ix] == b'^' {
class.push('^');
ix += 1;
}
if ix < self.re.len() && bytes[ix] == b']' {
class.push(']');
ix += 1;
}
loop {
if ix == self.re.len() {
return Err(Error::ParseError(ix, ParseError::InvalidClass));
}
let end = match bytes[ix] {
b'\\' => {
if ix + 1 == self.re.len() {
return Err(Error::ParseError(ix, ParseError::InvalidClass));
}
let (end, expr) = self.parse_escape(ix)?;
match expr {
Expr::Literal { val, .. } => {
class.push_str(&escape(&val));
}
Expr::Delegate { inner, .. } => {
class.push_str(&inner);
}
_ => {
return Err(Error::ParseError(ix, ParseError::InvalidClass));
}
}
end
}
b'[' => {
nest += 1;
class.push('[');
ix + 1
}
b']' => {
nest -= 1;
if nest == 0 {
break;
}
class.push(']');
ix + 1
}
b => {
let end = ix + codepoint_len(b);
class.push_str(&self.re[ix..end]);
end
}
};
ix = end;
}
class.push(']');
let ix = ix + 1; Ok((
ix,
Expr::Delegate {
inner: class,
size: 1,
casei: self.flag(FLAG_CASEI),
},
))
}
fn parse_group(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
let depth = depth + 1;
if depth >= MAX_RECURSION {
return Err(Error::ParseError(ix, ParseError::RecursionExceeded));
}
let ix = self.optional_whitespace(ix + 1)?;
let (la, skip) = if self.re[ix..].starts_with("?=") {
(Some(LookAhead), 2)
} else if self.re[ix..].starts_with("?!") {
(Some(LookAheadNeg), 2)
} else if self.re[ix..].starts_with("?<=") {
(Some(LookBehind), 3)
} else if self.re[ix..].starts_with("?<!") {
(Some(LookBehindNeg), 3)
} else if self.re[ix..].starts_with("?<") {
self.curr_group += 1;
if let Some((id, skip)) = parse_id(&self.re[ix + 1..], "<", ">") {
self.named_groups.insert(id.to_string(), self.curr_group);
(None, skip + 1)
} else {
return Err(Error::ParseError(ix, ParseError::InvalidGroupName));
}
} else if self.re[ix..].starts_with("?P<") {
self.curr_group += 1; if let Some((id, skip)) = parse_id(&self.re[ix + 2..], "<", ">") {
self.named_groups.insert(id.to_string(), self.curr_group);
(None, skip + 2)
} else {
return Err(Error::ParseError(ix, ParseError::InvalidGroupName));
}
} else if self.re[ix..].starts_with("?P=") {
return self.parse_named_backref(ix + 3, "", ")");
} else if self.re[ix..].starts_with("?>") {
(None, 2)
} else if self.re[ix..].starts_with("?(") {
return self.parse_conditional(ix + 2, depth);
} else if self.re[ix..].starts_with('?') {
return self.parse_flags(ix, depth);
} else {
self.curr_group += 1; (None, 0)
};
let ix = ix + skip;
let (ix, child) = self.parse_re(ix, depth)?;
let ix = self.check_for_close_paren(ix)?;
let result = match (la, skip) {
(Some(la), _) => Expr::LookAround(Box::new(child), la),
(None, 2) => Expr::AtomicGroup(Box::new(child)),
_ => Expr::Group(Box::new(child)),
};
Ok((ix, result))
}
fn check_for_close_paren(&self, ix: usize) -> Result<usize> {
let ix = self.optional_whitespace(ix)?;
if ix == self.re.len() {
return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
} else if self.re.as_bytes()[ix] != b')' {
return Err(Error::ParseError(
ix,
ParseError::GeneralParseError("expected close paren".to_string()),
));
}
Ok(ix + 1)
}
fn parse_flags(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
let start = ix + 1;
fn unknown_flag(re: &str, start: usize, end: usize) -> Error {
let after_end = end + codepoint_len(re.as_bytes()[end]);
let s = format!("(?{}", &re[start..after_end]);
Error::ParseError(start, ParseError::UnknownFlag(s))
}
let mut ix = start;
let mut neg = false;
let oldflags = self.flags;
loop {
ix = self.optional_whitespace(ix)?;
if ix == self.re.len() {
return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
}
let b = self.re.as_bytes()[ix];
match b {
b'i' => self.update_flag(FLAG_CASEI, neg),
b'm' => self.update_flag(FLAG_MULTI, neg),
b's' => self.update_flag(FLAG_DOTNL, neg),
b'U' => self.update_flag(FLAG_SWAP_GREED, neg),
b'x' => self.update_flag(FLAG_IGNORE_SPACE, neg),
b'u' => {
if neg {
return Err(Error::ParseError(ix, ParseError::NonUnicodeUnsupported));
}
}
b'-' => {
if neg {
return Err(unknown_flag(self.re, start, ix));
}
neg = true;
}
b')' => {
if ix == start || neg && ix == start + 1 {
return Err(unknown_flag(self.re, start, ix));
}
return Ok((ix + 1, Expr::Empty));
}
b':' => {
if neg && ix == start + 1 {
return Err(unknown_flag(self.re, start, ix));
}
ix += 1;
let (ix, child) = self.parse_re(ix, depth)?;
if ix == self.re.len() {
return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
} else if self.re.as_bytes()[ix] != b')' {
return Err(Error::ParseError(
ix,
ParseError::GeneralParseError("expected close paren".to_string()),
));
};
self.flags = oldflags;
return Ok((ix + 1, child));
}
_ => return Err(unknown_flag(self.re, start, ix)),
}
ix += 1;
}
}
fn parse_conditional(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
if ix >= self.re.len() {
return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
}
let bytes = self.re.as_bytes();
let b = bytes[ix];
let (mut next, condition) = if is_digit(b) {
self.parse_numbered_backref(ix)?
} else if b == b'\'' {
self.parse_named_backref(ix, "'", "'")?
} else if b == b'<' {
self.parse_named_backref(ix, "<", ">")?
} else {
self.parse_re(ix, depth)?
};
next = self.check_for_close_paren(next)?;
let (end, child) = self.parse_re(next, depth)?;
if end == next {
if let Expr::Backref(group) = condition {
return Ok((end + 1, Expr::BackrefExistsCondition(group)));
} else {
return Err(Error::ParseError(
end,
ParseError::GeneralParseError(
"expected conditional to be a backreference or at least an expression for when the condition is true".to_string()
)
));
}
}
let if_true: Expr;
let mut if_false: Expr = Expr::Empty;
if let Expr::Alt(mut alternatives) = child {
if_true = alternatives.remove(0);
if alternatives.len() == 1 {
if_false = alternatives.pop().expect("expected 2 alternatives");
} else {
if_false = Expr::Alt(alternatives);
}
} else {
if_true = child;
}
let inner_condition = if let Expr::Backref(group) = condition {
Expr::BackrefExistsCondition(group)
} else {
condition
};
Ok((
end + 1,
if if_true == Expr::Empty && if_false == Expr::Empty {
inner_condition
} else {
Expr::Conditional {
condition: Box::new(inner_condition),
true_branch: Box::new(if_true),
false_branch: Box::new(if_false),
}
},
))
}
fn flag(&self, flag: u32) -> bool {
(self.flags & flag) != 0
}
fn update_flag(&mut self, flag: u32, neg: bool) {
if neg {
self.flags &= !flag;
} else {
self.flags |= flag;
}
}
fn optional_whitespace(&self, mut ix: usize) -> Result<usize> {
let bytes = self.re.as_bytes();
loop {
if ix == self.re.len() {
return Ok(ix);
}
match bytes[ix] {
b'#' if self.flag(FLAG_IGNORE_SPACE) => {
match bytes[ix..].iter().position(|&c| c == b'\n') {
Some(x) => ix += x + 1,
None => return Ok(self.re.len()),
}
}
b' ' | b'\r' | b'\n' | b'\t' if self.flag(FLAG_IGNORE_SPACE) => ix += 1,
b'(' if bytes[ix..].starts_with(b"(?#") => {
ix += 3;
loop {
if ix >= self.re.len() {
return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
}
match bytes[ix] {
b')' => {
ix += 1;
break;
}
b'\\' => ix += 2,
_ => ix += 1,
}
}
}
_ => return Ok(ix),
}
}
}
}
pub(crate) fn parse_decimal(s: &str, ix: usize) -> Option<(usize, usize)> {
let mut end = ix;
while end < s.len() && is_digit(s.as_bytes()[end]) {
end += 1;
}
usize::from_str(&s[ix..end]).ok().map(|val| (end, val))
}
pub(crate) fn parse_id<'a>(s: &'a str, open: &'_ str, close: &'_ str) -> Option<(&'a str, usize)> {
debug_assert!(!close.starts_with(is_id_char));
if !s.starts_with(open) {
return None;
}
let id_start = open.len();
let id_len = match s[id_start..].find(|c: char| !is_id_char(c)) {
Some(id_len) if s[id_start + id_len..].starts_with(close) => Some(id_len),
None if close.is_empty() => Some(s.len()),
_ => None,
};
match id_len {
Some(0) => None,
Some(id_len) => {
let id_end = id_start + id_len;
Some((&s[id_start..id_end], id_end + close.len()))
}
_ => None,
}
}
fn is_id_char(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
fn is_digit(b: u8) -> bool {
b'0' <= b && b <= b'9'
}
fn is_hex_digit(b: u8) -> bool {
is_digit(b) || (b'a' <= (b | 32) && (b | 32) <= b'f')
}
pub(crate) fn make_literal(s: &str) -> Expr {
Expr::Literal {
val: String::from(s),
casei: false,
}
}
#[cfg(test)]
mod tests {
use crate::parse::{make_literal, parse_id};
use crate::Expr;
use crate::LookAround::*;
use std::usize;
fn p(s: &str) -> Expr {
Expr::parse_tree(s).unwrap().expr
}
#[cfg_attr(feature = "track_caller", track_caller)]
fn fail(s: &str) {
assert!(Expr::parse_tree(s).is_err());
}
#[cfg_attr(feature = "track_caller", track_caller)]
fn assert_error(re: &str, expected_error: &str) {
let result = Expr::parse_tree(re);
assert!(result.is_err());
assert_eq!(&format!("{}", result.err().unwrap()), expected_error);
}
#[test]
fn empty() {
assert_eq!(p(""), Expr::Empty);
}
#[test]
fn any() {
assert_eq!(p("."), Expr::Any { newline: false });
assert_eq!(p("(?s:.)"), Expr::Any { newline: true });
}
#[test]
fn start_text() {
assert_eq!(p("^"), Expr::StartText);
}
#[test]
fn end_text() {
assert_eq!(p("$"), Expr::EndText);
}
#[test]
fn literal() {
assert_eq!(p("a"), make_literal("a"));
}
#[test]
fn literal_special() {
assert_eq!(p("}"), make_literal("}"));
assert_eq!(p("]"), make_literal("]"));
}
#[test]
fn parse_id_test() {
assert_eq!(parse_id("foo.", "", ""), Some(("foo", 3)));
assert_eq!(parse_id("{foo}", "{", "}"), Some(("foo", 5)));
assert_eq!(parse_id("{foo.", "{", "}"), None);
assert_eq!(parse_id("{foo", "{", "}"), None);
assert_eq!(parse_id("{}", "{", "}"), None);
assert_eq!(parse_id("", "", ""), None);
}
#[test]
fn literal_unescaped_opening_curly() {
assert_eq!(p("{"), make_literal("{"));
assert_eq!(p("({)"), Expr::Group(Box::new(make_literal("{"),)));
assert_eq!(
p("a|{"),
Expr::Alt(vec![make_literal("a"), make_literal("{"),])
);
assert_eq!(
p("{{2}"),
Expr::Repeat {
child: Box::new(make_literal("{")),
lo: 2,
hi: 2,
greedy: true
}
);
}
#[test]
fn literal_escape() {
assert_eq!(p("\\'"), make_literal("'"));
assert_eq!(p("\\\""), make_literal("\""));
assert_eq!(p("\\ "), make_literal(" "));
assert_eq!(p("\\xA0"), make_literal("\u{A0}"));
assert_eq!(p("\\x{1F4A9}"), make_literal("\u{1F4A9}"));
assert_eq!(p("\\x{000000B7}"), make_literal("\u{B7}"));
assert_eq!(p("\\u21D2"), make_literal("\u{21D2}"));
assert_eq!(p("\\u{21D2}"), make_literal("\u{21D2}"));
assert_eq!(p("\\u21D2x"), p("\u{21D2}x"));
assert_eq!(p("\\U0001F60A"), make_literal("\u{1F60A}"));
assert_eq!(p("\\U{0001F60A}"), make_literal("\u{1F60A}"));
}
#[test]
fn hex_escape() {
assert_eq!(
p("\\h"),
Expr::Delegate {
inner: String::from("[0-9A-Fa-f]"),
size: 1,
casei: false
}
);
assert_eq!(
p("\\H"),
Expr::Delegate {
inner: String::from("[^0-9A-Fa-f]"),
size: 1,
casei: false
}
);
}
#[test]
fn invalid_escape() {
assert_error(
"\\",
"Parsing error at position 0: Backslash without following character",
);
assert_error("\\q", "Parsing error at position 0: Invalid escape: \\q");
assert_error("\\xAG", "Parsing error at position 2: Invalid hex escape");
assert_error("\\xA", "Parsing error at position 2: Invalid hex escape");
assert_error("\\x{}", "Parsing error at position 2: Invalid hex escape");
assert_error("\\x{AG}", "Parsing error at position 2: Invalid hex escape");
assert_error("\\x{42", "Parsing error at position 2: Invalid hex escape");
assert_error(
"\\x{D800}",
"Parsing error at position 2: Invalid codepoint for hex or unicode escape",
);
assert_error(
"\\x{110000}",
"Parsing error at position 2: Invalid codepoint for hex or unicode escape",
);
assert_error("\\u123", "Parsing error at position 2: Invalid hex escape");
assert_error("\\u123x", "Parsing error at position 2: Invalid hex escape");
assert_error("\\u{}", "Parsing error at position 2: Invalid hex escape");
assert_error(
"\\U1234567",
"Parsing error at position 2: Invalid hex escape",
);
assert_error("\\U{}", "Parsing error at position 2: Invalid hex escape");
}
#[test]
fn concat() {
assert_eq!(
p("ab"),
Expr::Concat(vec![make_literal("a"), make_literal("b"),])
);
}
#[test]
fn alt() {
assert_eq!(
p("a|b"),
Expr::Alt(vec![make_literal("a"), make_literal("b"),])
);
}
#[test]
fn group() {
assert_eq!(p("(a)"), Expr::Group(Box::new(make_literal("a"),)));
}
#[test]
fn group_repeat() {
assert_eq!(
p("(a){2}"),
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("a")))),
lo: 2,
hi: 2,
greedy: true
}
);
}
#[test]
fn repeat() {
assert_eq!(
p("a{2,42}"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 2,
hi: 42,
greedy: true
}
);
assert_eq!(
p("a{2,}"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 2,
hi: usize::MAX,
greedy: true
}
);
assert_eq!(
p("a{2}"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 2,
hi: 2,
greedy: true
}
);
assert_eq!(
p("a{,2}"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 0,
hi: 2,
greedy: true
}
);
assert_eq!(
p("a{2,42}?"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 2,
hi: 42,
greedy: false
}
);
assert_eq!(
p("a{2,}?"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 2,
hi: usize::MAX,
greedy: false
}
);
assert_eq!(
p("a{2}?"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 2,
hi: 2,
greedy: false
}
);
assert_eq!(
p("a{,2}?"),
Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 0,
hi: 2,
greedy: false
}
);
}
#[test]
fn invalid_repeat() {
assert_eq!(
p("a{"),
Expr::Concat(vec![make_literal("a"), make_literal("{"),])
);
assert_eq!(
p("a{6"),
Expr::Concat(vec![
make_literal("a"),
make_literal("{"),
make_literal("6"),
])
);
assert_eq!(
p("a{6,"),
Expr::Concat(vec![
make_literal("a"),
make_literal("{"),
make_literal("6"),
make_literal(","),
])
);
}
#[test]
fn delegate_zero() {
assert_eq!(
p("\\b"),
Expr::Delegate {
inner: String::from("\\b"),
size: 0,
casei: false
}
);
assert_eq!(
p("\\B"),
Expr::Delegate {
inner: String::from("\\B"),
size: 0,
casei: false
}
);
}
#[test]
fn delegate_named_group() {
assert_eq!(
p("\\p{Greek}"),
Expr::Delegate {
inner: String::from("\\p{Greek}"),
size: 1,
casei: false
}
);
assert_eq!(
p("\\pL"),
Expr::Delegate {
inner: String::from("\\pL"),
size: 1,
casei: false
}
);
assert_eq!(
p("\\P{Greek}"),
Expr::Delegate {
inner: String::from("\\P{Greek}"),
size: 1,
casei: false
}
);
assert_eq!(
p("\\PL"),
Expr::Delegate {
inner: String::from("\\PL"),
size: 1,
casei: false
}
);
assert_eq!(
p("(?i)\\p{Ll}"),
Expr::Delegate {
inner: String::from("\\p{Ll}"),
size: 1,
casei: true
}
);
}
#[test]
fn backref() {
assert_eq!(
p("(.)\\1"),
Expr::Concat(vec![
Expr::Group(Box::new(Expr::Any { newline: false })),
Expr::Backref(1),
])
);
}
#[test]
fn named_backref() {
assert_eq!(
p("(?<i>.)\\k<i>"),
Expr::Concat(vec![
Expr::Group(Box::new(Expr::Any { newline: false })),
Expr::Backref(1),
])
);
}
#[test]
fn lookaround() {
assert_eq!(
p("(?=a)"),
Expr::LookAround(Box::new(make_literal("a")), LookAhead)
);
assert_eq!(
p("(?!a)"),
Expr::LookAround(Box::new(make_literal("a")), LookAheadNeg)
);
assert_eq!(
p("(?<=a)"),
Expr::LookAround(Box::new(make_literal("a")), LookBehind)
);
assert_eq!(
p("(?<!a)"),
Expr::LookAround(Box::new(make_literal("a")), LookBehindNeg)
);
}
#[test]
fn shy_group() {
assert_eq!(
p("(?:ab)c"),
Expr::Concat(vec![
Expr::Concat(vec![make_literal("a"), make_literal("b"),]),
make_literal("c"),
])
);
}
#[test]
fn flag_state() {
assert_eq!(p("(?s)."), Expr::Any { newline: true });
assert_eq!(p("(?s:(?-s:.))"), Expr::Any { newline: false });
assert_eq!(
p("(?s:.)."),
Expr::Concat(vec![
Expr::Any { newline: true },
Expr::Any { newline: false },
])
);
assert_eq!(
p("(?:(?s).)."),
Expr::Concat(vec![
Expr::Any { newline: true },
Expr::Any { newline: false },
])
);
}
#[test]
fn flag_multiline() {
assert_eq!(p("^"), Expr::StartText);
assert_eq!(p("(?m:^)"), Expr::StartLine);
assert_eq!(p("$"), Expr::EndText);
assert_eq!(p("(?m:$)"), Expr::EndLine);
}
#[test]
fn flag_swap_greed() {
assert_eq!(p("a*"), p("(?U:a*?)"));
assert_eq!(p("a*?"), p("(?U:a*)"));
}
#[test]
fn invalid_flags() {
assert!(Expr::parse_tree("(?").is_err());
assert!(Expr::parse_tree("(?)").is_err());
assert!(Expr::parse_tree("(?-)").is_err());
assert!(Expr::parse_tree("(?-:a)").is_err());
assert!(Expr::parse_tree("(?q:a)").is_err());
}
#[test]
fn lifetime() {
assert_eq!(
p("\\'[a-zA-Z_][a-zA-Z0-9_]*(?!\\')\\b"),
Expr::Concat(vec![
make_literal("'"),
Expr::Delegate {
inner: String::from("[a-zA-Z_]"),
size: 1,
casei: false
},
Expr::Repeat {
child: Box::new(Expr::Delegate {
inner: String::from("[a-zA-Z0-9_]"),
size: 1,
casei: false
}),
lo: 0,
hi: usize::MAX,
greedy: true
},
Expr::LookAround(Box::new(make_literal("'")), LookAheadNeg),
Expr::Delegate {
inner: String::from("\\b"),
size: 0,
casei: false
}
])
);
}
#[test]
fn ignore_whitespace() {
assert_eq!(p("(?x: )"), p(""));
assert_eq!(p("(?x) | "), p("|"));
assert_eq!(p("(?x: a )"), p("a"));
assert_eq!(p("(?x: a # ) bobby tables\n b )"), p("ab"));
assert_eq!(p("(?x: a | b )"), p("a|b"));
assert_eq!(p("(?x: ( a b ) )"), p("(ab)"));
assert_eq!(p("(?x: a + )"), p("a+"));
assert_eq!(p("(?x: a {2} )"), p("a{2}"));
assert_eq!(p("(?x: a { 2 } )"), p("a{2}"));
assert_eq!(p("(?x: a { 2 , } )"), p("a{2,}"));
assert_eq!(p("(?x: a { , 2 } )"), p("a{,2}"));
assert_eq!(p("(?x: a { 2 , 3 } )"), p("a{2,3}"));
assert_eq!(p("(?x: a { 2 , 3 } ? )"), p("a{2,3}?"));
assert_eq!(p("(?x: ( ? i : . ) )"), p("(?i:.)"));
assert_eq!(p("(?x: ( ?= a ) )"), p("(?=a)"));
assert_eq!(p("(?x: [ ] )"), p("[ ]"));
assert_eq!(p("(?x: [ ^] )"), p("[ ^]"));
assert_eq!(p("(?x: [a - z] )"), p("[a - z]"));
assert_eq!(p("(?x: [ \\] \\\\] )"), p("[ \\] \\\\]"));
assert_eq!(p("(?x: a\\ b )"), p("a b"));
assert_eq!(p("(?x: a (?-x:#) b )"), p("a#b"));
}
#[test]
fn comments() {
assert_eq!(p(r"ab(?# comment)"), p("ab"));
assert_eq!(p(r"ab(?#)"), p("ab"));
assert_eq!(p(r"(?# comment 1)(?# comment 2)ab"), p("ab"));
assert_eq!(p(r"ab(?# comment \))c"), p("abc"));
assert_eq!(p(r"ab(?# comment \\)c"), p("abc"));
assert_eq!(p(r"ab(?# comment ()c"), p("abc"));
assert_eq!(p(r"ab(?# comment)*"), p("ab*"));
fail(r"ab(?# comment");
fail(r"ab(?# comment\");
}
#[test]
fn atomic_group() {
assert_eq!(p("(?>a)"), Expr::AtomicGroup(Box::new(make_literal("a"))));
}
#[test]
fn possessive() {
assert_eq!(
p("a++"),
Expr::AtomicGroup(Box::new(Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 1,
hi: usize::MAX,
greedy: true
}))
);
assert_eq!(
p("a*+"),
Expr::AtomicGroup(Box::new(Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 0,
hi: usize::MAX,
greedy: true
}))
);
assert_eq!(
p("a?+"),
Expr::AtomicGroup(Box::new(Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 0,
hi: 1,
greedy: true
}))
);
}
#[test]
fn invalid_backref() {
fail(".\\12345678"); fail(".\\c"); }
#[test]
fn invalid_group_name_backref() {
assert_error(
"\\k<id>(?<id>.)",
"Parsing error at position 2: Invalid group name in back reference: id",
);
}
#[test]
fn named_backref_only() {
assert_error("(?<id>.)\\1", "Error compiling regex: Numbered backref/call not allowed because named group was used, use a named backref instead");
assert_error("(a)\\1(?<name>b)", "Error compiling regex: Numbered backref/call not allowed because named group was used, use a named backref instead");
}
#[test]
fn invalid_group_name() {
assert_error(
"(?<id)",
"Parsing error at position 1: Could not parse group name",
);
assert_error(
"(?<>)",
"Parsing error at position 1: Could not parse group name",
);
assert_error(
"(?<#>)",
"Parsing error at position 1: Could not parse group name",
);
assert_error(
"\\kxxx<id>",
"Parsing error at position 2: Could not parse group name",
);
}
#[test]
fn unknown_flag() {
assert_error(
"(?-:a)",
"Parsing error at position 2: Unknown group flag: (?-:",
);
assert_error(
"(?)",
"Parsing error at position 2: Unknown group flag: (?)",
);
assert_error(
"(?--)",
"Parsing error at position 2: Unknown group flag: (?--",
);
assert_error(
"(?\u{1F60A})",
"Parsing error at position 2: Unknown group flag: (?\u{1F60A}",
);
}
#[test]
fn no_quantifiers_on_lookarounds() {
assert_error(
"(?=hello)+",
"Parsing error at position 9: Target of repeat operator is invalid",
);
assert_error(
"(?<!hello)*",
"Parsing error at position 10: Target of repeat operator is invalid",
);
assert_error(
"(?<=hello){2,3}",
"Parsing error at position 14: Target of repeat operator is invalid",
);
assert_error(
"(?!hello)?",
"Parsing error at position 9: Target of repeat operator is invalid",
);
assert_error(
"^?",
"Parsing error at position 1: Target of repeat operator is invalid",
);
assert_error(
"${2}",
"Parsing error at position 3: Target of repeat operator is invalid",
);
assert_error(
"(?m)^?",
"Parsing error at position 5: Target of repeat operator is invalid",
);
assert_error(
"(?m)${2}",
"Parsing error at position 7: Target of repeat operator is invalid",
);
assert_error(
"(a|b|?)",
"Parsing error at position 5: Target of repeat operator is invalid",
);
}
#[test]
fn keepout() {
assert_eq!(
p("a\\Kb"),
Expr::Concat(vec![make_literal("a"), Expr::KeepOut, make_literal("b"),])
);
}
#[test]
fn backref_exists_condition() {
assert_eq!(
p("(h)?(?(1))"),
Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("h")))),
lo: 0,
hi: 1,
greedy: true
},
Expr::BackrefExistsCondition(1)
])
);
assert_eq!(
p("(?<h>h)?(?('h'))"),
Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("h")))),
lo: 0,
hi: 1,
greedy: true
},
Expr::BackrefExistsCondition(1)
])
);
}
#[test]
fn conditional_non_backref_validity_check_without_branches() {
assert_error(
"(?(foo))",
"Parsing error at position 7: General parsing error: expected conditional to be a backreference or at least an expression for when the condition is true",
);
}
#[test]
fn conditional_invalid_target_of_repeat_operator() {
assert_error(
r"(?(?=\d)\w|!)",
"Parsing error at position 3: Target of repeat operator is invalid",
);
}
#[test]
fn backref_condition_with_one_two_or_three_branches() {
assert_eq!(
p("(h)?(?(1)i|x)"),
Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("h")))),
lo: 0,
hi: 1,
greedy: true
},
Expr::Conditional {
condition: Box::new(Expr::BackrefExistsCondition(1)),
true_branch: Box::new(make_literal("i")),
false_branch: Box::new(make_literal("x")),
},
])
);
assert_eq!(
p("(h)?(?(1)i)"),
Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("h")))),
lo: 0,
hi: 1,
greedy: true
},
Expr::Conditional {
condition: Box::new(Expr::BackrefExistsCondition(1)),
true_branch: Box::new(make_literal("i")),
false_branch: Box::new(Expr::Empty),
},
])
);
assert_eq!(
p("(h)?(?(1)ii|xy|z)"),
Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("h")))),
lo: 0,
hi: 1,
greedy: true
},
Expr::Conditional {
condition: Box::new(Expr::BackrefExistsCondition(1)),
true_branch: Box::new(Expr::Concat(
vec![make_literal("i"), make_literal("i"),]
)),
false_branch: Box::new(Expr::Alt(vec![
Expr::Concat(vec![make_literal("x"), make_literal("y"),]),
make_literal("z"),
])),
},
])
);
assert_eq!(
p("(?<cap>h)?(?(<cap>)ii|xy|z)"),
Expr::Concat(vec![
Expr::Repeat {
child: Box::new(Expr::Group(Box::new(make_literal("h")))),
lo: 0,
hi: 1,
greedy: true
},
Expr::Conditional {
condition: Box::new(Expr::BackrefExistsCondition(1)),
true_branch: Box::new(Expr::Concat(
vec![make_literal("i"), make_literal("i"),]
)),
false_branch: Box::new(Expr::Alt(vec![
Expr::Concat(vec![make_literal("x"), make_literal("y"),]),
make_literal("z"),
])),
},
])
);
}
#[test]
fn conditional() {
assert_eq!(
p("((?(a)b|c))(\\1)"),
Expr::Concat(vec![
Expr::Group(Box::new(Expr::Conditional {
condition: Box::new(make_literal("a")),
true_branch: Box::new(make_literal("b")),
false_branch: Box::new(make_literal("c"))
})),
Expr::Group(Box::new(Expr::Backref(1)))
])
);
assert_eq!(
p(r"^(?(\d)abc|\d!)$"),
Expr::Concat(vec![
Expr::StartText,
Expr::Conditional {
condition: Box::new(Expr::Delegate {
inner: "\\d".to_string(),
size: 1,
casei: false,
}),
true_branch: Box::new(Expr::Concat(vec![
make_literal("a"),
make_literal("b"),
make_literal("c"),
])),
false_branch: Box::new(Expr::Concat(vec![
Expr::Delegate {
inner: "\\d".to_string(),
size: 1,
casei: false,
},
make_literal("!"),
])),
},
Expr::EndText,
])
);
assert_eq!(
p(r"(?((?=\d))\w|!)"),
Expr::Conditional {
condition: Box::new(Expr::LookAround(
Box::new(Expr::Delegate {
inner: "\\d".to_string(),
size: 1,
casei: false
}),
LookAhead
)),
true_branch: Box::new(Expr::Delegate {
inner: "\\w".to_string(),
size: 1,
casei: false,
}),
false_branch: Box::new(make_literal("!")),
},
);
assert_eq!(
p(r"(?((ab))c|d)"),
Expr::Conditional {
condition: Box::new(Expr::Group(Box::new(Expr::Concat(vec![
make_literal("a"),
make_literal("b"),
]),))),
true_branch: Box::new(make_literal("c")),
false_branch: Box::new(make_literal("d")),
},
);
}
#[test]
fn fuzz_1() {
p(r"\ä");
}
#[test]
fn fuzz_2() {
p(r"\pä");
}
}