1use alloc::boxed::Box;
24use alloc::string::{String, ToString};
25use alloc::vec::Vec;
26use alloc::{format, vec};
27
28use bit_set::BitSet;
29use core::convert::TryInto;
30use core::usize;
31use regex_syntax::escape_into;
32
33use crate::{codepoint_len, CompileError, Error, Expr, ParseError, Result, MAX_RECURSION};
34use crate::{Assertion, LookAround::*};
35
36const FLAG_CASEI: u32 = 1;
37const FLAG_MULTI: u32 = 1 << 1;
38const FLAG_DOTNL: u32 = 1 << 2;
39const FLAG_SWAP_GREED: u32 = 1 << 3;
40const FLAG_IGNORE_SPACE: u32 = 1 << 4;
41const FLAG_UNICODE: u32 = 1 << 5;
42
43#[cfg(not(feature = "std"))]
44pub(crate) type NamedGroups = alloc::collections::BTreeMap<String, usize>;
45#[cfg(feature = "std")]
46pub(crate) type NamedGroups = std::collections::HashMap<String, usize>;
47
48#[derive(Debug)]
49pub struct ExprTree {
50 pub expr: Expr,
51 pub backrefs: BitSet,
52 pub named_groups: NamedGroups,
53}
54
55#[derive(Debug)]
56pub(crate) struct Parser<'a> {
57 re: &'a str, backrefs: BitSet,
59 flags: u32,
60 named_groups: NamedGroups,
61 numeric_backrefs: bool,
62 curr_group: usize, }
64
65impl<'a> Parser<'a> {
66 pub(crate) fn parse(re: &str) -> Result<ExprTree> {
69 let mut p = Parser::new(re);
70 let (ix, expr) = p.parse_re(0, 0)?;
71 if ix < re.len() {
72 return Err(Error::ParseError(
73 ix,
74 ParseError::GeneralParseError("end of string not reached".to_string()),
75 ));
76 }
77 Ok(ExprTree {
78 expr,
79 backrefs: Default::default(),
80 named_groups: p.named_groups,
81 })
82 }
83
84 fn new(re: &str) -> Parser<'_> {
85 Parser {
86 re,
87 backrefs: Default::default(),
88 named_groups: Default::default(),
89 numeric_backrefs: false,
90 flags: FLAG_UNICODE,
91 curr_group: 0,
92 }
93 }
94
95 fn parse_re(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
96 let (ix, child) = self.parse_branch(ix, depth)?;
97 let mut ix = self.optional_whitespace(ix)?;
98 if self.re[ix..].starts_with('|') {
99 let mut children = vec![child];
100 while self.re[ix..].starts_with('|') {
101 ix += 1;
102 let (next, child) = self.parse_branch(ix, depth)?;
103 children.push(child);
104 ix = self.optional_whitespace(next)?;
105 }
106 return Ok((ix, Expr::Alt(children)));
107 }
108 if self.numeric_backrefs && !self.named_groups.is_empty() {
110 return Err(Error::CompileError(CompileError::NamedBackrefOnly));
111 }
112 Ok((ix, child))
113 }
114
115 fn parse_branch(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
116 let mut children = Vec::new();
117 let mut ix = ix;
118 while ix < self.re.len() {
119 let (next, child) = self.parse_piece(ix, depth)?;
120 if next == ix {
121 break;
122 }
123 if child != Expr::Empty {
124 children.push(child);
125 }
126 ix = next;
127 }
128 match children.len() {
129 0 => Ok((ix, Expr::Empty)),
130 1 => Ok((ix, children.pop().unwrap())),
131 _ => Ok((ix, Expr::Concat(children))),
132 }
133 }
134
135 fn parse_piece(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
136 let (ix, child) = self.parse_atom(ix, depth)?;
137 let mut ix = self.optional_whitespace(ix)?;
138 if ix < self.re.len() {
139 let (lo, hi) = match self.re.as_bytes()[ix] {
141 b'?' => (0, 1),
142 b'*' => (0, usize::MAX),
143 b'+' => (1, usize::MAX),
144 b'{' => {
145 match self.parse_repeat(ix) {
146 Ok((next, lo, hi)) => {
147 ix = next - 1;
148 (lo, hi)
149 }
150 Err(_) => {
151 return Ok((ix, child));
153 }
154 }
155 }
156 _ => return Ok((ix, child)),
157 };
158 if !self.is_repeatable(&child) {
159 return Err(Error::ParseError(ix, ParseError::TargetNotRepeatable));
160 }
161 ix += 1;
162 ix = self.optional_whitespace(ix)?;
163 let mut greedy = true;
164 if ix < self.re.len() && self.re.as_bytes()[ix] == b'?' {
165 greedy = false;
166 ix += 1;
167 }
168 greedy ^= self.flag(FLAG_SWAP_GREED);
169 let mut node = Expr::Repeat {
170 child: Box::new(child),
171 lo,
172 hi,
173 greedy,
174 };
175 if ix < self.re.len() && self.re.as_bytes()[ix] == b'+' {
176 ix += 1;
177 node = Expr::AtomicGroup(Box::new(node));
178 }
179 return Ok((ix, node));
180 }
181 Ok((ix, child))
182 }
183
184 fn is_repeatable(&self, child: &Expr) -> bool {
185 match child {
186 Expr::LookAround(_, _) => false,
187 Expr::Empty => false,
188 Expr::Assertion(_) => false,
189 _ => true,
190 }
191 }
192
193 fn parse_repeat(&self, ix: usize) -> Result<(usize, usize, usize)> {
195 let ix = self.optional_whitespace(ix + 1)?; let bytes = self.re.as_bytes();
197 if ix == self.re.len() {
198 return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
199 }
200 let mut end = ix;
201 let lo = if bytes[ix] == b',' {
202 0
203 } else if let Some((next, lo)) = parse_decimal(self.re, ix) {
204 end = next;
205 lo
206 } else {
207 return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
208 };
209 let ix = self.optional_whitespace(end)?; if ix == self.re.len() {
211 return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
212 }
213 end = ix;
214 let hi = match bytes[ix] {
215 b'}' => lo,
216 b',' => {
217 end = self.optional_whitespace(ix + 1)?; if let Some((next, hi)) = parse_decimal(self.re, end) {
219 end = next;
220 hi
221 } else {
222 usize::MAX
223 }
224 }
225 _ => return Err(Error::ParseError(ix, ParseError::InvalidRepeat)),
226 };
227 let ix = self.optional_whitespace(end)?; if ix == self.re.len() || bytes[ix] != b'}' {
229 return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
230 }
231 Ok((ix + 1, lo, hi))
232 }
233
234 fn parse_atom(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
235 let ix = self.optional_whitespace(ix)?;
236 if ix == self.re.len() {
237 return Ok((ix, Expr::Empty));
238 }
239 match self.re.as_bytes()[ix] {
240 b'.' => Ok((
241 ix + 1,
242 Expr::Any {
243 newline: self.flag(FLAG_DOTNL),
244 },
245 )),
246 b'^' => Ok((
247 ix + 1,
248 if self.flag(FLAG_MULTI) {
249 Expr::Assertion(Assertion::StartLine { crlf: false })
251 } else {
252 Expr::Assertion(Assertion::StartText)
253 },
254 )),
255 b'$' => Ok((
256 ix + 1,
257 if self.flag(FLAG_MULTI) {
258 Expr::Assertion(Assertion::EndLine { crlf: false })
260 } else {
261 Expr::Assertion(Assertion::EndText)
262 },
263 )),
264 b'(' => self.parse_group(ix, depth),
265 b'\\' => {
266 let (next, expr) = self.parse_escape(ix, false)?;
267 if let Expr::Backref(group) = expr {
268 self.backrefs.insert(group);
269 }
270 Ok((next, expr))
271 }
272 b'+' | b'*' | b'?' | b'|' | b')' => Ok((ix, Expr::Empty)),
273 b'[' => self.parse_class(ix),
274 b => {
275 let next = ix + codepoint_len(b);
277 Ok((
278 next,
279 Expr::Literal {
280 val: String::from(&self.re[ix..next]),
281 casei: self.flag(FLAG_CASEI),
282 },
283 ))
284 }
285 }
286 }
287
288 fn parse_named_backref(
289 &self,
290 ix: usize,
291 open: &str,
292 close: &str,
293 allow_relative: bool,
294 ) -> Result<(usize, Expr)> {
295 if let Some((id, skip)) = parse_id(&self.re[ix..], open, close, allow_relative) {
296 let group = if let Some(group) = self.named_groups.get(id) {
297 Some(*group)
298 } else if let Ok(group) = id.parse::<isize>() {
299 group.try_into().map_or_else(
300 |_| {
301 self.curr_group.checked_add_signed(group + 1)
303 },
304 |group| Some(group),
305 )
306 } else {
307 None
308 };
309 if let Some(group) = group {
310 return Ok((ix + skip, Expr::Backref(group)));
311 }
312 Err(Error::ParseError(
314 ix,
315 ParseError::InvalidGroupNameBackref(id.to_string()),
316 ))
317 } else {
318 Err(Error::ParseError(ix, ParseError::InvalidGroupName))
320 }
321 }
322
323 fn parse_numbered_backref(&mut self, ix: usize) -> Result<(usize, Expr)> {
324 if let Some((end, group)) = parse_decimal(self.re, ix) {
325 if group < self.re.len() / 2 {
327 self.numeric_backrefs = true;
328 return Ok((end, Expr::Backref(group)));
329 }
330 }
331 return Err(Error::ParseError(ix, ParseError::InvalidBackref));
332 }
333
334 fn parse_escape(&mut self, ix: usize, in_class: bool) -> Result<(usize, Expr)> {
336 let bytes = self.re.as_bytes();
337 let Some(b) = bytes.get(ix + 1).copied() else {
338 return Err(Error::ParseError(ix, ParseError::TrailingBackslash));
339 };
340 let end = ix + 1 + codepoint_len(b);
341 Ok(if is_digit(b) {
342 return self.parse_numbered_backref(ix + 1);
343 } else if matches!(b, b'k') && !in_class {
344 if bytes.get(end) == Some(&b'\'') {
346 return self.parse_named_backref(end, "'", "'", true);
347 } else {
348 return self.parse_named_backref(end, "<", ">", true);
349 }
350 } else if b == b'A' && !in_class {
351 (end, Expr::Assertion(Assertion::StartText))
352 } else if b == b'z' && !in_class {
353 (end, Expr::Assertion(Assertion::EndText))
354 } else if b == b'b' && !in_class {
355 if bytes.get(end) == Some(&b'{') {
356 return Err(Error::ParseError(
358 ix,
359 ParseError::InvalidEscape(format!("\\{}", &self.re[ix + 1..end])),
360 ));
361 }
362 (end, Expr::Assertion(Assertion::WordBoundary))
363 } else if b == b'B' && !in_class {
364 if bytes.get(end) == Some(&b'{') {
365 return Err(Error::ParseError(
367 ix,
368 ParseError::InvalidEscape(format!("\\{}", &self.re[ix + 1..end])),
369 ));
370 }
371 (end, Expr::Assertion(Assertion::NotWordBoundary))
372 } else if b == b'<' && !in_class {
373 (end, Expr::Assertion(Assertion::LeftWordBoundary))
374 } else if b == b'>' && !in_class {
375 (end, Expr::Assertion(Assertion::RightWordBoundary))
376 } else if matches!(b | 32, b'd' | b's' | b'w') {
377 (
378 end,
379 Expr::Delegate {
380 inner: String::from(&self.re[ix..end]),
381 size: 1,
382 casei: self.flag(FLAG_CASEI),
383 },
384 )
385 } else if (b | 32) == b'h' {
386 let s = if b == b'h' {
387 "[0-9A-Fa-f]"
388 } else {
389 "[^0-9A-Fa-f]"
390 };
391 (
392 end,
393 Expr::Delegate {
394 inner: String::from(s),
395 size: 1,
396 casei: false,
397 },
398 )
399 } else if b == b'x' {
400 return self.parse_hex(end, 2);
401 } else if b == b'u' {
402 return self.parse_hex(end, 4);
403 } else if b == b'U' {
404 return self.parse_hex(end, 8);
405 } else if (b | 32) == b'p' && end != bytes.len() {
406 let mut end = end;
407 let b = bytes[end];
408 end += codepoint_len(b);
409 if b == b'{' {
410 loop {
411 if end == self.re.len() {
412 return Err(Error::ParseError(ix, ParseError::UnclosedUnicodeName));
413 }
414 let b = bytes[end];
415 if b == b'}' {
416 end += 1;
417 break;
418 }
419 end += codepoint_len(b);
420 }
421 }
422 (
423 end,
424 Expr::Delegate {
425 inner: String::from(&self.re[ix..end]),
426 size: 1,
427 casei: self.flag(FLAG_CASEI),
428 },
429 )
430 } else if b == b'K' && !in_class {
431 (end, Expr::KeepOut)
432 } else if b == b'G' && !in_class {
433 (end, Expr::ContinueFromPreviousMatchEnd)
434 } else {
435 (
437 end,
438 make_literal(match b {
439 b'a' => "\x07", b'b' => "\x08", b'f' => "\x0c", b'n' => "\n", b'r' => "\r", b't' => "\t", b'v' => "\x0b", b'e' => "\x1b", b' ' => " ",
448 b => {
449 let s = &self.re[ix + 1..end];
450 if b.is_ascii_alphabetic()
451 && !matches!(
452 b,
453 b'k' | b'A' | b'z' | b'b' | b'B' | b'<' | b'>' | b'K' | b'G'
454 )
455 {
456 return Err(Error::ParseError(
457 ix,
458 ParseError::InvalidEscape(format!("\\{}", s)),
459 ));
460 } else {
461 s
462 }
463 }
464 }),
465 )
466 })
467 }
468
469 fn parse_hex(&self, ix: usize, digits: usize) -> Result<(usize, Expr)> {
471 if ix >= self.re.len() {
472 return Err(Error::ParseError(ix, ParseError::InvalidHex));
474 }
475 let bytes = self.re.as_bytes();
476 let b = bytes[ix];
477 let (end, s) = if ix + digits <= self.re.len()
478 && bytes[ix..ix + digits].iter().all(|&b| is_hex_digit(b))
479 {
480 let end = ix + digits;
481 (end, &self.re[ix..end])
482 } else if b == b'{' {
483 let starthex = ix + 1;
484 let mut endhex = starthex;
485 loop {
486 if endhex == self.re.len() {
487 return Err(Error::ParseError(ix, ParseError::InvalidHex));
488 }
489 let b = bytes[endhex];
490 if endhex > starthex && b == b'}' {
491 break;
492 }
493 if is_hex_digit(b) && endhex < starthex + 8 {
494 endhex += 1;
495 } else {
496 return Err(Error::ParseError(ix, ParseError::InvalidHex));
497 }
498 }
499 (endhex + 1, &self.re[starthex..endhex])
500 } else {
501 return Err(Error::ParseError(ix, ParseError::InvalidHex));
502 };
503 let codepoint = u32::from_str_radix(s, 16).unwrap();
504 if let Some(c) = char::from_u32(codepoint) {
505 let mut inner = String::with_capacity(4);
506 inner.push(c);
507 Ok((
508 end,
509 Expr::Literal {
510 val: inner,
511 casei: self.flag(FLAG_CASEI),
512 },
513 ))
514 } else {
515 Err(Error::ParseError(ix, ParseError::InvalidCodepointValue))
516 }
517 }
518
519 fn parse_class(&mut self, ix: usize) -> Result<(usize, Expr)> {
520 let bytes = self.re.as_bytes();
521 let mut ix = ix + 1; let mut class = String::new();
523 let mut nest = 1;
524 class.push('[');
525
526 if bytes.get(ix) == Some(&b'^') {
528 class.push('^');
529 ix += 1;
530 }
531
532 if bytes.get(ix) == Some(&b']') {
534 class.push(']');
535 ix += 1;
536 }
537
538 loop {
539 if ix == self.re.len() {
540 return Err(Error::ParseError(ix, ParseError::InvalidClass));
541 }
542 let end = match bytes[ix] {
543 b'\\' => {
544 let (end, expr) = self.parse_escape(ix, true)?;
546 match expr {
547 Expr::Literal { val, .. } => {
548 debug_assert_eq!(val.chars().count(), 1);
549 escape_into(&val, &mut class);
550 }
551 Expr::Delegate { inner, .. } => {
552 class.push_str(&inner);
553 }
554 _ => {
555 return Err(Error::ParseError(ix, ParseError::InvalidClass));
556 }
557 }
558 end
559 }
560 b'[' => {
561 nest += 1;
562 class.push('[');
563 ix + 1
564 }
565 b']' => {
566 nest -= 1;
567 class.push(']');
568 if nest == 0 {
569 break;
570 }
571 ix + 1
572 }
573 b => {
574 let end = ix + codepoint_len(b);
575 class.push_str(&self.re[ix..end]);
576 end
577 }
578 };
579 ix = end;
580 }
581 let class = Expr::Delegate {
582 inner: class,
583 size: 1,
584 casei: self.flag(FLAG_CASEI),
585 };
586 let ix = ix + 1; Ok((ix, class))
588 }
589
590 fn parse_group(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
591 let depth = depth + 1;
592 if depth >= MAX_RECURSION {
593 return Err(Error::ParseError(ix, ParseError::RecursionExceeded));
594 }
595 let ix = self.optional_whitespace(ix + 1)?;
596 let (la, skip) = if self.re[ix..].starts_with("?=") {
597 (Some(LookAhead), 2)
598 } else if self.re[ix..].starts_with("?!") {
599 (Some(LookAheadNeg), 2)
600 } else if self.re[ix..].starts_with("?<=") {
601 (Some(LookBehind), 3)
602 } else if self.re[ix..].starts_with("?<!") {
603 (Some(LookBehindNeg), 3)
604 } else if self.re[ix..].starts_with("?<") {
605 self.curr_group += 1;
607 if let Some((id, skip)) = parse_id(&self.re[ix + 1..], "<", ">", false) {
608 self.named_groups.insert(id.to_string(), self.curr_group);
609 (None, skip + 1)
610 } else {
611 return Err(Error::ParseError(ix, ParseError::InvalidGroupName));
612 }
613 } else if self.re[ix..].starts_with("?P<") {
614 self.curr_group += 1; if let Some((id, skip)) = parse_id(&self.re[ix + 2..], "<", ">", false) {
617 self.named_groups.insert(id.to_string(), self.curr_group);
618 (None, skip + 2)
619 } else {
620 return Err(Error::ParseError(ix, ParseError::InvalidGroupName));
621 }
622 } else if self.re[ix..].starts_with("?P=") {
623 return self.parse_named_backref(ix + 3, "", ")", false);
625 } else if self.re[ix..].starts_with("?>") {
626 (None, 2)
627 } else if self.re[ix..].starts_with("?(") {
628 return self.parse_conditional(ix + 2, depth);
629 } else if self.re[ix..].starts_with('?') {
630 return self.parse_flags(ix, depth);
631 } else {
632 self.curr_group += 1; (None, 0)
634 };
635 let ix = ix + skip;
636 let (ix, child) = self.parse_re(ix, depth)?;
637 let ix = self.check_for_close_paren(ix)?;
638 let result = match (la, skip) {
639 (Some(la), _) => Expr::LookAround(Box::new(child), la),
640 (None, 2) => Expr::AtomicGroup(Box::new(child)),
641 _ => Expr::Group(Box::new(child)),
642 };
643 Ok((ix, result))
644 }
645
646 fn check_for_close_paren(&self, ix: usize) -> Result<usize> {
647 let ix = self.optional_whitespace(ix)?;
648 if ix == self.re.len() {
649 return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
650 } else if self.re.as_bytes()[ix] != b')' {
651 return Err(Error::ParseError(
652 ix,
653 ParseError::GeneralParseError("expected close paren".to_string()),
654 ));
655 }
656 Ok(ix + 1)
657 }
658
659 fn parse_flags(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
661 let start = ix + 1;
662
663 fn unknown_flag(re: &str, start: usize, end: usize) -> Error {
664 let after_end = end + codepoint_len(re.as_bytes()[end]);
665 let s = format!("(?{}", &re[start..after_end]);
666 Error::ParseError(start, ParseError::UnknownFlag(s))
667 }
668
669 let mut ix = start;
670 let mut neg = false;
671 let oldflags = self.flags;
672 loop {
673 ix = self.optional_whitespace(ix)?;
674 if ix == self.re.len() {
675 return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
676 }
677 let b = self.re.as_bytes()[ix];
678 match b {
679 b'i' => self.update_flag(FLAG_CASEI, neg),
680 b'm' => self.update_flag(FLAG_MULTI, neg),
681 b's' => self.update_flag(FLAG_DOTNL, neg),
682 b'U' => self.update_flag(FLAG_SWAP_GREED, neg),
683 b'x' => self.update_flag(FLAG_IGNORE_SPACE, neg),
684 b'u' => {
685 if neg {
686 return Err(Error::ParseError(ix, ParseError::NonUnicodeUnsupported));
687 }
688 }
689 b'-' => {
690 if neg {
691 return Err(unknown_flag(self.re, start, ix));
692 }
693 neg = true;
694 }
695 b')' => {
696 if ix == start || neg && ix == start + 1 {
697 return Err(unknown_flag(self.re, start, ix));
698 }
699 return Ok((ix + 1, Expr::Empty));
700 }
701 b':' => {
702 if neg && ix == start + 1 {
703 return Err(unknown_flag(self.re, start, ix));
704 }
705 ix += 1;
706 let (ix, child) = self.parse_re(ix, depth)?;
707 if ix == self.re.len() {
708 return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
709 } else if self.re.as_bytes()[ix] != b')' {
710 return Err(Error::ParseError(
711 ix,
712 ParseError::GeneralParseError("expected close paren".to_string()),
713 ));
714 };
715 self.flags = oldflags;
716 return Ok((ix + 1, child));
717 }
718 _ => return Err(unknown_flag(self.re, start, ix)),
719 }
720 ix += 1;
721 }
722 }
723
724 fn parse_conditional(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
726 if ix >= self.re.len() {
727 return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
728 }
729 let bytes = self.re.as_bytes();
730 let b = bytes[ix];
732 let (mut next, condition) = if is_digit(b) {
733 self.parse_numbered_backref(ix)?
734 } else if b == b'\'' {
735 self.parse_named_backref(ix, "'", "'", true)?
736 } else if b == b'<' {
737 self.parse_named_backref(ix, "<", ">", true)?
738 } else {
739 self.parse_re(ix, depth)?
740 };
741 next = self.check_for_close_paren(next)?;
742 let (end, child) = self.parse_re(next, depth)?;
743 if end == next {
744 if let Expr::Backref(group) = condition {
746 let after = self.check_for_close_paren(end)?;
747 return Ok((after, Expr::BackrefExistsCondition(group)));
748 } else {
749 return Err(Error::ParseError(
750 end,
751 ParseError::GeneralParseError(
752 "expected conditional to be a backreference or at least an expression for when the condition is true".to_string()
753 )
754 ));
755 }
756 }
757 let if_true: Expr;
758 let mut if_false: Expr = Expr::Empty;
759 if let Expr::Alt(mut alternatives) = child {
760 if_true = alternatives.remove(0);
762 if alternatives.len() == 1 {
764 if_false = alternatives.pop().expect("expected 2 alternatives");
765 } else {
766 if_false = Expr::Alt(alternatives);
768 }
769 } else {
770 if_true = child;
772 }
773 let inner_condition = if let Expr::Backref(group) = condition {
774 Expr::BackrefExistsCondition(group)
775 } else {
776 condition
777 };
778
779 let after = self.check_for_close_paren(end)?;
780 Ok((
781 after,
782 if if_true == Expr::Empty && if_false == Expr::Empty {
783 inner_condition
784 } else {
785 Expr::Conditional {
786 condition: Box::new(inner_condition),
787 true_branch: Box::new(if_true),
788 false_branch: Box::new(if_false),
789 }
790 },
791 ))
792 }
793
794 fn flag(&self, flag: u32) -> bool {
795 (self.flags & flag) != 0
796 }
797
798 fn update_flag(&mut self, flag: u32, neg: bool) {
799 if neg {
800 self.flags &= !flag;
801 } else {
802 self.flags |= flag;
803 }
804 }
805
806 fn optional_whitespace(&self, mut ix: usize) -> Result<usize> {
807 let bytes = self.re.as_bytes();
808 loop {
809 if ix == self.re.len() {
810 return Ok(ix);
811 }
812 match bytes[ix] {
813 b'#' if self.flag(FLAG_IGNORE_SPACE) => {
814 match bytes[ix..].iter().position(|&c| c == b'\n') {
815 Some(x) => ix += x + 1,
816 None => return Ok(self.re.len()),
817 }
818 }
819 b' ' | b'\r' | b'\n' | b'\t' if self.flag(FLAG_IGNORE_SPACE) => ix += 1,
820 b'(' if bytes[ix..].starts_with(b"(?#") => {
821 ix += 3;
822 loop {
823 if ix >= self.re.len() {
824 return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
825 }
826 match bytes[ix] {
827 b')' => {
828 ix += 1;
829 break;
830 }
831 b'\\' => ix += 2,
832 _ => ix += 1,
833 }
834 }
835 }
836 _ => return Ok(ix),
837 }
838 }
839 }
840}
841
842pub(crate) fn parse_decimal(s: &str, ix: usize) -> Option<(usize, usize)> {
844 let mut end = ix;
845 while end < s.len() && is_digit(s.as_bytes()[end]) {
846 end += 1;
847 }
848 usize::from_str_radix(&s[ix..end], 10)
849 .ok()
850 .map(|val| (end, val))
851}
852
853pub(crate) fn parse_id<'a>(
857 s: &'a str,
858 open: &'_ str,
859 close: &'_ str,
860 allow_relative: bool,
861) -> Option<(&'a str, usize)> {
862 debug_assert!(!close.starts_with(is_id_char));
863
864 if !s.starts_with(open) {
865 return None;
866 }
867
868 let id_start = open.len();
869 let mut iter = s[id_start..].char_indices().peekable();
870 let after_id = if allow_relative && iter.next_if(|(_, ch)| *ch == '-').is_some() {
871 iter.find(|(_, ch)| !ch.is_ascii_digit())
872 } else {
873 iter.find(|(_, ch)| !is_id_char(*ch))
874 };
875 let id_len = match after_id.map(|(i, _)| i) {
876 Some(id_len) if s[id_start + id_len..].starts_with(close) => Some(id_len),
877 None if close.is_empty() => Some(s.len()),
878 _ => None,
879 };
880 match id_len {
881 Some(0) => None,
882 Some(id_len) => {
883 let id_end = id_start + id_len;
884 Some((&s[id_start..id_end], id_end + close.len()))
885 }
886 _ => None,
887 }
888}
889
890fn is_id_char(c: char) -> bool {
891 c.is_alphanumeric() || c == '_'
892}
893
894fn is_digit(b: u8) -> bool {
895 b'0' <= b && b <= b'9'
896}
897
898fn is_hex_digit(b: u8) -> bool {
899 is_digit(b) || (b'a' <= (b | 32) && (b | 32) <= b'f')
900}
901
902pub(crate) fn make_literal(s: &str) -> Expr {
903 Expr::Literal {
904 val: String::from(s),
905 casei: false,
906 }
907}
908
909#[cfg(test)]
910mod tests {
911 use alloc::boxed::Box;
912 use alloc::string::{String, ToString};
913 use alloc::{format, vec};
914
915 use crate::parse::{make_literal, parse_id};
916 use crate::LookAround::*;
917 use crate::{Assertion, Expr};
918
919 fn p(s: &str) -> Expr {
920 Expr::parse_tree(s).unwrap().expr
921 }
922
923 #[cfg_attr(feature = "track_caller", track_caller)]
924 fn fail(s: &str) {
925 assert!(
926 Expr::parse_tree(s).is_err(),
927 "Expected parse error, but was: {:?}",
928 Expr::parse_tree(s)
929 );
930 }
931
932 #[cfg_attr(feature = "track_caller", track_caller)]
933 fn assert_error(re: &str, expected_error: &str) {
934 let result = Expr::parse_tree(re);
935 assert!(
936 result.is_err(),
937 "Expected parse error, but was: {:?}",
938 result
939 );
940 assert_eq!(&format!("{}", result.err().unwrap()), expected_error);
941 }
942
943 #[test]
944 fn empty() {
945 assert_eq!(p(""), Expr::Empty);
946 }
947
948 #[test]
949 fn any() {
950 assert_eq!(p("."), Expr::Any { newline: false });
951 assert_eq!(p("(?s:.)"), Expr::Any { newline: true });
952 }
953
954 #[test]
955 fn start_text() {
956 assert_eq!(p("^"), Expr::Assertion(Assertion::StartText));
957 }
958
959 #[test]
960 fn end_text() {
961 assert_eq!(p("$"), Expr::Assertion(Assertion::EndText));
962 }
963
964 #[test]
965 fn literal() {
966 assert_eq!(p("a"), make_literal("a"));
967 }
968
969 #[test]
970 fn literal_special() {
971 assert_eq!(p("}"), make_literal("}"));
972 assert_eq!(p("]"), make_literal("]"));
973 }
974
975 #[test]
976 fn parse_id_test() {
977 assert_eq!(parse_id("foo.", "", "", true), Some(("foo", 3)));
978 assert_eq!(parse_id("{foo}", "{", "}", true), Some(("foo", 5)));
979 assert_eq!(parse_id("{foo.", "{", "}", true), None);
980 assert_eq!(parse_id("{foo", "{", "}", true), None);
981 assert_eq!(parse_id("{}", "{", "}", true), None);
982 assert_eq!(parse_id("", "", "", true), None);
983 assert_eq!(parse_id("{-1}", "{", "}", true), Some(("-1", 4)));
984 assert_eq!(parse_id("{-1}", "{", "}", false), None);
985 assert_eq!(parse_id("{-a}", "{", "}", true), None);
986 }
987
988 #[test]
989 fn literal_unescaped_opening_curly() {
990 assert_eq!(p("{"), make_literal("{"));
992 assert_eq!(p("({)"), Expr::Group(Box::new(make_literal("{"),)));
993 assert_eq!(
994 p("a|{"),
995 Expr::Alt(vec![make_literal("a"), make_literal("{"),])
996 );
997 assert_eq!(
998 p("{{2}"),
999 Expr::Repeat {
1000 child: Box::new(make_literal("{")),
1001 lo: 2,
1002 hi: 2,
1003 greedy: true
1004 }
1005 );
1006 }
1007
1008 #[test]
1009 fn literal_escape() {
1010 assert_eq!(p("\\'"), make_literal("'"));
1011 assert_eq!(p("\\\""), make_literal("\""));
1012 assert_eq!(p("\\ "), make_literal(" "));
1013 assert_eq!(p("\\xA0"), make_literal("\u{A0}"));
1014 assert_eq!(p("\\x{1F4A9}"), make_literal("\u{1F4A9}"));
1015 assert_eq!(p("\\x{000000B7}"), make_literal("\u{B7}"));
1016 assert_eq!(p("\\u21D2"), make_literal("\u{21D2}"));
1017 assert_eq!(p("\\u{21D2}"), make_literal("\u{21D2}"));
1018 assert_eq!(p("\\u21D2x"), p("\u{21D2}x"));
1019 assert_eq!(p("\\U0001F60A"), make_literal("\u{1F60A}"));
1020 assert_eq!(p("\\U{0001F60A}"), make_literal("\u{1F60A}"));
1021 }
1022
1023 #[test]
1024 fn hex_escape() {
1025 assert_eq!(
1026 p("\\h"),
1027 Expr::Delegate {
1028 inner: String::from("[0-9A-Fa-f]"),
1029 size: 1,
1030 casei: false
1031 }
1032 );
1033 assert_eq!(
1034 p("\\H"),
1035 Expr::Delegate {
1036 inner: String::from("[^0-9A-Fa-f]"),
1037 size: 1,
1038 casei: false
1039 }
1040 );
1041 }
1042
1043 #[test]
1044 fn invalid_escape() {
1045 assert_error(
1046 "\\",
1047 "Parsing error at position 0: Backslash without following character",
1048 );
1049 assert_error("\\q", "Parsing error at position 0: Invalid escape: \\q");
1050 assert_error("\\xAG", "Parsing error at position 2: Invalid hex escape");
1051 assert_error("\\xA", "Parsing error at position 2: Invalid hex escape");
1052 assert_error("\\x{}", "Parsing error at position 2: Invalid hex escape");
1053 assert_error("\\x{AG}", "Parsing error at position 2: Invalid hex escape");
1054 assert_error("\\x{42", "Parsing error at position 2: Invalid hex escape");
1055 assert_error(
1056 "\\x{D800}",
1057 "Parsing error at position 2: Invalid codepoint for hex or unicode escape",
1058 );
1059 assert_error(
1060 "\\x{110000}",
1061 "Parsing error at position 2: Invalid codepoint for hex or unicode escape",
1062 );
1063 assert_error("\\u123", "Parsing error at position 2: Invalid hex escape");
1064 assert_error("\\u123x", "Parsing error at position 2: Invalid hex escape");
1065 assert_error("\\u{}", "Parsing error at position 2: Invalid hex escape");
1066 assert_error(
1067 "\\U1234567",
1068 "Parsing error at position 2: Invalid hex escape",
1069 );
1070 assert_error("\\U{}", "Parsing error at position 2: Invalid hex escape");
1071 }
1072
1073 #[test]
1074 fn concat() {
1075 assert_eq!(
1076 p("ab"),
1077 Expr::Concat(vec![make_literal("a"), make_literal("b"),])
1078 );
1079 }
1080
1081 #[test]
1082 fn alt() {
1083 assert_eq!(
1084 p("a|b"),
1085 Expr::Alt(vec![make_literal("a"), make_literal("b"),])
1086 );
1087 }
1088
1089 #[test]
1090 fn group() {
1091 assert_eq!(p("(a)"), Expr::Group(Box::new(make_literal("a"),)));
1092 }
1093
1094 #[test]
1095 fn group_repeat() {
1096 assert_eq!(
1097 p("(a){2}"),
1098 Expr::Repeat {
1099 child: Box::new(Expr::Group(Box::new(make_literal("a")))),
1100 lo: 2,
1101 hi: 2,
1102 greedy: true
1103 }
1104 );
1105 }
1106
1107 #[test]
1108 fn repeat() {
1109 assert_eq!(
1110 p("a{2,42}"),
1111 Expr::Repeat {
1112 child: Box::new(make_literal("a")),
1113 lo: 2,
1114 hi: 42,
1115 greedy: true
1116 }
1117 );
1118 assert_eq!(
1119 p("a{2,}"),
1120 Expr::Repeat {
1121 child: Box::new(make_literal("a")),
1122 lo: 2,
1123 hi: usize::MAX,
1124 greedy: true
1125 }
1126 );
1127 assert_eq!(
1128 p("a{2}"),
1129 Expr::Repeat {
1130 child: Box::new(make_literal("a")),
1131 lo: 2,
1132 hi: 2,
1133 greedy: true
1134 }
1135 );
1136 assert_eq!(
1137 p("a{,2}"),
1138 Expr::Repeat {
1139 child: Box::new(make_literal("a")),
1140 lo: 0,
1141 hi: 2,
1142 greedy: true
1143 }
1144 );
1145
1146 assert_eq!(
1147 p("a{2,42}?"),
1148 Expr::Repeat {
1149 child: Box::new(make_literal("a")),
1150 lo: 2,
1151 hi: 42,
1152 greedy: false
1153 }
1154 );
1155 assert_eq!(
1156 p("a{2,}?"),
1157 Expr::Repeat {
1158 child: Box::new(make_literal("a")),
1159 lo: 2,
1160 hi: usize::MAX,
1161 greedy: false
1162 }
1163 );
1164 assert_eq!(
1165 p("a{2}?"),
1166 Expr::Repeat {
1167 child: Box::new(make_literal("a")),
1168 lo: 2,
1169 hi: 2,
1170 greedy: false
1171 }
1172 );
1173 assert_eq!(
1174 p("a{,2}?"),
1175 Expr::Repeat {
1176 child: Box::new(make_literal("a")),
1177 lo: 0,
1178 hi: 2,
1179 greedy: false
1180 }
1181 );
1182 }
1183
1184 #[test]
1185 fn invalid_repeat() {
1186 assert_eq!(
1188 p("a{"),
1189 Expr::Concat(vec![make_literal("a"), make_literal("{"),])
1190 );
1191 assert_eq!(
1192 p("a{6"),
1193 Expr::Concat(vec![
1194 make_literal("a"),
1195 make_literal("{"),
1196 make_literal("6"),
1197 ])
1198 );
1199 assert_eq!(
1200 p("a{6,"),
1201 Expr::Concat(vec![
1202 make_literal("a"),
1203 make_literal("{"),
1204 make_literal("6"),
1205 make_literal(","),
1206 ])
1207 );
1208 }
1209
1210 #[test]
1211 fn delegate_zero() {
1212 assert_eq!(p("\\b"), Expr::Assertion(Assertion::WordBoundary),);
1213 assert_eq!(p("\\B"), Expr::Assertion(Assertion::NotWordBoundary),);
1214 }
1215
1216 #[test]
1217 fn delegate_named_group() {
1218 assert_eq!(
1219 p("\\p{Greek}"),
1220 Expr::Delegate {
1221 inner: String::from("\\p{Greek}"),
1222 size: 1,
1223 casei: false
1224 }
1225 );
1226 assert_eq!(
1227 p("\\pL"),
1228 Expr::Delegate {
1229 inner: String::from("\\pL"),
1230 size: 1,
1231 casei: false
1232 }
1233 );
1234 assert_eq!(
1235 p("\\P{Greek}"),
1236 Expr::Delegate {
1237 inner: String::from("\\P{Greek}"),
1238 size: 1,
1239 casei: false
1240 }
1241 );
1242 assert_eq!(
1243 p("\\PL"),
1244 Expr::Delegate {
1245 inner: String::from("\\PL"),
1246 size: 1,
1247 casei: false
1248 }
1249 );
1250 assert_eq!(
1251 p("(?i)\\p{Ll}"),
1252 Expr::Delegate {
1253 inner: String::from("\\p{Ll}"),
1254 size: 1,
1255 casei: true
1256 }
1257 );
1258 }
1259
1260 #[test]
1261 fn backref() {
1262 assert_eq!(
1263 p("(.)\\1"),
1264 Expr::Concat(vec![
1265 Expr::Group(Box::new(Expr::Any { newline: false })),
1266 Expr::Backref(1),
1267 ])
1268 );
1269 }
1270
1271 #[test]
1272 fn named_backref() {
1273 assert_eq!(
1274 p("(?<i>.)\\k<i>"),
1275 Expr::Concat(vec![
1276 Expr::Group(Box::new(Expr::Any { newline: false })),
1277 Expr::Backref(1),
1278 ])
1279 );
1280 }
1281
1282 #[test]
1283 fn relative_backref() {
1284 assert_eq!(
1285 p("(a)(.)\\k<-1>"),
1286 Expr::Concat(vec![
1287 Expr::Group(Box::new(make_literal("a"))),
1288 Expr::Group(Box::new(Expr::Any { newline: false })),
1289 Expr::Backref(2)
1290 ])
1291 );
1292 fail("(?P<->.)");
1293 fail("(.)(?P=-)")
1294 }
1295
1296 #[test]
1297 fn lookaround() {
1298 assert_eq!(
1299 p("(?=a)"),
1300 Expr::LookAround(Box::new(make_literal("a")), LookAhead)
1301 );
1302 assert_eq!(
1303 p("(?!a)"),
1304 Expr::LookAround(Box::new(make_literal("a")), LookAheadNeg)
1305 );
1306 assert_eq!(
1307 p("(?<=a)"),
1308 Expr::LookAround(Box::new(make_literal("a")), LookBehind)
1309 );
1310 assert_eq!(
1311 p("(?<!a)"),
1312 Expr::LookAround(Box::new(make_literal("a")), LookBehindNeg)
1313 );
1314 }
1315
1316 #[test]
1317 fn shy_group() {
1318 assert_eq!(
1319 p("(?:ab)c"),
1320 Expr::Concat(vec![
1321 Expr::Concat(vec![make_literal("a"), make_literal("b"),]),
1322 make_literal("c"),
1323 ])
1324 );
1325 }
1326
1327 #[test]
1328 fn flag_state() {
1329 assert_eq!(p("(?s)."), Expr::Any { newline: true });
1330 assert_eq!(p("(?s:(?-s:.))"), Expr::Any { newline: false });
1331 assert_eq!(
1332 p("(?s:.)."),
1333 Expr::Concat(vec![
1334 Expr::Any { newline: true },
1335 Expr::Any { newline: false },
1336 ])
1337 );
1338 assert_eq!(
1339 p("(?:(?s).)."),
1340 Expr::Concat(vec![
1341 Expr::Any { newline: true },
1342 Expr::Any { newline: false },
1343 ])
1344 );
1345 }
1346
1347 #[test]
1348 fn flag_multiline() {
1349 assert_eq!(p("^"), Expr::Assertion(Assertion::StartText));
1350 assert_eq!(
1351 p("(?m:^)"),
1352 Expr::Assertion(Assertion::StartLine { crlf: false })
1353 );
1354 assert_eq!(p("$"), Expr::Assertion(Assertion::EndText));
1355 assert_eq!(
1356 p("(?m:$)"),
1357 Expr::Assertion(Assertion::EndLine { crlf: false })
1358 );
1359 }
1360
1361 #[test]
1362 fn flag_swap_greed() {
1363 assert_eq!(p("a*"), p("(?U:a*?)"));
1364 assert_eq!(p("a*?"), p("(?U:a*)"));
1365 }
1366
1367 #[test]
1368 fn invalid_flags() {
1369 assert!(Expr::parse_tree("(?").is_err());
1370 assert!(Expr::parse_tree("(?)").is_err());
1371 assert!(Expr::parse_tree("(?-)").is_err());
1372 assert!(Expr::parse_tree("(?-:a)").is_err());
1373 assert!(Expr::parse_tree("(?q:a)").is_err());
1374 }
1375
1376 #[test]
1377 fn lifetime() {
1378 assert_eq!(
1379 p("\\'[a-zA-Z_][a-zA-Z0-9_]*(?!\\')\\b"),
1380 Expr::Concat(vec![
1381 make_literal("'"),
1382 Expr::Delegate {
1383 inner: String::from("[a-zA-Z_]"),
1384 size: 1,
1385 casei: false
1386 },
1387 Expr::Repeat {
1388 child: Box::new(Expr::Delegate {
1389 inner: String::from("[a-zA-Z0-9_]"),
1390 size: 1,
1391 casei: false
1392 }),
1393 lo: 0,
1394 hi: usize::MAX,
1395 greedy: true
1396 },
1397 Expr::LookAround(Box::new(make_literal("'")), LookAheadNeg),
1398 Expr::Assertion(Assertion::WordBoundary),
1399 ])
1400 );
1401 }
1402
1403 #[test]
1404 fn ignore_whitespace() {
1405 assert_eq!(p("(?x: )"), p(""));
1406 assert_eq!(p("(?x) | "), p("|"));
1407 assert_eq!(p("(?x: a )"), p("a"));
1408 assert_eq!(p("(?x: a # ) bobby tables\n b )"), p("ab"));
1409 assert_eq!(p("(?x: a | b )"), p("a|b"));
1410 assert_eq!(p("(?x: ( a b ) )"), p("(ab)"));
1411 assert_eq!(p("(?x: a + )"), p("a+"));
1412 assert_eq!(p("(?x: a {2} )"), p("a{2}"));
1413 assert_eq!(p("(?x: a { 2 } )"), p("a{2}"));
1414 assert_eq!(p("(?x: a { 2 , } )"), p("a{2,}"));
1415 assert_eq!(p("(?x: a { , 2 } )"), p("a{,2}"));
1416 assert_eq!(p("(?x: a { 2 , 3 } )"), p("a{2,3}"));
1417 assert_eq!(p("(?x: a { 2 , 3 } ? )"), p("a{2,3}?"));
1418 assert_eq!(p("(?x: ( ? i : . ) )"), p("(?i:.)"));
1419 assert_eq!(p("(?x: ( ?= a ) )"), p("(?=a)"));
1420 assert_eq!(p("(?x: [ ] )"), p("[ ]"));
1421 assert_eq!(p("(?x: [ ^] )"), p("[ ^]"));
1422 assert_eq!(p("(?x: [a - z] )"), p("[a - z]"));
1423 assert_eq!(p("(?x: [ \\] \\\\] )"), p("[ \\] \\\\]"));
1424 assert_eq!(p("(?x: a\\ b )"), p("a b"));
1425 assert_eq!(p("(?x: a (?-x:#) b )"), p("a#b"));
1426 }
1427
1428 #[test]
1429 fn comments() {
1430 assert_eq!(p(r"ab(?# comment)"), p("ab"));
1431 assert_eq!(p(r"ab(?#)"), p("ab"));
1432 assert_eq!(p(r"(?# comment 1)(?# comment 2)ab"), p("ab"));
1433 assert_eq!(p(r"ab(?# comment \))c"), p("abc"));
1434 assert_eq!(p(r"ab(?# comment \\)c"), p("abc"));
1435 assert_eq!(p(r"ab(?# comment ()c"), p("abc"));
1436 assert_eq!(p(r"ab(?# comment)*"), p("ab*"));
1437 fail(r"ab(?# comment");
1438 fail(r"ab(?# comment\");
1439 }
1440
1441 #[test]
1442 fn atomic_group() {
1443 assert_eq!(p("(?>a)"), Expr::AtomicGroup(Box::new(make_literal("a"))));
1444 }
1445
1446 #[test]
1447 fn possessive() {
1448 assert_eq!(
1449 p("a++"),
1450 Expr::AtomicGroup(Box::new(Expr::Repeat {
1451 child: Box::new(make_literal("a")),
1452 lo: 1,
1453 hi: usize::MAX,
1454 greedy: true
1455 }))
1456 );
1457 assert_eq!(
1458 p("a*+"),
1459 Expr::AtomicGroup(Box::new(Expr::Repeat {
1460 child: Box::new(make_literal("a")),
1461 lo: 0,
1462 hi: usize::MAX,
1463 greedy: true
1464 }))
1465 );
1466 assert_eq!(
1467 p("a?+"),
1468 Expr::AtomicGroup(Box::new(Expr::Repeat {
1469 child: Box::new(make_literal("a")),
1470 lo: 0,
1471 hi: 1,
1472 greedy: true
1473 }))
1474 );
1475 }
1476
1477 #[test]
1478 fn invalid_backref() {
1479 fail(".\\12345678"); fail(".\\c"); }
1483
1484 #[test]
1485 fn invalid_group_name_backref() {
1486 assert_error(
1487 "\\k<id>(?<id>.)",
1488 "Parsing error at position 2: Invalid group name in back reference: id",
1489 );
1490 }
1491
1492 #[test]
1493 fn named_backref_only() {
1494 assert_error("(?<id>.)\\1", "Error compiling regex: Numbered backref/call not allowed because named group was used, use a named backref instead");
1495 assert_error("(a)\\1(?<name>b)", "Error compiling regex: Numbered backref/call not allowed because named group was used, use a named backref instead");
1496 }
1497
1498 #[test]
1499 fn invalid_group_name() {
1500 assert_error(
1501 "(?<id)",
1502 "Parsing error at position 1: Could not parse group name",
1503 );
1504 assert_error(
1505 "(?<>)",
1506 "Parsing error at position 1: Could not parse group name",
1507 );
1508 assert_error(
1509 "(?<#>)",
1510 "Parsing error at position 1: Could not parse group name",
1511 );
1512 assert_error(
1513 "\\kxxx<id>",
1514 "Parsing error at position 2: Could not parse group name",
1515 );
1516 assert_error(
1518 "\\k<id-2>",
1519 "Parsing error at position 2: Could not parse group name",
1520 );
1521 assert_error(
1522 "\\k<-id>",
1523 "Parsing error at position 2: Could not parse group name",
1524 );
1525 }
1526
1527 #[test]
1528 fn unknown_flag() {
1529 assert_error(
1530 "(?-:a)",
1531 "Parsing error at position 2: Unknown group flag: (?-:",
1532 );
1533 assert_error(
1534 "(?)",
1535 "Parsing error at position 2: Unknown group flag: (?)",
1536 );
1537 assert_error(
1538 "(?--)",
1539 "Parsing error at position 2: Unknown group flag: (?--",
1540 );
1541 assert_error(
1543 "(?\u{1F60A})",
1544 "Parsing error at position 2: Unknown group flag: (?\u{1F60A}",
1545 );
1546 }
1547
1548 #[test]
1549 fn no_quantifiers_on_lookarounds() {
1550 assert_error(
1551 "(?=hello)+",
1552 "Parsing error at position 9: Target of repeat operator is invalid",
1553 );
1554 assert_error(
1555 "(?<!hello)*",
1556 "Parsing error at position 10: Target of repeat operator is invalid",
1557 );
1558 assert_error(
1559 "(?<=hello){2,3}",
1560 "Parsing error at position 14: Target of repeat operator is invalid",
1561 );
1562 assert_error(
1563 "(?!hello)?",
1564 "Parsing error at position 9: Target of repeat operator is invalid",
1565 );
1566 assert_error(
1567 "^?",
1568 "Parsing error at position 1: Target of repeat operator is invalid",
1569 );
1570 assert_error(
1571 "${2}",
1572 "Parsing error at position 3: Target of repeat operator is invalid",
1573 );
1574 assert_error(
1575 "(?m)^?",
1576 "Parsing error at position 5: Target of repeat operator is invalid",
1577 );
1578 assert_error(
1579 "(?m)${2}",
1580 "Parsing error at position 7: Target of repeat operator is invalid",
1581 );
1582 assert_error(
1583 "(a|b|?)",
1584 "Parsing error at position 5: Target of repeat operator is invalid",
1585 );
1586 }
1587
1588 #[test]
1589 fn keepout() {
1590 assert_eq!(
1591 p("a\\Kb"),
1592 Expr::Concat(vec![make_literal("a"), Expr::KeepOut, make_literal("b"),])
1593 );
1594 }
1595
1596 #[test]
1597 fn backref_exists_condition() {
1598 assert_eq!(
1599 p("(h)?(?(1))"),
1600 Expr::Concat(vec![
1601 Expr::Repeat {
1602 child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1603 lo: 0,
1604 hi: 1,
1605 greedy: true
1606 },
1607 Expr::BackrefExistsCondition(1)
1608 ])
1609 );
1610 assert_eq!(
1611 p("(?<h>h)?(?('h'))"),
1612 Expr::Concat(vec![
1613 Expr::Repeat {
1614 child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1615 lo: 0,
1616 hi: 1,
1617 greedy: true
1618 },
1619 Expr::BackrefExistsCondition(1)
1620 ])
1621 );
1622 }
1623
1624 #[test]
1625 fn conditional_non_backref_validity_check_without_branches() {
1626 assert_error(
1627 "(?(foo))",
1628 "Parsing error at position 7: General parsing error: expected conditional to be a backreference or at least an expression for when the condition is true",
1629 );
1630 }
1631
1632 #[test]
1633 fn conditional_invalid_target_of_repeat_operator() {
1634 assert_error(
1635 r"(?(?=\d)\w|!)",
1636 "Parsing error at position 3: Target of repeat operator is invalid",
1637 );
1638 }
1639
1640 #[test]
1641 fn backref_condition_with_one_two_or_three_branches() {
1642 assert_eq!(
1643 p("(h)?(?(1)i|x)"),
1644 Expr::Concat(vec![
1645 Expr::Repeat {
1646 child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1647 lo: 0,
1648 hi: 1,
1649 greedy: true
1650 },
1651 Expr::Conditional {
1652 condition: Box::new(Expr::BackrefExistsCondition(1)),
1653 true_branch: Box::new(make_literal("i")),
1654 false_branch: Box::new(make_literal("x")),
1655 },
1656 ])
1657 );
1658
1659 assert_eq!(
1660 p("(h)?(?(1)i)"),
1661 Expr::Concat(vec![
1662 Expr::Repeat {
1663 child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1664 lo: 0,
1665 hi: 1,
1666 greedy: true
1667 },
1668 Expr::Conditional {
1669 condition: Box::new(Expr::BackrefExistsCondition(1)),
1670 true_branch: Box::new(make_literal("i")),
1671 false_branch: Box::new(Expr::Empty),
1672 },
1673 ])
1674 );
1675
1676 assert_eq!(
1677 p("(h)?(?(1)ii|xy|z)"),
1678 Expr::Concat(vec![
1679 Expr::Repeat {
1680 child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1681 lo: 0,
1682 hi: 1,
1683 greedy: true
1684 },
1685 Expr::Conditional {
1686 condition: Box::new(Expr::BackrefExistsCondition(1)),
1687 true_branch: Box::new(Expr::Concat(
1688 vec![make_literal("i"), make_literal("i"),]
1689 )),
1690 false_branch: Box::new(Expr::Alt(vec![
1691 Expr::Concat(vec![make_literal("x"), make_literal("y"),]),
1692 make_literal("z"),
1693 ])),
1694 },
1695 ])
1696 );
1697
1698 assert_eq!(
1699 p("(?<cap>h)?(?(<cap>)ii|xy|z)"),
1700 Expr::Concat(vec![
1701 Expr::Repeat {
1702 child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1703 lo: 0,
1704 hi: 1,
1705 greedy: true
1706 },
1707 Expr::Conditional {
1708 condition: Box::new(Expr::BackrefExistsCondition(1)),
1709 true_branch: Box::new(Expr::Concat(
1710 vec![make_literal("i"), make_literal("i"),]
1711 )),
1712 false_branch: Box::new(Expr::Alt(vec![
1713 Expr::Concat(vec![make_literal("x"), make_literal("y"),]),
1714 make_literal("z"),
1715 ])),
1716 },
1717 ])
1718 );
1719 }
1720
1721 #[test]
1722 fn conditional() {
1723 assert_eq!(
1724 p("((?(a)b|c))(\\1)"),
1725 Expr::Concat(vec![
1726 Expr::Group(Box::new(Expr::Conditional {
1727 condition: Box::new(make_literal("a")),
1728 true_branch: Box::new(make_literal("b")),
1729 false_branch: Box::new(make_literal("c"))
1730 })),
1731 Expr::Group(Box::new(Expr::Backref(1)))
1732 ])
1733 );
1734
1735 assert_eq!(
1736 p(r"^(?(\d)abc|\d!)$"),
1737 Expr::Concat(vec![
1738 Expr::Assertion(Assertion::StartText),
1739 Expr::Conditional {
1740 condition: Box::new(Expr::Delegate {
1741 inner: "\\d".to_string(),
1742 size: 1,
1743 casei: false,
1744 }),
1745 true_branch: Box::new(Expr::Concat(vec![
1746 make_literal("a"),
1747 make_literal("b"),
1748 make_literal("c"),
1749 ])),
1750 false_branch: Box::new(Expr::Concat(vec![
1751 Expr::Delegate {
1752 inner: "\\d".to_string(),
1753 size: 1,
1754 casei: false,
1755 },
1756 make_literal("!"),
1757 ])),
1758 },
1759 Expr::Assertion(Assertion::EndText),
1760 ])
1761 );
1762
1763 assert_eq!(
1764 p(r"(?((?=\d))\w|!)"),
1765 Expr::Conditional {
1766 condition: Box::new(Expr::LookAround(
1767 Box::new(Expr::Delegate {
1768 inner: "\\d".to_string(),
1769 size: 1,
1770 casei: false
1771 }),
1772 LookAhead
1773 )),
1774 true_branch: Box::new(Expr::Delegate {
1775 inner: "\\w".to_string(),
1776 size: 1,
1777 casei: false,
1778 }),
1779 false_branch: Box::new(make_literal("!")),
1780 },
1781 );
1782
1783 assert_eq!(
1784 p(r"(?((ab))c|d)"),
1785 Expr::Conditional {
1786 condition: Box::new(Expr::Group(Box::new(Expr::Concat(vec![
1787 make_literal("a"),
1788 make_literal("b"),
1789 ]),))),
1790 true_branch: Box::new(make_literal("c")),
1791 false_branch: Box::new(make_literal("d")),
1792 },
1793 );
1794 }
1795
1796 #[test]
1798 fn fuzz_1() {
1799 p(r"\ä");
1800 }
1801
1802 #[test]
1803 fn fuzz_2() {
1804 p(r"\pä");
1805 }
1806
1807 #[test]
1808 fn fuzz_3() {
1809 fail(r"(?()^");
1810 fail(r#"!w(?()\"Kuz>"#);
1811 }
1812
1813 #[test]
1814 fn fuzz_4() {
1815 fail(r"\u{2}(?(2)");
1816 }
1817}