fancy_regex/
parse.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! A regex parser yielding an AST.
22
23use alloc::boxed::Box;
24use alloc::string::{String, ToString};
25use alloc::vec::Vec;
26use alloc::{format, vec};
27
28use bit_set::BitSet;
29use core::convert::TryInto;
30use core::usize;
31use regex_syntax::escape_into;
32
33use crate::{codepoint_len, CompileError, Error, Expr, ParseError, Result, MAX_RECURSION};
34use crate::{Assertion, LookAround::*};
35
36const FLAG_CASEI: u32 = 1;
37const FLAG_MULTI: u32 = 1 << 1;
38const FLAG_DOTNL: u32 = 1 << 2;
39const FLAG_SWAP_GREED: u32 = 1 << 3;
40const FLAG_IGNORE_SPACE: u32 = 1 << 4;
41const FLAG_UNICODE: u32 = 1 << 5;
42
43#[cfg(not(feature = "std"))]
44pub(crate) type NamedGroups = alloc::collections::BTreeMap<String, usize>;
45#[cfg(feature = "std")]
46pub(crate) type NamedGroups = std::collections::HashMap<String, usize>;
47
48#[derive(Debug)]
49pub struct ExprTree {
50    pub expr: Expr,
51    pub backrefs: BitSet,
52    pub named_groups: NamedGroups,
53}
54
55#[derive(Debug)]
56pub(crate) struct Parser<'a> {
57    re: &'a str, // source
58    backrefs: BitSet,
59    flags: u32,
60    named_groups: NamedGroups,
61    numeric_backrefs: bool,
62    curr_group: usize, // need to keep track of which group number we're parsing
63}
64
65impl<'a> Parser<'a> {
66    /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
67    /// that are referenced by backrefs.
68    pub(crate) fn parse(re: &str) -> Result<ExprTree> {
69        let mut p = Parser::new(re);
70        let (ix, expr) = p.parse_re(0, 0)?;
71        if ix < re.len() {
72            return Err(Error::ParseError(
73                ix,
74                ParseError::GeneralParseError("end of string not reached".to_string()),
75            ));
76        }
77        Ok(ExprTree {
78            expr,
79            backrefs: Default::default(),
80            named_groups: p.named_groups,
81        })
82    }
83
84    fn new(re: &str) -> Parser<'_> {
85        Parser {
86            re,
87            backrefs: Default::default(),
88            named_groups: Default::default(),
89            numeric_backrefs: false,
90            flags: FLAG_UNICODE,
91            curr_group: 0,
92        }
93    }
94
95    fn parse_re(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
96        let (ix, child) = self.parse_branch(ix, depth)?;
97        let mut ix = self.optional_whitespace(ix)?;
98        if self.re[ix..].starts_with('|') {
99            let mut children = vec![child];
100            while self.re[ix..].starts_with('|') {
101                ix += 1;
102                let (next, child) = self.parse_branch(ix, depth)?;
103                children.push(child);
104                ix = self.optional_whitespace(next)?;
105            }
106            return Ok((ix, Expr::Alt(children)));
107        }
108        // can't have numeric backrefs and named backrefs
109        if self.numeric_backrefs && !self.named_groups.is_empty() {
110            return Err(Error::CompileError(CompileError::NamedBackrefOnly));
111        }
112        Ok((ix, child))
113    }
114
115    fn parse_branch(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
116        let mut children = Vec::new();
117        let mut ix = ix;
118        while ix < self.re.len() {
119            let (next, child) = self.parse_piece(ix, depth)?;
120            if next == ix {
121                break;
122            }
123            if child != Expr::Empty {
124                children.push(child);
125            }
126            ix = next;
127        }
128        match children.len() {
129            0 => Ok((ix, Expr::Empty)),
130            1 => Ok((ix, children.pop().unwrap())),
131            _ => Ok((ix, Expr::Concat(children))),
132        }
133    }
134
135    fn parse_piece(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
136        let (ix, child) = self.parse_atom(ix, depth)?;
137        let mut ix = self.optional_whitespace(ix)?;
138        if ix < self.re.len() {
139            // fail when child is empty?
140            let (lo, hi) = match self.re.as_bytes()[ix] {
141                b'?' => (0, 1),
142                b'*' => (0, usize::MAX),
143                b'+' => (1, usize::MAX),
144                b'{' => {
145                    match self.parse_repeat(ix) {
146                        Ok((next, lo, hi)) => {
147                            ix = next - 1;
148                            (lo, hi)
149                        }
150                        Err(_) => {
151                            // Invalid repeat syntax, which results in `{` being treated as a literal
152                            return Ok((ix, child));
153                        }
154                    }
155                }
156                _ => return Ok((ix, child)),
157            };
158            if !self.is_repeatable(&child) {
159                return Err(Error::ParseError(ix, ParseError::TargetNotRepeatable));
160            }
161            ix += 1;
162            ix = self.optional_whitespace(ix)?;
163            let mut greedy = true;
164            if ix < self.re.len() && self.re.as_bytes()[ix] == b'?' {
165                greedy = false;
166                ix += 1;
167            }
168            greedy ^= self.flag(FLAG_SWAP_GREED);
169            let mut node = Expr::Repeat {
170                child: Box::new(child),
171                lo,
172                hi,
173                greedy,
174            };
175            if ix < self.re.len() && self.re.as_bytes()[ix] == b'+' {
176                ix += 1;
177                node = Expr::AtomicGroup(Box::new(node));
178            }
179            return Ok((ix, node));
180        }
181        Ok((ix, child))
182    }
183
184    fn is_repeatable(&self, child: &Expr) -> bool {
185        match child {
186            Expr::LookAround(_, _) => false,
187            Expr::Empty => false,
188            Expr::Assertion(_) => false,
189            _ => true,
190        }
191    }
192
193    // ix, lo, hi
194    fn parse_repeat(&self, ix: usize) -> Result<(usize, usize, usize)> {
195        let ix = self.optional_whitespace(ix + 1)?; // skip opening '{'
196        let bytes = self.re.as_bytes();
197        if ix == self.re.len() {
198            return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
199        }
200        let mut end = ix;
201        let lo = if bytes[ix] == b',' {
202            0
203        } else if let Some((next, lo)) = parse_decimal(self.re, ix) {
204            end = next;
205            lo
206        } else {
207            return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
208        };
209        let ix = self.optional_whitespace(end)?; // past lo number
210        if ix == self.re.len() {
211            return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
212        }
213        end = ix;
214        let hi = match bytes[ix] {
215            b'}' => lo,
216            b',' => {
217                end = self.optional_whitespace(ix + 1)?; // past ','
218                if let Some((next, hi)) = parse_decimal(self.re, end) {
219                    end = next;
220                    hi
221                } else {
222                    usize::MAX
223                }
224            }
225            _ => return Err(Error::ParseError(ix, ParseError::InvalidRepeat)),
226        };
227        let ix = self.optional_whitespace(end)?; // past hi number
228        if ix == self.re.len() || bytes[ix] != b'}' {
229            return Err(Error::ParseError(ix, ParseError::InvalidRepeat));
230        }
231        Ok((ix + 1, lo, hi))
232    }
233
234    fn parse_atom(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
235        let ix = self.optional_whitespace(ix)?;
236        if ix == self.re.len() {
237            return Ok((ix, Expr::Empty));
238        }
239        match self.re.as_bytes()[ix] {
240            b'.' => Ok((
241                ix + 1,
242                Expr::Any {
243                    newline: self.flag(FLAG_DOTNL),
244                },
245            )),
246            b'^' => Ok((
247                ix + 1,
248                if self.flag(FLAG_MULTI) {
249                    // TODO: support crlf flag
250                    Expr::Assertion(Assertion::StartLine { crlf: false })
251                } else {
252                    Expr::Assertion(Assertion::StartText)
253                },
254            )),
255            b'$' => Ok((
256                ix + 1,
257                if self.flag(FLAG_MULTI) {
258                    // TODO: support crlf flag
259                    Expr::Assertion(Assertion::EndLine { crlf: false })
260                } else {
261                    Expr::Assertion(Assertion::EndText)
262                },
263            )),
264            b'(' => self.parse_group(ix, depth),
265            b'\\' => {
266                let (next, expr) = self.parse_escape(ix, false)?;
267                if let Expr::Backref(group) = expr {
268                    self.backrefs.insert(group);
269                }
270                Ok((next, expr))
271            }
272            b'+' | b'*' | b'?' | b'|' | b')' => Ok((ix, Expr::Empty)),
273            b'[' => self.parse_class(ix),
274            b => {
275                // TODO: maybe want to match multiple codepoints?
276                let next = ix + codepoint_len(b);
277                Ok((
278                    next,
279                    Expr::Literal {
280                        val: String::from(&self.re[ix..next]),
281                        casei: self.flag(FLAG_CASEI),
282                    },
283                ))
284            }
285        }
286    }
287
288    fn parse_named_backref(
289        &self,
290        ix: usize,
291        open: &str,
292        close: &str,
293        allow_relative: bool,
294    ) -> Result<(usize, Expr)> {
295        if let Some((id, skip)) = parse_id(&self.re[ix..], open, close, allow_relative) {
296            let group = if let Some(group) = self.named_groups.get(id) {
297                Some(*group)
298            } else if let Ok(group) = id.parse::<isize>() {
299                group.try_into().map_or_else(
300                    |_| {
301                        // relative backref
302                        self.curr_group.checked_add_signed(group + 1)
303                    },
304                    |group| Some(group),
305                )
306            } else {
307                None
308            };
309            if let Some(group) = group {
310                return Ok((ix + skip, Expr::Backref(group)));
311            }
312            // here the name is parsed but it is invalid
313            Err(Error::ParseError(
314                ix,
315                ParseError::InvalidGroupNameBackref(id.to_string()),
316            ))
317        } else {
318            // in this case the name can't be parsed
319            Err(Error::ParseError(ix, ParseError::InvalidGroupName))
320        }
321    }
322
323    fn parse_numbered_backref(&mut self, ix: usize) -> Result<(usize, Expr)> {
324        if let Some((end, group)) = parse_decimal(self.re, ix) {
325            // protect BitSet against unreasonably large value
326            if group < self.re.len() / 2 {
327                self.numeric_backrefs = true;
328                return Ok((end, Expr::Backref(group)));
329            }
330        }
331        return Err(Error::ParseError(ix, ParseError::InvalidBackref));
332    }
333
334    // ix points to \ character
335    fn parse_escape(&mut self, ix: usize, in_class: bool) -> Result<(usize, Expr)> {
336        let bytes = self.re.as_bytes();
337        let Some(b) = bytes.get(ix + 1).copied() else {
338            return Err(Error::ParseError(ix, ParseError::TrailingBackslash));
339        };
340        let end = ix + 1 + codepoint_len(b);
341        Ok(if is_digit(b) {
342            return self.parse_numbered_backref(ix + 1);
343        } else if matches!(b, b'k') && !in_class {
344            // Named backref: \k<name>
345            if bytes.get(end) == Some(&b'\'') {
346                return self.parse_named_backref(end, "'", "'", true);
347            } else {
348                return self.parse_named_backref(end, "<", ">", true);
349            }
350        } else if b == b'A' && !in_class {
351            (end, Expr::Assertion(Assertion::StartText))
352        } else if b == b'z' && !in_class {
353            (end, Expr::Assertion(Assertion::EndText))
354        } else if b == b'b' && !in_class {
355            if bytes.get(end) == Some(&b'{') {
356                // Support for \b{...} is not implemented yet
357                return Err(Error::ParseError(
358                    ix,
359                    ParseError::InvalidEscape(format!("\\{}", &self.re[ix + 1..end])),
360                ));
361            }
362            (end, Expr::Assertion(Assertion::WordBoundary))
363        } else if b == b'B' && !in_class {
364            if bytes.get(end) == Some(&b'{') {
365                // Support for \b{...} is not implemented yet
366                return Err(Error::ParseError(
367                    ix,
368                    ParseError::InvalidEscape(format!("\\{}", &self.re[ix + 1..end])),
369                ));
370            }
371            (end, Expr::Assertion(Assertion::NotWordBoundary))
372        } else if b == b'<' && !in_class {
373            (end, Expr::Assertion(Assertion::LeftWordBoundary))
374        } else if b == b'>' && !in_class {
375            (end, Expr::Assertion(Assertion::RightWordBoundary))
376        } else if matches!(b | 32, b'd' | b's' | b'w') {
377            (
378                end,
379                Expr::Delegate {
380                    inner: String::from(&self.re[ix..end]),
381                    size: 1,
382                    casei: self.flag(FLAG_CASEI),
383                },
384            )
385        } else if (b | 32) == b'h' {
386            let s = if b == b'h' {
387                "[0-9A-Fa-f]"
388            } else {
389                "[^0-9A-Fa-f]"
390            };
391            (
392                end,
393                Expr::Delegate {
394                    inner: String::from(s),
395                    size: 1,
396                    casei: false,
397                },
398            )
399        } else if b == b'x' {
400            return self.parse_hex(end, 2);
401        } else if b == b'u' {
402            return self.parse_hex(end, 4);
403        } else if b == b'U' {
404            return self.parse_hex(end, 8);
405        } else if (b | 32) == b'p' && end != bytes.len() {
406            let mut end = end;
407            let b = bytes[end];
408            end += codepoint_len(b);
409            if b == b'{' {
410                loop {
411                    if end == self.re.len() {
412                        return Err(Error::ParseError(ix, ParseError::UnclosedUnicodeName));
413                    }
414                    let b = bytes[end];
415                    if b == b'}' {
416                        end += 1;
417                        break;
418                    }
419                    end += codepoint_len(b);
420                }
421            }
422            (
423                end,
424                Expr::Delegate {
425                    inner: String::from(&self.re[ix..end]),
426                    size: 1,
427                    casei: self.flag(FLAG_CASEI),
428                },
429            )
430        } else if b == b'K' && !in_class {
431            (end, Expr::KeepOut)
432        } else if b == b'G' && !in_class {
433            (end, Expr::ContinueFromPreviousMatchEnd)
434        } else {
435            // printable ASCII (including space, see issue #29)
436            (
437                end,
438                make_literal(match b {
439                    b'a' => "\x07", // BEL
440                    b'b' => "\x08", // BS
441                    b'f' => "\x0c", // FF
442                    b'n' => "\n",   // LF
443                    b'r' => "\r",   // CR
444                    b't' => "\t",   // TAB
445                    b'v' => "\x0b", // VT
446                    b'e' => "\x1b", // ESC
447                    b' ' => " ",
448                    b => {
449                        let s = &self.re[ix + 1..end];
450                        if b.is_ascii_alphabetic()
451                            && !matches!(
452                                b,
453                                b'k' | b'A' | b'z' | b'b' | b'B' | b'<' | b'>' | b'K' | b'G'
454                            )
455                        {
456                            return Err(Error::ParseError(
457                                ix,
458                                ParseError::InvalidEscape(format!("\\{}", s)),
459                            ));
460                        } else {
461                            s
462                        }
463                    }
464                }),
465            )
466        })
467    }
468
469    // ix points after '\x', eg to 'A0' or '{12345}', or after `\u` or `\U`
470    fn parse_hex(&self, ix: usize, digits: usize) -> Result<(usize, Expr)> {
471        if ix >= self.re.len() {
472            // Incomplete escape sequence
473            return Err(Error::ParseError(ix, ParseError::InvalidHex));
474        }
475        let bytes = self.re.as_bytes();
476        let b = bytes[ix];
477        let (end, s) = if ix + digits <= self.re.len()
478            && bytes[ix..ix + digits].iter().all(|&b| is_hex_digit(b))
479        {
480            let end = ix + digits;
481            (end, &self.re[ix..end])
482        } else if b == b'{' {
483            let starthex = ix + 1;
484            let mut endhex = starthex;
485            loop {
486                if endhex == self.re.len() {
487                    return Err(Error::ParseError(ix, ParseError::InvalidHex));
488                }
489                let b = bytes[endhex];
490                if endhex > starthex && b == b'}' {
491                    break;
492                }
493                if is_hex_digit(b) && endhex < starthex + 8 {
494                    endhex += 1;
495                } else {
496                    return Err(Error::ParseError(ix, ParseError::InvalidHex));
497                }
498            }
499            (endhex + 1, &self.re[starthex..endhex])
500        } else {
501            return Err(Error::ParseError(ix, ParseError::InvalidHex));
502        };
503        let codepoint = u32::from_str_radix(s, 16).unwrap();
504        if let Some(c) = char::from_u32(codepoint) {
505            let mut inner = String::with_capacity(4);
506            inner.push(c);
507            Ok((
508                end,
509                Expr::Literal {
510                    val: inner,
511                    casei: self.flag(FLAG_CASEI),
512                },
513            ))
514        } else {
515            Err(Error::ParseError(ix, ParseError::InvalidCodepointValue))
516        }
517    }
518
519    fn parse_class(&mut self, ix: usize) -> Result<(usize, Expr)> {
520        let bytes = self.re.as_bytes();
521        let mut ix = ix + 1; // skip opening '['
522        let mut class = String::new();
523        let mut nest = 1;
524        class.push('[');
525
526        // Negated character class
527        if bytes.get(ix) == Some(&b'^') {
528            class.push('^');
529            ix += 1;
530        }
531
532        // `]` does not have to be escaped after opening `[` or `[^`
533        if bytes.get(ix) == Some(&b']') {
534            class.push(']');
535            ix += 1;
536        }
537
538        loop {
539            if ix == self.re.len() {
540                return Err(Error::ParseError(ix, ParseError::InvalidClass));
541            }
542            let end = match bytes[ix] {
543                b'\\' => {
544                    // We support more escapes than regex, so parse it ourselves before delegating.
545                    let (end, expr) = self.parse_escape(ix, true)?;
546                    match expr {
547                        Expr::Literal { val, .. } => {
548                            debug_assert_eq!(val.chars().count(), 1);
549                            escape_into(&val, &mut class);
550                        }
551                        Expr::Delegate { inner, .. } => {
552                            class.push_str(&inner);
553                        }
554                        _ => {
555                            return Err(Error::ParseError(ix, ParseError::InvalidClass));
556                        }
557                    }
558                    end
559                }
560                b'[' => {
561                    nest += 1;
562                    class.push('[');
563                    ix + 1
564                }
565                b']' => {
566                    nest -= 1;
567                    class.push(']');
568                    if nest == 0 {
569                        break;
570                    }
571                    ix + 1
572                }
573                b => {
574                    let end = ix + codepoint_len(b);
575                    class.push_str(&self.re[ix..end]);
576                    end
577                }
578            };
579            ix = end;
580        }
581        let class = Expr::Delegate {
582            inner: class,
583            size: 1,
584            casei: self.flag(FLAG_CASEI),
585        };
586        let ix = ix + 1; // skip closing ']'
587        Ok((ix, class))
588    }
589
590    fn parse_group(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
591        let depth = depth + 1;
592        if depth >= MAX_RECURSION {
593            return Err(Error::ParseError(ix, ParseError::RecursionExceeded));
594        }
595        let ix = self.optional_whitespace(ix + 1)?;
596        let (la, skip) = if self.re[ix..].starts_with("?=") {
597            (Some(LookAhead), 2)
598        } else if self.re[ix..].starts_with("?!") {
599            (Some(LookAheadNeg), 2)
600        } else if self.re[ix..].starts_with("?<=") {
601            (Some(LookBehind), 3)
602        } else if self.re[ix..].starts_with("?<!") {
603            (Some(LookBehindNeg), 3)
604        } else if self.re[ix..].starts_with("?<") {
605            // Named capture group using Oniguruma syntax: (?<name>...)
606            self.curr_group += 1;
607            if let Some((id, skip)) = parse_id(&self.re[ix + 1..], "<", ">", false) {
608                self.named_groups.insert(id.to_string(), self.curr_group);
609                (None, skip + 1)
610            } else {
611                return Err(Error::ParseError(ix, ParseError::InvalidGroupName));
612            }
613        } else if self.re[ix..].starts_with("?P<") {
614            // Named capture group using Python syntax: (?P<name>...)
615            self.curr_group += 1; // this is a capture group
616            if let Some((id, skip)) = parse_id(&self.re[ix + 2..], "<", ">", false) {
617                self.named_groups.insert(id.to_string(), self.curr_group);
618                (None, skip + 2)
619            } else {
620                return Err(Error::ParseError(ix, ParseError::InvalidGroupName));
621            }
622        } else if self.re[ix..].starts_with("?P=") {
623            // Backref using Python syntax: (?P=name)
624            return self.parse_named_backref(ix + 3, "", ")", false);
625        } else if self.re[ix..].starts_with("?>") {
626            (None, 2)
627        } else if self.re[ix..].starts_with("?(") {
628            return self.parse_conditional(ix + 2, depth);
629        } else if self.re[ix..].starts_with('?') {
630            return self.parse_flags(ix, depth);
631        } else {
632            self.curr_group += 1; // this is a capture group
633            (None, 0)
634        };
635        let ix = ix + skip;
636        let (ix, child) = self.parse_re(ix, depth)?;
637        let ix = self.check_for_close_paren(ix)?;
638        let result = match (la, skip) {
639            (Some(la), _) => Expr::LookAround(Box::new(child), la),
640            (None, 2) => Expr::AtomicGroup(Box::new(child)),
641            _ => Expr::Group(Box::new(child)),
642        };
643        Ok((ix, result))
644    }
645
646    fn check_for_close_paren(&self, ix: usize) -> Result<usize> {
647        let ix = self.optional_whitespace(ix)?;
648        if ix == self.re.len() {
649            return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
650        } else if self.re.as_bytes()[ix] != b')' {
651            return Err(Error::ParseError(
652                ix,
653                ParseError::GeneralParseError("expected close paren".to_string()),
654            ));
655        }
656        Ok(ix + 1)
657    }
658
659    // ix points to `?` in `(?`
660    fn parse_flags(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
661        let start = ix + 1;
662
663        fn unknown_flag(re: &str, start: usize, end: usize) -> Error {
664            let after_end = end + codepoint_len(re.as_bytes()[end]);
665            let s = format!("(?{}", &re[start..after_end]);
666            Error::ParseError(start, ParseError::UnknownFlag(s))
667        }
668
669        let mut ix = start;
670        let mut neg = false;
671        let oldflags = self.flags;
672        loop {
673            ix = self.optional_whitespace(ix)?;
674            if ix == self.re.len() {
675                return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
676            }
677            let b = self.re.as_bytes()[ix];
678            match b {
679                b'i' => self.update_flag(FLAG_CASEI, neg),
680                b'm' => self.update_flag(FLAG_MULTI, neg),
681                b's' => self.update_flag(FLAG_DOTNL, neg),
682                b'U' => self.update_flag(FLAG_SWAP_GREED, neg),
683                b'x' => self.update_flag(FLAG_IGNORE_SPACE, neg),
684                b'u' => {
685                    if neg {
686                        return Err(Error::ParseError(ix, ParseError::NonUnicodeUnsupported));
687                    }
688                }
689                b'-' => {
690                    if neg {
691                        return Err(unknown_flag(self.re, start, ix));
692                    }
693                    neg = true;
694                }
695                b')' => {
696                    if ix == start || neg && ix == start + 1 {
697                        return Err(unknown_flag(self.re, start, ix));
698                    }
699                    return Ok((ix + 1, Expr::Empty));
700                }
701                b':' => {
702                    if neg && ix == start + 1 {
703                        return Err(unknown_flag(self.re, start, ix));
704                    }
705                    ix += 1;
706                    let (ix, child) = self.parse_re(ix, depth)?;
707                    if ix == self.re.len() {
708                        return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
709                    } else if self.re.as_bytes()[ix] != b')' {
710                        return Err(Error::ParseError(
711                            ix,
712                            ParseError::GeneralParseError("expected close paren".to_string()),
713                        ));
714                    };
715                    self.flags = oldflags;
716                    return Ok((ix + 1, child));
717                }
718                _ => return Err(unknown_flag(self.re, start, ix)),
719            }
720            ix += 1;
721        }
722    }
723
724    // ix points to after the last ( in (?(
725    fn parse_conditional(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
726        if ix >= self.re.len() {
727            return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
728        }
729        let bytes = self.re.as_bytes();
730        // get the character after the open paren
731        let b = bytes[ix];
732        let (mut next, condition) = if is_digit(b) {
733            self.parse_numbered_backref(ix)?
734        } else if b == b'\'' {
735            self.parse_named_backref(ix, "'", "'", true)?
736        } else if b == b'<' {
737            self.parse_named_backref(ix, "<", ">", true)?
738        } else {
739            self.parse_re(ix, depth)?
740        };
741        next = self.check_for_close_paren(next)?;
742        let (end, child) = self.parse_re(next, depth)?;
743        if end == next {
744            // Backreference validity checker
745            if let Expr::Backref(group) = condition {
746                let after = self.check_for_close_paren(end)?;
747                return Ok((after, Expr::BackrefExistsCondition(group)));
748            } else {
749                return Err(Error::ParseError(
750                    end,
751                    ParseError::GeneralParseError(
752                        "expected conditional to be a backreference or at least an expression for when the condition is true".to_string()
753                    )
754                ));
755            }
756        }
757        let if_true: Expr;
758        let mut if_false: Expr = Expr::Empty;
759        if let Expr::Alt(mut alternatives) = child {
760            // the truth branch will be the first alternative
761            if_true = alternatives.remove(0);
762            // if there is only one alternative left, take it out the Expr::Alt
763            if alternatives.len() == 1 {
764                if_false = alternatives.pop().expect("expected 2 alternatives");
765            } else {
766                // otherwise the remaining branches become the false branch
767                if_false = Expr::Alt(alternatives);
768            }
769        } else {
770            // there is only one branch - the truth branch. i.e. "if" without "else"
771            if_true = child;
772        }
773        let inner_condition = if let Expr::Backref(group) = condition {
774            Expr::BackrefExistsCondition(group)
775        } else {
776            condition
777        };
778
779        let after = self.check_for_close_paren(end)?;
780        Ok((
781            after,
782            if if_true == Expr::Empty && if_false == Expr::Empty {
783                inner_condition
784            } else {
785                Expr::Conditional {
786                    condition: Box::new(inner_condition),
787                    true_branch: Box::new(if_true),
788                    false_branch: Box::new(if_false),
789                }
790            },
791        ))
792    }
793
794    fn flag(&self, flag: u32) -> bool {
795        (self.flags & flag) != 0
796    }
797
798    fn update_flag(&mut self, flag: u32, neg: bool) {
799        if neg {
800            self.flags &= !flag;
801        } else {
802            self.flags |= flag;
803        }
804    }
805
806    fn optional_whitespace(&self, mut ix: usize) -> Result<usize> {
807        let bytes = self.re.as_bytes();
808        loop {
809            if ix == self.re.len() {
810                return Ok(ix);
811            }
812            match bytes[ix] {
813                b'#' if self.flag(FLAG_IGNORE_SPACE) => {
814                    match bytes[ix..].iter().position(|&c| c == b'\n') {
815                        Some(x) => ix += x + 1,
816                        None => return Ok(self.re.len()),
817                    }
818                }
819                b' ' | b'\r' | b'\n' | b'\t' if self.flag(FLAG_IGNORE_SPACE) => ix += 1,
820                b'(' if bytes[ix..].starts_with(b"(?#") => {
821                    ix += 3;
822                    loop {
823                        if ix >= self.re.len() {
824                            return Err(Error::ParseError(ix, ParseError::UnclosedOpenParen));
825                        }
826                        match bytes[ix] {
827                            b')' => {
828                                ix += 1;
829                                break;
830                            }
831                            b'\\' => ix += 2,
832                            _ => ix += 1,
833                        }
834                    }
835                }
836                _ => return Ok(ix),
837            }
838        }
839    }
840}
841
842// return (ix, value)
843pub(crate) fn parse_decimal(s: &str, ix: usize) -> Option<(usize, usize)> {
844    let mut end = ix;
845    while end < s.len() && is_digit(s.as_bytes()[end]) {
846        end += 1;
847    }
848    usize::from_str_radix(&s[ix..end], 10)
849        .ok()
850        .map(|val| (end, val))
851}
852
853/// Attempts to parse an identifier between the specified opening and closing
854/// delimiters.  On success, returns `Some((id, skip))`, where `skip` is how much
855/// of the string was used.
856pub(crate) fn parse_id<'a>(
857    s: &'a str,
858    open: &'_ str,
859    close: &'_ str,
860    allow_relative: bool,
861) -> Option<(&'a str, usize)> {
862    debug_assert!(!close.starts_with(is_id_char));
863
864    if !s.starts_with(open) {
865        return None;
866    }
867
868    let id_start = open.len();
869    let mut iter = s[id_start..].char_indices().peekable();
870    let after_id = if allow_relative && iter.next_if(|(_, ch)| *ch == '-').is_some() {
871        iter.find(|(_, ch)| !ch.is_ascii_digit())
872    } else {
873        iter.find(|(_, ch)| !is_id_char(*ch))
874    };
875    let id_len = match after_id.map(|(i, _)| i) {
876        Some(id_len) if s[id_start + id_len..].starts_with(close) => Some(id_len),
877        None if close.is_empty() => Some(s.len()),
878        _ => None,
879    };
880    match id_len {
881        Some(0) => None,
882        Some(id_len) => {
883            let id_end = id_start + id_len;
884            Some((&s[id_start..id_end], id_end + close.len()))
885        }
886        _ => None,
887    }
888}
889
890fn is_id_char(c: char) -> bool {
891    c.is_alphanumeric() || c == '_'
892}
893
894fn is_digit(b: u8) -> bool {
895    b'0' <= b && b <= b'9'
896}
897
898fn is_hex_digit(b: u8) -> bool {
899    is_digit(b) || (b'a' <= (b | 32) && (b | 32) <= b'f')
900}
901
902pub(crate) fn make_literal(s: &str) -> Expr {
903    Expr::Literal {
904        val: String::from(s),
905        casei: false,
906    }
907}
908
909#[cfg(test)]
910mod tests {
911    use alloc::boxed::Box;
912    use alloc::string::{String, ToString};
913    use alloc::{format, vec};
914
915    use crate::parse::{make_literal, parse_id};
916    use crate::LookAround::*;
917    use crate::{Assertion, Expr};
918
919    fn p(s: &str) -> Expr {
920        Expr::parse_tree(s).unwrap().expr
921    }
922
923    #[cfg_attr(feature = "track_caller", track_caller)]
924    fn fail(s: &str) {
925        assert!(
926            Expr::parse_tree(s).is_err(),
927            "Expected parse error, but was: {:?}",
928            Expr::parse_tree(s)
929        );
930    }
931
932    #[cfg_attr(feature = "track_caller", track_caller)]
933    fn assert_error(re: &str, expected_error: &str) {
934        let result = Expr::parse_tree(re);
935        assert!(
936            result.is_err(),
937            "Expected parse error, but was: {:?}",
938            result
939        );
940        assert_eq!(&format!("{}", result.err().unwrap()), expected_error);
941    }
942
943    #[test]
944    fn empty() {
945        assert_eq!(p(""), Expr::Empty);
946    }
947
948    #[test]
949    fn any() {
950        assert_eq!(p("."), Expr::Any { newline: false });
951        assert_eq!(p("(?s:.)"), Expr::Any { newline: true });
952    }
953
954    #[test]
955    fn start_text() {
956        assert_eq!(p("^"), Expr::Assertion(Assertion::StartText));
957    }
958
959    #[test]
960    fn end_text() {
961        assert_eq!(p("$"), Expr::Assertion(Assertion::EndText));
962    }
963
964    #[test]
965    fn literal() {
966        assert_eq!(p("a"), make_literal("a"));
967    }
968
969    #[test]
970    fn literal_special() {
971        assert_eq!(p("}"), make_literal("}"));
972        assert_eq!(p("]"), make_literal("]"));
973    }
974
975    #[test]
976    fn parse_id_test() {
977        assert_eq!(parse_id("foo.", "", "", true), Some(("foo", 3)));
978        assert_eq!(parse_id("{foo}", "{", "}", true), Some(("foo", 5)));
979        assert_eq!(parse_id("{foo.", "{", "}", true), None);
980        assert_eq!(parse_id("{foo", "{", "}", true), None);
981        assert_eq!(parse_id("{}", "{", "}", true), None);
982        assert_eq!(parse_id("", "", "", true), None);
983        assert_eq!(parse_id("{-1}", "{", "}", true), Some(("-1", 4)));
984        assert_eq!(parse_id("{-1}", "{", "}", false), None);
985        assert_eq!(parse_id("{-a}", "{", "}", true), None);
986    }
987
988    #[test]
989    fn literal_unescaped_opening_curly() {
990        // `{` in position where quantifier is not allowed results in literal `{`
991        assert_eq!(p("{"), make_literal("{"));
992        assert_eq!(p("({)"), Expr::Group(Box::new(make_literal("{"),)));
993        assert_eq!(
994            p("a|{"),
995            Expr::Alt(vec![make_literal("a"), make_literal("{"),])
996        );
997        assert_eq!(
998            p("{{2}"),
999            Expr::Repeat {
1000                child: Box::new(make_literal("{")),
1001                lo: 2,
1002                hi: 2,
1003                greedy: true
1004            }
1005        );
1006    }
1007
1008    #[test]
1009    fn literal_escape() {
1010        assert_eq!(p("\\'"), make_literal("'"));
1011        assert_eq!(p("\\\""), make_literal("\""));
1012        assert_eq!(p("\\ "), make_literal(" "));
1013        assert_eq!(p("\\xA0"), make_literal("\u{A0}"));
1014        assert_eq!(p("\\x{1F4A9}"), make_literal("\u{1F4A9}"));
1015        assert_eq!(p("\\x{000000B7}"), make_literal("\u{B7}"));
1016        assert_eq!(p("\\u21D2"), make_literal("\u{21D2}"));
1017        assert_eq!(p("\\u{21D2}"), make_literal("\u{21D2}"));
1018        assert_eq!(p("\\u21D2x"), p("\u{21D2}x"));
1019        assert_eq!(p("\\U0001F60A"), make_literal("\u{1F60A}"));
1020        assert_eq!(p("\\U{0001F60A}"), make_literal("\u{1F60A}"));
1021    }
1022
1023    #[test]
1024    fn hex_escape() {
1025        assert_eq!(
1026            p("\\h"),
1027            Expr::Delegate {
1028                inner: String::from("[0-9A-Fa-f]"),
1029                size: 1,
1030                casei: false
1031            }
1032        );
1033        assert_eq!(
1034            p("\\H"),
1035            Expr::Delegate {
1036                inner: String::from("[^0-9A-Fa-f]"),
1037                size: 1,
1038                casei: false
1039            }
1040        );
1041    }
1042
1043    #[test]
1044    fn invalid_escape() {
1045        assert_error(
1046            "\\",
1047            "Parsing error at position 0: Backslash without following character",
1048        );
1049        assert_error("\\q", "Parsing error at position 0: Invalid escape: \\q");
1050        assert_error("\\xAG", "Parsing error at position 2: Invalid hex escape");
1051        assert_error("\\xA", "Parsing error at position 2: Invalid hex escape");
1052        assert_error("\\x{}", "Parsing error at position 2: Invalid hex escape");
1053        assert_error("\\x{AG}", "Parsing error at position 2: Invalid hex escape");
1054        assert_error("\\x{42", "Parsing error at position 2: Invalid hex escape");
1055        assert_error(
1056            "\\x{D800}",
1057            "Parsing error at position 2: Invalid codepoint for hex or unicode escape",
1058        );
1059        assert_error(
1060            "\\x{110000}",
1061            "Parsing error at position 2: Invalid codepoint for hex or unicode escape",
1062        );
1063        assert_error("\\u123", "Parsing error at position 2: Invalid hex escape");
1064        assert_error("\\u123x", "Parsing error at position 2: Invalid hex escape");
1065        assert_error("\\u{}", "Parsing error at position 2: Invalid hex escape");
1066        assert_error(
1067            "\\U1234567",
1068            "Parsing error at position 2: Invalid hex escape",
1069        );
1070        assert_error("\\U{}", "Parsing error at position 2: Invalid hex escape");
1071    }
1072
1073    #[test]
1074    fn concat() {
1075        assert_eq!(
1076            p("ab"),
1077            Expr::Concat(vec![make_literal("a"), make_literal("b"),])
1078        );
1079    }
1080
1081    #[test]
1082    fn alt() {
1083        assert_eq!(
1084            p("a|b"),
1085            Expr::Alt(vec![make_literal("a"), make_literal("b"),])
1086        );
1087    }
1088
1089    #[test]
1090    fn group() {
1091        assert_eq!(p("(a)"), Expr::Group(Box::new(make_literal("a"),)));
1092    }
1093
1094    #[test]
1095    fn group_repeat() {
1096        assert_eq!(
1097            p("(a){2}"),
1098            Expr::Repeat {
1099                child: Box::new(Expr::Group(Box::new(make_literal("a")))),
1100                lo: 2,
1101                hi: 2,
1102                greedy: true
1103            }
1104        );
1105    }
1106
1107    #[test]
1108    fn repeat() {
1109        assert_eq!(
1110            p("a{2,42}"),
1111            Expr::Repeat {
1112                child: Box::new(make_literal("a")),
1113                lo: 2,
1114                hi: 42,
1115                greedy: true
1116            }
1117        );
1118        assert_eq!(
1119            p("a{2,}"),
1120            Expr::Repeat {
1121                child: Box::new(make_literal("a")),
1122                lo: 2,
1123                hi: usize::MAX,
1124                greedy: true
1125            }
1126        );
1127        assert_eq!(
1128            p("a{2}"),
1129            Expr::Repeat {
1130                child: Box::new(make_literal("a")),
1131                lo: 2,
1132                hi: 2,
1133                greedy: true
1134            }
1135        );
1136        assert_eq!(
1137            p("a{,2}"),
1138            Expr::Repeat {
1139                child: Box::new(make_literal("a")),
1140                lo: 0,
1141                hi: 2,
1142                greedy: true
1143            }
1144        );
1145
1146        assert_eq!(
1147            p("a{2,42}?"),
1148            Expr::Repeat {
1149                child: Box::new(make_literal("a")),
1150                lo: 2,
1151                hi: 42,
1152                greedy: false
1153            }
1154        );
1155        assert_eq!(
1156            p("a{2,}?"),
1157            Expr::Repeat {
1158                child: Box::new(make_literal("a")),
1159                lo: 2,
1160                hi: usize::MAX,
1161                greedy: false
1162            }
1163        );
1164        assert_eq!(
1165            p("a{2}?"),
1166            Expr::Repeat {
1167                child: Box::new(make_literal("a")),
1168                lo: 2,
1169                hi: 2,
1170                greedy: false
1171            }
1172        );
1173        assert_eq!(
1174            p("a{,2}?"),
1175            Expr::Repeat {
1176                child: Box::new(make_literal("a")),
1177                lo: 0,
1178                hi: 2,
1179                greedy: false
1180            }
1181        );
1182    }
1183
1184    #[test]
1185    fn invalid_repeat() {
1186        // Invalid repeat syntax results in literal
1187        assert_eq!(
1188            p("a{"),
1189            Expr::Concat(vec![make_literal("a"), make_literal("{"),])
1190        );
1191        assert_eq!(
1192            p("a{6"),
1193            Expr::Concat(vec![
1194                make_literal("a"),
1195                make_literal("{"),
1196                make_literal("6"),
1197            ])
1198        );
1199        assert_eq!(
1200            p("a{6,"),
1201            Expr::Concat(vec![
1202                make_literal("a"),
1203                make_literal("{"),
1204                make_literal("6"),
1205                make_literal(","),
1206            ])
1207        );
1208    }
1209
1210    #[test]
1211    fn delegate_zero() {
1212        assert_eq!(p("\\b"), Expr::Assertion(Assertion::WordBoundary),);
1213        assert_eq!(p("\\B"), Expr::Assertion(Assertion::NotWordBoundary),);
1214    }
1215
1216    #[test]
1217    fn delegate_named_group() {
1218        assert_eq!(
1219            p("\\p{Greek}"),
1220            Expr::Delegate {
1221                inner: String::from("\\p{Greek}"),
1222                size: 1,
1223                casei: false
1224            }
1225        );
1226        assert_eq!(
1227            p("\\pL"),
1228            Expr::Delegate {
1229                inner: String::from("\\pL"),
1230                size: 1,
1231                casei: false
1232            }
1233        );
1234        assert_eq!(
1235            p("\\P{Greek}"),
1236            Expr::Delegate {
1237                inner: String::from("\\P{Greek}"),
1238                size: 1,
1239                casei: false
1240            }
1241        );
1242        assert_eq!(
1243            p("\\PL"),
1244            Expr::Delegate {
1245                inner: String::from("\\PL"),
1246                size: 1,
1247                casei: false
1248            }
1249        );
1250        assert_eq!(
1251            p("(?i)\\p{Ll}"),
1252            Expr::Delegate {
1253                inner: String::from("\\p{Ll}"),
1254                size: 1,
1255                casei: true
1256            }
1257        );
1258    }
1259
1260    #[test]
1261    fn backref() {
1262        assert_eq!(
1263            p("(.)\\1"),
1264            Expr::Concat(vec![
1265                Expr::Group(Box::new(Expr::Any { newline: false })),
1266                Expr::Backref(1),
1267            ])
1268        );
1269    }
1270
1271    #[test]
1272    fn named_backref() {
1273        assert_eq!(
1274            p("(?<i>.)\\k<i>"),
1275            Expr::Concat(vec![
1276                Expr::Group(Box::new(Expr::Any { newline: false })),
1277                Expr::Backref(1),
1278            ])
1279        );
1280    }
1281
1282    #[test]
1283    fn relative_backref() {
1284        assert_eq!(
1285            p("(a)(.)\\k<-1>"),
1286            Expr::Concat(vec![
1287                Expr::Group(Box::new(make_literal("a"))),
1288                Expr::Group(Box::new(Expr::Any { newline: false })),
1289                Expr::Backref(2)
1290            ])
1291        );
1292        fail("(?P<->.)");
1293        fail("(.)(?P=-)")
1294    }
1295
1296    #[test]
1297    fn lookaround() {
1298        assert_eq!(
1299            p("(?=a)"),
1300            Expr::LookAround(Box::new(make_literal("a")), LookAhead)
1301        );
1302        assert_eq!(
1303            p("(?!a)"),
1304            Expr::LookAround(Box::new(make_literal("a")), LookAheadNeg)
1305        );
1306        assert_eq!(
1307            p("(?<=a)"),
1308            Expr::LookAround(Box::new(make_literal("a")), LookBehind)
1309        );
1310        assert_eq!(
1311            p("(?<!a)"),
1312            Expr::LookAround(Box::new(make_literal("a")), LookBehindNeg)
1313        );
1314    }
1315
1316    #[test]
1317    fn shy_group() {
1318        assert_eq!(
1319            p("(?:ab)c"),
1320            Expr::Concat(vec![
1321                Expr::Concat(vec![make_literal("a"), make_literal("b"),]),
1322                make_literal("c"),
1323            ])
1324        );
1325    }
1326
1327    #[test]
1328    fn flag_state() {
1329        assert_eq!(p("(?s)."), Expr::Any { newline: true });
1330        assert_eq!(p("(?s:(?-s:.))"), Expr::Any { newline: false });
1331        assert_eq!(
1332            p("(?s:.)."),
1333            Expr::Concat(vec![
1334                Expr::Any { newline: true },
1335                Expr::Any { newline: false },
1336            ])
1337        );
1338        assert_eq!(
1339            p("(?:(?s).)."),
1340            Expr::Concat(vec![
1341                Expr::Any { newline: true },
1342                Expr::Any { newline: false },
1343            ])
1344        );
1345    }
1346
1347    #[test]
1348    fn flag_multiline() {
1349        assert_eq!(p("^"), Expr::Assertion(Assertion::StartText));
1350        assert_eq!(
1351            p("(?m:^)"),
1352            Expr::Assertion(Assertion::StartLine { crlf: false })
1353        );
1354        assert_eq!(p("$"), Expr::Assertion(Assertion::EndText));
1355        assert_eq!(
1356            p("(?m:$)"),
1357            Expr::Assertion(Assertion::EndLine { crlf: false })
1358        );
1359    }
1360
1361    #[test]
1362    fn flag_swap_greed() {
1363        assert_eq!(p("a*"), p("(?U:a*?)"));
1364        assert_eq!(p("a*?"), p("(?U:a*)"));
1365    }
1366
1367    #[test]
1368    fn invalid_flags() {
1369        assert!(Expr::parse_tree("(?").is_err());
1370        assert!(Expr::parse_tree("(?)").is_err());
1371        assert!(Expr::parse_tree("(?-)").is_err());
1372        assert!(Expr::parse_tree("(?-:a)").is_err());
1373        assert!(Expr::parse_tree("(?q:a)").is_err());
1374    }
1375
1376    #[test]
1377    fn lifetime() {
1378        assert_eq!(
1379            p("\\'[a-zA-Z_][a-zA-Z0-9_]*(?!\\')\\b"),
1380            Expr::Concat(vec![
1381                make_literal("'"),
1382                Expr::Delegate {
1383                    inner: String::from("[a-zA-Z_]"),
1384                    size: 1,
1385                    casei: false
1386                },
1387                Expr::Repeat {
1388                    child: Box::new(Expr::Delegate {
1389                        inner: String::from("[a-zA-Z0-9_]"),
1390                        size: 1,
1391                        casei: false
1392                    }),
1393                    lo: 0,
1394                    hi: usize::MAX,
1395                    greedy: true
1396                },
1397                Expr::LookAround(Box::new(make_literal("'")), LookAheadNeg),
1398                Expr::Assertion(Assertion::WordBoundary),
1399            ])
1400        );
1401    }
1402
1403    #[test]
1404    fn ignore_whitespace() {
1405        assert_eq!(p("(?x: )"), p(""));
1406        assert_eq!(p("(?x) | "), p("|"));
1407        assert_eq!(p("(?x: a )"), p("a"));
1408        assert_eq!(p("(?x: a # ) bobby tables\n b )"), p("ab"));
1409        assert_eq!(p("(?x: a | b )"), p("a|b"));
1410        assert_eq!(p("(?x: ( a b ) )"), p("(ab)"));
1411        assert_eq!(p("(?x: a + )"), p("a+"));
1412        assert_eq!(p("(?x: a {2} )"), p("a{2}"));
1413        assert_eq!(p("(?x: a { 2 } )"), p("a{2}"));
1414        assert_eq!(p("(?x: a { 2 , } )"), p("a{2,}"));
1415        assert_eq!(p("(?x: a { , 2 } )"), p("a{,2}"));
1416        assert_eq!(p("(?x: a { 2 , 3 } )"), p("a{2,3}"));
1417        assert_eq!(p("(?x: a { 2 , 3 } ? )"), p("a{2,3}?"));
1418        assert_eq!(p("(?x: ( ? i : . ) )"), p("(?i:.)"));
1419        assert_eq!(p("(?x: ( ?= a ) )"), p("(?=a)"));
1420        assert_eq!(p("(?x: [ ] )"), p("[ ]"));
1421        assert_eq!(p("(?x: [ ^] )"), p("[ ^]"));
1422        assert_eq!(p("(?x: [a - z] )"), p("[a - z]"));
1423        assert_eq!(p("(?x: [ \\] \\\\] )"), p("[ \\] \\\\]"));
1424        assert_eq!(p("(?x: a\\ b )"), p("a b"));
1425        assert_eq!(p("(?x: a (?-x:#) b )"), p("a#b"));
1426    }
1427
1428    #[test]
1429    fn comments() {
1430        assert_eq!(p(r"ab(?# comment)"), p("ab"));
1431        assert_eq!(p(r"ab(?#)"), p("ab"));
1432        assert_eq!(p(r"(?# comment 1)(?# comment 2)ab"), p("ab"));
1433        assert_eq!(p(r"ab(?# comment \))c"), p("abc"));
1434        assert_eq!(p(r"ab(?# comment \\)c"), p("abc"));
1435        assert_eq!(p(r"ab(?# comment ()c"), p("abc"));
1436        assert_eq!(p(r"ab(?# comment)*"), p("ab*"));
1437        fail(r"ab(?# comment");
1438        fail(r"ab(?# comment\");
1439    }
1440
1441    #[test]
1442    fn atomic_group() {
1443        assert_eq!(p("(?>a)"), Expr::AtomicGroup(Box::new(make_literal("a"))));
1444    }
1445
1446    #[test]
1447    fn possessive() {
1448        assert_eq!(
1449            p("a++"),
1450            Expr::AtomicGroup(Box::new(Expr::Repeat {
1451                child: Box::new(make_literal("a")),
1452                lo: 1,
1453                hi: usize::MAX,
1454                greedy: true
1455            }))
1456        );
1457        assert_eq!(
1458            p("a*+"),
1459            Expr::AtomicGroup(Box::new(Expr::Repeat {
1460                child: Box::new(make_literal("a")),
1461                lo: 0,
1462                hi: usize::MAX,
1463                greedy: true
1464            }))
1465        );
1466        assert_eq!(
1467            p("a?+"),
1468            Expr::AtomicGroup(Box::new(Expr::Repeat {
1469                child: Box::new(make_literal("a")),
1470                lo: 0,
1471                hi: 1,
1472                greedy: true
1473            }))
1474        );
1475    }
1476
1477    #[test]
1478    fn invalid_backref() {
1479        // only syntactic tests; see similar test in analyze module
1480        fail(".\\12345678"); // unreasonably large number
1481        fail(".\\c"); // not decimal
1482    }
1483
1484    #[test]
1485    fn invalid_group_name_backref() {
1486        assert_error(
1487            "\\k<id>(?<id>.)",
1488            "Parsing error at position 2: Invalid group name in back reference: id",
1489        );
1490    }
1491
1492    #[test]
1493    fn named_backref_only() {
1494        assert_error("(?<id>.)\\1", "Error compiling regex: Numbered backref/call not allowed because named group was used, use a named backref instead");
1495        assert_error("(a)\\1(?<name>b)", "Error compiling regex: Numbered backref/call not allowed because named group was used, use a named backref instead");
1496    }
1497
1498    #[test]
1499    fn invalid_group_name() {
1500        assert_error(
1501            "(?<id)",
1502            "Parsing error at position 1: Could not parse group name",
1503        );
1504        assert_error(
1505            "(?<>)",
1506            "Parsing error at position 1: Could not parse group name",
1507        );
1508        assert_error(
1509            "(?<#>)",
1510            "Parsing error at position 1: Could not parse group name",
1511        );
1512        assert_error(
1513            "\\kxxx<id>",
1514            "Parsing error at position 2: Could not parse group name",
1515        );
1516        // "-" can only be at the start for a relative backref
1517        assert_error(
1518            "\\k<id-2>",
1519            "Parsing error at position 2: Could not parse group name",
1520        );
1521        assert_error(
1522            "\\k<-id>",
1523            "Parsing error at position 2: Could not parse group name",
1524        );
1525    }
1526
1527    #[test]
1528    fn unknown_flag() {
1529        assert_error(
1530            "(?-:a)",
1531            "Parsing error at position 2: Unknown group flag: (?-:",
1532        );
1533        assert_error(
1534            "(?)",
1535            "Parsing error at position 2: Unknown group flag: (?)",
1536        );
1537        assert_error(
1538            "(?--)",
1539            "Parsing error at position 2: Unknown group flag: (?--",
1540        );
1541        // Check that we don't split on char boundary
1542        assert_error(
1543            "(?\u{1F60A})",
1544            "Parsing error at position 2: Unknown group flag: (?\u{1F60A}",
1545        );
1546    }
1547
1548    #[test]
1549    fn no_quantifiers_on_lookarounds() {
1550        assert_error(
1551            "(?=hello)+",
1552            "Parsing error at position 9: Target of repeat operator is invalid",
1553        );
1554        assert_error(
1555            "(?<!hello)*",
1556            "Parsing error at position 10: Target of repeat operator is invalid",
1557        );
1558        assert_error(
1559            "(?<=hello){2,3}",
1560            "Parsing error at position 14: Target of repeat operator is invalid",
1561        );
1562        assert_error(
1563            "(?!hello)?",
1564            "Parsing error at position 9: Target of repeat operator is invalid",
1565        );
1566        assert_error(
1567            "^?",
1568            "Parsing error at position 1: Target of repeat operator is invalid",
1569        );
1570        assert_error(
1571            "${2}",
1572            "Parsing error at position 3: Target of repeat operator is invalid",
1573        );
1574        assert_error(
1575            "(?m)^?",
1576            "Parsing error at position 5: Target of repeat operator is invalid",
1577        );
1578        assert_error(
1579            "(?m)${2}",
1580            "Parsing error at position 7: Target of repeat operator is invalid",
1581        );
1582        assert_error(
1583            "(a|b|?)",
1584            "Parsing error at position 5: Target of repeat operator is invalid",
1585        );
1586    }
1587
1588    #[test]
1589    fn keepout() {
1590        assert_eq!(
1591            p("a\\Kb"),
1592            Expr::Concat(vec![make_literal("a"), Expr::KeepOut, make_literal("b"),])
1593        );
1594    }
1595
1596    #[test]
1597    fn backref_exists_condition() {
1598        assert_eq!(
1599            p("(h)?(?(1))"),
1600            Expr::Concat(vec![
1601                Expr::Repeat {
1602                    child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1603                    lo: 0,
1604                    hi: 1,
1605                    greedy: true
1606                },
1607                Expr::BackrefExistsCondition(1)
1608            ])
1609        );
1610        assert_eq!(
1611            p("(?<h>h)?(?('h'))"),
1612            Expr::Concat(vec![
1613                Expr::Repeat {
1614                    child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1615                    lo: 0,
1616                    hi: 1,
1617                    greedy: true
1618                },
1619                Expr::BackrefExistsCondition(1)
1620            ])
1621        );
1622    }
1623
1624    #[test]
1625    fn conditional_non_backref_validity_check_without_branches() {
1626        assert_error(
1627            "(?(foo))",
1628            "Parsing error at position 7: General parsing error: expected conditional to be a backreference or at least an expression for when the condition is true",
1629        );
1630    }
1631
1632    #[test]
1633    fn conditional_invalid_target_of_repeat_operator() {
1634        assert_error(
1635            r"(?(?=\d)\w|!)",
1636            "Parsing error at position 3: Target of repeat operator is invalid",
1637        );
1638    }
1639
1640    #[test]
1641    fn backref_condition_with_one_two_or_three_branches() {
1642        assert_eq!(
1643            p("(h)?(?(1)i|x)"),
1644            Expr::Concat(vec![
1645                Expr::Repeat {
1646                    child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1647                    lo: 0,
1648                    hi: 1,
1649                    greedy: true
1650                },
1651                Expr::Conditional {
1652                    condition: Box::new(Expr::BackrefExistsCondition(1)),
1653                    true_branch: Box::new(make_literal("i")),
1654                    false_branch: Box::new(make_literal("x")),
1655                },
1656            ])
1657        );
1658
1659        assert_eq!(
1660            p("(h)?(?(1)i)"),
1661            Expr::Concat(vec![
1662                Expr::Repeat {
1663                    child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1664                    lo: 0,
1665                    hi: 1,
1666                    greedy: true
1667                },
1668                Expr::Conditional {
1669                    condition: Box::new(Expr::BackrefExistsCondition(1)),
1670                    true_branch: Box::new(make_literal("i")),
1671                    false_branch: Box::new(Expr::Empty),
1672                },
1673            ])
1674        );
1675
1676        assert_eq!(
1677            p("(h)?(?(1)ii|xy|z)"),
1678            Expr::Concat(vec![
1679                Expr::Repeat {
1680                    child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1681                    lo: 0,
1682                    hi: 1,
1683                    greedy: true
1684                },
1685                Expr::Conditional {
1686                    condition: Box::new(Expr::BackrefExistsCondition(1)),
1687                    true_branch: Box::new(Expr::Concat(
1688                        vec![make_literal("i"), make_literal("i"),]
1689                    )),
1690                    false_branch: Box::new(Expr::Alt(vec![
1691                        Expr::Concat(vec![make_literal("x"), make_literal("y"),]),
1692                        make_literal("z"),
1693                    ])),
1694                },
1695            ])
1696        );
1697
1698        assert_eq!(
1699            p("(?<cap>h)?(?(<cap>)ii|xy|z)"),
1700            Expr::Concat(vec![
1701                Expr::Repeat {
1702                    child: Box::new(Expr::Group(Box::new(make_literal("h")))),
1703                    lo: 0,
1704                    hi: 1,
1705                    greedy: true
1706                },
1707                Expr::Conditional {
1708                    condition: Box::new(Expr::BackrefExistsCondition(1)),
1709                    true_branch: Box::new(Expr::Concat(
1710                        vec![make_literal("i"), make_literal("i"),]
1711                    )),
1712                    false_branch: Box::new(Expr::Alt(vec![
1713                        Expr::Concat(vec![make_literal("x"), make_literal("y"),]),
1714                        make_literal("z"),
1715                    ])),
1716                },
1717            ])
1718        );
1719    }
1720
1721    #[test]
1722    fn conditional() {
1723        assert_eq!(
1724            p("((?(a)b|c))(\\1)"),
1725            Expr::Concat(vec![
1726                Expr::Group(Box::new(Expr::Conditional {
1727                    condition: Box::new(make_literal("a")),
1728                    true_branch: Box::new(make_literal("b")),
1729                    false_branch: Box::new(make_literal("c"))
1730                })),
1731                Expr::Group(Box::new(Expr::Backref(1)))
1732            ])
1733        );
1734
1735        assert_eq!(
1736            p(r"^(?(\d)abc|\d!)$"),
1737            Expr::Concat(vec![
1738                Expr::Assertion(Assertion::StartText),
1739                Expr::Conditional {
1740                    condition: Box::new(Expr::Delegate {
1741                        inner: "\\d".to_string(),
1742                        size: 1,
1743                        casei: false,
1744                    }),
1745                    true_branch: Box::new(Expr::Concat(vec![
1746                        make_literal("a"),
1747                        make_literal("b"),
1748                        make_literal("c"),
1749                    ])),
1750                    false_branch: Box::new(Expr::Concat(vec![
1751                        Expr::Delegate {
1752                            inner: "\\d".to_string(),
1753                            size: 1,
1754                            casei: false,
1755                        },
1756                        make_literal("!"),
1757                    ])),
1758                },
1759                Expr::Assertion(Assertion::EndText),
1760            ])
1761        );
1762
1763        assert_eq!(
1764            p(r"(?((?=\d))\w|!)"),
1765            Expr::Conditional {
1766                condition: Box::new(Expr::LookAround(
1767                    Box::new(Expr::Delegate {
1768                        inner: "\\d".to_string(),
1769                        size: 1,
1770                        casei: false
1771                    }),
1772                    LookAhead
1773                )),
1774                true_branch: Box::new(Expr::Delegate {
1775                    inner: "\\w".to_string(),
1776                    size: 1,
1777                    casei: false,
1778                }),
1779                false_branch: Box::new(make_literal("!")),
1780            },
1781        );
1782
1783        assert_eq!(
1784            p(r"(?((ab))c|d)"),
1785            Expr::Conditional {
1786                condition: Box::new(Expr::Group(Box::new(Expr::Concat(vec![
1787                    make_literal("a"),
1788                    make_literal("b"),
1789                ]),))),
1790                true_branch: Box::new(make_literal("c")),
1791                false_branch: Box::new(make_literal("d")),
1792            },
1793        );
1794    }
1795
1796    // found by cargo fuzz, then minimized
1797    #[test]
1798    fn fuzz_1() {
1799        p(r"\ä");
1800    }
1801
1802    #[test]
1803    fn fuzz_2() {
1804        p(r"\pä");
1805    }
1806
1807    #[test]
1808    fn fuzz_3() {
1809        fail(r"(?()^");
1810        fail(r#"!w(?()\"Kuz>"#);
1811    }
1812
1813    #[test]
1814    fn fuzz_4() {
1815        fail(r"\u{2}(?(2)");
1816    }
1817}