regex_lite/hir/
parse.rs

Help
1use core::cell::{Cell, RefCell};
2
3use alloc::{
4    boxed::Box,
5    string::{String, ToString},
6    vec,
7    vec::Vec,
8};
9
10use crate::{
11    error::Error,
12    hir::{self, Config, Flags, Hir, HirKind},
13};
14
15// These are all of the errors that can occur while parsing a regex. Unlike
16// regex-syntax, our errors are not particularly great. They are just enough
17// to get a general sense of what went wrong. But in exchange, the error
18// reporting mechanism is *much* simpler than what's in regex-syntax.
19//
20// By convention, we use each of these messages in exactly one place. That
21// way, every branch that leads to an error has a unique message. This in turn
22// means that given a message, one can precisely identify which part of the
23// parser reported it.
24//
25// Finally, we give names to each message so that we can reference them in
26// tests.
27const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting";
28const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups";
29const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name";
30const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'";
31const ERR_UNCLOSED_GROUP_QUESTION: &str =
32    "expected closing ')', but got end of pattern";
33const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('";
34const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported";
35const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed";
36const ERR_MISSING_GROUP_NAME: &str =
37    "expected capture group name, but got end of pattern";
38const ERR_INVALID_GROUP_NAME: &str = "invalid group name";
39const ERR_UNCLOSED_GROUP_NAME: &str =
40    "expected end of capture group name, but got end of pattern";
41const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed";
42const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag";
43const ERR_FLAG_REPEATED_NEGATION: &str =
44    "inline flag negation cannot be repeated";
45const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed";
46const ERR_FLAG_UNEXPECTED_EOF: &str =
47    "expected ':' or ')' to end inline flags, but got end of pattern";
48const ERR_FLAG_DANGLING_NEGATION: &str =
49    "inline flags cannot end with negation directive";
50const ERR_DECIMAL_NO_DIGITS: &str =
51    "expected decimal number, but found no digits";
52const ERR_DECIMAL_INVALID: &str = "got invalid decimal number";
53const ERR_HEX_BRACE_INVALID_DIGIT: &str =
54    "expected hexadecimal number in braces, but got non-hex digit";
55const ERR_HEX_BRACE_UNEXPECTED_EOF: &str =
56    "expected hexadecimal number, but saw end of pattern before closing brace";
57const ERR_HEX_BRACE_EMPTY: &str =
58    "expected hexadecimal number in braces, but got no digits";
59const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces";
60const ERR_HEX_FIXED_UNEXPECTED_EOF: &str =
61    "expected fixed length hexadecimal number, but saw end of pattern first";
62const ERR_HEX_FIXED_INVALID_DIGIT: &str =
63    "expected fixed length hexadecimal number, but got non-hex digit";
64const ERR_HEX_FIXED_INVALID: &str =
65    "got invalid fixed length hexadecimal number";
66const ERR_HEX_UNEXPECTED_EOF: &str =
67    "expected hexadecimal number, but saw end of pattern first";
68const ERR_ESCAPE_UNEXPECTED_EOF: &str =
69    "saw start of escape sequence, but saw end of pattern before it finished";
70const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported";
71const ERR_UNICODE_CLASS_UNSUPPORTED: &str =
72    "Unicode character classes are not supported";
73const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence";
74const ERR_POSIX_CLASS_UNRECOGNIZED: &str =
75    "unrecognized POSIX character class";
76const ERR_UNCOUNTED_REP_SUB_MISSING: &str =
77    "uncounted repetition operator must be applied to a sub-expression";
78const ERR_COUNTED_REP_SUB_MISSING: &str =
79    "counted repetition operator must be applied to a sub-expression";
80const ERR_COUNTED_REP_UNCLOSED: &str =
81    "found unclosed counted repetition operator";
82const ERR_COUNTED_REP_MIN_UNCLOSED: &str =
83    "found incomplete and unclosed counted repetition operator";
84const ERR_COUNTED_REP_COMMA_UNCLOSED: &str =
85    "found counted repetition operator with a comma that is unclosed";
86const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str =
87    "found counted repetition with min and max that is unclosed";
88const ERR_COUNTED_REP_INVALID: &str =
89    "expected closing brace for counted repetition, but got something else";
90const ERR_COUNTED_REP_INVALID_RANGE: &str =
91    "found counted repetition with a min bigger than its max";
92const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str =
93    "non-empty character class has no closing bracket";
94const ERR_CLASS_INVALID_RANGE_ITEM: &str =
95    "character class ranges must start and end with a single character";
96const ERR_CLASS_INVALID_ITEM: &str =
97    "invalid escape sequence in character class";
98const ERR_CLASS_UNCLOSED_AFTER_DASH: &str =
99    "non-empty character class has no closing bracket after dash";
100const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str =
101    "negated character class has no closing bracket";
102const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str =
103    "character class begins with literal ']' but has no closing bracket";
104const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class";
105const ERR_CLASS_UNCLOSED: &str = "found unclosed character class";
106const ERR_CLASS_NEST_UNSUPPORTED: &str =
107    "nested character classes are not supported";
108const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str =
109    "character class intersection is not supported";
110const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111    "character class difference is not supported";
112const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
113    "character class symmetric difference is not supported";
114const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
115    "special word boundary assertion is unclosed or has an invalid character";
116const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
117    "special word boundary assertion is unrecognized";
118const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
119    "found start of special word boundary or repetition without an end";
120
121/// A regular expression parser.
122///
123/// This parses a string representation of a regular expression into an
124/// abstract syntax tree. The size of the tree is proportional to the length
125/// of the regular expression pattern.
126///
127/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
128#[derive(Clone, Debug)]
129pub(super) struct Parser<'a> {
130    /// The configuration of the parser as given by the caller.
131    config: Config,
132    /// The pattern we're parsing as given by the caller.
133    pattern: &'a str,
134    /// The call depth of the parser. This is incremented for each
135    /// sub-expression parsed. Its peak value is the maximum nesting of the
136    /// pattern.
137    depth: Cell<u32>,
138    /// The current position of the parser.
139    pos: Cell<usize>,
140    /// The current codepoint of the parser. The codepoint corresponds to the
141    /// codepoint encoded in `pattern` beginning at `pos`.
142    ///
143    /// This is `None` if and only if `pos == pattern.len()`.
144    char: Cell<Option<char>>,
145    /// The current capture index.
146    capture_index: Cell<u32>,
147    /// The flags that are currently set.
148    flags: RefCell<Flags>,
149    /// A sorted sequence of capture names. This is used to detect duplicate
150    /// capture names and report an error if one is detected.
151    capture_names: RefCell<Vec<String>>,
152}
153
154/// The constructor and a variety of helper routines.
155impl<'a> Parser<'a> {
156    /// Build a parser from this configuration with the given pattern.
157    pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> {
158        Parser {
159            config,
160            pattern,
161            depth: Cell::new(0),
162            pos: Cell::new(0),
163            char: Cell::new(pattern.chars().next()),
164            capture_index: Cell::new(0),
165            flags: RefCell::new(config.flags),
166            capture_names: RefCell::new(vec![]),
167        }
168    }
169
170    /// Returns the full pattern string that we're parsing.
171    fn pattern(&self) -> &str {
172        self.pattern
173    }
174
175    /// Return the current byte offset of the parser.
176    ///
177    /// The offset starts at `0` from the beginning of the regular expression
178    /// pattern string.
179    fn pos(&self) -> usize {
180        self.pos.get()
181    }
182
183    /// Increments the call depth of the parser.
184    ///
185    /// If the call depth would exceed the configured nest limit, then this
186    /// returns an error.
187    ///
188    /// This returns the old depth.
189    fn increment_depth(&self) -> Result<u32, Error> {
190        let old = self.depth.get();
191        if old > self.config.nest_limit {
192            return Err(Error::new(ERR_TOO_MUCH_NESTING));
193        }
194        // OK because our depth starts at 0, and we return an error if it
195        // ever reaches the limit. So the call depth can never exceed u32::MAX.
196        let new = old.checked_add(1).unwrap();
197        self.depth.set(new);
198        Ok(old)
199    }
200
201    /// Decrements the call depth of the parser.
202    ///
203    /// This panics if the current depth is 0.
204    fn decrement_depth(&self) {
205        let old = self.depth.get();
206        // If this fails then the caller has a bug in how they're incrementing
207        // and decrementing the depth of the parser's call stack.
208        let new = old.checked_sub(1).unwrap();
209        self.depth.set(new);
210    }
211
212    /// Return the codepoint at the current position of the parser.
213    ///
214    /// This panics if the parser is positioned at the end of the pattern.
215    fn char(&self) -> char {
216        self.char.get().expect("codepoint, but parser is done")
217    }
218
219    /// Returns true if the next call to `bump` would return false.
220    fn is_done(&self) -> bool {
221        self.pos() == self.pattern.len()
222    }
223
224    /// Returns the flags that are current set for this regex.
225    fn flags(&self) -> Flags {
226        *self.flags.borrow()
227    }
228
229    /// Bump the parser to the next Unicode scalar value.
230    ///
231    /// If the end of the input has been reached, then `false` is returned.
232    fn bump(&self) -> bool {
233        if self.is_done() {
234            return false;
235        }
236        self.pos.set(self.pos() + self.char().len_utf8());
237        self.char.set(self.pattern()[self.pos()..].chars().next());
238        self.char.get().is_some()
239    }
240
241    /// If the substring starting at the current position of the parser has
242    /// the given prefix, then bump the parser to the character immediately
243    /// following the prefix and return true. Otherwise, don't bump the parser
244    /// and return false.
245    fn bump_if(&self, prefix: &str) -> bool {
246        if self.pattern()[self.pos()..].starts_with(prefix) {
247            for _ in 0..prefix.chars().count() {
248                self.bump();
249            }
250            true
251        } else {
252            false
253        }
254    }
255
256    /// Bump the parser, and if the `x` flag is enabled, bump through any
257    /// subsequent spaces. Return true if and only if the parser is not done.
258    fn bump_and_bump_space(&self) -> bool {
259        if !self.bump() {
260            return false;
261        }
262        self.bump_space();
263        !self.is_done()
264    }
265
266    /// If the `x` flag is enabled (i.e., whitespace insensitivity with
267    /// comments), then this will advance the parser through all whitespace
268    /// and comments to the next non-whitespace non-comment byte.
269    ///
270    /// If the `x` flag is disabled, then this is a no-op.
271    ///
272    /// This should be used selectively throughout the parser where
273    /// arbitrary whitespace is permitted when the `x` flag is enabled. For
274    /// example, `{   5  , 6}` is equivalent to `{5,6}`.
275    fn bump_space(&self) {
276        if !self.flags().ignore_whitespace {
277            return;
278        }
279        while !self.is_done() {
280            if self.char().is_whitespace() {
281                self.bump();
282            } else if self.char() == '#' {
283                self.bump();
284                while !self.is_done() {
285                    let c = self.char();
286                    self.bump();
287                    if c == '\n' {
288                        break;
289                    }
290                }
291            } else {
292                break;
293            }
294        }
295    }
296
297    /// Peek at the next character in the input without advancing the parser.
298    ///
299    /// If the input has been exhausted, then this returns `None`.
300    fn peek(&self) -> Option<char> {
301        if self.is_done() {
302            return None;
303        }
304        self.pattern()[self.pos() + self.char().len_utf8()..].chars().next()
305    }
306
307    /// Peeks at the next character in the pattern from the current offset, and
308    /// will ignore spaces when the parser is in whitespace insensitive mode.
309    fn peek_space(&self) -> Option<char> {
310        if !self.flags().ignore_whitespace {
311            return self.peek();
312        }
313        if self.is_done() {
314            return None;
315        }
316        let mut start = self.pos() + self.char().len_utf8();
317        let mut in_comment = false;
318        for (i, ch) in self.pattern()[start..].char_indices() {
319            if ch.is_whitespace() {
320                continue;
321            } else if !in_comment && ch == '#' {
322                in_comment = true;
323            } else if in_comment && ch == '\n' {
324                in_comment = false;
325            } else {
326                start += i;
327                break;
328            }
329        }
330        self.pattern()[start..].chars().next()
331    }
332
333    /// Return the next capturing index. Each subsequent call increments the
334    /// internal index. Since the way capture indices are computed is a public
335    /// API guarantee, use of this routine depends on the parser being depth
336    /// first and left-to-right.
337    ///
338    /// If the capture limit is exceeded, then an error is returned.
339    fn next_capture_index(&self) -> Result<u32, Error> {
340        let current = self.capture_index.get();
341        let next = current
342            .checked_add(1)
343            .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?;
344        self.capture_index.set(next);
345        Ok(next)
346    }
347
348    /// Adds the given capture name to this parser. If this capture name has
349    /// already been used, then an error is returned.
350    fn add_capture_name(&self, name: &str) -> Result<(), Error> {
351        let mut names = self.capture_names.borrow_mut();
352        match names.binary_search_by(|n| name.cmp(n)) {
353            Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)),
354            Err(i) => {
355                names.insert(i, name.to_string());
356                Ok(())
357            }
358        }
359    }
360
361    /// Returns true if and only if the parser is positioned at a look-around
362    /// prefix. The conditions under which this returns true must always
363    /// correspond to a regular expression that would otherwise be consider
364    /// invalid.
365    ///
366    /// This should only be called immediately after parsing the opening of
367    /// a group or a set of flags.
368    fn is_lookaround_prefix(&self) -> bool {
369        self.bump_if("?=")
370            || self.bump_if("?!")
371            || self.bump_if("?<=")
372            || self.bump_if("?<!")
373    }
374}
375
376/// The actual parser. We try to break out each kind of regex syntax into its
377/// own routine.
378impl<'a> Parser<'a> {
379    pub(super) fn parse(&self) -> Result<Hir, Error> {
380        let hir = self.parse_inner()?;
381        // While we also check nesting during parsing, that only checks the
382        // number of recursive parse calls. It does not necessarily cover
383        // all possible recursive nestings of the Hir itself. For example,
384        // repetition operators don't require recursive parse calls. So one
385        // can stack them arbitrarily without overflowing the stack in the
386        // *parser*. But then if one recurses over the resulting Hir, a stack
387        // overflow is possible. So here we check the Hir nesting level
388        // thoroughly to ensure it isn't nested too deeply.
389        //
390        // Note that we do still need the nesting limit check in the parser as
391        // well, since that will avoid overflowing the stack during parse time
392        // before the complete Hir value is constructed.
393        check_hir_nesting(&hir, self.config.nest_limit)?;
394        Ok(hir)
395    }
396
397    fn parse_inner(&self) -> Result<Hir, Error> {
398        let depth = self.increment_depth()?;
399        let mut alternates = vec![];
400        let mut concat = vec![];
401        loop {
402            self.bump_space();
403            if self.is_done() {
404                break;
405            }
406            match self.char() {
407                '(' => {
408                    // Save the old flags and reset them only when we close
409                    // the group.
410                    let oldflags = *self.flags.borrow();
411                    if let Some(sub) = self.parse_group()? {
412                        concat.push(sub);
413                        // We only reset them here because if 'parse_group'
414                        // returns None, then that means it handled a flag
415                        // directive, e.g., '(?ism)'. And the whole point is
416                        // that those flags remain active until either disabled
417                        // or the end of the pattern or current group.
418                        *self.flags.borrow_mut() = oldflags;
419                    }
420                    if self.char.get() != Some(')') {
421                        return Err(Error::new(ERR_UNCLOSED_GROUP));
422                    }
423                    self.bump();
424                }
425                ')' => {
426                    if depth == 0 {
427                        return Err(Error::new(ERR_UNOPENED_GROUP));
428                    }
429                    break;
430                }
431                '|' => {
432                    alternates.push(Hir::concat(core::mem::take(&mut concat)));
433                    self.bump();
434                }
435                '[' => concat.push(self.parse_class()?),
436                '?' | '*' | '+' => {
437                    concat = self.parse_uncounted_repetition(concat)?;
438                }
439                '{' => {
440                    concat = self.parse_counted_repetition(concat)?;
441                }
442                _ => concat.push(self.parse_primitive()?),
443            }
444        }
445        self.decrement_depth();
446        alternates.push(Hir::concat(concat));
447        // N.B. This strips off the "alternation" if there's only one branch.
448        Ok(Hir::alternation(alternates))
449    }
450
451    /// Parses a "primitive" pattern. A primitive is any expression that does
452    /// not contain any sub-expressions.
453    ///
454    /// This assumes the parser is pointing at the beginning of the primitive.
455    fn parse_primitive(&self) -> Result<Hir, Error> {
456        let ch = self.char();
457        self.bump();
458        match ch {
459            '\\' => self.parse_escape(),
460            '.' => Ok(self.hir_dot()),
461            '^' => Ok(self.hir_anchor_start()),
462            '$' => Ok(self.hir_anchor_end()),
463            ch => Ok(self.hir_char(ch)),
464        }
465    }
466
467    /// Parse an escape sequence. This always results in a "primitive" HIR,
468    /// that is, an HIR with no sub-expressions.
469    ///
470    /// This assumes the parser is positioned at the start of the sequence,
471    /// immediately *after* the `\`. It advances the parser to the first
472    /// position immediately following the escape sequence.
473    fn parse_escape(&self) -> Result<Hir, Error> {
474        if self.is_done() {
475            return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF));
476        }
477        let ch = self.char();
478        // Put some of the more complicated routines into helpers.
479        match ch {
480            '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)),
481            'p' | 'P' => {
482                return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED))
483            }
484            'x' | 'u' | 'U' => return self.parse_hex(),
485            'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
486                return Ok(self.parse_perl_class());
487            }
488            _ => {}
489        }
490
491        // Handle all of the one letter sequences inline.
492        self.bump();
493        if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) {
494            return Ok(self.hir_char(ch));
495        }
496        let special = |ch| Ok(self.hir_char(ch));
497        match ch {
498            'a' => special('\x07'),
499            'f' => special('\x0C'),
500            't' => special('\t'),
501            'n' => special('\n'),
502            'r' => special('\r'),
503            'v' => special('\x0B'),
504            'A' => Ok(Hir::look(hir::Look::Start)),
505            'z' => Ok(Hir::look(hir::Look::End)),
506            'b' => {
507                let mut hir = Hir::look(hir::Look::Word);
508                if !self.is_done() && self.char() == '{' {
509                    if let Some(special) =
510                        self.maybe_parse_special_word_boundary()?
511                    {
512                        hir = special;
513                    }
514                }
515                Ok(hir)
516            }
517            'B' => Ok(Hir::look(hir::Look::WordNegate)),
518            '<' => Ok(Hir::look(hir::Look::WordStart)),
519            '>' => Ok(Hir::look(hir::Look::WordEnd)),
520            _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
521        }
522    }
523
524    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
525    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
526    ///
527    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
528    /// if it fails it will just return `None` with no error. This is done
529    /// because `\b{5}` is a valid expression and we want to let that be parsed
530    /// by the existing counted repetition parsing code. (I thought about just
531    /// invoking the counted repetition code from here, but it seemed a little
532    /// ham-fisted.)
533    ///
534    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
535    /// Namely, if we definitely know it isn't a counted repetition, then we
536    /// return an error specific to the specialty word boundaries.
537    ///
538    /// This assumes the parser is positioned at a `{` immediately following
539    /// a `\b`. When `None` is returned, the parser is returned to the position
540    /// at which it started: pointing at a `{`.
541    ///
542    /// The position given should correspond to the start of the `\b`.
543    fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
544        assert_eq!(self.char(), '{');
545
546        let is_valid_char = |c| match c {
547            'A'..='Z' | 'a'..='z' | '-' => true,
548            _ => false,
549        };
550        let start = self.pos();
551        if !self.bump_and_bump_space() {
552            return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
553        }
554        // This is one of the critical bits: if the first non-whitespace
555        // character isn't in [-A-Za-z] (i.e., this can't be a special word
556        // boundary), then we bail and let the counted repetition parser deal
557        // with this.
558        if !is_valid_char(self.char()) {
559            self.pos.set(start);
560            self.char.set(Some('{'));
561            return Ok(None);
562        }
563
564        // Now collect up our chars until we see a '}'.
565        let mut scratch = String::new();
566        while !self.is_done() && is_valid_char(self.char()) {
567            scratch.push(self.char());
568            self.bump_and_bump_space();
569        }
570        if self.is_done() || self.char() != '}' {
571            return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
572        }
573        self.bump();
574        let kind = match scratch.as_str() {
575            "start" => hir::Look::WordStart,
576            "end" => hir::Look::WordEnd,
577            "start-half" => hir::Look::WordStartHalf,
578            "end-half" => hir::Look::WordEndHalf,
579            _ => {
580                return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
581            }
582        };
583        Ok(Some(Hir::look(kind)))
584    }
585
586    /// Parse a hex representation of a Unicode codepoint. This handles both
587    /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
588    /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
589    /// the first character immediately following the hexadecimal literal.
590    fn parse_hex(&self) -> Result<Hir, Error> {
591        let digit_len = match self.char() {
592            'x' => 2,
593            'u' => 4,
594            'U' => 8,
595            unk => unreachable!(
596                "invalid start of fixed length hexadecimal number {}",
597                unk
598            ),
599        };
600        if !self.bump_and_bump_space() {
601            return Err(Error::new(ERR_HEX_UNEXPECTED_EOF));
602        }
603        if self.char() == '{' {
604            self.parse_hex_brace()
605        } else {
606            self.parse_hex_digits(digit_len)
607        }
608    }
609
610    /// Parse an N-digit hex representation of a Unicode codepoint. This
611    /// expects the parser to be positioned at the first digit and will advance
612    /// the parser to the first character immediately following the escape
613    /// sequence.
614    ///
615    /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
616    /// or 8 (for `\UNNNNNNNN`).
617    fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> {
618        let mut scratch = String::new();
619        for i in 0..digit_len {
620            if i > 0 && !self.bump_and_bump_space() {
621                return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF));
622            }
623            if !is_hex(self.char()) {
624                return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT));
625            }
626            scratch.push(self.char());
627        }
628        // The final bump just moves the parser past the literal, which may
629        // be EOF.
630        self.bump_and_bump_space();
631        match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
632            None => Err(Error::new(ERR_HEX_FIXED_INVALID)),
633            Some(ch) => Ok(self.hir_char(ch)),
634        }
635    }
636
637    /// Parse a hex representation of any Unicode scalar value. This expects
638    /// the parser to be positioned at the opening brace `{` and will advance
639    /// the parser to the first character following the closing brace `}`.
640    fn parse_hex_brace(&self) -> Result<Hir, Error> {
641        let mut scratch = String::new();
642        while self.bump_and_bump_space() && self.char() != '}' {
643            if !is_hex(self.char()) {
644                return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT));
645            }
646            scratch.push(self.char());
647        }
648        if self.is_done() {
649            return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF));
650        }
651        assert_eq!(self.char(), '}');
652        self.bump_and_bump_space();
653
654        if scratch.is_empty() {
655            return Err(Error::new(ERR_HEX_BRACE_EMPTY));
656        }
657        match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
658            None => Err(Error::new(ERR_HEX_BRACE_INVALID)),
659            Some(ch) => Ok(self.hir_char(ch)),
660        }
661    }
662
663    /// Parse a decimal number into a u32 while trimming leading and trailing
664    /// whitespace.
665    ///
666    /// This expects the parser to be positioned at the first position where
667    /// a decimal digit could occur. This will advance the parser to the byte
668    /// immediately following the last contiguous decimal digit.
669    ///
670    /// If no decimal digit could be found or if there was a problem parsing
671    /// the complete set of digits into a u32, then an error is returned.
672    fn parse_decimal(&self) -> Result<u32, Error> {
673        let mut scratch = String::new();
674        while !self.is_done() && self.char().is_whitespace() {
675            self.bump();
676        }
677        while !self.is_done() && '0' <= self.char() && self.char() <= '9' {
678            scratch.push(self.char());
679            self.bump_and_bump_space();
680        }
681        while !self.is_done() && self.char().is_whitespace() {
682            self.bump_and_bump_space();
683        }
684        let digits = scratch.as_str();
685        if digits.is_empty() {
686            return Err(Error::new(ERR_DECIMAL_NO_DIGITS));
687        }
688        match u32::from_str_radix(digits, 10).ok() {
689            Some(n) => Ok(n),
690            None => Err(Error::new(ERR_DECIMAL_INVALID)),
691        }
692    }
693
694    /// Parses an uncounted repetition operator. An uncounted repetition
695    /// operator includes `?`, `*` and `+`, but does not include the `{m,n}`
696    /// syntax. The current character should be one of `?`, `*` or `+`. Any
697    /// other character will result in a panic.
698    ///
699    /// This assumes that the parser is currently positioned at the repetition
700    /// operator and advances the parser to the first character after the
701    /// operator. (Note that the operator may include a single additional `?`,
702    /// which makes the operator ungreedy.)
703    ///
704    /// The caller should include the concatenation that is being built. The
705    /// concatenation returned includes the repetition operator applied to the
706    /// last expression in the given concatenation.
707    ///
708    /// If the concatenation is empty, then this returns an error.
709    fn parse_uncounted_repetition(
710        &self,
711        mut concat: Vec<Hir>,
712    ) -> Result<Vec<Hir>, Error> {
713        let sub = match concat.pop() {
714            Some(hir) => Box::new(hir),
715            None => {
716                return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING));
717            }
718        };
719        let (min, max) = match self.char() {
720            '?' => (0, Some(1)),
721            '*' => (0, None),
722            '+' => (1, None),
723            unk => unreachable!("unrecognized repetition operator '{}'", unk),
724        };
725        let mut greedy = true;
726        if self.bump() && self.char() == '?' {
727            greedy = false;
728            self.bump();
729        }
730        if self.flags().swap_greed {
731            greedy = !greedy;
732        }
733        concat.push(Hir::repetition(hir::Repetition {
734            min,
735            max,
736            greedy,
737            sub,
738        }));
739        Ok(concat)
740    }
741
742    /// Parses a counted repetition operation. A counted repetition operator
743    /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or
744    /// `+` operators.
745    ///
746    /// This assumes that the parser is currently at the opening `{` and
747    /// advances the parser to the first character after the operator. (Note
748    /// that the operator may include a single additional `?`, which makes the
749    /// operator ungreedy.)
750    ///
751    /// The caller should include the concatenation that is being built. The
752    /// concatenation returned includes the repetition operator applied to the
753    /// last expression in the given concatenation.
754    ///
755    /// If the concatenation is empty, then this returns an error.
756    fn parse_counted_repetition(
757        &self,
758        mut concat: Vec<Hir>,
759    ) -> Result<Vec<Hir>, Error> {
760        assert_eq!(self.char(), '{', "expected opening brace");
761        let sub = match concat.pop() {
762            Some(hir) => Box::new(hir),
763            None => {
764                return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING));
765            }
766        };
767        if !self.bump_and_bump_space() {
768            return Err(Error::new(ERR_COUNTED_REP_UNCLOSED));
769        }
770        let min = self.parse_decimal()?;
771        let mut max = Some(min);
772        if self.is_done() {
773            return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED));
774        }
775        if self.char() == ',' {
776            if !self.bump_and_bump_space() {
777                return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED));
778            }
779            if self.char() != '}' {
780                max = Some(self.parse_decimal()?);
781            } else {
782                max = None;
783            }
784            if self.is_done() {
785                return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED));
786            }
787        }
788        if self.char() != '}' {
789            return Err(Error::new(ERR_COUNTED_REP_INVALID));
790        }
791
792        let mut greedy = true;
793        if self.bump_and_bump_space() && self.char() == '?' {
794            greedy = false;
795            self.bump();
796        }
797        if self.flags().swap_greed {
798            greedy = !greedy;
799        }
800
801        if max.map_or(false, |max| min > max) {
802            return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE));
803        }
804        concat.push(Hir::repetition(hir::Repetition {
805            min,
806            max,
807            greedy,
808            sub,
809        }));
810        Ok(concat)
811    }
812
813    /// Parses the part of a pattern that starts with a `(`. This is usually
814    /// a group sub-expression, but might just be a directive that enables
815    /// (or disables) certain flags.
816    ///
817    /// This assumes the parser is pointing at the opening `(`.
818    fn parse_group(&self) -> Result<Option<Hir>, Error> {
819        assert_eq!(self.char(), '(');
820        self.bump_and_bump_space();
821        if self.is_lookaround_prefix() {
822            return Err(Error::new(ERR_LOOK_UNSUPPORTED));
823        }
824        if self.bump_if("?P<") || self.bump_if("?<") {
825            let index = self.next_capture_index()?;
826            let name = Some(Box::from(self.parse_capture_name()?));
827            let sub = Box::new(self.parse_inner()?);
828            let cap = hir::Capture { index, name, sub };
829            Ok(Some(Hir::capture(cap)))
830        } else if self.bump_if("?") {
831            if self.is_done() {
832                return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION));
833            }
834            let start = self.pos();
835            // The flags get reset in the top-level 'parse' routine.
836            *self.flags.borrow_mut() = self.parse_flags()?;
837            let consumed = self.pos() - start;
838            if self.char() == ')' {
839                // We don't allow empty flags, e.g., `(?)`.
840                if consumed == 0 {
841                    return Err(Error::new(ERR_EMPTY_FLAGS));
842                }
843                Ok(None)
844            } else {
845                assert_eq!(':', self.char());
846                self.bump();
847                self.parse_inner().map(Some)
848            }
849        } else {
850            let index = self.next_capture_index()?;
851            let sub = Box::new(self.parse_inner()?);
852            let cap = hir::Capture { index, name: None, sub };
853            Ok(Some(Hir::capture(cap)))
854        }
855    }
856
857    /// Parses a capture group name. Assumes that the parser is positioned at
858    /// the first character in the name following the opening `<` (and may
859    /// possibly be EOF). This advances the parser to the first character
860    /// following the closing `>`.
861    fn parse_capture_name(&self) -> Result<&str, Error> {
862        if self.is_done() {
863            return Err(Error::new(ERR_MISSING_GROUP_NAME));
864        }
865        let start = self.pos();
866        loop {
867            if self.char() == '>' {
868                break;
869            }
870            if !is_capture_char(self.char(), self.pos() == start) {
871                return Err(Error::new(ERR_INVALID_GROUP_NAME));
872            }
873            if !self.bump() {
874                break;
875            }
876        }
877        let end = self.pos();
878        if self.is_done() {
879            return Err(Error::new(ERR_UNCLOSED_GROUP_NAME));
880        }
881        assert_eq!(self.char(), '>');
882        self.bump();
883        let name = &self.pattern()[start..end];
884        if name.is_empty() {
885            return Err(Error::new(ERR_EMPTY_GROUP_NAME));
886        }
887        self.add_capture_name(name)?;
888        Ok(name)
889    }
890
891    /// Parse a sequence of flags starting at the current character.
892    ///
893    /// This advances the parser to the character immediately following the
894    /// flags, which is guaranteed to be either `:` or `)`.
895    ///
896    /// # Errors
897    ///
898    /// If any flags are duplicated, then an error is returned.
899    ///
900    /// If the negation operator is used more than once, then an error is
901    /// returned.
902    ///
903    /// If no flags could be found or if the negation operation is not followed
904    /// by any flags, then an error is returned.
905    fn parse_flags(&self) -> Result<Flags, Error> {
906        let mut flags = *self.flags.borrow();
907        let mut negate = false;
908        // Keeps track of whether the previous flag item was a '-'. We use this
909        // to detect whether there is a dangling '-', which is invalid.
910        let mut last_was_negation = false;
911        // A set to keep track of the flags we've seen. Since all flags are
912        // ASCII, we only need 128 bytes.
913        let mut seen = [false; 128];
914        while self.char() != ':' && self.char() != ')' {
915            if self.char() == '-' {
916                last_was_negation = true;
917                if negate {
918                    return Err(Error::new(ERR_FLAG_REPEATED_NEGATION));
919                }
920                negate = true;
921            } else {
922                last_was_negation = false;
923                self.parse_flag(&mut flags, negate)?;
924                // OK because every valid flag is ASCII, and we're only here if
925                // the flag is valid.
926                let flag_byte = u8::try_from(self.char()).unwrap();
927                if seen[usize::from(flag_byte)] {
928                    return Err(Error::new(ERR_FLAG_DUPLICATE));
929                }
930                seen[usize::from(flag_byte)] = true;
931            }
932            if !self.bump() {
933                return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF));
934            }
935        }
936        if last_was_negation {
937            return Err(Error::new(ERR_FLAG_DANGLING_NEGATION));
938        }
939        Ok(flags)
940    }
941
942    /// Parse the current character as a flag. Do not advance the parser.
943    ///
944    /// This sets the appropriate boolean value in place on the set of flags
945    /// given. The boolean is inverted when `negate` is true.
946    ///
947    /// # Errors
948    ///
949    /// If the flag is not recognized, then an error is returned.
950    fn parse_flag(
951        &self,
952        flags: &mut Flags,
953        negate: bool,
954    ) -> Result<(), Error> {
955        let enabled = !negate;
956        match self.char() {
957            'i' => flags.case_insensitive = enabled,
958            'm' => flags.multi_line = enabled,
959            's' => flags.dot_matches_new_line = enabled,
960            'U' => flags.swap_greed = enabled,
961            'R' => flags.crlf = enabled,
962            'x' => flags.ignore_whitespace = enabled,
963            // We make a special exception for this flag where we let it
964            // through as a recognized flag, but treat it as a no-op. This in
965            // practice retains some compatibility with the regex crate. It is
966            // a little suspect to do this, but for example, '(?-u:\b).+' in
967            // the regex crate is equivalent to '\b.+' in regex-lite.
968            'u' => {}
969            _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)),
970        }
971        Ok(())
972    }
973
974    /// Parse a standard character class consisting primarily of characters or
975    /// character ranges.
976    ///
977    /// This assumes the parser is positioned at the opening `[`. If parsing
978    /// is successful, then the parser is advanced to the position immediately
979    /// following the closing `]`.
980    fn parse_class(&self) -> Result<Hir, Error> {
981        assert_eq!(self.char(), '[');
982
983        let mut union = vec![];
984        if !self.bump_and_bump_space() {
985            return Err(Error::new(ERR_CLASS_UNCLOSED));
986        }
987        // Determine whether the class is negated or not.
988        let negate = if self.char() != '^' {
989            false
990        } else {
991            if !self.bump_and_bump_space() {
992                return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION));
993            }
994            true
995        };
996        // Accept any number of `-` as literal `-`.
997        while self.char() == '-' {
998            union.push(hir::ClassRange { start: '-', end: '-' });
999            if !self.bump_and_bump_space() {
1000                return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1001            }
1002        }
1003        // If `]` is the *first* char in a set, then interpret it as a literal
1004        // `]`. That is, an empty class is impossible to write.
1005        if union.is_empty() && self.char() == ']' {
1006            union.push(hir::ClassRange { start: ']', end: ']' });
1007            if !self.bump_and_bump_space() {
1008                return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING));
1009            }
1010        }
1011        loop {
1012            self.bump_space();
1013            if self.is_done() {
1014                return Err(Error::new(ERR_CLASS_UNCLOSED));
1015            }
1016            match self.char() {
1017                '[' => {
1018                    // Attempt to treat this as the beginning of a POSIX class.
1019                    // If POSIX class parsing fails, then the parser backs up
1020                    // to `[`.
1021                    if let Some(class) = self.maybe_parse_posix_class() {
1022                        union.extend_from_slice(&class.ranges);
1023                        continue;
1024                    }
1025                    // ... otherwise we don't support nested classes.
1026                    return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED));
1027                }
1028                ']' => {
1029                    self.bump();
1030                    let mut class = hir::Class::new(union);
1031                    // Note that we must apply case folding before negation!
1032                    // Consider `(?i)[^x]`. If we applied negation first, then
1033                    // the result would be the character class that matched any
1034                    // Unicode scalar value.
1035                    if self.flags().case_insensitive {
1036                        class.ascii_case_fold();
1037                    }
1038                    if negate {
1039                        class.negate();
1040                    }
1041                    return Ok(Hir::class(class));
1042                }
1043                '&' if self.peek() == Some('&') => {
1044                    return Err(Error::new(
1045                        ERR_CLASS_INTERSECTION_UNSUPPORTED,
1046                    ));
1047                }
1048                '-' if self.peek() == Some('-') => {
1049                    return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED));
1050                }
1051                '~' if self.peek() == Some('~') => {
1052                    return Err(Error::new(
1053                        ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED,
1054                    ));
1055                }
1056                _ => self.parse_class_range(&mut union)?,
1057            }
1058        }
1059    }
1060
1061    /// Parse a single primitive item in a character class set. The item to
1062    /// be parsed can either be one of a simple literal character, a range
1063    /// between two simple literal characters or a "primitive" character
1064    /// class like `\w`.
1065    ///
1066    /// If an invalid escape is found, or if a character class is found where
1067    /// a simple literal is expected (e.g., in a range), then an error is
1068    /// returned.
1069    ///
1070    /// Otherwise, the range (or ranges) are appended to the given union of
1071    /// ranges.
1072    fn parse_class_range(
1073        &self,
1074        union: &mut Vec<hir::ClassRange>,
1075    ) -> Result<(), Error> {
1076        let prim1 = self.parse_class_item()?;
1077        self.bump_space();
1078        if self.is_done() {
1079            return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM));
1080        }
1081        // If the next char isn't a `-`, then we don't have a range.
1082        // There are two exceptions. If the char after a `-` is a `]`, then
1083        // `-` is interpreted as a literal `-`. Alternatively, if the char
1084        // after a `-` is a `-`, then `--` corresponds to a "difference"
1085        // operation. (Which we don't support in regex-lite, but error about
1086        // specifically in an effort to be loud about differences between the
1087        // main regex crate where possible.)
1088        if self.char() != '-'
1089            || self.peek_space() == Some(']')
1090            || self.peek_space() == Some('-')
1091        {
1092            union.extend_from_slice(&into_class_item_ranges(prim1)?);
1093            return Ok(());
1094        }
1095        // OK, now we're parsing a range, so bump past the `-` and parse the
1096        // second half of the range.
1097        if !self.bump_and_bump_space() {
1098            return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1099        }
1100        let prim2 = self.parse_class_item()?;
1101        let range = hir::ClassRange {
1102            start: into_class_item_range(prim1)?,
1103            end: into_class_item_range(prim2)?,
1104        };
1105        if range.start > range.end {
1106            return Err(Error::new(ERR_CLASS_INVALID_RANGE));
1107        }
1108        union.push(range);
1109        Ok(())
1110    }
1111
1112    /// Parse a single item in a character class as a primitive, where the
1113    /// primitive either consists of a verbatim literal or a single escape
1114    /// sequence.
1115    ///
1116    /// This assumes the parser is positioned at the beginning of a primitive,
1117    /// and advances the parser to the first position after the primitive if
1118    /// successful.
1119    ///
1120    /// Note that it is the caller's responsibility to report an error if an
1121    /// illegal primitive was parsed.
1122    fn parse_class_item(&self) -> Result<Hir, Error> {
1123        let ch = self.char();
1124        self.bump();
1125        if ch == '\\' {
1126            self.parse_escape()
1127        } else {
1128            Ok(Hir::char(ch))
1129        }
1130    }
1131
1132    /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`.
1133    ///
1134    /// This assumes the parser is positioned at the opening `[`.
1135    ///
1136    /// If no valid POSIX character class could be found, then this does not
1137    /// advance the parser and `None` is returned. Otherwise, the parser is
1138    /// advanced to the first byte following the closing `]` and the
1139    /// corresponding POSIX class is returned.
1140    fn maybe_parse_posix_class(&self) -> Option<hir::Class> {
1141        // POSIX character classes are interesting from a parsing perspective
1142        // because parsing cannot fail with any interesting error. For example,
1143        // in order to use an POSIX character class, it must be enclosed in
1144        // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1145        // of it as "POSIX character classes have the syntax `[:NAME:]` which
1146        // can only appear within character brackets." This means that things
1147        // like `[[:lower:]A]` are legal constructs.
1148        //
1149        // However, if one types an incorrect POSIX character class, e.g.,
1150        // `[[:loower:]]`, then we treat that as if it were normal nested
1151        // character class containing the characters `:elorw`. (Which isn't
1152        // supported and results in an error in regex-lite.) One might argue
1153        // that we should return an error instead since the repeated colons
1154        // give away the intent to write an POSIX class. But what if the user
1155        // typed `[[:lower]]` instead? How can we tell that was intended to be
1156        // a POSXI class and not just a normal nested class?
1157        //
1158        // Reasonable people can probably disagree over this, but for better
1159        // or worse, we implement semantics that never fails at the expense of
1160        // better failure modes.
1161        assert_eq!(self.char(), '[');
1162
1163        // If parsing fails, then we back up the parser to this starting point.
1164        let start_pos = self.pos();
1165        let start_char = self.char.get();
1166        let reset = || {
1167            self.pos.set(start_pos);
1168            self.char.set(start_char);
1169        };
1170
1171        let mut negated = false;
1172        if !self.bump() || self.char() != ':' {
1173            reset();
1174            return None;
1175        }
1176        if !self.bump() {
1177            reset();
1178            return None;
1179        }
1180        if self.char() == '^' {
1181            negated = true;
1182            if !self.bump() {
1183                reset();
1184                return None;
1185            }
1186        }
1187        let name_start = self.pos();
1188        while self.char() != ':' && self.bump() {}
1189        if self.is_done() {
1190            reset();
1191            return None;
1192        }
1193        let name = &self.pattern()[name_start..self.pos()];
1194        if !self.bump_if(":]") {
1195            reset();
1196            return None;
1197        }
1198        if let Ok(ranges) = posix_class(name) {
1199            let mut class = hir::Class::new(ranges);
1200            if negated {
1201                class.negate();
1202            }
1203            return Some(class);
1204        }
1205        reset();
1206        None
1207    }
1208
1209    /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
1210    /// parser is currently at a valid character class name and will be
1211    /// advanced to the character immediately following the class.
1212    fn parse_perl_class(&self) -> Hir {
1213        let ch = self.char();
1214        self.bump();
1215        let mut class = hir::Class::new(match ch {
1216            'd' | 'D' => posix_class("digit").unwrap(),
1217            's' | 'S' => posix_class("space").unwrap(),
1218            'w' | 'W' => posix_class("word").unwrap(),
1219            unk => unreachable!("invalid Perl class \\{}", unk),
1220        });
1221        if ch.is_ascii_uppercase() {
1222            class.negate();
1223        }
1224        Hir::class(class)
1225    }
1226
1227    fn hir_dot(&self) -> Hir {
1228        if self.flags().dot_matches_new_line {
1229            Hir::class(hir::Class::new([hir::ClassRange {
1230                start: '\x00',
1231                end: '\u{10FFFF}',
1232            }]))
1233        } else if self.flags().crlf {
1234            Hir::class(hir::Class::new([
1235                hir::ClassRange { start: '\x00', end: '\x09' },
1236                hir::ClassRange { start: '\x0B', end: '\x0C' },
1237                hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' },
1238            ]))
1239        } else {
1240            Hir::class(hir::Class::new([
1241                hir::ClassRange { start: '\x00', end: '\x09' },
1242                hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' },
1243            ]))
1244        }
1245    }
1246
1247    fn hir_anchor_start(&self) -> Hir {
1248        let look = if self.flags().multi_line {
1249            if self.flags().crlf {
1250                hir::Look::StartCRLF
1251            } else {
1252                hir::Look::StartLF
1253            }
1254        } else {
1255            hir::Look::Start
1256        };
1257        Hir::look(look)
1258    }
1259
1260    fn hir_anchor_end(&self) -> Hir {
1261        let look = if self.flags().multi_line {
1262            if self.flags().crlf {
1263                hir::Look::EndCRLF
1264            } else {
1265                hir::Look::EndLF
1266            }
1267        } else {
1268            hir::Look::End
1269        };
1270        Hir::look(look)
1271    }
1272
1273    fn hir_char(&self, ch: char) -> Hir {
1274        if self.flags().case_insensitive {
1275            let this = hir::ClassRange { start: ch, end: ch };
1276            if let Some(folded) = this.ascii_case_fold() {
1277                return Hir::class(hir::Class::new([this, folded]));
1278            }
1279        }
1280        Hir::char(ch)
1281    }
1282}
1283
1284/// This checks the depth of the given `Hir` value, and if it exceeds the given
1285/// limit, then an error is returned.
1286fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
1287    fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
1288        if depth > limit {
1289            return Err(Error::new(ERR_TOO_MUCH_NESTING));
1290        }
1291        let Some(next_depth) = depth.checked_add(1) else {
1292            return Err(Error::new(ERR_TOO_MUCH_NESTING));
1293        };
1294        match *hir.kind() {
1295            HirKind::Empty
1296            | HirKind::Char(_)
1297            | HirKind::Class(_)
1298            | HirKind::Look(_) => Ok(()),
1299            HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
1300                recurse(sub, limit, next_depth)
1301            }
1302            HirKind::Capture(hir::Capture { ref sub, .. }) => {
1303                recurse(sub, limit, next_depth)
1304            }
1305            HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
1306                for sub in subs.iter() {
1307                    recurse(sub, limit, next_depth)?;
1308                }
1309                Ok(())
1310            }
1311        }
1312    }
1313    recurse(hir, limit, 0)
1314}
1315
1316/// Converts the given Hir to a literal char if the Hir is just a single
1317/// character. Otherwise this returns an error.
1318///
1319/// This is useful in contexts where you can only accept a single character,
1320/// but where it is convenient to parse something more general. For example,
1321/// parsing a single part of a character class range. It's useful to reuse
1322/// the literal parsing code, but that code can itself return entire classes
1323/// which can't be used as the start/end of a class range.
1324fn into_class_item_range(hir: Hir) -> Result<char, Error> {
1325    match hir.kind {
1326        HirKind::Char(ch) => Ok(ch),
1327        _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)),
1328    }
1329}
1330
1331fn into_class_item_ranges(
1332    mut hir: Hir,
1333) -> Result<Vec<hir::ClassRange>, Error> {
1334    match core::mem::replace(&mut hir.kind, HirKind::Empty) {
1335        HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
1336        HirKind::Class(hir::Class { ranges }) => Ok(ranges),
1337        _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
1338    }
1339}
1340
1341/// Returns an iterator of character class ranges for the given named POSIX
1342/// character class. If no such character class exists for the name given, then
1343/// an error is returned.
1344fn posix_class(
1345    kind: &str,
1346) -> Result<impl Iterator<Item = hir::ClassRange>, Error> {
1347    let slice: &'static [(u8, u8)] = match kind {
1348        "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1349        "alpha" => &[(b'A', b'Z'), (b'a', b'z')],
1350        "ascii" => &[(b'\x00', b'\x7F')],
1351        "blank" => &[(b'\t', b'\t'), (b' ', b' ')],
1352        "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1353        "digit" => &[(b'0', b'9')],
1354        "graph" => &[(b'!', b'~')],
1355        "lower" => &[(b'a', b'z')],
1356        "print" => &[(b' ', b'~')],
1357        "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1358        "space" => &[
1359            (b'\t', b'\t'),
1360            (b'\n', b'\n'),
1361            (b'\x0B', b'\x0B'),
1362            (b'\x0C', b'\x0C'),
1363            (b'\r', b'\r'),
1364            (b' ', b' '),
1365        ],
1366        "upper" => &[(b'A', b'Z')],
1367        "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1368        "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1369        _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)),
1370    };
1371    Ok(slice.iter().map(|&(start, end)| hir::ClassRange {
1372        start: char::from(start),
1373        end: char::from(end),
1374    }))
1375}
1376
1377/// Returns true if the given character is a hexadecimal digit.
1378fn is_hex(c: char) -> bool {
1379    ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
1380}
1381
1382/// Returns true if the given character is a valid in a capture group name.
1383///
1384/// If `first` is true, then `c` is treated as the first character in the
1385/// group name (which must be alphabetic or underscore).
1386fn is_capture_char(c: char, first: bool) -> bool {
1387    if first {
1388        c == '_' || c.is_alphabetic()
1389    } else {
1390        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
1391    }
1392}
1393
1394#[cfg(test)]
1395mod tests {
1396    use super::*;
1397
1398    fn p(pattern: &str) -> Hir {
1399        Parser::new(Config::default(), pattern).parse_inner().unwrap()
1400    }
1401
1402    fn perr(pattern: &str) -> String {
1403        Parser::new(Config::default(), pattern)
1404            .parse_inner()
1405            .unwrap_err()
1406            .to_string()
1407    }
1408
1409    fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir {
1410        Hir::class(hir::Class::new(
1411            it.into_iter().map(|(start, end)| hir::ClassRange { start, end }),
1412        ))
1413    }
1414
1415    fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir {
1416        Hir::class(hir::Class::new(
1417            it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }),
1418        ))
1419    }
1420
1421    fn posix(name: &str) -> Hir {
1422        Hir::class(hir::Class::new(posix_class(name).unwrap()))
1423    }
1424
1425    fn cap(index: u32, sub: Hir) -> Hir {
1426        Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) })
1427    }
1428
1429    fn named_cap(index: u32, name: &str, sub: Hir) -> Hir {
1430        Hir::capture(hir::Capture {
1431            index,
1432            name: Some(Box::from(name)),
1433            sub: Box::new(sub),
1434        })
1435    }
1436
1437    #[test]
1438    fn ok_literal() {
1439        assert_eq!(p("a"), Hir::char('a'));
1440        assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]));
1441        assert_eq!(p("💩"), Hir::char('💩'));
1442    }
1443
1444    #[test]
1445    fn ok_meta_escapes() {
1446        assert_eq!(p(r"\*"), Hir::char('*'));
1447        assert_eq!(p(r"\+"), Hir::char('+'));
1448        assert_eq!(p(r"\?"), Hir::char('?'));
1449        assert_eq!(p(r"\|"), Hir::char('|'));
1450        assert_eq!(p(r"\("), Hir::char('('));
1451        assert_eq!(p(r"\)"), Hir::char(')'));
1452        assert_eq!(p(r"\^"), Hir::char('^'));
1453        assert_eq!(p(r"\$"), Hir::char('$'));
1454        assert_eq!(p(r"\["), Hir::char('['));
1455        assert_eq!(p(r"\]"), Hir::char(']'));
1456    }
1457
1458    #[test]
1459    fn ok_special_escapes() {
1460        assert_eq!(p(r"\a"), Hir::char('\x07'));
1461        assert_eq!(p(r"\f"), Hir::char('\x0C'));
1462        assert_eq!(p(r"\t"), Hir::char('\t'));
1463        assert_eq!(p(r"\n"), Hir::char('\n'));
1464        assert_eq!(p(r"\r"), Hir::char('\r'));
1465        assert_eq!(p(r"\v"), Hir::char('\x0B'));
1466        assert_eq!(p(r"\A"), Hir::look(hir::Look::Start));
1467        assert_eq!(p(r"\z"), Hir::look(hir::Look::End));
1468        assert_eq!(p(r"\b"), Hir::look(hir::Look::Word));
1469        assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate));
1470    }
1471
1472    #[test]
1473    fn ok_hex() {
1474        // fixed length
1475        assert_eq!(p(r"\x41"), Hir::char('A'));
1476        assert_eq!(p(r"\u2603"), Hir::char('☃'));
1477        assert_eq!(p(r"\U0001F4A9"), Hir::char('💩'));
1478        // braces
1479        assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩'));
1480        assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩'));
1481        assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩'));
1482    }
1483
1484    #[test]
1485    fn ok_perl() {
1486        assert_eq!(p(r"\d"), posix("digit"));
1487        assert_eq!(p(r"\s"), posix("space"));
1488        assert_eq!(p(r"\w"), posix("word"));
1489
1490        let negated = |name| {
1491            let mut class = hir::Class::new(posix_class(name).unwrap());
1492            class.negate();
1493            Hir::class(class)
1494        };
1495        assert_eq!(p(r"\D"), negated("digit"));
1496        assert_eq!(p(r"\S"), negated("space"));
1497        assert_eq!(p(r"\W"), negated("word"));
1498    }
1499
1500    #[test]
1501    fn ok_flags_and_primitives() {
1502        assert_eq!(p(r"a"), Hir::char('a'));
1503        assert_eq!(p(r"(?i:a)"), singles(['A', 'a']));
1504
1505        assert_eq!(p(r"^"), Hir::look(hir::Look::Start));
1506        assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF));
1507        assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF));
1508
1509        assert_eq!(p(r"$"), Hir::look(hir::Look::End));
1510        assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF));
1511        assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF));
1512
1513        assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')]));
1514        assert_eq!(
1515            p(r"(?R:.)"),
1516            class([
1517                ('\x00', '\x09'),
1518                ('\x0B', '\x0C'),
1519                ('\x0E', '\u{10FFFF}'),
1520            ])
1521        );
1522        assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')]));
1523        assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')]));
1524    }
1525
1526    #[test]
1527    fn ok_alternate() {
1528        assert_eq!(
1529            p(r"a|b"),
1530            Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1531        );
1532        assert_eq!(
1533            p(r"(?:a|b)"),
1534            Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1535        );
1536
1537        assert_eq!(
1538            p(r"(a|b)"),
1539            cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')]))
1540        );
1541        assert_eq!(
1542            p(r"(?<foo>a|b)"),
1543            named_cap(
1544                1,
1545                "foo",
1546                Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1547            )
1548        );
1549
1550        assert_eq!(
1551            p(r"a|b|c"),
1552            Hir::alternation(vec![
1553                Hir::char('a'),
1554                Hir::char('b'),
1555                Hir::char('c')
1556            ])
1557        );
1558
1559        assert_eq!(
1560            p(r"ax|by|cz"),
1561            Hir::alternation(vec![
1562                Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1563                Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1564                Hir::concat(vec![Hir::char('c'), Hir::char('z')]),
1565            ])
1566        );
1567        assert_eq!(
1568            p(r"(ax|(by|(cz)))"),
1569            cap(
1570                1,
1571                Hir::alternation(vec![
1572                    Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1573                    cap(
1574                        2,
1575                        Hir::alternation(vec![
1576                            Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1577                            cap(
1578                                3,
1579                                Hir::concat(vec![
1580                                    Hir::char('c'),
1581                                    Hir::char('z')
1582                                ])
1583                            ),
1584                        ])
1585                    ),
1586                ])
1587            )
1588        );
1589
1590        assert_eq!(
1591            p(r"|"),
1592            Hir::alternation(vec![Hir::empty(), Hir::empty()])
1593        );
1594        assert_eq!(
1595            p(r"||"),
1596            Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()])
1597        );
1598
1599        assert_eq!(
1600            p(r"a|"),
1601            Hir::alternation(vec![Hir::char('a'), Hir::empty()])
1602        );
1603        assert_eq!(
1604            p(r"|a"),
1605            Hir::alternation(vec![Hir::empty(), Hir::char('a')])
1606        );
1607
1608        assert_eq!(
1609            p(r"(|)"),
1610            cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()]))
1611        );
1612        assert_eq!(
1613            p(r"(a|)"),
1614            cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()]))
1615        );
1616        assert_eq!(
1617            p(r"(|a)"),
1618            cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')]))
1619        );
1620    }
1621
1622    #[test]
1623    fn ok_flag_group() {
1624        assert_eq!(
1625            p("a(?i:b)"),
1626            Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1627        );
1628    }
1629
1630    #[test]
1631    fn ok_flag_directive() {
1632        assert_eq!(p("(?i)a"), singles(['A', 'a']));
1633        assert_eq!(p("a(?i)"), Hir::char('a'));
1634        assert_eq!(
1635            p("a(?i)b"),
1636            Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1637        );
1638        assert_eq!(
1639            p("a(?i)a(?-i)a"),
1640            Hir::concat(vec![
1641                Hir::char('a'),
1642                singles(['A', 'a']),
1643                Hir::char('a'),
1644            ])
1645        );
1646        assert_eq!(
1647            p("a(?:(?i)a)a"),
1648            Hir::concat(vec![
1649                Hir::char('a'),
1650                singles(['A', 'a']),
1651                Hir::char('a'),
1652            ])
1653        );
1654        assert_eq!(
1655            p("a((?i)a)a"),
1656            Hir::concat(vec![
1657                Hir::char('a'),
1658                cap(1, singles(['A', 'a'])),
1659                Hir::char('a'),
1660            ])
1661        );
1662    }
1663
1664    #[test]
1665    fn ok_uncounted_repetition() {
1666        assert_eq!(
1667            p(r"a?"),
1668            Hir::repetition(hir::Repetition {
1669                min: 0,
1670                max: Some(1),
1671                greedy: true,
1672                sub: Box::new(Hir::char('a')),
1673            }),
1674        );
1675        assert_eq!(
1676            p(r"a*"),
1677            Hir::repetition(hir::Repetition {
1678                min: 0,
1679                max: None,
1680                greedy: true,
1681                sub: Box::new(Hir::char('a')),
1682            }),
1683        );
1684        assert_eq!(
1685            p(r"a+"),
1686            Hir::repetition(hir::Repetition {
1687                min: 1,
1688                max: None,
1689                greedy: true,
1690                sub: Box::new(Hir::char('a')),
1691            }),
1692        );
1693
1694        assert_eq!(
1695            p(r"a??"),
1696            Hir::repetition(hir::Repetition {
1697                min: 0,
1698                max: Some(1),
1699                greedy: false,
1700                sub: Box::new(Hir::char('a')),
1701            }),
1702        );
1703        assert_eq!(
1704            p(r"a*?"),
1705            Hir::repetition(hir::Repetition {
1706                min: 0,
1707                max: None,
1708                greedy: false,
1709                sub: Box::new(Hir::char('a')),
1710            }),
1711        );
1712        assert_eq!(
1713            p(r"a+?"),
1714            Hir::repetition(hir::Repetition {
1715                min: 1,
1716                max: None,
1717                greedy: false,
1718                sub: Box::new(Hir::char('a')),
1719            }),
1720        );
1721
1722        assert_eq!(
1723            p(r"a?b"),
1724            Hir::concat(vec![
1725                Hir::repetition(hir::Repetition {
1726                    min: 0,
1727                    max: Some(1),
1728                    greedy: true,
1729                    sub: Box::new(Hir::char('a')),
1730                }),
1731                Hir::char('b'),
1732            ]),
1733        );
1734
1735        assert_eq!(
1736            p(r"ab?"),
1737            Hir::concat(vec![
1738                Hir::char('a'),
1739                Hir::repetition(hir::Repetition {
1740                    min: 0,
1741                    max: Some(1),
1742                    greedy: true,
1743                    sub: Box::new(Hir::char('b')),
1744                }),
1745            ]),
1746        );
1747
1748        assert_eq!(
1749            p(r"(?:ab)?"),
1750            Hir::repetition(hir::Repetition {
1751                min: 0,
1752                max: Some(1),
1753                greedy: true,
1754                sub: Box::new(Hir::concat(vec![
1755                    Hir::char('a'),
1756                    Hir::char('b')
1757                ])),
1758            }),
1759        );
1760
1761        assert_eq!(
1762            p(r"(ab)?"),
1763            Hir::repetition(hir::Repetition {
1764                min: 0,
1765                max: Some(1),
1766                greedy: true,
1767                sub: Box::new(cap(
1768                    1,
1769                    Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1770                )),
1771            }),
1772        );
1773
1774        assert_eq!(
1775            p(r"|a?"),
1776            Hir::alternation(vec![
1777                Hir::empty(),
1778                Hir::repetition(hir::Repetition {
1779                    min: 0,
1780                    max: Some(1),
1781                    greedy: true,
1782                    sub: Box::new(Hir::char('a')),
1783                })
1784            ]),
1785        );
1786    }
1787
1788    #[test]
1789    fn ok_counted_repetition() {
1790        assert_eq!(
1791            p(r"a{5}"),
1792            Hir::repetition(hir::Repetition {
1793                min: 5,
1794                max: Some(5),
1795                greedy: true,
1796                sub: Box::new(Hir::char('a')),
1797            }),
1798        );
1799        assert_eq!(
1800            p(r"a{5}?"),
1801            Hir::repetition(hir::Repetition {
1802                min: 5,
1803                max: Some(5),
1804                greedy: false,
1805                sub: Box::new(Hir::char('a')),
1806            }),
1807        );
1808
1809        assert_eq!(
1810            p(r"a{5,}"),
1811            Hir::repetition(hir::Repetition {
1812                min: 5,
1813                max: None,
1814                greedy: true,
1815                sub: Box::new(Hir::char('a')),
1816            }),
1817        );
1818
1819        assert_eq!(
1820            p(r"a{5,9}"),
1821            Hir::repetition(hir::Repetition {
1822                min: 5,
1823                max: Some(9),
1824                greedy: true,
1825                sub: Box::new(Hir::char('a')),
1826            }),
1827        );
1828
1829        assert_eq!(
1830            p(r"ab{5}c"),
1831            Hir::concat(vec![
1832                Hir::char('a'),
1833                Hir::repetition(hir::Repetition {
1834                    min: 5,
1835                    max: Some(5),
1836                    greedy: true,
1837                    sub: Box::new(Hir::char('b')),
1838                }),
1839                Hir::char('c'),
1840            ]),
1841        );
1842
1843        assert_eq!(
1844            p(r"a{ 5 }"),
1845            Hir::repetition(hir::Repetition {
1846                min: 5,
1847                max: Some(5),
1848                greedy: true,
1849                sub: Box::new(Hir::char('a')),
1850            }),
1851        );
1852        assert_eq!(
1853            p(r"a{ 5 , 9 }"),
1854            Hir::repetition(hir::Repetition {
1855                min: 5,
1856                max: Some(9),
1857                greedy: true,
1858                sub: Box::new(Hir::char('a')),
1859            }),
1860        );
1861    }
1862
1863    #[test]
1864    fn ok_group_unnamed() {
1865        assert_eq!(p("(a)"), cap(1, Hir::char('a')));
1866        assert_eq!(
1867            p("(ab)"),
1868            cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')]))
1869        );
1870    }
1871
1872    #[test]
1873    fn ok_group_named() {
1874        assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1875        assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1876
1877        assert_eq!(
1878            p("(?P<foo>ab)"),
1879            named_cap(
1880                1,
1881                "foo",
1882                Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1883            )
1884        );
1885        assert_eq!(
1886            p("(?<foo>ab)"),
1887            named_cap(
1888                1,
1889                "foo",
1890                Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1891            )
1892        );
1893
1894        assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z')));
1895        assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z')));
1896
1897        assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1898        assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1899
1900        assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1901        assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1902
1903        assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1904        assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1905
1906        assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1907        assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1908
1909        assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1910        assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1911    }
1912
1913    #[test]
1914    fn ok_class() {
1915        assert_eq!(p(r"[a]"), singles(['a']));
1916        assert_eq!(p(r"[a\]]"), singles(['a', ']']));
1917        assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z']));
1918        assert_eq!(p(r"[ab]"), class([('a', 'b')]));
1919        assert_eq!(p(r"[a-]"), singles(['a', '-']));
1920        assert_eq!(p(r"[-a]"), singles(['a', '-']));
1921        assert_eq!(p(r"[--a]"), singles(['a', '-']));
1922        assert_eq!(p(r"[---a]"), singles(['a', '-']));
1923        assert_eq!(p(r"[[:alnum:]]"), posix("alnum"));
1924        assert_eq!(p(r"[\w]"), posix("word"));
1925        assert_eq!(p(r"[a\wz]"), posix("word"));
1926        assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')]));
1927        assert_eq!(p(r"[^\s\S]"), Hir::fail());
1928        assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')]));
1929        assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')]));
1930        assert_eq!(p(r"[]]"), singles([']']));
1931        assert_eq!(p(r"[]a]"), singles([']', 'a']));
1932        assert_eq!(p(r"[]\[]"), singles(['[', ']']));
1933        assert_eq!(p(r"[\[]"), singles(['[']));
1934
1935        assert_eq!(p(r"(?i)[a]"), singles(['A', 'a']));
1936        assert_eq!(p(r"(?i)[A]"), singles(['A', 'a']));
1937        assert_eq!(p(r"(?i)[k]"), singles(['K', 'k']));
1938        assert_eq!(p(r"(?i)[s]"), singles(['S', 's']));
1939        assert_eq!(p(r"(?i)[β]"), singles(['β']));
1940
1941        assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')]));
1942        assert_eq!(
1943            p(r"[^-a]"),
1944            class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')])
1945        );
1946
1947        assert_eq!(
1948            p(r"[-]a]"),
1949            Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')])
1950        );
1951    }
1952
1953    #[test]
1954    fn ok_verbatim() {
1955        assert_eq!(
1956            p(r"(?x)a{5,9} ?"),
1957            Hir::repetition(hir::Repetition {
1958                min: 5,
1959                max: Some(9),
1960                greedy: false,
1961                sub: Box::new(Hir::char('a')),
1962            })
1963        );
1964        assert_eq!(p(r"(?x)[   a]"), singles(['a']));
1965        assert_eq!(
1966            p(r"(?x)[ ^  a]"),
1967            class([('\x00', '`'), ('b', '\u{10FFFF}')])
1968        );
1969        assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-']));
1970        assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a']));
1971
1972        assert_eq!(
1973            p(r"(?x)a b"),
1974            Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1975        );
1976        assert_eq!(
1977            p(r"(?x)a b(?-x)a b"),
1978            Hir::concat(vec![
1979                Hir::char('a'),
1980                Hir::char('b'),
1981                Hir::char('a'),
1982                Hir::char(' '),
1983                Hir::char('b'),
1984            ])
1985        );
1986        assert_eq!(
1987            p(r"a (?x:a )a "),
1988            Hir::concat(vec![
1989                Hir::char('a'),
1990                Hir::char(' '),
1991                Hir::char('a'),
1992                Hir::char('a'),
1993                Hir::char(' '),
1994            ])
1995        );
1996        assert_eq!(
1997            p(r"(?x)( ?P<foo> a )"),
1998            named_cap(1, "foo", Hir::char('a')),
1999        );
2000        assert_eq!(p(r"(?x)(  a )"), cap(1, Hir::char('a')));
2001        assert_eq!(p(r"(?x)(   ?:  a )"), Hir::char('a'));
2002        assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53'));
2003        assert_eq!(p(r"(?x)\ "), Hir::char(' '));
2004    }
2005
2006    #[test]
2007    fn ok_comments() {
2008        let pat = "(?x)
2009# This is comment 1.
2010foo # This is comment 2.
2011  # This is comment 3.
2012bar
2013# This is comment 4.";
2014        assert_eq!(
2015            p(pat),
2016            Hir::concat(vec![
2017                Hir::char('f'),
2018                Hir::char('o'),
2019                Hir::char('o'),
2020                Hir::char('b'),
2021                Hir::char('a'),
2022                Hir::char('r'),
2023            ])
2024        );
2025    }
2026
2027    #[test]
2028    fn err_standard() {
2029        assert_eq!(
2030            ERR_TOO_MUCH_NESTING,
2031            perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"),
2032        );
2033        // This one is tricky, because the only way it can happen is if the
2034        // number of captures overflows u32. Perhaps we should allow setting a
2035        // lower limit?
2036        // assert_eq!(ERR_TOO_MANY_CAPTURES, perr(""));
2037        assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)"));
2038        assert_eq!(ERR_UNCLOSED_GROUP, perr("("));
2039        assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?"));
2040        assert_eq!(ERR_UNOPENED_GROUP, perr(")"));
2041        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)"));
2042        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)"));
2043        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)"));
2044        assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)"));
2045        assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)"));
2046        assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<"));
2047        assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<"));
2048        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)"));
2049        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)"));
2050        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)"));
2051        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)"));
2052        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)"));
2053        assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)"));
2054        assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo"));
2055        assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo"));
2056        assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)"));
2057        assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)"));
2058        assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)"));
2059        assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)"));
2060        assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)"));
2061        assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)"));
2062        assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is"));
2063        assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)"));
2064        assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}"));
2065        assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{"));
2066        assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A"));
2067        assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}"));
2068        assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}"));
2069        assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA"));
2070        assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ"));
2071        assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA"));
2072        assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ"));
2073        assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800"));
2074        assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF"));
2075        assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x"));
2076        assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\"));
2077        assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0"));
2078        assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1"));
2079        assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8"));
2080        assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
2081        assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
2082        assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
2083        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
2084        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
2085        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
2086        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)"));
2087        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?"));
2088        assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?"));
2089        assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}"));
2090        assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})"));
2091        assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}"));
2092        assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{"));
2093        assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5"));
2094        assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,"));
2095        assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6"));
2096        assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z"));
2097        assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}"));
2098        assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}"));
2099        assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}"));
2100        assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}"));
2101        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a"));
2102        assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]"));
2103        assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]"));
2104        assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]"));
2105        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-"));
2106        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^"));
2107        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]"));
2108        assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]"));
2109        assert_eq!(ERR_CLASS_UNCLOSED, perr(r"["));
2110        assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z"));
2111        assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]"));
2112        assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]"));
2113        assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
2114        assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
2115        assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
2116        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
2117        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
2118        assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
2119        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
2120        assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
2121    }
2122
2123    #[test]
2124    fn err_verbatim() {
2125        // See: https://github.com/rust-lang/regex/issues/792
2126        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]"));
2127        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a "));
2128        assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- "));
2129        assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[         "));
2130    }
2131
2132    // This tests a bug fix where the nest limit checker wasn't decrementing
2133    // its depth during post-traversal, which causes long regexes to trip
2134    // the default limit too aggressively.
2135    #[test]
2136    fn regression_454_nest_too_big() {
2137        let pattern = r#"
2138        2(?:
2139          [45]\d{3}|
2140          7(?:
2141            1[0-267]|
2142            2[0-289]|
2143            3[0-29]|
2144            4[01]|
2145            5[1-3]|
2146            6[013]|
2147            7[0178]|
2148            91
2149          )|
2150          8(?:
2151            0[125]|
2152            [139][1-6]|
2153            2[0157-9]|
2154            41|
2155            6[1-35]|
2156            7[1-5]|
2157            8[1-8]|
2158            90
2159          )|
2160          9(?:
2161            0[0-2]|
2162            1[0-4]|
2163            2[568]|
2164            3[3-6]|
2165            5[5-7]|
2166            6[0167]|
2167            7[15]|
2168            8[0146-9]
2169          )
2170        )\d{4}
2171        "#;
2172        p(pattern);
2173    }
2174
2175    // This tests that we treat a trailing `-` in a character class as a
2176    // literal `-` even when whitespace mode is enabled and there is whitespace
2177    // after the trailing `-`.
2178    #[test]
2179    fn regression_455_trailing_dash_ignore_whitespace() {
2180        p("(?x)[ / - ]");
2181        p("(?x)[ a - ]");
2182        p("(?x)[
2183            a
2184            - ]
2185        ");
2186        p("(?x)[
2187            a # wat
2188            - ]
2189        ");
2190
2191        perr("(?x)[ / -");
2192        perr("(?x)[ / - ");
2193        perr(
2194            "(?x)[
2195            / -
2196        ",
2197        );
2198        perr(
2199            "(?x)[
2200            / - # wat
2201        ",
2202        );
2203    }
2204
2205    #[test]
2206    fn regression_capture_indices() {
2207        let got = p(r"(a|ab|c|bcd){4,10}(d*)");
2208        assert_eq!(
2209            got,
2210            Hir::concat(vec![
2211                Hir::repetition(hir::Repetition {
2212                    min: 4,
2213                    max: Some(10),
2214                    greedy: true,
2215                    sub: Box::new(cap(
2216                        1,
2217                        Hir::alternation(vec![
2218                            Hir::char('a'),
2219                            Hir::concat(vec![Hir::char('a'), Hir::char('b')]),
2220                            Hir::char('c'),
2221                            Hir::concat(vec![
2222                                Hir::char('b'),
2223                                Hir::char('c'),
2224                                Hir::char('d')
2225                            ]),
2226                        ])
2227                    ))
2228                }),
2229                cap(
2230                    2,
2231                    Hir::repetition(hir::Repetition {
2232                        min: 0,
2233                        max: None,
2234                        greedy: true,
2235                        sub: Box::new(Hir::char('d')),
2236                    })
2237                ),
2238            ])
2239        );
2240    }
2241}
regex_lite/hir/parse.rs

regex_lite/hir/
parse.rs