fancy_regex/
lib.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t  c\td    e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td    e"]);
101```
102
103# Features
104
105This crate supports several optional features that can be enabled or disabled:
106
107- **`std`** (enabled by default): Enables standard library support. Disable for `no_std` environments.
108- **`unicode`** (enabled by default): Enables Unicode support for character classes and word boundaries.
109- **`perf`** (enabled by default): Enables performance optimizations in the underlying regex engine.
110- **`variable-lookbehinds`** (enabled by default): Enables support for variable-length lookbehind
111  assertions (e.g., `(?<=a+)`). Without this feature, only constant-length lookbehinds are supported.
112  This feature uses reverse DFA matching from the `regex-automata` crate to efficiently handle
113  variable-length patterns that don't use backreferences or other fancy features.
114
115# Syntax
116
117The regex syntax is based on the [regex] crate's, with some additional supported syntax.
118
119Escapes:
120
121`\h`
122: hex digit (`[0-9A-Fa-f]`) \
123`\H`
124: not hex digit (`[^0-9A-Fa-f]`) \
125`\e`
126: escape control character (`\x1B`) \
127`\K`
128: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
129`\G`
130: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\
131`\Z`
132: anchor to the end of the text before any trailing newlines\
133`\O`
134: any character including newline
135
136Backreferences:
137
138`\1`
139: match the exact string that the first capture group matched \
140`\2`
141: backref to the second capture group, etc
142
143Named capture groups:
144
145`(?<name>exp)`
146: match *exp*, creating capture group named *name* \
147`\k<name>`
148: match the exact string that the capture group named *name* matched \
149`(?P<name>exp)`
150: same as `(?<name>exp)` for compatibility with Python, etc. \
151`(?P=name)`
152: same as `\k<name>` for compatibility with Python, etc.
153
154Look-around assertions for matching without changing the current position:
155
156`(?=exp)`
157: look-ahead, succeeds if *exp* matches to the right of the current position \
158`(?!exp)`
159: negative look-ahead, succeeds if *exp* doesn't match to the right \
160`(?<=exp)`
161: look-behind, succeeds if *exp* matches to the left of the current position \
162`(?<!exp)`
163: negative look-behind, succeeds if *exp* doesn't match to the left
164
165**Note**: Look-behind assertions with variable length (e.g., `(?<=a+)`) are supported with the
166`variable-lookbehinds` feature (enabled by default). Without this feature, only constant-length
167look-behinds are supported. Variable-length look-behinds with backreferences or other "fancy"
168features are not currently supported.
169
170Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
171
172```
173# use fancy_regex::Regex;
174let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
175assert!(re.is_match("abcc").unwrap());
176// Doesn't match because `|b` is never tried because of the atomic group
177assert!(!re.is_match("abc").unwrap());
178```
179
180Conditionals - if/then/else:
181
182`(?(1))`
183: continue only if first capture group matched \
184`(?(<name>))`
185: continue only if capture group named *name* matched \
186`(?(1)true_branch|false_branch)`
187: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
188`(?(condition)true_branch|false_branch)`
189: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
190
191[regex]: https://crates.io/crates/regex
192*/
193
194#![deny(missing_docs)]
195#![deny(missing_debug_implementations)]
196#![cfg_attr(not(feature = "std"), no_std)]
197
198extern crate alloc;
199
200use alloc::borrow::Cow;
201use alloc::boxed::Box;
202use alloc::string::{String, ToString};
203use alloc::sync::Arc;
204use alloc::vec;
205use alloc::vec::Vec;
206
207use core::convert::TryFrom;
208use core::fmt;
209use core::fmt::{Debug, Formatter};
210use core::ops::{Index, Range};
211use core::str::FromStr;
212use regex_automata::meta::Regex as RaRegex;
213use regex_automata::util::captures::Captures as RaCaptures;
214use regex_automata::util::syntax::Config as SyntaxConfig;
215use regex_automata::Input as RaInput;
216
217mod analyze;
218mod compile;
219mod error;
220mod expand;
221mod optimize;
222mod parse;
223mod parse_flags;
224mod replacer;
225mod vm;
226
227use crate::analyze::analyze;
228use crate::analyze::can_compile_as_anchored;
229use crate::compile::compile;
230use crate::optimize::optimize;
231use crate::parse::{ExprTree, NamedGroups, Parser};
232use crate::parse_flags::*;
233use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
234
235pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
236pub use crate::expand::Expander;
237pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
238
239const MAX_RECURSION: usize = 64;
240
241// the public API
242
243/// A builder for a `Regex` to allow configuring options.
244#[derive(Debug)]
245pub struct RegexBuilder(RegexOptions);
246
247/// A compiled regular expression.
248#[derive(Clone)]
249pub struct Regex {
250    inner: RegexImpl,
251    named_groups: Arc<NamedGroups>,
252}
253
254// Separate enum because we don't want to expose any of this
255#[derive(Clone)]
256enum RegexImpl {
257    // Do we want to box this? It's pretty big...
258    Wrap {
259        inner: RaRegex,
260        options: RegexOptions,
261        /// Some optimizations avoid the VM, but need to use an extra capture group to represent the match boundaries
262        explicit_capture_group_0: bool,
263        debug_pattern: String,
264    },
265    Fancy {
266        prog: Arc<Prog>,
267        n_groups: usize,
268        options: RegexOptions,
269    },
270}
271
272/// A single match of a regex or group in an input text
273#[derive(Copy, Clone, Debug, Eq, PartialEq)]
274pub struct Match<'t> {
275    text: &'t str,
276    start: usize,
277    end: usize,
278}
279
280/// An iterator over all non-overlapping matches for a particular string.
281///
282/// The iterator yields a `Result<Match>`. The iterator stops when no more
283/// matches can be found.
284///
285/// `'r` is the lifetime of the compiled regular expression and `'t` is the
286/// lifetime of the matched string.
287#[derive(Debug)]
288pub struct Matches<'r, 't> {
289    re: &'r Regex,
290    text: &'t str,
291    last_end: usize,
292    last_match: Option<usize>,
293}
294
295impl<'r, 't> Matches<'r, 't> {
296    /// Return the text being searched.
297    pub fn text(&self) -> &'t str {
298        self.text
299    }
300
301    /// Return the underlying regex.
302    pub fn regex(&self) -> &'r Regex {
303        self.re
304    }
305}
306
307impl<'r, 't> Iterator for Matches<'r, 't> {
308    type Item = Result<Match<'t>>;
309
310    /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
311    /// Ignores empty matches immediately after a match.
312    fn next(&mut self) -> Option<Self::Item> {
313        if self.last_end > self.text.len() {
314            return None;
315        }
316
317        let option_flags = if let Some(last_match) = self.last_match {
318            if self.last_end > last_match {
319                OPTION_SKIPPED_EMPTY_MATCH
320            } else {
321                0
322            }
323        } else {
324            0
325        };
326        let mat =
327            match self
328                .re
329                .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
330            {
331                Err(error) => {
332                    // Stop on first error: If an error is encountered, return it, and set the "last match position"
333                    // to the string length, so that the next next() call will return None, to prevent an infinite loop.
334                    self.last_end = self.text.len() + 1;
335                    return Some(Err(error));
336                }
337                Ok(None) => return None,
338                Ok(Some(mat)) => mat,
339            };
340
341        if mat.start == mat.end {
342            // This is an empty match. To ensure we make progress, start
343            // the next search at the smallest possible starting position
344            // of the next match following this one.
345            self.last_end = next_utf8(self.text, mat.end);
346            // Don't accept empty matches immediately following a match.
347            // Just move on to the next match.
348            if Some(mat.end) == self.last_match {
349                return self.next();
350            }
351        } else {
352            self.last_end = mat.end;
353        }
354
355        self.last_match = Some(mat.end);
356
357        Some(Ok(mat))
358    }
359}
360
361/// An iterator that yields all non-overlapping capture groups matching a
362/// particular regular expression.
363///
364/// The iterator stops when no more matches can be found.
365///
366/// `'r` is the lifetime of the compiled regular expression and `'t` is the
367/// lifetime of the matched string.
368#[derive(Debug)]
369pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
370
371impl<'r, 't> CaptureMatches<'r, 't> {
372    /// Return the text being searched.
373    pub fn text(&self) -> &'t str {
374        self.0.text
375    }
376
377    /// Return the underlying regex.
378    pub fn regex(&self) -> &'r Regex {
379        self.0.re
380    }
381}
382
383impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
384    type Item = Result<Captures<'t>>;
385
386    /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
387    /// Ignores empty matches immediately after a match.
388    fn next(&mut self) -> Option<Self::Item> {
389        if self.0.last_end > self.0.text.len() {
390            return None;
391        }
392
393        let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
394            Err(error) => {
395                // Stop on first error: If an error is encountered, return it, and set the "last match position"
396                // to the string length, so that the next next() call will return None, to prevent an infinite loop.
397                self.0.last_end = self.0.text.len() + 1;
398                return Some(Err(error));
399            }
400            Ok(None) => return None,
401            Ok(Some(captures)) => captures,
402        };
403
404        let mat = captures
405            .get(0)
406            .expect("`Captures` is expected to have entire match at 0th position");
407        if mat.start == mat.end {
408            self.0.last_end = next_utf8(self.0.text, mat.end);
409            if Some(mat.end) == self.0.last_match {
410                return self.next();
411            }
412        } else {
413            self.0.last_end = mat.end;
414        }
415
416        self.0.last_match = Some(mat.end);
417
418        Some(Ok(captures))
419    }
420}
421
422/// A set of capture groups found for a regex.
423#[derive(Debug)]
424pub struct Captures<'t> {
425    inner: CapturesImpl<'t>,
426    named_groups: Arc<NamedGroups>,
427}
428
429#[derive(Debug)]
430enum CapturesImpl<'t> {
431    Wrap {
432        text: &'t str,
433        locations: RaCaptures,
434        /// Some optimizations avoid the VM but need an extra capture group to represent the match boundaries.
435        /// Therefore what is actually capture group 1 should be treated as capture group 0, and all other
436        /// capture groups should have their index reduced by one as well to line up with what the pattern specifies.
437        explicit_capture_group_0: bool,
438    },
439    Fancy {
440        text: &'t str,
441        saves: Vec<usize>,
442    },
443}
444
445/// Iterator for captured groups in order in which they appear in the regex.
446#[derive(Debug)]
447pub struct SubCaptureMatches<'c, 't> {
448    caps: &'c Captures<'t>,
449    i: usize,
450}
451
452/// An iterator over all substrings delimited by a regex.
453///
454/// This iterator yields `Result<&'h str>`, where each item is a substring of the
455/// target string that is delimited by matches of the regular expression. It stops when there
456/// are no more substrings to yield.
457///
458/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
459/// lifetime of the target string being split.
460///
461/// This iterator can be created by the [`Regex::split`] method.
462#[derive(Debug)]
463pub struct Split<'r, 'h> {
464    matches: Matches<'r, 'h>,
465    next_start: usize,
466    target: &'h str,
467}
468
469impl<'r, 'h> Iterator for Split<'r, 'h> {
470    type Item = Result<&'h str>;
471
472    /// Returns the next substring that results from splitting the target string by the regex.
473    ///
474    /// If no more matches are found, returns the remaining part of the string,
475    /// or `None` if all substrings have been yielded.
476    fn next(&mut self) -> Option<Result<&'h str>> {
477        match self.matches.next() {
478            None => {
479                let len = self.target.len();
480                if self.next_start > len {
481                    // No more substrings to return
482                    None
483                } else {
484                    // Return the last part of the target string
485                    // Next call will return None
486                    let part = &self.target[self.next_start..len];
487                    self.next_start = len + 1;
488                    Some(Ok(part))
489                }
490            }
491            // Return the next substring
492            Some(Ok(m)) => {
493                let part = &self.target[self.next_start..m.start()];
494                self.next_start = m.end();
495                Some(Ok(part))
496            }
497            Some(Err(e)) => Some(Err(e)),
498        }
499    }
500}
501
502impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
503
504/// An iterator over at most `N` substrings delimited by a regex.
505///
506/// This iterator yields `Result<&'h str>`, where each item is a substring of the
507/// target that is delimited by matches of the regular expression. It stops either when
508/// there are no more substrings to yield, or after `N` substrings have been yielded.
509///
510/// The `N`th substring is the remaining part of the target.
511///
512/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
513/// lifetime of the target string being split.
514///
515/// This iterator can be created by the [`Regex::splitn`] method.
516#[derive(Debug)]
517pub struct SplitN<'r, 'h> {
518    splits: Split<'r, 'h>,
519    limit: usize,
520}
521
522impl<'r, 'h> Iterator for SplitN<'r, 'h> {
523    type Item = Result<&'h str>;
524
525    /// Returns the next substring resulting from splitting the target by the regex,
526    /// limited to `N` splits.
527    ///
528    /// Returns `None` if no more matches are found or if the limit is reached after yielding
529    /// the remaining part of the target.
530    fn next(&mut self) -> Option<Result<&'h str>> {
531        if self.limit == 0 {
532            // Limit reached. No more substrings available.
533            return None;
534        }
535
536        // Decrement the limit for each split.
537        self.limit -= 1;
538        if self.limit > 0 {
539            return self.splits.next();
540        }
541
542        // Nth split
543        let len = self.splits.target.len();
544        if self.splits.next_start > len {
545            // No more substrings available.
546            None
547        } else {
548            // Return the remaining part of the target
549            let start = self.splits.next_start;
550            self.splits.next_start = len + 1;
551            Some(Ok(&self.splits.target[start..len]))
552        }
553    }
554
555    fn size_hint(&self) -> (usize, Option<usize>) {
556        (0, Some(self.limit))
557    }
558}
559
560impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
561
562#[derive(Clone, Debug)]
563struct RegexOptions {
564    pattern: String,
565    syntaxc: SyntaxConfig,
566    backtrack_limit: usize,
567    delegate_size_limit: Option<usize>,
568    delegate_dfa_size_limit: Option<usize>,
569    oniguruma_mode: bool,
570}
571
572impl RegexOptions {
573    fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
574        if flag_value {
575            enum_value
576        } else {
577            0
578        }
579    }
580
581    fn compute_flags(&self) -> u32 {
582        let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
583        let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
584        let whitespace =
585            Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
586        let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
587        let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
588        let oniguruma_mode = Self::get_flag_value(self.oniguruma_mode, FLAG_ONIGURUMA_MODE);
589
590        insensitive | multiline | whitespace | dotnl | unicode | unicode | oniguruma_mode
591    }
592}
593
594impl Default for RegexOptions {
595    fn default() -> Self {
596        RegexOptions {
597            pattern: String::new(),
598            syntaxc: SyntaxConfig::default(),
599            backtrack_limit: 1_000_000,
600            delegate_size_limit: None,
601            delegate_dfa_size_limit: None,
602            oniguruma_mode: false,
603        }
604    }
605}
606
607impl RegexBuilder {
608    /// Create a new regex builder with a regex pattern.
609    ///
610    /// If the pattern is invalid, the call to `build` will fail later.
611    pub fn new(pattern: &str) -> Self {
612        let mut builder = RegexBuilder(RegexOptions::default());
613        builder.0.pattern = pattern.to_string();
614        builder
615    }
616
617    /// Build the `Regex`.
618    ///
619    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
620    pub fn build(&self) -> Result<Regex> {
621        Regex::new_options(self.0.clone())
622    }
623
624    fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
625        self.0.syntaxc = func(self.0.syntaxc);
626        self
627    }
628
629    /// Override default case insensitive
630    /// this is to enable/disable casing via builder instead of a flag within
631    /// the raw string provided to the regex builder
632    ///
633    /// Default is false
634    pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
635        self.set_config(|x| x.case_insensitive(yes))
636    }
637
638    /// Enable multi-line regex
639    pub fn multi_line(&mut self, yes: bool) -> &mut Self {
640        self.set_config(|x| x.multi_line(yes))
641    }
642
643    /// Allow ignore whitespace
644    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
645        self.set_config(|x| x.ignore_whitespace(yes))
646    }
647
648    /// Enable or disable the "dot matches any character" flag.
649    /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character
650    /// except for a new line character.
651    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
652        self.set_config(|x| x.dot_matches_new_line(yes))
653    }
654
655    /// Enable verbose mode in the regular expression.
656    ///
657    /// The same as ignore_whitespace
658    ///
659    /// When enabled, verbose mode permits insigificant whitespace in many
660    /// places in the regular expression, as well as comments. Comments are
661    /// started using `#` and continue until the end of the line.
662    ///
663    /// By default, this is disabled. It may be selectively enabled in the
664    /// regular expression by using the `x` flag regardless of this setting.
665    pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
666        self.set_config(|x| x.ignore_whitespace(yes))
667    }
668
669    /// Enable or disable the Unicode flag (`u`) by default.
670    ///
671    /// By default this is **enabled**. It may alternatively be selectively
672    /// disabled in the regular expression itself via the `u` flag.
673    ///
674    /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
675    /// default), a regular expression will fail to parse if Unicode mode is
676    /// disabled and a sub-expression could possibly match invalid UTF-8.
677    ///
678    /// **WARNING**: Unicode mode can greatly increase the size of the compiled
679    /// DFA, which can noticeably impact both memory usage and compilation
680    /// time. This is especially noticeable if your regex contains character
681    /// classes like `\w` that are impacted by whether Unicode is enabled or
682    /// not. If Unicode is not necessary, you are encouraged to disable it.
683    pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
684        self.set_config(|x| x.unicode(yes))
685    }
686
687    /// Limit for how many times backtracking should be attempted for fancy regexes (where
688    /// backtracking is used). If this limit is exceeded, execution returns an error with
689    /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
690    /// This is for preventing a regex with catastrophic backtracking to run for too long.
691    ///
692    /// Default is `1_000_000` (1 million).
693    pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
694        self.0.backtrack_limit = limit;
695        self
696    }
697
698    /// Set the approximate size limit of the compiled regular expression.
699    ///
700    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
701    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
702    /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
703    pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
704        self.0.delegate_size_limit = Some(limit);
705        self
706    }
707
708    /// Set the approximate size of the cache used by the DFA.
709    ///
710    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
711    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
712    /// such the actual limit is closer to `<number of delegated regexes> *
713    /// delegate_dfa_size_limit`.
714    pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
715        self.0.delegate_dfa_size_limit = Some(limit);
716        self
717    }
718
719    /// Attempts to better match [Oniguruma](https://github.com/kkos/oniguruma)'s default behavior
720    ///
721    /// Currently this amounts to changing behavior with:
722    ///
723    /// # Left and right word bounds
724    ///
725    /// `fancy-regex` follows the default of other regex engines such as the `regex` crate itself
726    /// where `\<` and `\>` correspond to a _left_ and _right_ word-bound respectively. This
727    /// differs from Oniguruma's defaults which treat them as matching the literals `<` and `>`.
728    /// When this option is set using `\<` and `\>` in the pattern will match the literals
729    /// `<` and `>` instead of word bounds.
730    ///
731    /// ## Example
732    ///
733    /// ```
734    /// use fancy_regex::{Regex, RegexBuilder};
735    ///
736    /// let haystack = "turbo::<Fish>";
737    /// let regex = r"\<\w*\>";
738    ///
739    /// // By default `\<` and `\>` will match the start and end of a word boundary
740    /// let word_bounds_regex = Regex::new(regex).unwrap();
741    /// let word_bounds = word_bounds_regex.find(haystack).unwrap().unwrap();
742    /// assert_eq!(word_bounds.as_str(), "turbo");
743    ///
744    /// // With the option set they instead match the literal `<` and `>` characters
745    /// let literals_regex = RegexBuilder::new(regex).oniguruma_mode(true).build().unwrap();
746    /// let literals = literals_regex.find(haystack).unwrap().unwrap();
747    /// assert_eq!(literals.as_str(), "<Fish>");
748    /// ```
749    pub fn oniguruma_mode(&mut self, yes: bool) -> &mut Self {
750        self.0.oniguruma_mode = yes;
751        self
752    }
753}
754
755impl fmt::Debug for Regex {
756    /// Shows the original regular expression.
757    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
758        write!(f, "{}", self.as_str())
759    }
760}
761
762impl fmt::Display for Regex {
763    /// Shows the original regular expression
764    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
765        write!(f, "{}", self.as_str())
766    }
767}
768
769impl FromStr for Regex {
770    type Err = Error;
771
772    /// Attempts to parse a string into a regular expression
773    fn from_str(s: &str) -> Result<Regex> {
774        Regex::new(s)
775    }
776}
777
778impl Regex {
779    /// Parse and compile a regex with default options, see `RegexBuilder`.
780    ///
781    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
782    pub fn new(re: &str) -> Result<Regex> {
783        let options = RegexOptions {
784            pattern: re.to_string(),
785            ..RegexOptions::default()
786        };
787        Self::new_options(options)
788    }
789
790    fn new_options(options: RegexOptions) -> Result<Regex> {
791        let mut tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?;
792
793        // try to optimize the expression tree
794        let requires_capture_group_fixup = optimize(&mut tree);
795        let info = analyze(&tree, requires_capture_group_fixup)?;
796
797        if !info.hard {
798            // easy case, wrap regex
799
800            // we do our own to_str because escapes are different
801            // NOTE: there is a good opportunity here to use Hir to avoid regex-automata re-parsing it
802            let mut re_cooked = String::new();
803            tree.expr.to_str(&mut re_cooked, 0);
804            let inner = compile::compile_inner(&re_cooked, &options)?;
805            return Ok(Regex {
806                inner: RegexImpl::Wrap {
807                    inner,
808                    options: RegexOptions {
809                        pattern: options.pattern,
810                        ..options
811                    },
812                    explicit_capture_group_0: requires_capture_group_fixup,
813                    debug_pattern: re_cooked,
814                },
815                named_groups: Arc::new(tree.named_groups),
816            });
817        }
818
819        let prog = compile(&info, can_compile_as_anchored(&tree.expr))?;
820        Ok(Regex {
821            inner: RegexImpl::Fancy {
822                prog: Arc::new(prog),
823                n_groups: info.end_group(),
824                options,
825            },
826            named_groups: Arc::new(tree.named_groups),
827        })
828    }
829
830    /// Returns the original string of this regex.
831    pub fn as_str(&self) -> &str {
832        match &self.inner {
833            RegexImpl::Wrap { options, .. } => &options.pattern,
834            RegexImpl::Fancy { options, .. } => &options.pattern,
835        }
836    }
837
838    /// Check if the regex matches the input text.
839    ///
840    /// # Example
841    ///
842    /// Test if some text contains the same word twice:
843    ///
844    /// ```rust
845    /// # use fancy_regex::Regex;
846    ///
847    /// let re = Regex::new(r"(\w+) \1").unwrap();
848    /// assert!(re.is_match("mirror mirror on the wall").unwrap());
849    /// ```
850    pub fn is_match(&self, text: &str) -> Result<bool> {
851        match &self.inner {
852            RegexImpl::Wrap { inner, .. } => Ok(inner.is_match(text)),
853            RegexImpl::Fancy { prog, options, .. } => {
854                let result = vm::run(prog, text, 0, 0, options)?;
855                Ok(result.is_some())
856            }
857        }
858    }
859
860    /// Returns an iterator for each successive non-overlapping match in `text`.
861    ///
862    /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
863    /// method.
864    ///
865    /// # Example
866    ///
867    /// Find all words followed by an exclamation point:
868    ///
869    /// ```rust
870    /// # use fancy_regex::Regex;
871    ///
872    /// let re = Regex::new(r"\w+(?=!)").unwrap();
873    /// let mut matches = re.find_iter("so fancy! even with! iterators!");
874    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
875    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
876    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
877    /// assert!(matches.next().is_none());
878    /// ```
879    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
880        Matches {
881            re: self,
882            text,
883            last_end: 0,
884            last_match: None,
885        }
886    }
887
888    /// Find the first match in the input text.
889    ///
890    /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
891    /// method.
892    ///
893    /// # Example
894    ///
895    /// Find a word that is followed by an exclamation point:
896    ///
897    /// ```rust
898    /// # use fancy_regex::Regex;
899    ///
900    /// let re = Regex::new(r"\w+(?=!)").unwrap();
901    /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
902    /// ```
903    pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
904        self.find_from_pos(text, 0)
905    }
906
907    /// Returns the first match in `text`, starting from the specified byte position `pos`.
908    ///
909    /// # Examples
910    ///
911    /// Finding match starting at a position:
912    ///
913    /// ```
914    /// # use fancy_regex::Regex;
915    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
916    /// let text = "1 test 123\n2 foo";
917    /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
918    ///
919    /// assert_eq!(mat.start(), 11);
920    /// assert_eq!(mat.end(), 12);
921    /// ```
922    ///
923    /// Note that in some cases this is not the same as using the `find`
924    /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
925    pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
926        self.find_from_pos_with_option_flags(text, pos, 0)
927    }
928
929    fn find_from_pos_with_option_flags<'t>(
930        &self,
931        text: &'t str,
932        pos: usize,
933        option_flags: u32,
934    ) -> Result<Option<Match<'t>>> {
935        match &self.inner {
936            RegexImpl::Wrap {
937                inner,
938                explicit_capture_group_0,
939                ..
940            } => {
941                if !*explicit_capture_group_0 {
942                    Ok(inner
943                        .search(&RaInput::new(text).span(pos..text.len()))
944                        .map(|m| Match::new(text, m.start(), m.end())))
945                } else {
946                    let mut locations = inner.create_captures();
947                    inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
948                    Ok(locations.is_match().then(|| {
949                        Match::new(
950                            text,
951                            locations.get_group(1).unwrap().start,
952                            locations.get_group(1).unwrap().end,
953                        )
954                    }))
955                }
956            }
957            RegexImpl::Fancy { prog, options, .. } => {
958                let result = vm::run(prog, text, pos, option_flags, options)?;
959                Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
960            }
961        }
962    }
963
964    /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
965    ///
966    /// # Examples
967    ///
968    /// Finding all matches and capturing parts of each:
969    ///
970    /// ```rust
971    /// # use fancy_regex::Regex;
972    ///
973    /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
974    /// let text = "It was between 2018-04 and 2020-01";
975    /// let mut all_captures = re.captures_iter(text);
976    ///
977    /// let first = all_captures.next().unwrap().unwrap();
978    /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
979    /// assert_eq!(first.get(2).unwrap().as_str(), "04");
980    /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
981    ///
982    /// let second = all_captures.next().unwrap().unwrap();
983    /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
984    /// assert_eq!(second.get(2).unwrap().as_str(), "01");
985    /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
986    ///
987    /// assert!(all_captures.next().is_none());
988    /// ```
989    pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
990        CaptureMatches(self.find_iter(text))
991    }
992
993    /// Returns the capture groups for the first match in `text`.
994    ///
995    /// If no match is found, then `Ok(None)` is returned.
996    ///
997    /// # Examples
998    ///
999    /// Finding matches and capturing parts of the match:
1000    ///
1001    /// ```rust
1002    /// # use fancy_regex::Regex;
1003    ///
1004    /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
1005    /// let text = "The date was 2018-04-07";
1006    /// let captures = re.captures(text).unwrap().unwrap();
1007    ///
1008    /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
1009    /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
1010    /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
1011    /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
1012    /// ```
1013    pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
1014        self.captures_from_pos(text, 0)
1015    }
1016
1017    /// Returns the capture groups for the first match in `text`, starting from
1018    /// the specified byte position `pos`.
1019    ///
1020    /// # Examples
1021    ///
1022    /// Finding captures starting at a position:
1023    ///
1024    /// ```
1025    /// # use fancy_regex::Regex;
1026    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1027    /// let text = "1 test 123\n2 foo";
1028    /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
1029    ///
1030    /// let group = captures.get(1).unwrap();
1031    /// assert_eq!(group.as_str(), "2");
1032    /// assert_eq!(group.start(), 11);
1033    /// assert_eq!(group.end(), 12);
1034    /// ```
1035    ///
1036    /// Note that in some cases this is not the same as using the `captures`
1037    /// method and passing a slice of the string, see the capture that we get
1038    /// when we do this:
1039    ///
1040    /// ```
1041    /// # use fancy_regex::Regex;
1042    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1043    /// let text = "1 test 123\n2 foo";
1044    /// let captures = re.captures(&text[7..]).unwrap().unwrap();
1045    /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
1046    /// ```
1047    ///
1048    /// This matched the number "123" because it's at the beginning of the text
1049    /// of the string slice.
1050    ///
1051    pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
1052        let named_groups = self.named_groups.clone();
1053        match &self.inner {
1054            RegexImpl::Wrap {
1055                inner,
1056                explicit_capture_group_0,
1057                ..
1058            } => {
1059                let mut locations = inner.create_captures();
1060                inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
1061                if locations.is_match() {
1062                    Ok(Some(Captures {
1063                        inner: CapturesImpl::Wrap {
1064                            text,
1065                            locations,
1066                            explicit_capture_group_0: *explicit_capture_group_0,
1067                        },
1068                        named_groups,
1069                    }))
1070                } else {
1071                    Ok(None)
1072                }
1073            }
1074            RegexImpl::Fancy {
1075                prog,
1076                n_groups,
1077                options,
1078                ..
1079            } => {
1080                let result = vm::run(prog, text, pos, 0, options)?;
1081                Ok(result.map(|mut saves| {
1082                    saves.truncate(n_groups * 2);
1083                    Captures {
1084                        inner: CapturesImpl::Fancy { text, saves },
1085                        named_groups,
1086                    }
1087                }))
1088            }
1089        }
1090    }
1091
1092    /// Returns the number of captures, including the implicit capture of the entire expression.
1093    pub fn captures_len(&self) -> usize {
1094        match &self.inner {
1095            RegexImpl::Wrap {
1096                inner,
1097                explicit_capture_group_0,
1098                ..
1099            } => inner.captures_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1100            RegexImpl::Fancy { n_groups, .. } => *n_groups,
1101        }
1102    }
1103
1104    /// Returns an iterator over the capture names.
1105    pub fn capture_names(&self) -> CaptureNames<'_> {
1106        let mut names = Vec::new();
1107        names.resize(self.captures_len(), None);
1108        for (name, &i) in self.named_groups.iter() {
1109            names[i] = Some(name.as_str());
1110        }
1111        CaptureNames(names.into_iter())
1112    }
1113
1114    // for debugging only
1115    #[doc(hidden)]
1116    pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
1117        match &self.inner {
1118            RegexImpl::Wrap {
1119                debug_pattern,
1120                explicit_capture_group_0,
1121                ..
1122            } => {
1123                write!(
1124                    writer,
1125                    "wrapped Regex {:?}, explicit_capture_group_0: {:}",
1126                    debug_pattern, *explicit_capture_group_0
1127                )
1128            }
1129            RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
1130        }
1131    }
1132
1133    /// Replaces the leftmost-first match with the replacement provided.
1134    /// The replacement can be a regular string (where `$N` and `$name` are
1135    /// expanded to match capture groups) or a function that takes the matches'
1136    /// `Captures` and returns the replaced string.
1137    ///
1138    /// If no match is found, then a copy of the string is returned unchanged.
1139    ///
1140    /// # Replacement string syntax
1141    ///
1142    /// All instances of `$name` in the replacement text is replaced with the
1143    /// corresponding capture group `name`.
1144    ///
1145    /// `name` may be an integer corresponding to the index of the
1146    /// capture group (counted by order of opening parenthesis where `0` is the
1147    /// entire match) or it can be a name (consisting of letters, digits or
1148    /// underscores) corresponding to a named capture group.
1149    ///
1150    /// If `name` isn't a valid capture group (whether the name doesn't exist
1151    /// or isn't a valid index), then it is replaced with the empty string.
1152    ///
1153    /// The longest possible name is used. e.g., `$1a` looks up the capture
1154    /// group named `1a` and not the capture group at index `1`. To exert more
1155    /// precise control over the name, use braces, e.g., `${1}a`.
1156    ///
1157    /// To write a literal `$` use `$$`.
1158    ///
1159    /// # Examples
1160    ///
1161    /// Note that this function is polymorphic with respect to the replacement.
1162    /// In typical usage, this can just be a normal string:
1163    ///
1164    /// ```rust
1165    /// # use fancy_regex::Regex;
1166    /// let re = Regex::new("[^01]+").unwrap();
1167    /// assert_eq!(re.replace("1078910", ""), "1010");
1168    /// ```
1169    ///
1170    /// But anything satisfying the `Replacer` trait will work. For example,
1171    /// a closure of type `|&Captures| -> String` provides direct access to the
1172    /// captures corresponding to a match. This allows one to access
1173    /// capturing group matches easily:
1174    ///
1175    /// ```rust
1176    /// # use fancy_regex::{Regex, Captures};
1177    /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
1178    /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1179    ///     format!("{} {}", &caps[2], &caps[1])
1180    /// });
1181    /// assert_eq!(result, "Bruce Springsteen");
1182    /// ```
1183    ///
1184    /// But this is a bit cumbersome to use all the time. Instead, a simple
1185    /// syntax is supported that expands `$name` into the corresponding capture
1186    /// group. Here's the last example, but using this expansion technique
1187    /// with named capture groups:
1188    ///
1189    /// ```rust
1190    /// # use fancy_regex::Regex;
1191    /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1192    /// let result = re.replace("Springsteen, Bruce", "$first $last");
1193    /// assert_eq!(result, "Bruce Springsteen");
1194    /// ```
1195    ///
1196    /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1197    /// would produce the same result. To write a literal `$` use `$$`.
1198    ///
1199    /// Sometimes the replacement string requires use of curly braces to
1200    /// delineate a capture group replacement and surrounding literal text.
1201    /// For example, if we wanted to join two words together with an
1202    /// underscore:
1203    ///
1204    /// ```rust
1205    /// # use fancy_regex::Regex;
1206    /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1207    /// let result = re.replace("deep fried", "${first}_$second");
1208    /// assert_eq!(result, "deep_fried");
1209    /// ```
1210    ///
1211    /// Without the curly braces, the capture group name `first_` would be
1212    /// used, and since it doesn't exist, it would be replaced with the empty
1213    /// string.
1214    ///
1215    /// Finally, sometimes you just want to replace a literal string with no
1216    /// regard for capturing group expansion. This can be done by wrapping a
1217    /// byte string with `NoExpand`:
1218    ///
1219    /// ```rust
1220    /// # use fancy_regex::Regex;
1221    /// use fancy_regex::NoExpand;
1222    ///
1223    /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1224    /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1225    /// assert_eq!(result, "$2 $last");
1226    /// ```
1227    pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1228        self.replacen(text, 1, rep)
1229    }
1230
1231    /// Replaces all non-overlapping matches in `text` with the replacement
1232    /// provided. This is the same as calling `replacen` with `limit` set to
1233    /// `0`.
1234    ///
1235    /// See the documentation for `replace` for details on how to access
1236    /// capturing group matches in the replacement string.
1237    pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1238        self.replacen(text, 0, rep)
1239    }
1240
1241    /// Replaces at most `limit` non-overlapping matches in `text` with the
1242    /// replacement provided. If `limit` is 0, then all non-overlapping matches
1243    /// are replaced.
1244    ///
1245    /// Will panic if any errors are encountered. Use `try_replacen`, which this
1246    /// function unwraps, if you want to handle errors.
1247    ///
1248    /// See the documentation for `replace` for details on how to access
1249    /// capturing group matches in the replacement string.
1250    ///
1251    pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1252        self.try_replacen(text, limit, rep).unwrap()
1253    }
1254
1255    /// Replaces at most `limit` non-overlapping matches in `text` with the
1256    /// replacement provided. If `limit` is 0, then all non-overlapping matches
1257    /// are replaced.
1258    ///
1259    /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1260    ///
1261    /// See the documentation for `replace` for details on how to access
1262    /// capturing group matches in the replacement string.
1263    pub fn try_replacen<'t, R: Replacer>(
1264        &self,
1265        text: &'t str,
1266        limit: usize,
1267        mut rep: R,
1268    ) -> Result<Cow<'t, str>> {
1269        // If we know that the replacement doesn't have any capture expansions,
1270        // then we can fast path. The fast path can make a tremendous
1271        // difference:
1272        //
1273        //   1) We use `find_iter` instead of `captures_iter`. Not asking for
1274        //      captures generally makes the regex engines faster.
1275        //   2) We don't need to look up all of the capture groups and do
1276        //      replacements inside the replacement string. We just push it
1277        //      at each match and be done with it.
1278        if let Some(rep) = rep.no_expansion() {
1279            let mut it = self.find_iter(text).enumerate().peekable();
1280            if it.peek().is_none() {
1281                return Ok(Cow::Borrowed(text));
1282            }
1283            let mut new = String::with_capacity(text.len());
1284            let mut last_match = 0;
1285            for (i, m) in it {
1286                let m = m?;
1287
1288                if limit > 0 && i >= limit {
1289                    break;
1290                }
1291                new.push_str(&text[last_match..m.start()]);
1292                new.push_str(&rep);
1293                last_match = m.end();
1294            }
1295            new.push_str(&text[last_match..]);
1296            return Ok(Cow::Owned(new));
1297        }
1298
1299        // The slower path, which we use if the replacement needs access to
1300        // capture groups.
1301        let mut it = self.captures_iter(text).enumerate().peekable();
1302        if it.peek().is_none() {
1303            return Ok(Cow::Borrowed(text));
1304        }
1305        let mut new = String::with_capacity(text.len());
1306        let mut last_match = 0;
1307        for (i, cap) in it {
1308            let cap = cap?;
1309
1310            if limit > 0 && i >= limit {
1311                break;
1312            }
1313            // unwrap on 0 is OK because captures only reports matches
1314            let m = cap.get(0).unwrap();
1315            new.push_str(&text[last_match..m.start()]);
1316            rep.replace_append(&cap, &mut new);
1317            last_match = m.end();
1318        }
1319        new.push_str(&text[last_match..]);
1320        Ok(Cow::Owned(new))
1321    }
1322
1323    /// Splits the string by matches of the regex.
1324    ///
1325    /// Returns an iterator over the substrings of the target string
1326    ///  that *aren't* matched by the regex.
1327    ///
1328    /// # Example
1329    ///
1330    /// To split a string delimited by arbitrary amounts of spaces or tabs:
1331    ///
1332    /// ```rust
1333    /// # use fancy_regex::Regex;
1334    /// let re = Regex::new(r"[ \t]+").unwrap();
1335    /// let target = "a b \t  c\td    e";
1336    /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1337    /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1338    /// ```
1339    pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1340        Split {
1341            matches: self.find_iter(target),
1342            next_start: 0,
1343            target,
1344        }
1345    }
1346
1347    /// Splits the string by matches of the regex at most `limit` times.
1348    ///
1349    /// Returns an iterator over the substrings of the target string
1350    /// that *aren't* matched by the regex.
1351    ///
1352    /// The `N`th substring is the remaining part of the target.
1353    ///
1354    /// # Example
1355    ///
1356    /// To split a string delimited by arbitrary amounts of spaces or tabs
1357    /// 3 times:
1358    ///
1359    /// ```rust
1360    /// # use fancy_regex::Regex;
1361    /// let re = Regex::new(r"[ \t]+").unwrap();
1362    /// let target = "a b \t  c\td    e";
1363    /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1364    /// assert_eq!(fields, vec!["a", "b", "c\td    e"]);
1365    /// ```
1366    pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1367        SplitN {
1368            splits: self.split(target),
1369            limit,
1370        }
1371    }
1372}
1373
1374impl TryFrom<&str> for Regex {
1375    type Error = Error;
1376
1377    /// Attempts to parse a string into a regular expression
1378    fn try_from(s: &str) -> Result<Self> {
1379        Self::new(s)
1380    }
1381}
1382
1383impl TryFrom<String> for Regex {
1384    type Error = Error;
1385
1386    /// Attempts to parse a string into a regular expression
1387    fn try_from(s: String) -> Result<Self> {
1388        Self::new(&s)
1389    }
1390}
1391
1392impl<'t> Match<'t> {
1393    /// Returns the starting byte offset of the match in the text.
1394    #[inline]
1395    pub fn start(&self) -> usize {
1396        self.start
1397    }
1398
1399    /// Returns the ending byte offset of the match in the text.
1400    #[inline]
1401    pub fn end(&self) -> usize {
1402        self.end
1403    }
1404
1405    /// Returns the range over the starting and ending byte offsets of the match in text.
1406    #[inline]
1407    pub fn range(&self) -> Range<usize> {
1408        self.start..self.end
1409    }
1410
1411    /// Returns the matched text.
1412    #[inline]
1413    pub fn as_str(&self) -> &'t str {
1414        &self.text[self.start..self.end]
1415    }
1416
1417    /// Creates a new match from the given text and byte offsets.
1418    fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1419        Match { text, start, end }
1420    }
1421}
1422
1423impl<'t> From<Match<'t>> for &'t str {
1424    fn from(m: Match<'t>) -> &'t str {
1425        m.as_str()
1426    }
1427}
1428
1429impl<'t> From<Match<'t>> for Range<usize> {
1430    fn from(m: Match<'t>) -> Range<usize> {
1431        m.range()
1432    }
1433}
1434
1435#[allow(clippy::len_without_is_empty)] // follow regex's API
1436impl<'t> Captures<'t> {
1437    /// Get the capture group by its index in the regex.
1438    ///
1439    /// If there is no match for that group or the index does not correspond to a group, `None` is
1440    /// returned. The index 0 returns the whole match.
1441    pub fn get(&self, i: usize) -> Option<Match<'t>> {
1442        match &self.inner {
1443            CapturesImpl::Wrap {
1444                text,
1445                locations,
1446                explicit_capture_group_0,
1447            } => locations
1448                .get_group(i + if *explicit_capture_group_0 { 1 } else { 0 })
1449                .map(|span| Match {
1450                    text,
1451                    start: span.start,
1452                    end: span.end,
1453                }),
1454            CapturesImpl::Fancy { text, saves } => {
1455                let slot = i * 2;
1456                if slot >= saves.len() {
1457                    return None;
1458                }
1459                let lo = saves[slot];
1460                if lo == usize::MAX {
1461                    return None;
1462                }
1463                let hi = saves[slot + 1];
1464                Some(Match {
1465                    text,
1466                    start: lo,
1467                    end: hi,
1468                })
1469            }
1470        }
1471    }
1472
1473    /// Returns the match for a named capture group.  Returns `None` the capture
1474    /// group did not match or if there is no group with the given name.
1475    pub fn name(&self, name: &str) -> Option<Match<'t>> {
1476        self.named_groups.get(name).and_then(|i| self.get(*i))
1477    }
1478
1479    /// Expands all instances of `$group` in `replacement` to the corresponding
1480    /// capture group `name`, and writes them to the `dst` buffer given.
1481    ///
1482    /// `group` may be an integer corresponding to the index of the
1483    /// capture group (counted by order of opening parenthesis where `\0` is the
1484    /// entire match) or it can be a name (consisting of letters, digits or
1485    /// underscores) corresponding to a named capture group.
1486    ///
1487    /// If `group` isn't a valid capture group (whether the name doesn't exist
1488    /// or isn't a valid index), then it is replaced with the empty string.
1489    ///
1490    /// The longest possible name is used. e.g., `$1a` looks up the capture
1491    /// group named `1a` and not the capture group at index `1`. To exert more
1492    /// precise control over the name, use braces, e.g., `${1}a`.
1493    ///
1494    /// To write a literal `$`, use `$$`.
1495    ///
1496    /// For more control over expansion, see [`Expander`].
1497    ///
1498    /// [`Expander`]: expand/struct.Expander.html
1499    pub fn expand(&self, replacement: &str, dst: &mut String) {
1500        Expander::default().append_expansion(dst, replacement, self);
1501    }
1502
1503    /// Iterate over the captured groups in order in which they appeared in the regex. The first
1504    /// capture corresponds to the whole match.
1505    pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1506        SubCaptureMatches { caps: self, i: 0 }
1507    }
1508
1509    /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1510    /// match.
1511    pub fn len(&self) -> usize {
1512        match &self.inner {
1513            CapturesImpl::Wrap {
1514                locations,
1515                explicit_capture_group_0,
1516                ..
1517            } => locations.group_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1518            CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1519        }
1520    }
1521}
1522
1523/// Get a group by index.
1524///
1525/// `'t` is the lifetime of the matched text.
1526///
1527/// The text can't outlive the `Captures` object if this method is
1528/// used, because of how `Index` is defined (normally `a[i]` is part
1529/// of `a` and can't outlive it); to do that, use `get()` instead.
1530///
1531/// # Panics
1532///
1533/// If there is no group at the given index.
1534impl<'t> Index<usize> for Captures<'t> {
1535    type Output = str;
1536
1537    fn index(&self, i: usize) -> &str {
1538        self.get(i)
1539            .map(|m| m.as_str())
1540            .unwrap_or_else(|| panic!("no group at index '{}'", i))
1541    }
1542}
1543
1544/// Get a group by name.
1545///
1546/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1547/// of the group name (the index).
1548///
1549/// The text can't outlive the `Captures` object if this method is
1550/// used, because of how `Index` is defined (normally `a[i]` is part
1551/// of `a` and can't outlive it); to do that, use `name` instead.
1552///
1553/// # Panics
1554///
1555/// If there is no group named by the given value.
1556impl<'t, 'i> Index<&'i str> for Captures<'t> {
1557    type Output = str;
1558
1559    fn index<'a>(&'a self, name: &'i str) -> &'a str {
1560        self.name(name)
1561            .map(|m| m.as_str())
1562            .unwrap_or_else(|| panic!("no group named '{}'", name))
1563    }
1564}
1565
1566impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1567    type Item = Option<Match<'t>>;
1568
1569    fn next(&mut self) -> Option<Option<Match<'t>>> {
1570        if self.i < self.caps.len() {
1571            let result = self.caps.get(self.i);
1572            self.i += 1;
1573            Some(result)
1574        } else {
1575            None
1576        }
1577    }
1578}
1579
1580// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1581
1582/// Regular expression AST. This is public for now but may change.
1583#[derive(Debug, PartialEq, Eq, Clone)]
1584pub enum Expr {
1585    /// An empty expression, e.g. the last branch in `(a|b|)`
1586    Empty,
1587    /// Any character, regex `.`
1588    Any {
1589        /// Whether it also matches newlines or not
1590        newline: bool,
1591    },
1592    /// An assertion
1593    Assertion(Assertion),
1594    /// The string as a literal, e.g. `a`
1595    Literal {
1596        /// The string to match
1597        val: String,
1598        /// Whether match is case-insensitive or not
1599        casei: bool,
1600    },
1601    /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1602    /// the literal `a` and `.` for any character
1603    Concat(Vec<Expr>),
1604    /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1605    /// where either the literal `a` or `b` must match
1606    Alt(Vec<Expr>),
1607    /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1608    /// (remembers) the match
1609    Group(Box<Expr>),
1610    /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1611    /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1612    LookAround(Box<Expr>, LookAround),
1613    /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1614    Repeat {
1615        /// The expression that is being repeated
1616        child: Box<Expr>,
1617        /// The minimum number of repetitions
1618        lo: usize,
1619        /// The maximum number of repetitions (or `usize::MAX`)
1620        hi: usize,
1621        /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1622        /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1623        greedy: bool,
1624    },
1625    /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1626    /// to represent all the expressions in the AST, e.g. character classes.
1627    Delegate {
1628        /// The regex
1629        inner: String,
1630        /// How many characters the regex matches
1631        size: usize, // TODO: move into analysis result
1632        /// Whether the matching is case-insensitive or not
1633        casei: bool,
1634    },
1635    /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1636    /// and the whole regex matches either `abcabc` or `defdef`.
1637    Backref {
1638        /// The capture group number being referenced
1639        group: usize,
1640        /// Whether the matching is case-insensitive or not
1641        casei: bool,
1642    },
1643    /// Back reference to a capture group at the given specified relative recursion level.
1644    BackrefWithRelativeRecursionLevel {
1645        /// The capture group number being referenced
1646        group: usize,
1647        /// Relative recursion level
1648        relative_level: isize,
1649        /// Whether the matching is case-insensitive or not
1650        casei: bool,
1651    },
1652    /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1653    /// never backtrack and try `a`, even if matching fails after the atomic group.
1654    AtomicGroup(Box<Expr>),
1655    /// Keep matched text so far out of overall match
1656    KeepOut,
1657    /// Anchor to match at the position where the previous match ended
1658    ContinueFromPreviousMatchEnd,
1659    /// Conditional expression based on whether the numbered capture group matched or not
1660    BackrefExistsCondition(usize),
1661    /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1662    Conditional {
1663        /// The conditional expression to evaluate
1664        condition: Box<Expr>,
1665        /// What to execute if the condition is true
1666        true_branch: Box<Expr>,
1667        /// What to execute if the condition is false
1668        false_branch: Box<Expr>,
1669    },
1670    /// Subroutine call to the specified group number
1671    SubroutineCall(usize),
1672    /// Unresolved subroutine call to the specified group name
1673    UnresolvedNamedSubroutineCall {
1674        /// The capture group name
1675        name: String,
1676        /// The position in the original regex pattern where the subroutine call is made
1677        ix: usize,
1678    },
1679}
1680
1681/// Type of look-around assertion as used for a look-around expression.
1682#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1683pub enum LookAround {
1684    /// Look-ahead assertion, e.g. `(?=a)`
1685    LookAhead,
1686    /// Negative look-ahead assertion, e.g. `(?!a)`
1687    LookAheadNeg,
1688    /// Look-behind assertion, e.g. `(?<=a)`
1689    LookBehind,
1690    /// Negative look-behind assertion, e.g. `(?<!a)`
1691    LookBehindNeg,
1692}
1693
1694/// An iterator over capture names in a [Regex].  The iterator
1695/// returns the name of each group, or [None] if the group has
1696/// no name.  Because capture group 0 cannot have a name, the
1697/// first item returned is always [None].
1698pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1699
1700impl Debug for CaptureNames<'_> {
1701    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1702        f.write_str("<CaptureNames>")
1703    }
1704}
1705
1706impl<'r> Iterator for CaptureNames<'r> {
1707    type Item = Option<&'r str>;
1708
1709    fn next(&mut self) -> Option<Self::Item> {
1710        self.0.next()
1711    }
1712}
1713
1714// silly to write my own, but this is super-fast for the common 1-digit
1715// case.
1716fn push_usize(s: &mut String, x: usize) {
1717    if x >= 10 {
1718        push_usize(s, x / 10);
1719        s.push((b'0' + (x % 10) as u8) as char);
1720    } else {
1721        s.push((b'0' + (x as u8)) as char);
1722    }
1723}
1724
1725fn is_special(c: char) -> bool {
1726    matches!(
1727        c,
1728        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' | '#'
1729    )
1730}
1731
1732fn push_quoted(buf: &mut String, s: &str) {
1733    for c in s.chars() {
1734        if is_special(c) {
1735            buf.push('\\');
1736        }
1737        buf.push(c);
1738    }
1739}
1740
1741/// Escapes special characters in `text` with '\\'.  Returns a string which, when interpreted
1742/// as a regex, matches exactly `text`.
1743pub fn escape(text: &str) -> Cow<'_, str> {
1744    // Using bytes() is OK because all special characters are single bytes.
1745    match text.bytes().filter(|&b| is_special(b as char)).count() {
1746        0 => Cow::Borrowed(text),
1747        n => {
1748            // The capacity calculation is exact because '\\' is a single byte.
1749            let mut buf = String::with_capacity(text.len() + n);
1750            push_quoted(&mut buf, text);
1751            Cow::Owned(buf)
1752        }
1753    }
1754}
1755
1756/// Type of assertions
1757#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1758pub enum Assertion {
1759    /// Start of input text
1760    StartText,
1761    /// End of input text
1762    EndText,
1763    /// Start of a line
1764    StartLine {
1765        /// CRLF mode
1766        crlf: bool,
1767    },
1768    /// End of a line
1769    EndLine {
1770        /// CRLF mode
1771        crlf: bool,
1772    },
1773    /// Left word boundary
1774    LeftWordBoundary,
1775    /// Left word half boundary
1776    LeftWordHalfBoundary,
1777    /// Right word boundary
1778    RightWordBoundary,
1779    /// Right word half boundary
1780    RightWordHalfBoundary,
1781    /// Both word boundaries
1782    WordBoundary,
1783    /// Not word boundary
1784    NotWordBoundary,
1785}
1786
1787impl Assertion {
1788    pub(crate) fn is_hard(&self) -> bool {
1789        use Assertion::*;
1790        matches!(
1791            self,
1792            // these will make regex-automata use PikeVM
1793            LeftWordBoundary
1794                | LeftWordHalfBoundary
1795                | RightWordBoundary
1796                | RightWordHalfBoundary
1797                | WordBoundary
1798                | NotWordBoundary
1799        )
1800    }
1801}
1802
1803impl Expr {
1804    /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1805    /// that are referenced by backrefs.
1806    pub fn parse_tree(re: &str) -> Result<ExprTree> {
1807        Parser::parse(re)
1808    }
1809
1810    /// Parse the regex and return an expression (AST)
1811    /// Flags should be bit based based on flags
1812    pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
1813        Parser::parse_with_flags(re, flags)
1814    }
1815
1816    /// Convert expression to a regex string in the regex crate's syntax.
1817    ///
1818    /// # Panics
1819    ///
1820    /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1821    pub fn to_str(&self, buf: &mut String, precedence: u8) {
1822        match *self {
1823            Expr::Empty => (),
1824            Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1825            Expr::Literal { ref val, casei } => {
1826                if casei {
1827                    buf.push_str("(?i:");
1828                }
1829                push_quoted(buf, val);
1830                if casei {
1831                    buf.push(')');
1832                }
1833            }
1834            Expr::Assertion(Assertion::StartText) => buf.push('^'),
1835            Expr::Assertion(Assertion::EndText) => buf.push('$'),
1836            Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1837            Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1838            Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1839            Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1840            Expr::Concat(ref children) => {
1841                if precedence > 1 {
1842                    buf.push_str("(?:");
1843                }
1844                for child in children {
1845                    child.to_str(buf, 2);
1846                }
1847                if precedence > 1 {
1848                    buf.push(')')
1849                }
1850            }
1851            Expr::Alt(ref children) => {
1852                if precedence > 0 {
1853                    buf.push_str("(?:");
1854                }
1855                for (i, child) in children.iter().enumerate() {
1856                    if i != 0 {
1857                        buf.push('|');
1858                    }
1859                    child.to_str(buf, 1);
1860                }
1861                if precedence > 0 {
1862                    buf.push(')');
1863                }
1864            }
1865            Expr::Group(ref child) => {
1866                buf.push('(');
1867                child.to_str(buf, 0);
1868                buf.push(')');
1869            }
1870            Expr::Repeat {
1871                ref child,
1872                lo,
1873                hi,
1874                greedy,
1875            } => {
1876                if precedence > 2 {
1877                    buf.push_str("(?:");
1878                }
1879                child.to_str(buf, 3);
1880                match (lo, hi) {
1881                    (0, 1) => buf.push('?'),
1882                    (0, usize::MAX) => buf.push('*'),
1883                    (1, usize::MAX) => buf.push('+'),
1884                    (lo, hi) => {
1885                        buf.push('{');
1886                        push_usize(buf, lo);
1887                        if lo != hi {
1888                            buf.push(',');
1889                            if hi != usize::MAX {
1890                                push_usize(buf, hi);
1891                            }
1892                        }
1893                        buf.push('}');
1894                    }
1895                }
1896                if !greedy {
1897                    buf.push('?');
1898                }
1899                if precedence > 2 {
1900                    buf.push(')');
1901                }
1902            }
1903            Expr::Delegate {
1904                ref inner, casei, ..
1905            } => {
1906                // at the moment, delegate nodes are just atoms
1907                if casei {
1908                    buf.push_str("(?i:");
1909                }
1910                buf.push_str(inner);
1911                if casei {
1912                    buf.push(')');
1913                }
1914            }
1915            _ => panic!("attempting to format hard expr {:?}", self),
1916        }
1917    }
1918}
1919
1920// precondition: ix > 0
1921fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1922    let bytes = s.as_bytes();
1923    loop {
1924        ix -= 1;
1925        // fancy bit magic for ranges 0..0x80 + 0xc0..
1926        if (bytes[ix] as i8) >= -0x40 {
1927            break;
1928        }
1929    }
1930    ix
1931}
1932
1933fn codepoint_len(b: u8) -> usize {
1934    match b {
1935        b if b < 0x80 => 1,
1936        b if b < 0xe0 => 2,
1937        b if b < 0xf0 => 3,
1938        _ => 4,
1939    }
1940}
1941
1942/// Returns the smallest possible index of the next valid UTF-8 sequence
1943/// starting after `i`.
1944/// Adapted from a function with the same name in the `regex` crate.
1945fn next_utf8(text: &str, i: usize) -> usize {
1946    let b = match text.as_bytes().get(i) {
1947        None => return i + 1,
1948        Some(&b) => b,
1949    };
1950    i + codepoint_len(b)
1951}
1952
1953// If this returns false, then there is no possible backref in the re
1954
1955// Both potential implementations are turned off, because we currently
1956// always need to do a deeper analysis because of 1-character
1957// look-behind. If we could call a find_from_pos method of regex::Regex,
1958// it would make sense to bring this back.
1959/*
1960pub fn detect_possible_backref(re: &str) -> bool {
1961    let mut last = b'\x00';
1962    for b in re.as_bytes() {
1963        if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1964        last = *b;
1965    }
1966    false
1967}
1968
1969pub fn detect_possible_backref(re: &str) -> bool {
1970    let mut bytes = re.as_bytes();
1971    loop {
1972        match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1973            Some(i) => {
1974                bytes = &bytes[i + 1..];
1975                let c = bytes[0];
1976                if b'0' <= c && c <= b'9' { return true; }
1977            }
1978            None => return false
1979        }
1980    }
1981}
1982*/
1983
1984/// The internal module only exists so that the toy example can access internals for debugging and
1985/// experimenting.
1986#[doc(hidden)]
1987pub mod internal {
1988    pub use crate::analyze::{analyze, can_compile_as_anchored};
1989    pub use crate::compile::compile;
1990    pub use crate::optimize::optimize;
1991    pub use crate::parse_flags::{
1992        FLAG_CASEI, FLAG_DOTNL, FLAG_IGNORE_SPACE, FLAG_MULTI, FLAG_ONIGURUMA_MODE, FLAG_UNICODE,
1993    };
1994    pub use crate::vm::{run_default, run_trace, Insn, Prog};
1995}
1996
1997#[cfg(test)]
1998mod tests {
1999    use alloc::borrow::Cow;
2000    use alloc::boxed::Box;
2001    use alloc::string::String;
2002    use alloc::{format, vec};
2003
2004    use crate::parse::make_literal;
2005    use crate::{Expr, Regex, RegexImpl};
2006
2007    //use detect_possible_backref;
2008
2009    // tests for to_str
2010
2011    fn to_str(e: Expr) -> String {
2012        let mut s = String::new();
2013        e.to_str(&mut s, 0);
2014        s
2015    }
2016
2017    #[test]
2018    fn to_str_concat_alt() {
2019        let e = Expr::Concat(vec![
2020            Expr::Alt(vec![make_literal("a"), make_literal("b")]),
2021            make_literal("c"),
2022        ]);
2023        assert_eq!(to_str(e), "(?:a|b)c");
2024    }
2025
2026    #[test]
2027    fn to_str_rep_concat() {
2028        let e = Expr::Repeat {
2029            child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
2030            lo: 2,
2031            hi: 3,
2032            greedy: true,
2033        };
2034        assert_eq!(to_str(e), "(?:ab){2,3}");
2035    }
2036
2037    #[test]
2038    fn to_str_group_alt() {
2039        let e = Expr::Group(Box::new(Expr::Alt(vec![
2040            make_literal("a"),
2041            make_literal("b"),
2042        ])));
2043        assert_eq!(to_str(e), "(a|b)");
2044    }
2045
2046    #[test]
2047    fn as_str_debug() {
2048        let s = r"(a+)b\1";
2049        let regex = Regex::new(s).unwrap();
2050        assert_eq!(s, regex.as_str());
2051        assert_eq!(s, format!("{:?}", regex));
2052    }
2053
2054    #[test]
2055    fn display() {
2056        let s = r"(a+)b\1";
2057        let regex = Regex::new(s).unwrap();
2058        assert_eq!(s, format!("{}", regex));
2059    }
2060
2061    #[test]
2062    fn from_str() {
2063        let s = r"(a+)b\1";
2064        let regex = s.parse::<Regex>().unwrap();
2065        assert_eq!(regex.as_str(), s);
2066    }
2067
2068    #[test]
2069    fn to_str_repeat() {
2070        fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
2071            Expr::Repeat {
2072                child: Box::new(make_literal("a")),
2073                lo,
2074                hi,
2075                greedy,
2076            }
2077        }
2078
2079        assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
2080        assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
2081        assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
2082        assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
2083        assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
2084        assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
2085        assert_eq!(to_str(repeat(0, 1, true)), "a?");
2086        assert_eq!(to_str(repeat(0, 1, false)), "a??");
2087        assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
2088        assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
2089        assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
2090        assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
2091    }
2092
2093    #[test]
2094    fn escape() {
2095        // Check that strings that need no quoting are borrowed, and that non-special punctuation
2096        // is not quoted.
2097        match crate::escape("@foo") {
2098            Cow::Borrowed(s) => assert_eq!(s, "@foo"),
2099            _ => panic!("Value should be borrowed."),
2100        }
2101
2102        // Check typical usage.
2103        assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
2104
2105        // Check that multibyte characters are handled correctly.
2106        assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
2107    }
2108
2109    #[test]
2110    fn trailing_positive_lookahead_wrap_capture_group_fixup() {
2111        let s = r"a+(?=c)";
2112        let regex = s.parse::<Regex>().unwrap();
2113        assert!(matches!(regex.inner,
2114            RegexImpl::Wrap { explicit_capture_group_0: true, .. }),
2115            "trailing positive lookahead for an otherwise easy pattern should avoid going through the VM");
2116        assert_eq!(s, regex.as_str());
2117        assert_eq!(s, format!("{:?}", regex));
2118    }
2119
2120    #[test]
2121    fn easy_regex() {
2122        let s = r"(a+)b";
2123        let regex = s.parse::<Regex>().unwrap();
2124        assert!(
2125            matches!(regex.inner, RegexImpl::Wrap { explicit_capture_group_0: false, .. }),
2126            "easy pattern should avoid going through the VM, and capture group 0 should be implicit"
2127        );
2128
2129        assert_eq!(s, regex.as_str());
2130        assert_eq!(s, format!("{:?}", regex));
2131    }
2132
2133    #[test]
2134    fn hard_regex() {
2135        let s = r"(a+)(?>c)";
2136        let regex = s.parse::<Regex>().unwrap();
2137        assert!(
2138            matches!(regex.inner, RegexImpl::Fancy { .. }),
2139            "hard regex should be compiled into a VM"
2140        );
2141        assert_eq!(s, regex.as_str());
2142        assert_eq!(s, format!("{:?}", regex));
2143    }
2144
2145    /*
2146    #[test]
2147    fn detect_backref() {
2148        assert_eq!(detect_possible_backref("a0a1a2"), false);
2149        assert_eq!(detect_possible_backref("a0a1\\a2"), false);
2150        assert_eq!(detect_possible_backref("a0a\\1a2"), true);
2151        assert_eq!(detect_possible_backref("a0a1a2\\"), false);
2152    }
2153    */
2154}
fancy_regex/lib.rs

fancy_regex/
lib.rs