fancy_regex/
analyze.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Analysis of regex expressions.
22
23use alloc::string::String;
24use alloc::vec::Vec;
25use core::cmp::min;
26
27use bit_set::BitSet;
28
29use crate::parse::ExprTree;
30use crate::{CompileError, Error, Expr, Result};
31
32#[derive(Debug)]
33pub struct Info<'a> {
34    pub(crate) start_group: usize,
35    pub(crate) end_group: usize,
36    pub(crate) min_size: usize,
37    pub(crate) const_size: bool,
38    pub(crate) hard: bool,
39    pub(crate) expr: &'a Expr,
40    pub(crate) children: Vec<Info<'a>>,
41}
42
43impl<'a> Info<'a> {
44    pub(crate) fn is_literal(&self) -> bool {
45        match *self.expr {
46            Expr::Literal { casei, .. } => !casei,
47            Expr::Concat(_) => self.children.iter().all(|child| child.is_literal()),
48            _ => false,
49        }
50    }
51
52    pub(crate) fn push_literal(&self, buf: &mut String) {
53        match *self.expr {
54            // could be more paranoid about checking casei
55            Expr::Literal { ref val, .. } => buf.push_str(val),
56            Expr::Concat(_) => {
57                for child in &self.children {
58                    child.push_literal(buf);
59                }
60            }
61            _ => panic!("push_literal called on non-literal"),
62        }
63    }
64}
65
66struct Analyzer<'a> {
67    backrefs: &'a BitSet,
68    group_ix: usize,
69}
70
71impl<'a> Analyzer<'a> {
72    fn visit(&mut self, expr: &'a Expr) -> Result<Info<'a>> {
73        let start_group = self.group_ix;
74        let mut children = Vec::new();
75        let mut min_size = 0;
76        let mut const_size = false;
77        let mut hard = false;
78        match *expr {
79            Expr::Assertion(assertion) if assertion.is_hard() => {
80                const_size = true;
81                hard = true;
82            }
83            Expr::Empty | Expr::Assertion(_) => {
84                const_size = true;
85            }
86            Expr::Any { .. } => {
87                min_size = 1;
88                const_size = true;
89            }
90            Expr::Literal { ref val, casei } => {
91                // right now each character in a literal gets its own node, that might change
92                min_size = 1;
93                const_size = literal_const_size(val, casei);
94            }
95            Expr::Concat(ref v) => {
96                const_size = true;
97                for child in v {
98                    let child_info = self.visit(child)?;
99                    min_size += child_info.min_size;
100                    const_size &= child_info.const_size;
101                    hard |= child_info.hard;
102                    children.push(child_info);
103                }
104            }
105            Expr::Alt(ref v) => {
106                let child_info = self.visit(&v[0])?;
107                min_size = child_info.min_size;
108                const_size = child_info.const_size;
109                hard = child_info.hard;
110                children.push(child_info);
111                for child in &v[1..] {
112                    let child_info = self.visit(child)?;
113                    const_size &= child_info.const_size && min_size == child_info.min_size;
114                    min_size = min(min_size, child_info.min_size);
115                    hard |= child_info.hard;
116                    children.push(child_info);
117                }
118            }
119            Expr::Group(ref child) => {
120                let group = self.group_ix;
121                self.group_ix += 1;
122                let child_info = self.visit(child)?;
123                min_size = child_info.min_size;
124                const_size = child_info.const_size;
125                // If there's a backref to this group, we potentially have to backtrack within the
126                // group. E.g. with `(x|xy)\1` and input `xyxy`, `x` matches but then the backref
127                // doesn't, so we have to backtrack and try `xy`.
128                hard = child_info.hard | self.backrefs.contains(group);
129                children.push(child_info);
130            }
131            Expr::LookAround(ref child, _) => {
132                let child_info = self.visit(child)?;
133                // min_size = 0
134                const_size = true;
135                hard = true;
136                children.push(child_info);
137            }
138            Expr::Repeat {
139                ref child, lo, hi, ..
140            } => {
141                let child_info = self.visit(child)?;
142                min_size = child_info.min_size * lo;
143                const_size = child_info.const_size && lo == hi;
144                hard = child_info.hard;
145                children.push(child_info);
146            }
147            Expr::Delegate { size, .. } => {
148                // currently only used for empty and single-char matches
149                min_size = size;
150                const_size = true;
151            }
152            Expr::Backref(group) => {
153                if group >= self.group_ix {
154                    return Err(Error::CompileError(CompileError::InvalidBackref));
155                }
156                hard = true;
157            }
158            Expr::AtomicGroup(ref child) => {
159                let child_info = self.visit(child)?;
160                min_size = child_info.min_size;
161                const_size = child_info.const_size;
162                hard = true; // TODO: possibly could weaken
163                children.push(child_info);
164            }
165            Expr::KeepOut => {
166                hard = true;
167                const_size = true;
168            }
169            Expr::ContinueFromPreviousMatchEnd => {
170                hard = true;
171                const_size = true;
172            }
173            Expr::BackrefExistsCondition(group) => {
174                if group >= self.group_ix {
175                    return Err(Error::CompileError(CompileError::InvalidBackref));
176                }
177                hard = true;
178                const_size = true;
179            }
180            Expr::Conditional {
181                ref condition,
182                ref true_branch,
183                ref false_branch,
184            } => {
185                hard = true;
186
187                let child_info_condition = self.visit(condition)?;
188                let child_info_truth = self.visit(true_branch)?;
189                let child_info_false = self.visit(false_branch)?;
190
191                min_size = child_info_condition.min_size
192                    + min(child_info_truth.min_size, child_info_false.min_size);
193                const_size = child_info_condition.const_size
194                    && child_info_truth.const_size
195                    && child_info_false.const_size
196                    // if the condition's size plus the truth branch's size is equal to the false branch's size then it's const size
197                    && child_info_condition.min_size + child_info_truth.min_size == child_info_false.min_size;
198
199                children.push(child_info_condition);
200                children.push(child_info_truth);
201                children.push(child_info_false);
202            }
203        };
204
205        Ok(Info {
206            expr,
207            children,
208            start_group,
209            end_group: self.group_ix,
210            min_size,
211            const_size,
212            hard,
213        })
214    }
215}
216
217fn literal_const_size(_: &str, _: bool) -> bool {
218    // Right now, regex doesn't do sophisticated case folding,
219    // test below will fail when that changes, then we need to
220    // do something fancier here.
221    true
222}
223
224/// Analyze the parsed expression to determine whether it requires fancy features.
225pub fn analyze<'a>(tree: &'a ExprTree) -> Result<Info<'a>> {
226    let mut analyzer = Analyzer {
227        backrefs: &tree.backrefs,
228        group_ix: 0,
229    };
230
231    analyzer.visit(&tree.expr)
232}
233
234#[cfg(test)]
235mod tests {
236    use super::analyze;
237    // use super::literal_const_size;
238    use crate::Expr;
239
240    // #[test]
241    // fn case_folding_safe() {
242    //     let re = regex::Regex::new("(?i:ß)").unwrap();
243    //     if re.is_match("SS") {
244    //         assert!(!literal_const_size("ß", true));
245    //     }
246
247    //     // Another tricky example, Armenian ECH YIWN
248    //     let re = regex::Regex::new("(?i:\\x{0587})").unwrap();
249    //     if re.is_match("\u{0565}\u{0582}") {
250    //         assert!(!literal_const_size("\u{0587}", true));
251    //     }
252    // }
253
254    #[test]
255    fn invalid_backref_1() {
256        assert!(analyze(&Expr::parse_tree(".\\0").unwrap()).is_err());
257    }
258
259    #[test]
260    fn invalid_backref_2() {
261        assert!(analyze(&Expr::parse_tree("(.\\1)").unwrap()).is_err());
262    }
263
264    #[test]
265    fn invalid_backref_3() {
266        assert!(analyze(&Expr::parse_tree("\\1(.)").unwrap()).is_err());
267    }
268
269    #[test]
270    fn is_literal() {
271        let tree = Expr::parse_tree("abc").unwrap();
272        let info = analyze(&tree).unwrap();
273        assert_eq!(info.is_literal(), true);
274    }
275
276    #[test]
277    fn is_literal_with_repeat() {
278        let tree = Expr::parse_tree("abc*").unwrap();
279        let info = analyze(&tree).unwrap();
280        assert_eq!(info.is_literal(), false);
281    }
282}