1use core::{cell::RefCell, mem::size_of};
2
3use alloc::{string::String, sync::Arc, vec, vec::Vec};
4
5use crate::{
6 error::Error,
7 hir::{self, Hir, HirKind},
8 int::U32,
9};
10
11pub(crate) type StateID = u32;
12
13#[derive(Clone, Copy, Debug)]
14pub(crate) struct Config {
15 pub(crate) size_limit: Option<usize>,
16}
17
18impl Default for Config {
19 fn default() -> Config {
20 Config { size_limit: Some(10 * (1 << 20)) }
21 }
22}
23
24#[derive(Clone)]
25pub(crate) struct NFA {
26 pattern: String,
30 states: Vec<State>,
32 start: StateID,
34 is_start_anchored: bool,
36 is_match_empty: bool,
38 static_explicit_captures_len: Option<usize>,
41 cap_name_to_index: CaptureNameMap,
43 cap_index_to_name: Vec<Option<Arc<str>>>,
46 memory_extra: usize,
51}
52
53impl NFA {
54 pub(crate) fn new(
56 config: Config,
57 pattern: String,
58 hir: &Hir,
59 ) -> Result<NFA, Error> {
60 Compiler::new(config, pattern).compile(hir)
61 }
62
63 pub(crate) fn pattern(&self) -> &str {
65 &self.pattern
66 }
67
68 pub(crate) fn state(&self, id: StateID) -> &State {
74 &self.states[id.as_usize()]
75 }
76
77 pub(crate) fn len(&self) -> usize {
79 self.states.len()
80 }
81
82 pub(crate) fn start(&self) -> StateID {
84 self.start
85 }
86
87 pub(crate) fn to_index(&self, name: &str) -> Option<usize> {
90 self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize())
91 }
92
93 pub(crate) fn capture_names(&self) -> CaptureNames<'_> {
104 CaptureNames { it: self.cap_index_to_name.iter() }
105 }
106
107 pub(crate) fn group_len(&self) -> usize {
110 self.cap_index_to_name.len()
111 }
112
113 pub(crate) fn is_start_anchored(&self) -> bool {
116 self.is_start_anchored
117 }
118
119 pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
123 self.static_explicit_captures_len
124 }
125
126 fn memory_usage(&self) -> usize {
128 (self.states.len() * size_of::<State>())
129 + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>())
130 + self.memory_extra
131 }
132}
133
134impl core::fmt::Debug for NFA {
135 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
136 writeln!(f, "NFA(")?;
137 writeln!(f, "pattern: {}", self.pattern)?;
138 for (sid, state) in self.states.iter().enumerate() {
139 writeln!(f, "{:07?}: {:?}", sid, state)?;
140 }
141 writeln!(f, ")")?;
142 Ok(())
143 }
144}
145
146#[derive(Clone, Debug)]
151pub(crate) struct CaptureNames<'a> {
152 it: core::slice::Iter<'a, Option<Arc<str>>>,
153}
154
155impl<'a> Iterator for CaptureNames<'a> {
156 type Item = Option<&'a str>;
157
158 fn next(&mut self) -> Option<Option<&'a str>> {
159 self.it.next().map(|n| n.as_deref())
160 }
161}
162
163#[derive(Clone, Eq, PartialEq)]
164pub(crate) enum State {
165 Char { target: StateID, ch: char },
166 Ranges { target: StateID, ranges: Vec<(char, char)> },
167 Splits { targets: Vec<StateID>, reverse: bool },
168 Goto { target: StateID, look: Option<hir::Look> },
169 Capture { target: StateID, slot: u32 },
170 Fail,
171 Match,
172}
173
174impl State {
175 fn memory_usage(&self) -> usize {
177 match *self {
178 State::Char { .. }
179 | State::Goto { .. }
180 | State::Capture { .. }
181 | State::Fail { .. }
182 | State::Match => 0,
183 State::Splits { ref targets, .. } => {
184 targets.len() * size_of::<StateID>()
185 }
186 State::Ranges { ref ranges, .. } => {
187 ranges.len() * size_of::<(char, char)>()
188 }
189 }
190 }
191
192 pub(crate) fn iter_splits<'a>(
195 splits: &'a [StateID],
196 reverse: bool,
197 ) -> impl Iterator<Item = StateID> + 'a {
198 let mut it = splits.iter();
199 core::iter::from_fn(move || {
200 if reverse { it.next_back() } else { it.next() }.copied()
201 })
202 }
203}
204
205impl core::fmt::Debug for State {
206 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
207 match *self {
208 State::Char { target, ch } => {
209 write!(f, "{:?} => {:?}", ch, target)
210 }
211 State::Ranges { target, ref ranges } => {
212 for (i, &(start, end)) in ranges.iter().enumerate() {
213 if i > 0 {
214 write!(f, ", ")?;
215 }
216 write!(f, "{:?}-{:?} => {:?}", start, end, target)?;
217 }
218 Ok(())
219 }
220 State::Splits { ref targets, reverse } => {
221 write!(f, "splits(")?;
222 for (i, sid) in
223 State::iter_splits(targets, reverse).enumerate()
224 {
225 if i > 0 {
226 write!(f, ", ")?;
227 }
228 write!(f, "{:?}", sid)?;
229 }
230 write!(f, ")")
231 }
232 State::Goto { target, look: None } => {
233 write!(f, "goto({:?})", target)
234 }
235 State::Goto { target, look: Some(look) } => {
236 write!(f, "{:?} => {:?}", look, target)
237 }
238 State::Capture { target, slot } => {
239 write!(f, "capture(slot={:?}) => {:?}", slot, target,)
240 }
241 State::Fail => write!(f, "FAIL"),
242 State::Match => {
243 write!(f, "MATCH")
244 }
245 }
246 }
247}
248
249#[cfg(feature = "std")]
255type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>;
256#[cfg(not(feature = "std"))]
257type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>;
258
259#[derive(Debug)]
260struct Compiler {
261 config: Config,
262 nfa: RefCell<NFA>,
263}
264
265impl Compiler {
266 fn new(config: Config, pattern: String) -> Compiler {
267 let nfa = RefCell::new(NFA {
268 pattern,
269 states: vec![],
270 start: 0,
271 is_start_anchored: false,
272 is_match_empty: false,
273 static_explicit_captures_len: None,
274 cap_name_to_index: CaptureNameMap::default(),
275 cap_index_to_name: vec![],
276 memory_extra: 0,
277 });
278 Compiler { config, nfa }
279 }
280
281 fn compile(self, hir: &Hir) -> Result<NFA, Error> {
282 self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored();
283 self.nfa.borrow_mut().is_match_empty = hir.is_match_empty();
284 self.nfa.borrow_mut().static_explicit_captures_len =
285 hir.static_explicit_captures_len();
286 let compiled = self.c_capture(0, None, hir)?;
287 let mat = self.add(State::Match)?;
288 self.patch(compiled.end, mat)?;
289 self.nfa.borrow_mut().start = compiled.start;
290 Ok(self.nfa.into_inner())
291 }
292
293 fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> {
294 match *hir.kind() {
295 HirKind::Empty => self.c_empty(),
296 HirKind::Char(ch) => self.c_char(ch),
297 HirKind::Class(ref class) => self.c_class(class),
298 HirKind::Look(ref look) => self.c_look(look),
299 HirKind::Repetition(ref rep) => self.c_repetition(rep),
300 HirKind::Capture(ref cap) => {
301 self.c_capture(cap.index, cap.name.as_deref(), &cap.sub)
302 }
303 HirKind::Concat(ref subs) => {
304 self.c_concat(subs.iter().map(|s| self.c(s)))
305 }
306 HirKind::Alternation(ref subs) => {
307 self.c_alternation(subs.iter().map(|s| self.c(s)))
308 }
309 }
310 }
311
312 fn c_fail(&self) -> Result<ThompsonRef, Error> {
314 let id = self.add(State::Fail)?;
315 Ok(ThompsonRef { start: id, end: id })
316 }
317
318 fn c_empty(&self) -> Result<ThompsonRef, Error> {
324 let id = self.add_empty()?;
325 Ok(ThompsonRef { start: id, end: id })
326 }
327
328 fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
330 let id = self.add(State::Char { target: 0, ch })?;
331 Ok(ThompsonRef { start: id, end: id })
332 }
333
334 fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> {
338 let id = if class.ranges.is_empty() {
339 self.add(State::Fail)
345 } else {
346 let ranges =
347 class.ranges.iter().map(|r| (r.start, r.end)).collect();
348 self.add(State::Ranges { target: 0, ranges })
349 }?;
350 Ok(ThompsonRef { start: id, end: id })
351 }
352
353 fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> {
356 let id = self.add(State::Goto { target: 0, look: Some(*look) })?;
357 Ok(ThompsonRef { start: id, end: id })
358 }
359
360 fn c_repetition(
363 &self,
364 rep: &hir::Repetition,
365 ) -> Result<ThompsonRef, Error> {
366 match (rep.min, rep.max) {
367 (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
368 (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
369 (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
370 (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
371 }
372 }
373
374 fn c_bounded(
381 &self,
382 hir: &Hir,
383 greedy: bool,
384 min: u32,
385 max: u32,
386 ) -> Result<ThompsonRef, Error> {
387 let prefix = self.c_exactly(hir, min)?;
388 if min == max {
389 return Ok(prefix);
390 }
391
392 let empty = self.add_empty()?;
422 let mut prev_end = prefix.end;
423 for _ in min..max {
424 let splits =
425 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
426 let compiled = self.c(hir)?;
427 self.patch(prev_end, splits)?;
428 self.patch(splits, compiled.start)?;
429 self.patch(splits, empty)?;
430 prev_end = compiled.end;
431 }
432 self.patch(prev_end, empty)?;
433 Ok(ThompsonRef { start: prefix.start, end: empty })
434 }
435
436 fn c_at_least(
444 &self,
445 hir: &Hir,
446 greedy: bool,
447 n: u32,
448 ) -> Result<ThompsonRef, Error> {
449 if n == 0 {
450 if !hir.is_match_empty() {
455 let splits = self.add(State::Splits {
456 targets: vec![],
457 reverse: !greedy,
458 })?;
459 let compiled = self.c(hir)?;
460 self.patch(splits, compiled.start)?;
461 self.patch(compiled.end, splits)?;
462 return Ok(ThompsonRef { start: splits, end: splits });
463 }
464
465 let compiled = self.c(hir)?;
474 let plus =
475 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
476 self.patch(compiled.end, plus)?;
477 self.patch(plus, compiled.start)?;
478
479 let question =
480 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
481 let empty = self.add_empty()?;
482 self.patch(question, compiled.start)?;
483 self.patch(question, empty)?;
484 self.patch(plus, empty)?;
485 Ok(ThompsonRef { start: question, end: empty })
486 } else if n == 1 {
487 let compiled = self.c(hir)?;
488 let splits =
489 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
490 self.patch(compiled.end, splits)?;
491 self.patch(splits, compiled.start)?;
492 Ok(ThompsonRef { start: compiled.start, end: splits })
493 } else {
494 let prefix = self.c_exactly(hir, n - 1)?;
495 let last = self.c(hir)?;
496 let splits =
497 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
498 self.patch(prefix.end, last.start)?;
499 self.patch(last.end, splits)?;
500 self.patch(splits, last.start)?;
501 Ok(ThompsonRef { start: prefix.start, end: splits })
502 }
503 }
504
505 fn c_zero_or_one(
512 &self,
513 hir: &Hir,
514 greedy: bool,
515 ) -> Result<ThompsonRef, Error> {
516 let splits =
517 self.add(State::Splits { targets: vec![], reverse: !greedy })?;
518 let compiled = self.c(hir)?;
519 let empty = self.add_empty()?;
520 self.patch(splits, compiled.start)?;
521 self.patch(splits, empty)?;
522 self.patch(compiled.end, empty)?;
523 Ok(ThompsonRef { start: splits, end: empty })
524 }
525
526 fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> {
528 self.c_concat((0..n).map(|_| self.c(hir)))
529 }
530
531 fn c_capture(
535 &self,
536 index: u32,
537 name: Option<&str>,
538 hir: &Hir,
539 ) -> Result<ThompsonRef, Error> {
540 let existing_groups_len = self.nfa.borrow().cap_index_to_name.len();
545 for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) {
546 self.nfa.borrow_mut().cap_index_to_name.push(None);
547 }
548 if index.as_usize() >= existing_groups_len {
549 if let Some(name) = name {
550 let name = Arc::from(name);
551 let mut nfa = self.nfa.borrow_mut();
552 nfa.cap_name_to_index.insert(Arc::clone(&name), index);
553 nfa.cap_index_to_name.push(Some(Arc::clone(&name)));
554 nfa.memory_extra += name.len() + size_of::<u32>();
556 } else {
557 self.nfa.borrow_mut().cap_index_to_name.push(None);
558 }
559 }
560
561 let Some(slot) = index.checked_mul(2) else {
562 return Err(Error::new("capture group slots exhausted"));
563 };
564 let start = self.add(State::Capture { target: 0, slot })?;
565 let inner = self.c(hir)?;
566 let Some(slot) = slot.checked_add(1) else {
567 return Err(Error::new("capture group slots exhausted"));
568 };
569 let end = self.add(State::Capture { target: 0, slot })?;
570 self.patch(start, inner.start)?;
571 self.patch(inner.end, end)?;
572
573 Ok(ThompsonRef { start, end })
574 }
575
576 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
580 where
581 I: Iterator<Item = Result<ThompsonRef, Error>>,
582 {
583 let ThompsonRef { start, mut end } = match it.next() {
584 Some(result) => result?,
585 None => return self.c_empty(),
586 };
587 for result in it {
588 let compiled = result?;
589 self.patch(end, compiled.start)?;
590 end = compiled.end;
591 }
592 Ok(ThompsonRef { start, end })
593 }
594
595 fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
603 where
604 I: Iterator<Item = Result<ThompsonRef, Error>>,
605 {
606 let first = match it.next() {
607 None => return self.c_fail(),
608 Some(result) => result?,
609 };
610 let second = match it.next() {
611 None => return Ok(first),
612 Some(result) => result?,
613 };
614
615 let splits =
616 self.add(State::Splits { targets: vec![], reverse: false })?;
617 let end = self.add_empty()?;
618 self.patch(splits, first.start)?;
619 self.patch(first.end, end)?;
620 self.patch(splits, second.start)?;
621 self.patch(second.end, end)?;
622 for result in it {
623 let compiled = result?;
624 self.patch(splits, compiled.start)?;
625 self.patch(compiled.end, end)?;
626 }
627 Ok(ThompsonRef { start: splits, end })
628 }
629
630 fn add_empty(&self) -> Result<StateID, Error> {
637 self.add(State::Goto { target: 0, look: None })
638 }
639
640 fn add(&self, state: State) -> Result<StateID, Error> {
644 let id = u32::try_from(self.nfa.borrow().states.len())
645 .map_err(|_| Error::new("exhausted state IDs, too many states"))?;
646 self.nfa.borrow_mut().memory_extra += state.memory_usage();
647 self.nfa.borrow_mut().states.push(state);
648 self.check_size_limit()?;
649 Ok(id)
650 }
651
652 fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
666 let mut new_memory_extra = self.nfa.borrow().memory_extra;
667 match self.nfa.borrow_mut().states[from.as_usize()] {
668 State::Char { ref mut target, .. } => {
669 *target = to;
670 }
671 State::Ranges { ref mut target, .. } => {
672 *target = to;
673 }
674 State::Splits { ref mut targets, .. } => {
675 targets.push(to);
676 new_memory_extra += size_of::<StateID>();
677 }
678 State::Goto { ref mut target, .. } => {
679 *target = to;
680 }
681 State::Capture { ref mut target, .. } => {
682 *target = to;
683 }
684 State::Fail | State::Match => {}
685 }
686 if new_memory_extra != self.nfa.borrow().memory_extra {
687 self.nfa.borrow_mut().memory_extra = new_memory_extra;
688 self.check_size_limit()?;
689 }
690 Ok(())
691 }
692
693 fn check_size_limit(&self) -> Result<(), Error> {
697 if let Some(limit) = self.config.size_limit {
698 if self.nfa.borrow().memory_usage() > limit {
699 return Err(Error::new("compiled regex exceeded size limit"));
700 }
701 }
702 Ok(())
703 }
704}
705
706#[derive(Clone, Copy, Debug)]
710struct ThompsonRef {
711 start: StateID,
712 end: StateID,
713}