1use alloc::string::String;
24use alloc::vec::Vec;
25use core::usize;
26use regex_automata::meta::Regex as RaRegex;
27use regex_automata::meta::{Builder as RaBuilder, Config as RaConfig};
28#[cfg(all(test, feature = "std"))]
29use std::{collections::BTreeMap, sync::RwLock};
30
31use crate::analyze::Info;
32use crate::vm::{Insn, Prog};
33use crate::LookAround::*;
34use crate::{CompileError, Error, Expr, LookAround, RegexOptions, Result};
35
36struct VMBuilder {
39 prog: Vec<Insn>,
40 n_saves: usize,
41}
42
43impl VMBuilder {
44 fn new(max_group: usize) -> VMBuilder {
45 VMBuilder {
46 prog: Vec::new(),
47 n_saves: max_group * 2,
48 }
49 }
50
51 fn build(self) -> Prog {
52 Prog::new(self.prog, self.n_saves)
53 }
54
55 fn newsave(&mut self) -> usize {
56 let result = self.n_saves;
57 self.n_saves += 1;
58 result
59 }
60
61 fn pc(&self) -> usize {
62 self.prog.len()
63 }
64
65 fn add(&mut self, insn: Insn) {
67 self.prog.push(insn);
68 }
69
70 fn set_jmp_target(&mut self, jmp_pc: usize, target: usize) {
71 match self.prog[jmp_pc] {
72 Insn::Jmp(ref mut next) => *next = target,
73 _ => panic!("mutating instruction other than Jmp"),
74 }
75 }
76
77 fn set_split_target(&mut self, split_pc: usize, target: usize, second: bool) {
78 match self.prog[split_pc] {
79 Insn::Split(_, ref mut y) if second => *y = target,
80 Insn::Split(ref mut x, _) => *x = target,
81 _ => panic!("mutating instruction other than Split"),
82 }
83 }
84
85 fn set_repeat_target(&mut self, repeat_pc: usize, target: usize) {
86 match self.prog[repeat_pc] {
87 Insn::RepeatGr { ref mut next, .. }
88 | Insn::RepeatNg { ref mut next, .. }
89 | Insn::RepeatEpsilonGr { ref mut next, .. }
90 | Insn::RepeatEpsilonNg { ref mut next, .. } => *next = target,
91 _ => panic!("mutating instruction other than Repeat"),
92 }
93 }
94}
95
96struct Compiler {
97 b: VMBuilder,
98 options: RegexOptions,
99}
100
101impl Compiler {
102 fn new(max_group: usize) -> Compiler {
103 Compiler {
104 b: VMBuilder::new(max_group),
105 options: Default::default(),
106 }
107 }
108
109 fn visit(&mut self, info: &Info<'_>, hard: bool) -> Result<()> {
110 if !hard && !info.hard {
111 return self.compile_delegate(info);
113 }
114 match *info.expr {
115 Expr::Empty => (),
116 Expr::Literal { ref val, casei } => {
117 if !casei {
118 self.b.add(Insn::Lit(val.clone()));
119 } else {
120 self.compile_delegate(info)?;
121 }
122 }
123 Expr::Any { newline: true } => {
124 self.b.add(Insn::Any);
125 }
126 Expr::Any { newline: false } => {
127 self.b.add(Insn::AnyNoNL);
128 }
129 Expr::Concat(_) => {
130 self.compile_concat(info, hard)?;
131 }
132 Expr::Alt(_) => {
133 let count = info.children.len();
134 self.compile_alt(count, |compiler, i| compiler.visit(&info.children[i], hard))?;
135 }
136 Expr::Group(_) => {
137 let group = info.start_group;
138 self.b.add(Insn::Save(group * 2));
139 self.visit(&info.children[0], hard)?;
140 self.b.add(Insn::Save(group * 2 + 1));
141 }
142 Expr::Repeat { lo, hi, greedy, .. } => {
143 self.compile_repeat(info, lo, hi, greedy, hard)?;
144 }
145 Expr::LookAround(_, la) => {
146 self.compile_lookaround(info, la)?;
147 }
148 Expr::Backref(group) => {
149 self.b.add(Insn::Backref(group * 2));
150 }
151 Expr::BackrefExistsCondition(group) => {
152 self.b.add(Insn::BackrefExistsCondition(group));
153 }
154 Expr::AtomicGroup(_) => {
155 self.b.add(Insn::BeginAtomic);
158 self.visit(&info.children[0], false)?;
159 self.b.add(Insn::EndAtomic);
160 }
161 Expr::Delegate { .. } => {
162 self.compile_delegate(info)?;
164 }
165 Expr::Assertion(assertion) => {
166 self.b.add(Insn::Assertion(assertion));
167 }
168 Expr::KeepOut => {
169 self.b.add(Insn::Save(0));
170 }
171 Expr::ContinueFromPreviousMatchEnd => {
172 self.b.add(Insn::ContinueFromPreviousMatchEnd);
173 }
174 Expr::Conditional { .. } => {
175 self.compile_conditional(|compiler, i| compiler.visit(&info.children[i], hard))?;
176 }
177 }
178 Ok(())
179 }
180
181 fn compile_alt<F>(&mut self, count: usize, mut handle_alternative: F) -> Result<()>
182 where
183 F: FnMut(&mut Compiler, usize) -> Result<()>,
184 {
185 let mut jmps = Vec::new();
186 let mut last_pc = usize::MAX;
187 for i in 0..count {
188 let has_next = i != count - 1;
189 let pc = self.b.pc();
190 if has_next {
191 self.b.add(Insn::Split(pc + 1, usize::MAX));
192 }
193 if last_pc != usize::MAX {
194 self.b.set_split_target(last_pc, pc, true);
195 }
196 last_pc = pc;
197
198 handle_alternative(self, i)?;
199
200 if has_next {
201 let pc = self.b.pc();
205 jmps.push(pc);
206 self.b.add(Insn::Jmp(0));
207 }
208 }
209 let next_pc = self.b.pc();
210 for jmp_pc in jmps {
211 self.b.set_jmp_target(jmp_pc, next_pc);
212 }
213 Ok(())
214 }
215
216 fn compile_conditional<F>(&mut self, mut handle_child: F) -> Result<()>
217 where
218 F: FnMut(&mut Compiler, usize) -> Result<()>,
219 {
220 self.b.add(Insn::BeginAtomic);
225
226 let split_pc = self.b.pc();
227 self.b.add(Insn::Split(split_pc + 1, usize::MAX));
229
230 handle_child(self, 0)?;
232
233 self.b.add(Insn::EndAtomic);
235
236 handle_child(self, 1)?;
238 let jump_over_false_pc = self.b.pc();
240 self.b.add(Insn::Jmp(0));
241
242 self.b.set_split_target(split_pc, self.b.pc(), true);
244 handle_child(self, 2)?;
245
246 self.b.set_jmp_target(jump_over_false_pc, self.b.pc());
248
249 Ok(())
250 }
251
252 fn compile_concat(&mut self, info: &Info<'_>, hard: bool) -> Result<()> {
253 let prefix_end = info
255 .children
256 .iter()
257 .take_while(|c| c.const_size && !c.hard)
258 .count();
259
260 let suffix_len = if !hard {
263 info.children[prefix_end..]
264 .iter()
265 .rev()
266 .take_while(|c| !c.hard)
267 .count()
268 } else {
269 info.children[prefix_end..]
271 .iter()
272 .rev()
273 .take_while(|c| c.const_size && !c.hard)
274 .count()
275 };
276 let suffix_begin = info.children.len() - suffix_len;
277
278 self.compile_delegates(&info.children[..prefix_end])?;
279
280 for child in info.children[prefix_end..suffix_begin].iter() {
281 self.visit(child, true)?;
282 }
283
284 self.compile_delegates(&info.children[suffix_begin..])
285 }
286
287 fn compile_repeat(
288 &mut self,
289 info: &Info<'_>,
290 lo: usize,
291 hi: usize,
292 greedy: bool,
293 hard: bool,
294 ) -> Result<()> {
295 let child = &info.children[0];
296 if lo == 0 && hi == 1 {
297 let pc = self.b.pc();
299 self.b.add(Insn::Split(pc + 1, pc + 1));
300 self.visit(child, hard)?;
304 let next_pc = self.b.pc();
305 self.b.set_split_target(pc, next_pc, greedy);
306 return Ok(());
307 }
308 let hard = hard | info.hard;
309 if hi == usize::MAX && child.min_size == 0 {
310 let repeat = self.b.newsave();
312 let check = self.b.newsave();
313 self.b.add(Insn::Save0(repeat));
314 let pc = self.b.pc();
315 if greedy {
316 self.b.add(Insn::RepeatEpsilonGr {
317 lo,
318 next: usize::MAX,
319 repeat,
320 check,
321 });
322 } else {
323 self.b.add(Insn::RepeatEpsilonNg {
324 lo,
325 next: usize::MAX,
326 repeat,
327 check,
328 });
329 }
330 self.visit(child, hard)?;
331 self.b.add(Insn::Jmp(pc));
332 let next_pc = self.b.pc();
333 self.b.set_repeat_target(pc, next_pc);
334 } else if lo == 0 && hi == usize::MAX {
335 let pc = self.b.pc();
337 self.b.add(Insn::Split(pc + 1, pc + 1));
338 self.visit(child, hard)?;
339 self.b.add(Insn::Jmp(pc));
340 let next_pc = self.b.pc();
341 self.b.set_split_target(pc, next_pc, greedy);
342 } else if lo == 1 && hi == usize::MAX {
343 let pc = self.b.pc();
345 self.visit(child, hard)?;
346 let next = self.b.pc() + 1;
347 let (x, y) = if greedy { (pc, next) } else { (next, pc) };
348 self.b.add(Insn::Split(x, y));
349 } else {
350 let repeat = self.b.newsave();
351 self.b.add(Insn::Save0(repeat));
352 let pc = self.b.pc();
353 if greedy {
354 self.b.add(Insn::RepeatGr {
355 lo,
356 hi,
357 next: usize::MAX,
358 repeat,
359 });
360 } else {
361 self.b.add(Insn::RepeatNg {
362 lo,
363 hi,
364 next: usize::MAX,
365 repeat,
366 });
367 }
368 self.visit(child, hard)?;
369 self.b.add(Insn::Jmp(pc));
370 let next_pc = self.b.pc();
371 self.b.set_repeat_target(pc, next_pc);
372 }
373 Ok(())
374 }
375
376 fn compile_lookaround(&mut self, info: &Info<'_>, la: LookAround) -> Result<()> {
377 let inner = &info.children[0];
378 match la {
379 LookBehind => {
380 if let Info {
381 const_size: false,
382 expr: &Expr::Alt(_),
383 ..
384 } = inner
385 {
386 let alternatives = &inner.children;
388 self.compile_alt(alternatives.len(), |compiler, i| {
389 let alternative = &alternatives[i];
390 compiler.compile_positive_lookaround(alternative, la)
391 })
392 } else {
393 self.compile_positive_lookaround(inner, la)
394 }
395 }
396 LookBehindNeg => {
397 if let Info {
398 const_size: false,
399 expr: &Expr::Alt(_),
400 ..
401 } = inner
402 {
403 let alternatives = &inner.children;
405 for alternative in alternatives {
406 self.compile_negative_lookaround(alternative, la)?;
407 }
408 Ok(())
409 } else {
410 self.compile_negative_lookaround(inner, la)
411 }
412 }
413 LookAhead => self.compile_positive_lookaround(inner, la),
414 LookAheadNeg => self.compile_negative_lookaround(inner, la),
415 }
416 }
417
418 fn compile_positive_lookaround(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
419 let save = self.b.newsave();
420 self.b.add(Insn::Save(save));
421 self.compile_lookaround_inner(inner, la)?;
422 self.b.add(Insn::Restore(save));
423 Ok(())
424 }
425
426 fn compile_negative_lookaround(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
427 let pc = self.b.pc();
428 self.b.add(Insn::Split(pc + 1, usize::MAX));
429 self.compile_lookaround_inner(inner, la)?;
430 self.b.add(Insn::FailNegativeLookAround);
431 let next_pc = self.b.pc();
432 self.b.set_split_target(pc, next_pc, true);
433 Ok(())
434 }
435
436 fn compile_lookaround_inner(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> {
437 if la == LookBehind || la == LookBehindNeg {
438 if !inner.const_size {
439 return Err(Error::CompileError(CompileError::LookBehindNotConst));
440 }
441 self.b.add(Insn::GoBack(inner.min_size));
442 }
443 self.visit(inner, false)
444 }
445
446 fn compile_delegates(&mut self, infos: &[Info<'_>]) -> Result<()> {
447 if infos.is_empty() {
448 return Ok(());
449 }
450 if infos.iter().all(|e| e.is_literal()) {
453 let mut val = String::new();
454 for info in infos {
455 info.push_literal(&mut val);
456 }
457 self.b.add(Insn::Lit(val));
458 return Ok(());
459 }
460
461 let mut delegate_builder = DelegateBuilder::new();
462 for info in infos {
463 delegate_builder.push(info);
464 }
465 let delegate = delegate_builder.build(&self.options)?;
466
467 self.b.add(delegate);
468 Ok(())
469 }
470
471 fn compile_delegate(&mut self, info: &Info) -> Result<()> {
472 let insn = if info.is_literal() {
473 let mut val = String::new();
474 info.push_literal(&mut val);
475 Insn::Lit(val)
476 } else {
477 DelegateBuilder::new().push(info).build(&self.options)?
478 };
479 self.b.add(insn);
480 Ok(())
481 }
482}
483
484#[cfg(all(test, feature = "std"))]
489static PATTERN_MAPPING: RwLock<BTreeMap<String, String>> = RwLock::new(BTreeMap::new());
490
491pub(crate) fn compile_inner(inner_re: &str, options: &RegexOptions) -> Result<RaRegex> {
492 let mut config = RaConfig::new();
493 if let Some(size_limit) = options.delegate_size_limit {
494 config = config.nfa_size_limit(Some(size_limit));
495 }
496 if let Some(dfa_size_limit) = options.delegate_dfa_size_limit {
497 config = config.dfa_size_limit(Some(dfa_size_limit));
498 }
499
500 let re = RaBuilder::new()
501 .configure(config)
502 .syntax(options.syntaxc)
503 .build(inner_re)
504 .map_err(CompileError::InnerError)
505 .map_err(Error::CompileError)?;
506
507 #[cfg(all(test, feature = "std"))]
508 PATTERN_MAPPING
509 .write()
510 .unwrap()
511 .insert(format!("{:?}", re), inner_re.to_owned());
512
513 Ok(re)
514}
515
516pub fn compile(info: &Info<'_>) -> Result<Prog> {
518 let mut c = Compiler::new(info.end_group);
519 c.visit(info, false)?;
520 c.b.add(Insn::End);
521 Ok(c.b.build())
522}
523
524struct DelegateBuilder {
525 re: String,
526 min_size: usize,
527 const_size: bool,
528 start_group: Option<usize>,
529 end_group: usize,
530}
531
532impl DelegateBuilder {
533 fn new() -> Self {
534 Self {
535 re: String::new(),
536 min_size: 0,
537 const_size: true,
538 start_group: None,
539 end_group: 0,
540 }
541 }
542
543 fn push(&mut self, info: &Info<'_>) -> &mut DelegateBuilder {
544 self.min_size += info.min_size;
548 self.const_size &= info.const_size;
549 if self.start_group.is_none() {
550 self.start_group = Some(info.start_group);
551 }
552 self.end_group = info.end_group;
553
554 info.expr.to_str(&mut self.re, 1);
565 self
566 }
567
568 fn build(&self, options: &RegexOptions) -> Result<Insn> {
569 let start_group = self.start_group.expect("Expected at least one expression");
570 let end_group = self.end_group;
571
572 let compiled = compile_inner(&self.re, options)?;
573
574 Ok(Insn::Delegate {
575 inner: compiled,
576 start_group,
577 end_group,
578 })
579 }
580}
581
582#[cfg(test)]
583mod tests {
584 use super::*;
585 use crate::analyze::analyze;
586 use crate::parse::ExprTree;
587 use crate::vm::Insn::*;
588 use alloc::vec;
589 use bit_set::BitSet;
590 use matches::assert_matches;
591
592 #[test]
593 fn jumps_for_alternation() {
594 let tree = ExprTree {
595 expr: Expr::Alt(vec![
596 Expr::Literal {
597 val: "a".into(),
598 casei: false,
599 },
600 Expr::Literal {
601 val: "b".into(),
602 casei: false,
603 },
604 Expr::Literal {
605 val: "c".into(),
606 casei: false,
607 },
608 ]),
609 backrefs: BitSet::new(),
610 named_groups: Default::default(),
611 };
612 let info = analyze(&tree).unwrap();
613
614 let mut c = Compiler::new(0);
615 c.visit(&info, true).unwrap();
617 c.b.add(Insn::End);
618
619 let prog = c.b.prog;
620
621 assert_eq!(prog.len(), 8, "prog: {:?}", prog);
622 assert_matches!(prog[0], Split(1, 3));
623 assert_matches!(prog[1], Lit(ref l) if l == "a");
624 assert_matches!(prog[2], Jmp(7));
625 assert_matches!(prog[3], Split(4, 6));
626 assert_matches!(prog[4], Lit(ref l) if l == "b");
627 assert_matches!(prog[5], Jmp(7));
628 assert_matches!(prog[6], Lit(ref l) if l == "c");
629 assert_matches!(prog[7], End);
630 }
631
632 #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
633 #[test]
634 fn look_around_pattern_can_be_delegated() {
635 let prog = compile_prog("(?=ab*)c");
636
637 assert_eq!(prog.len(), 5, "prog: {:?}", prog);
638 assert_matches!(prog[0], Save(0));
639 assert_delegate(&prog[1], "ab*");
640 assert_matches!(prog[2], Restore(0));
641 assert_matches!(prog[3], Lit(ref l) if l == "c");
642 assert_matches!(prog[4], End);
643 }
644
645 #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
646 #[test]
647 fn easy_concat_can_delegate_end() {
648 let prog = compile_prog("(?!x)(?:a|ab)x*");
649
650 assert_eq!(prog.len(), 5, "prog: {:?}", prog);
651 assert_matches!(prog[0], Split(1, 3));
652 assert_matches!(prog[1], Lit(ref l) if l == "x");
653 assert_matches!(prog[2], FailNegativeLookAround);
654 assert_delegate(&prog[3], "(?:a|ab)x*");
655 assert_matches!(prog[4], End);
656 }
657
658 #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
659 #[test]
660 fn hard_concat_can_delegate_const_size_end() {
661 let prog = compile_prog("(?:(?!x)(?:a|b)c)x*");
662
663 assert_eq!(prog.len(), 6, "prog: {:?}", prog);
664 assert_matches!(prog[0], Split(1, 3));
665 assert_matches!(prog[1], Lit(ref l) if l == "x");
666 assert_matches!(prog[2], FailNegativeLookAround);
667 assert_delegate(&prog[3], "(?:a|b)c");
668 assert_delegate(&prog[4], "x*");
669 assert_matches!(prog[5], End);
670 }
671
672 #[cfg_attr(not(feature = "std"), ignore = "this test need std")]
673 #[test]
674 fn hard_concat_can_not_delegate_variable_end() {
675 let prog = compile_prog("(?:(?!x)(?:a|ab))x*");
676
677 assert_eq!(prog.len(), 9, "prog: {:?}", prog);
678 assert_matches!(prog[0], Split(1, 3));
679 assert_matches!(prog[1], Lit(ref l) if l == "x");
680 assert_matches!(prog[2], FailNegativeLookAround);
681 assert_matches!(prog[3], Split(4, 6));
682 assert_matches!(prog[4], Lit(ref l) if l == "a");
683 assert_matches!(prog[5], Jmp(7));
684 assert_matches!(prog[6], Lit(ref l) if l == "ab");
685 assert_delegate(&prog[7], "x*");
686 assert_matches!(prog[8], End);
687 }
688
689 #[test]
690 fn conditional_expression_can_be_compiled() {
691 let prog = compile_prog(r"(?(ab)c|d)");
692
693 assert_eq!(prog.len(), 8, "prog: {:?}", prog);
694
695 assert_matches!(prog[0], BeginAtomic);
696 assert_matches!(prog[1], Split(2, 6));
697 assert_matches!(prog[2], Lit(ref l) if l == "ab");
698 assert_matches!(prog[3], EndAtomic);
699 assert_matches!(prog[4], Lit(ref l) if l == "c");
700 assert_matches!(prog[5], Jmp(7));
701 assert_matches!(prog[6], Lit(ref l) if l == "d");
702 assert_matches!(prog[7], End);
703 }
704
705 fn compile_prog(re: &str) -> Vec<Insn> {
706 let tree = Expr::parse_tree(re).unwrap();
707 let info = analyze(&tree).unwrap();
708 let prog = compile(&info).unwrap();
709 prog.body
710 }
711
712 #[cfg(feature = "std")]
713 fn assert_delegate(insn: &Insn, re: &str) {
714 match insn {
715 Insn::Delegate { inner, .. } => {
716 assert_eq!(
717 PATTERN_MAPPING
718 .read()
719 .unwrap()
720 .get(&alloc::format!("{:?}", inner))
721 .unwrap(),
722 re
723 );
724 }
725 _ => {
726 panic!("Expected Insn::Delegate but was {:#?}", insn);
727 }
728 }
729 }
730
731 #[cfg(not(feature = "std"))]
732 fn assert_delegate(_: &Insn, _: &str) {}
733}