regex_lite/
interpolate.rs

1/*!
2Provides routines for interpolating capture group references.
3
4That is, if a replacement string contains references like `$foo` or `${foo1}`,
5then they are replaced with the corresponding capture values for the groups
6named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
7is supported as well, with `1` corresponding to a capture group index and not
8a name.
9
10This module provides the free functions [`string`] and [`bytes`], which
11interpolate Rust Unicode strings and byte strings, respectively.
12
13# Format
14
15These routines support two different kinds of capture references: unbraced and
16braced.
17
18For the unbraced format, the format supported is `$ref` where `name` can be
19any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
20possible parse. So for example, `$1a` corresponds to the capture group named
21`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
22it is treated as a capture group index itself and not a name.
23
24For the braced format, the format supported is `${ref}` where `ref` can be any
25sequence of bytes except for `}`. If no closing brace occurs, then it is not
26considered a capture reference. As with the unbraced format, if `ref` matches
27`^[0-9]+$`, then it is treated as a capture group index and not a name.
28
29The braced format is useful for exerting precise control over the name of the
30capture reference. For example, `${1}a` corresponds to the capture group
31reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
32corresponds to the capture group reference `1a`. The braced format is also
33useful for expressing capture group names that use characters not supported by
34the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
35named `foo[bar].baz`.
36
37If a capture group reference is found and it does not refer to a valid capture
38group, then it will be replaced with the empty string.
39
40To write a literal `$`, use `$$`.
41
42To be clear, and as exhibited via the type signatures in the routines in this
43module, it is impossible for a replacement string to be invalid. A replacement
44string may not have the intended semantics, but the interpolation procedure
45itself can never fail.
46*/
47
48use alloc::string::String;
49
50/// Accepts a replacement string and interpolates capture references with their
51/// corresponding values.
52///
53/// `append` should be a function that appends the string value of a capture
54/// group at a particular index to the string given. If the capture group
55/// index is invalid, then nothing should be appended.
56///
57/// `name_to_index` should be a function that maps a capture group name to a
58/// capture group index. If the given name doesn't exist, then `None` should
59/// be returned.
60///
61/// Finally, `dst` is where the final interpolated contents should be written.
62/// If `replacement` contains no capture group references, then `dst` will be
63/// equivalent to `replacement`.
64///
65/// See the [module documentation](self) for details about the format
66/// supported.
67pub fn string(
68    mut replacement: &str,
69    mut append: impl FnMut(usize, &mut String),
70    mut name_to_index: impl FnMut(&str) -> Option<usize>,
71    dst: &mut String,
72) {
73    while !replacement.is_empty() {
74        match replacement.find('$') {
75            None => break,
76            Some(i) => {
77                dst.push_str(&replacement[..i]);
78                replacement = &replacement[i..];
79            }
80        }
81        // Handle escaping of '$'.
82        if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
83            dst.push_str("$");
84            replacement = &replacement[2..];
85            continue;
86        }
87        debug_assert!(!replacement.is_empty());
88        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
89            Some(cap_ref) => cap_ref,
90            None => {
91                dst.push_str("$");
92                replacement = &replacement[1..];
93                continue;
94            }
95        };
96        replacement = &replacement[cap_ref.end..];
97        match cap_ref.cap {
98            Ref::Number(i) => append(i, dst),
99            Ref::Named(name) => {
100                if let Some(i) = name_to_index(name) {
101                    append(i, dst);
102                }
103            }
104        }
105    }
106    dst.push_str(replacement);
107}
108
109/*
110This should be uncommented and used if we ever provide public APIs for
111searching `&[u8]`.
112
113/// Accepts a replacement byte string and interpolates capture references with
114/// their corresponding values.
115///
116/// `append` should be a function that appends the byte string value of a
117/// capture group at a particular index to the byte string given. If the
118/// capture group index is invalid, then nothing should be appended.
119///
120/// `name_to_index` should be a function that maps a capture group name to a
121/// capture group index. If the given name doesn't exist, then `None` should
122/// be returned.
123///
124/// Finally, `dst` is where the final interpolated contents should be written.
125/// If `replacement` contains no capture group references, then `dst` will be
126/// equivalent to `replacement`.
127///
128/// See the [module documentation](self) for details about the format
129/// supported.
130pub fn bytes(
131    mut replacement: &[u8],
132    mut append: impl FnMut(usize, &mut Vec<u8>),
133    mut name_to_index: impl FnMut(&str) -> Option<usize>,
134    dst: &mut Vec<u8>,
135) {
136    while !replacement.is_empty() {
137        match replacement.iter().position(|&b| b == b'$') {
138            None => break,
139            Some(i) => {
140                dst.extend_from_slice(&replacement[..i]);
141                replacement = &replacement[i..];
142            }
143        }
144        // Handle escaping of '$'.
145        if replacement.get(1).map_or(false, |&b| b == b'$') {
146            dst.push(b'$');
147            replacement = &replacement[2..];
148            continue;
149        }
150        debug_assert!(!replacement.is_empty());
151        let cap_ref = match find_cap_ref(replacement) {
152            Some(cap_ref) => cap_ref,
153            None => {
154                dst.push(b'$');
155                replacement = &replacement[1..];
156                continue;
157            }
158        };
159        replacement = &replacement[cap_ref.end..];
160        match cap_ref.cap {
161            Ref::Number(i) => append(i, dst),
162            Ref::Named(name) => {
163                if let Some(i) = name_to_index(name) {
164                    append(i, dst);
165                }
166            }
167        }
168    }
169    dst.extend_from_slice(replacement);
170}
171*/
172
173/// `CaptureRef` represents a reference to a capture group inside some text.
174/// The reference is either a capture group name or a number.
175///
176/// It is also tagged with the position in the text following the
177/// capture reference.
178#[derive(Clone, Copy, Debug, Eq, PartialEq)]
179struct CaptureRef<'a> {
180    cap: Ref<'a>,
181    end: usize,
182}
183
184/// A reference to a capture group in some text.
185///
186/// e.g., `$2`, `$foo`, `${foo}`.
187#[derive(Clone, Copy, Debug, Eq, PartialEq)]
188enum Ref<'a> {
189    Named(&'a str),
190    Number(usize),
191}
192
193impl<'a> From<&'a str> for Ref<'a> {
194    fn from(x: &'a str) -> Ref<'a> {
195        Ref::Named(x)
196    }
197}
198
199impl From<usize> for Ref<'static> {
200    fn from(x: usize) -> Ref<'static> {
201        Ref::Number(x)
202    }
203}
204
205/// Parses a possible reference to a capture group name in the given text,
206/// starting at the beginning of `replacement`.
207///
208/// If no such valid reference could be found, None is returned.
209///
210/// Note that this returns a "possible" reference because this routine doesn't
211/// know whether the reference is to a valid group or not. If it winds up not
212/// being a valid reference, then it should be replaced with the empty string.
213fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
214    let mut i = 0;
215    let rep: &[u8] = replacement;
216    if rep.len() <= 1 || rep[0] != b'$' {
217        return None;
218    }
219    i += 1;
220    if rep[i] == b'{' {
221        return find_cap_ref_braced(rep, i + 1);
222    }
223    let mut cap_end = i;
224    while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
225        cap_end += 1;
226    }
227    if cap_end == i {
228        return None;
229    }
230    // We just verified that the range 0..cap_end is valid ASCII, so it must
231    // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
232    // check via an unchecked conversion or by parsing the number straight from
233    // &[u8].
234    let cap = core::str::from_utf8(&rep[i..cap_end])
235        .expect("valid UTF-8 capture name");
236    Some(CaptureRef {
237        cap: match cap.parse::<usize>() {
238            Ok(i) => Ref::Number(i),
239            Err(_) => Ref::Named(cap),
240        },
241        end: cap_end,
242    })
243}
244
245/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
246/// brace has been found at `i-1` in `rep`. This then looks for a closing
247/// brace and returns the capture reference within the brace.
248fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
249    assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
250    let start = i;
251    while rep.get(i).map_or(false, |&b| b != b'}') {
252        i += 1;
253    }
254    if !rep.get(i).map_or(false, |&b| b == b'}') {
255        return None;
256    }
257    // When looking at braced names, we don't put any restrictions on the name,
258    // so it's possible it could be invalid UTF-8. But a capture group name
259    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
260    // safely return None.
261    let cap = match core::str::from_utf8(&rep[start..i]) {
262        Err(_) => return None,
263        Ok(cap) => cap,
264    };
265    Some(CaptureRef {
266        cap: match cap.parse::<usize>() {
267            Ok(i) => Ref::Number(i),
268            Err(_) => Ref::Named(cap),
269        },
270        end: i + 1,
271    })
272}
273
274/// Returns true if and only if the given byte is allowed in a capture name
275/// written in non-brace form.
276fn is_valid_cap_letter(b: u8) -> bool {
277    match b {
278        b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
279        _ => false,
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use alloc::{string::String, vec, vec::Vec};
286
287    use super::{find_cap_ref, CaptureRef};
288
289    macro_rules! find {
290        ($name:ident, $text:expr) => {
291            #[test]
292            fn $name() {
293                assert_eq!(None, find_cap_ref($text.as_bytes()));
294            }
295        };
296        ($name:ident, $text:expr, $capref:expr) => {
297            #[test]
298            fn $name() {
299                assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
300            }
301        };
302    }
303
304    macro_rules! c {
305        ($name_or_number:expr, $pos:expr) => {
306            CaptureRef { cap: $name_or_number.into(), end: $pos }
307        };
308    }
309
310    find!(find_cap_ref1, "$foo", c!("foo", 4));
311    find!(find_cap_ref2, "${foo}", c!("foo", 6));
312    find!(find_cap_ref3, "$0", c!(0, 2));
313    find!(find_cap_ref4, "$5", c!(5, 2));
314    find!(find_cap_ref5, "$10", c!(10, 3));
315    // See https://github.com/rust-lang/regex/pull/585
316    // for more on characters following numbers
317    find!(find_cap_ref6, "$42a", c!("42a", 4));
318    find!(find_cap_ref7, "${42}a", c!(42, 5));
319    find!(find_cap_ref8, "${42");
320    find!(find_cap_ref9, "${42 ");
321    find!(find_cap_ref10, " $0 ");
322    find!(find_cap_ref11, "$");
323    find!(find_cap_ref12, " ");
324    find!(find_cap_ref13, "");
325    find!(find_cap_ref14, "$1-$2", c!(1, 2));
326    find!(find_cap_ref15, "$1_$2", c!("1_", 3));
327    find!(find_cap_ref16, "$x-$y", c!("x", 2));
328    find!(find_cap_ref17, "$x_$y", c!("x_", 3));
329    find!(find_cap_ref18, "${#}", c!("#", 4));
330    find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
331    find!(find_cap_ref20, "${¾}", c!("¾", 5));
332    find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
333    find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
334    find!(find_cap_ref23, "${☃}", c!("☃", 6));
335    find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
336    find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
337    find!(find_cap_ref26, "${名字}", c!("名字", 9));
338
339    fn interpolate_string(
340        mut name_to_index: Vec<(&'static str, usize)>,
341        caps: Vec<&'static str>,
342        replacement: &str,
343    ) -> String {
344        name_to_index.sort_by_key(|x| x.0);
345
346        let mut dst = String::new();
347        super::string(
348            replacement,
349            |i, dst| {
350                if let Some(&s) = caps.get(i) {
351                    dst.push_str(s);
352                }
353            },
354            |name| -> Option<usize> {
355                name_to_index
356                    .binary_search_by_key(&name, |x| x.0)
357                    .ok()
358                    .map(|i| name_to_index[i].1)
359            },
360            &mut dst,
361        );
362        dst
363    }
364
365    /*
366    fn interpolate_bytes(
367        mut name_to_index: Vec<(&'static str, usize)>,
368        caps: Vec<&'static str>,
369        replacement: &str,
370    ) -> String {
371        name_to_index.sort_by_key(|x| x.0);
372
373        let mut dst = vec![];
374        super::bytes(
375            replacement.as_bytes(),
376            |i, dst| {
377                if let Some(&s) = caps.get(i) {
378                    dst.extend_from_slice(s.as_bytes());
379                }
380            },
381            |name| -> Option<usize> {
382                name_to_index
383                    .binary_search_by_key(&name, |x| x.0)
384                    .ok()
385                    .map(|i| name_to_index[i].1)
386            },
387            &mut dst,
388        );
389        String::from_utf8(dst).unwrap()
390    }
391    */
392
393    macro_rules! interp {
394        ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
395            #[test]
396            fn $name() {
397                assert_eq!(
398                    $expected,
399                    interpolate_string($map, $caps, $hay),
400                    "interpolate::string failed",
401                );
402                /*
403                assert_eq!(
404                    $expected,
405                    interpolate_bytes($map, $caps, $hay),
406                    "interpolate::bytes failed",
407                );
408                */
409            }
410        };
411    }
412
413    interp!(
414        interp1,
415        vec![("foo", 2)],
416        vec!["", "", "xxx"],
417        "test $foo test",
418        "test xxx test",
419    );
420
421    interp!(
422        interp2,
423        vec![("foo", 2)],
424        vec!["", "", "xxx"],
425        "test$footest",
426        "test",
427    );
428
429    interp!(
430        interp3,
431        vec![("foo", 2)],
432        vec!["", "", "xxx"],
433        "test${foo}test",
434        "testxxxtest",
435    );
436
437    interp!(
438        interp4,
439        vec![("foo", 2)],
440        vec!["", "", "xxx"],
441        "test$2test",
442        "test",
443    );
444
445    interp!(
446        interp5,
447        vec![("foo", 2)],
448        vec!["", "", "xxx"],
449        "test${2}test",
450        "testxxxtest",
451    );
452
453    interp!(
454        interp6,
455        vec![("foo", 2)],
456        vec!["", "", "xxx"],
457        "test $$foo test",
458        "test $foo test",
459    );
460
461    interp!(
462        interp7,
463        vec![("foo", 2)],
464        vec!["", "", "xxx"],
465        "test $foo",
466        "test xxx",
467    );
468
469    interp!(
470        interp8,
471        vec![("foo", 2)],
472        vec!["", "", "xxx"],
473        "$foo test",
474        "xxx test",
475    );
476
477    interp!(
478        interp9,
479        vec![("bar", 1), ("foo", 2)],
480        vec!["", "yyy", "xxx"],
481        "test $bar$foo",
482        "test yyyxxx",
483    );
484
485    interp!(
486        interp10,
487        vec![("bar", 1), ("foo", 2)],
488        vec!["", "yyy", "xxx"],
489        "test $ test",
490        "test $ test",
491    );
492
493    interp!(
494        interp11,
495        vec![("bar", 1), ("foo", 2)],
496        vec!["", "yyy", "xxx"],
497        "test ${} test",
498        "test  test",
499    );
500
501    interp!(
502        interp12,
503        vec![("bar", 1), ("foo", 2)],
504        vec!["", "yyy", "xxx"],
505        "test ${ } test",
506        "test  test",
507    );
508
509    interp!(
510        interp13,
511        vec![("bar", 1), ("foo", 2)],
512        vec!["", "yyy", "xxx"],
513        "test ${a b} test",
514        "test  test",
515    );
516
517    interp!(
518        interp14,
519        vec![("bar", 1), ("foo", 2)],
520        vec!["", "yyy", "xxx"],
521        "test ${a} test",
522        "test  test",
523    );
524
525    // This is a funny case where a braced reference is never closed, but
526    // within the unclosed braced reference, there is an unbraced reference.
527    // In this case, the braced reference is just treated literally and the
528    // unbraced reference is found.
529    interp!(
530        interp15,
531        vec![("bar", 1), ("foo", 2)],
532        vec!["", "yyy", "xxx"],
533        "test ${wat $bar ok",
534        "test ${wat yyy ok",
535    );
536}