unicode_normalization/
stream_safe.rs

1use core::iter::FusedIterator;
2
3use crate::lookups::{
4    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5    stream_safe_trailing_nonstarters,
6};
7use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8use crate::tables::stream_safe_leading_nonstarters;
9
10pub(crate) const MAX_NONSTARTERS: usize = 30;
11const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12
13/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
14/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
15/// (U+034F) if the count exceeds 30.
16///
17/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
18pub struct StreamSafe<I> {
19    iter: I,
20    nonstarter_count: usize,
21    buffer: Option<char>,
22}
23
24impl<I: Iterator<Item = char>> StreamSafe<I> {
25    /// Create a new stream safe iterator.
26    ///
27    /// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
28    /// on the iterator.
29    #[inline]
30    pub fn new(iter: I) -> Self {
31        Self {
32            iter,
33            nonstarter_count: 0,
34            buffer: None,
35        }
36    }
37}
38
39impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
40    type Item = char;
41
42    #[inline]
43    fn next(&mut self) -> Option<char> {
44        let next_ch = self.buffer.take().or_else(|| self.iter.next())?;
45        let d = classify_nonstarters(next_ch);
46        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
47            // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
48            // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
49            // iterator (via `self.buffer`), and we'll reclassify it next iteration.
50            self.nonstarter_count = 0;
51            self.buffer = Some(next_ch);
52            return Some(COMBINING_GRAPHEME_JOINER);
53        }
54
55        // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
56        // nonstarters in NKFD.
57        if d.leading_nonstarters == d.decomposition_len {
58            self.nonstarter_count += d.decomposition_len;
59        }
60        // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
61        else {
62            self.nonstarter_count = d.trailing_nonstarters;
63        }
64        Some(next_ch)
65    }
66}
67
68impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
69
70#[derive(Debug)]
71pub(crate) struct Decomposition {
72    pub(crate) leading_nonstarters: usize,
73    pub(crate) trailing_nonstarters: usize,
74    pub(crate) decomposition_len: usize,
75}
76
77#[inline]
78pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
79    // As usual, fast path for ASCII (which is always a starter)
80    if c <= '\x7f' {
81        return Decomposition {
82            leading_nonstarters: 0,
83            trailing_nonstarters: 0,
84            decomposition_len: 1,
85        };
86    }
87    // Next, special case Hangul, since it's not handled by our tables.
88    if is_hangul_syllable(c) {
89        return Decomposition {
90            leading_nonstarters: 0,
91            trailing_nonstarters: 0,
92            decomposition_len: hangul_decomposition_length(c),
93        };
94    }
95    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
96    match decomp {
97        Some(decomp) => Decomposition {
98            leading_nonstarters: stream_safe_leading_nonstarters(c),
99            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
100            decomposition_len: decomp.len(),
101        },
102        None => {
103            let is_nonstarter = canonical_combining_class(c) != 0;
104            let nonstarter = if is_nonstarter { 1 } else { 0 };
105            Decomposition {
106                leading_nonstarters: nonstarter,
107                trailing_nonstarters: nonstarter,
108                decomposition_len: 1,
109            }
110        }
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    use super::{classify_nonstarters, StreamSafe};
117    use crate::lookups::canonical_combining_class;
118    use crate::normalize::decompose_compatible;
119
120    #[cfg(not(feature = "std"))]
121    use alloc::{string::String, vec::Vec};
122
123    use core::char;
124
125    fn stream_safe(s: &str) -> String {
126        StreamSafe::new(s.chars()).collect()
127    }
128
129    #[test]
130    fn test_simple() {
131        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
132        assert_eq!(stream_safe(technically_okay), technically_okay);
133
134        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
135        let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
136        assert_eq!(stream_safe(too_much), fixed_it);
137
138        let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
139        let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
140        assert_eq!(stream_safe(woah_nelly), its_cool);
141    }
142
143    #[test]
144    fn test_all_nonstarters() {
145        let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
146        let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
147        assert_eq!(stream_safe(s), expected);
148    }
149
150    #[test]
151    fn test_classify_nonstarters() {
152        // Highest character in the `compat_fully_decomp` table is 2FA1D
153        for ch in 0..0x2FA1E {
154            let ch = match char::from_u32(ch) {
155                Some(c) => c,
156                None => continue,
157            };
158            let c = classify_nonstarters(ch);
159            let mut s = Vec::new();
160            decompose_compatible(ch, |c| s.push(c));
161
162            assert_eq!(s.len(), c.decomposition_len);
163
164            let num_leading = s
165                .iter()
166                .take_while(|&c| canonical_combining_class(*c) != 0)
167                .count();
168            let num_trailing = s
169                .iter()
170                .rev()
171                .take_while(|&c| canonical_combining_class(*c) != 0)
172                .count();
173
174            assert_eq!(num_leading, c.leading_nonstarters);
175            assert_eq!(num_trailing, c.trailing_nonstarters);
176        }
177    }
178}