unicode_normalization/
stream_safe.rs
1use core::iter::FusedIterator;
2
3use crate::lookups::{
4 canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5 stream_safe_trailing_nonstarters,
6};
7use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8use crate::tables::stream_safe_leading_nonstarters;
9
10pub(crate) const MAX_NONSTARTERS: usize = 30;
11const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12
13pub struct StreamSafe<I> {
17 iter: I,
18 nonstarter_count: usize,
19 buffer: Option<char>,
20}
21
22impl<I> StreamSafe<I> {
23 pub(crate) fn new(iter: I) -> Self {
24 Self {
25 iter,
26 nonstarter_count: 0,
27 buffer: None,
28 }
29 }
30}
31
32impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
33 type Item = char;
34
35 #[inline]
36 fn next(&mut self) -> Option<char> {
37 let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
38 None => return None,
39 Some(c) => c,
40 };
41 let d = classify_nonstarters(next_ch);
42 if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
43 self.nonstarter_count = 0;
47 self.buffer = Some(next_ch);
48 return Some(COMBINING_GRAPHEME_JOINER);
49 }
50
51 if d.leading_nonstarters == d.decomposition_len {
54 self.nonstarter_count += d.decomposition_len;
55 }
56 else {
58 self.nonstarter_count = d.trailing_nonstarters;
59 }
60 Some(next_ch)
61 }
62}
63
64impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
65
66#[derive(Debug)]
67pub(crate) struct Decomposition {
68 pub(crate) leading_nonstarters: usize,
69 pub(crate) trailing_nonstarters: usize,
70 pub(crate) decomposition_len: usize,
71}
72
73#[inline]
74pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
75 if c <= '\x7f' {
77 return Decomposition {
78 leading_nonstarters: 0,
79 trailing_nonstarters: 0,
80 decomposition_len: 1,
81 };
82 }
83 if is_hangul_syllable(c) {
85 return Decomposition {
86 leading_nonstarters: 0,
87 trailing_nonstarters: 0,
88 decomposition_len: hangul_decomposition_length(c),
89 };
90 }
91 let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
92 match decomp {
93 Some(decomp) => Decomposition {
94 leading_nonstarters: stream_safe_leading_nonstarters(c),
95 trailing_nonstarters: stream_safe_trailing_nonstarters(c),
96 decomposition_len: decomp.len(),
97 },
98 None => {
99 let is_nonstarter = canonical_combining_class(c) != 0;
100 let nonstarter = if is_nonstarter { 1 } else { 0 };
101 Decomposition {
102 leading_nonstarters: nonstarter,
103 trailing_nonstarters: nonstarter,
104 decomposition_len: 1,
105 }
106 }
107 }
108}
109
110#[cfg(test)]
111mod tests {
112 use super::{classify_nonstarters, StreamSafe};
113 use crate::lookups::canonical_combining_class;
114 use crate::normalize::decompose_compatible;
115
116 #[cfg(not(feature = "std"))]
117 use alloc::{string::String, vec::Vec};
118
119 use core::char;
120
121 fn stream_safe(s: &str) -> String {
122 StreamSafe::new(s.chars()).collect()
123 }
124
125 #[test]
126 fn test_simple() {
127 let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
128 assert_eq!(stream_safe(technically_okay), technically_okay);
129
130 let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
131 let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
132 assert_eq!(stream_safe(too_much), fixed_it);
133
134 let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
135 let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
136 assert_eq!(stream_safe(woah_nelly), its_cool);
137 }
138
139 #[test]
140 fn test_all_nonstarters() {
141 let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
142 let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
143 assert_eq!(stream_safe(s), expected);
144 }
145
146 #[test]
147 fn test_classify_nonstarters() {
148 for ch in 0..0x2FA1E {
150 let ch = match char::from_u32(ch) {
151 Some(c) => c,
152 None => continue,
153 };
154 let c = classify_nonstarters(ch);
155 let mut s = Vec::new();
156 decompose_compatible(ch, |c| s.push(c));
157
158 assert_eq!(s.len(), c.decomposition_len);
159
160 let num_leading = s
161 .iter()
162 .take_while(|&c| canonical_combining_class(*c) != 0)
163 .count();
164 let num_trailing = s
165 .iter()
166 .rev()
167 .take_while(|&c| canonical_combining_class(*c) != 0)
168 .count();
169
170 assert_eq!(num_leading, c.leading_nonstarters);
171 assert_eq!(num_trailing, c.trailing_nonstarters);
172 }
173 }
174}