bstr/unicode/word.rs
1use regex_automata::{dfa::Automaton, Anchored, Input};
2
3use crate::{
4 ext_slice::ByteSlice,
5 unicode::fsm::{
6 simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
7 },
8 utf8,
9};
10
11/// An iterator over words in a byte string.
12///
13/// This iterator is typically constructed by
14/// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
15///
16/// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
17/// iterator, except it only returns elements that contain a "word" character.
18/// A word character is defined by UTS #18 (Annex C) to be the combination
19/// of the `Alphabetic` and `Join_Control` properties, along with the
20/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
21///
22/// Since words are made up of one or more codepoints, this iterator yields
23/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
24/// are [substituted](index.html#handling-of-invalid-utf-8).
25///
26/// This iterator yields words in accordance with the default word boundary
27/// rules specified in
28/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
29/// In particular, this may not be suitable for Japanese and Chinese scripts
30/// that do not use spaces between words.
31#[derive(Clone, Debug)]
32pub struct Words<'a>(WordsWithBreaks<'a>);
33
34impl<'a> Words<'a> {
35 pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
36 Words(WordsWithBreaks::new(bs))
37 }
38
39 /// View the underlying data as a subslice of the original data.
40 ///
41 /// The slice returned has the same lifetime as the original slice, and so
42 /// the iterator can continue to be used while this exists.
43 ///
44 /// # Examples
45 ///
46 /// ```
47 /// use bstr::ByteSlice;
48 ///
49 /// let mut it = b"foo bar baz".words();
50 ///
51 /// assert_eq!(b"foo bar baz", it.as_bytes());
52 /// it.next();
53 /// it.next();
54 /// assert_eq!(b" baz", it.as_bytes());
55 /// it.next();
56 /// assert_eq!(b"", it.as_bytes());
57 /// ```
58 #[inline]
59 pub fn as_bytes(&self) -> &'a [u8] {
60 self.0.as_bytes()
61 }
62}
63
64impl<'a> Iterator for Words<'a> {
65 type Item = &'a str;
66
67 #[inline]
68 fn next(&mut self) -> Option<&'a str> {
69 while let Some(word) = self.0.next() {
70 let input =
71 Input::new(word).anchored(Anchored::Yes).earliest(true);
72 if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
73 return Some(word);
74 }
75 }
76 None
77 }
78}
79
80/// An iterator over words in a byte string and their byte index positions.
81///
82/// This iterator is typically constructed by
83/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
84///
85/// This is similar to the
86/// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
87/// except it only returns elements that contain a "word" character. A
88/// word character is defined by UTS #18 (Annex C) to be the combination
89/// of the `Alphabetic` and `Join_Control` properties, along with the
90/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
91///
92/// Since words are made up of one or more codepoints, this iterator
93/// yields `&str` elements (along with their start and end byte offsets).
94/// When invalid UTF-8 is encountered, replacement codepoints are
95/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
96/// indices yielded by this iterator may not correspond to the length of the
97/// word yielded with those indices. For example, when this iterator encounters
98/// `\xFF` in the byte string, then it will yield a pair of indices ranging
99/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
100/// which is three bytes in length. However, when given only valid UTF-8, then
101/// all indices are in exact correspondence with their paired word.
102///
103/// This iterator yields words in accordance with the default word boundary
104/// rules specified in
105/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
106/// In particular, this may not be suitable for Japanese and Chinese scripts
107/// that do not use spaces between words.
108#[derive(Clone, Debug)]
109pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
110
111impl<'a> WordIndices<'a> {
112 pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
113 WordIndices(WordsWithBreakIndices::new(bs))
114 }
115
116 /// View the underlying data as a subslice of the original data.
117 ///
118 /// The slice returned has the same lifetime as the original slice, and so
119 /// the iterator can continue to be used while this exists.
120 ///
121 /// # Examples
122 ///
123 /// ```
124 /// use bstr::ByteSlice;
125 ///
126 /// let mut it = b"foo bar baz".word_indices();
127 ///
128 /// assert_eq!(b"foo bar baz", it.as_bytes());
129 /// it.next();
130 /// it.next();
131 /// assert_eq!(b" baz", it.as_bytes());
132 /// it.next();
133 /// it.next();
134 /// assert_eq!(b"", it.as_bytes());
135 /// ```
136 #[inline]
137 pub fn as_bytes(&self) -> &'a [u8] {
138 self.0.as_bytes()
139 }
140}
141
142impl<'a> Iterator for WordIndices<'a> {
143 type Item = (usize, usize, &'a str);
144
145 #[inline]
146 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
147 while let Some((start, end, word)) = self.0.next() {
148 let input =
149 Input::new(word).anchored(Anchored::Yes).earliest(true);
150 if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
151 return Some((start, end, word));
152 }
153 }
154 None
155 }
156}
157
158/// An iterator over all word breaks in a byte string.
159///
160/// This iterator is typically constructed by
161/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
162///
163/// This iterator yields not only all words, but the content that comes between
164/// words. In particular, if all elements yielded by this iterator are
165/// concatenated, then the result is the original string (subject to Unicode
166/// replacement codepoint substitutions).
167///
168/// Since words are made up of one or more codepoints, this iterator yields
169/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
170/// are [substituted](index.html#handling-of-invalid-utf-8).
171///
172/// This iterator yields words in accordance with the default word boundary
173/// rules specified in
174/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
175/// In particular, this may not be suitable for Japanese and Chinese scripts
176/// that do not use spaces between words.
177#[derive(Clone, Debug)]
178pub struct WordsWithBreaks<'a> {
179 bs: &'a [u8],
180}
181
182impl<'a> WordsWithBreaks<'a> {
183 pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
184 WordsWithBreaks { bs }
185 }
186
187 /// View the underlying data as a subslice of the original data.
188 ///
189 /// The slice returned has the same lifetime as the original slice, and so
190 /// the iterator can continue to be used while this exists.
191 ///
192 /// # Examples
193 ///
194 /// ```
195 /// use bstr::ByteSlice;
196 ///
197 /// let mut it = b"foo bar baz".words_with_breaks();
198 ///
199 /// assert_eq!(b"foo bar baz", it.as_bytes());
200 /// it.next();
201 /// assert_eq!(b" bar baz", it.as_bytes());
202 /// it.next();
203 /// it.next();
204 /// assert_eq!(b" baz", it.as_bytes());
205 /// it.next();
206 /// it.next();
207 /// assert_eq!(b"", it.as_bytes());
208 /// ```
209 #[inline]
210 pub fn as_bytes(&self) -> &'a [u8] {
211 self.bs
212 }
213}
214
215impl<'a> Iterator for WordsWithBreaks<'a> {
216 type Item = &'a str;
217
218 #[inline]
219 fn next(&mut self) -> Option<&'a str> {
220 let (word, size) = decode_word(self.bs);
221 if size == 0 {
222 return None;
223 }
224 self.bs = &self.bs[size..];
225 Some(word)
226 }
227}
228
229/// An iterator over all word breaks in a byte string, along with their byte
230/// index positions.
231///
232/// This iterator is typically constructed by
233/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
234///
235/// This iterator yields not only all words, but the content that comes between
236/// words. In particular, if all elements yielded by this iterator are
237/// concatenated, then the result is the original string (subject to Unicode
238/// replacement codepoint substitutions).
239///
240/// Since words are made up of one or more codepoints, this iterator
241/// yields `&str` elements (along with their start and end byte offsets).
242/// When invalid UTF-8 is encountered, replacement codepoints are
243/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
244/// indices yielded by this iterator may not correspond to the length of the
245/// word yielded with those indices. For example, when this iterator encounters
246/// `\xFF` in the byte string, then it will yield a pair of indices ranging
247/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
248/// which is three bytes in length. However, when given only valid UTF-8, then
249/// all indices are in exact correspondence with their paired word.
250///
251/// This iterator yields words in accordance with the default word boundary
252/// rules specified in
253/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
254/// In particular, this may not be suitable for Japanese and Chinese scripts
255/// that do not use spaces between words.
256#[derive(Clone, Debug)]
257pub struct WordsWithBreakIndices<'a> {
258 bs: &'a [u8],
259 forward_index: usize,
260}
261
262impl<'a> WordsWithBreakIndices<'a> {
263 pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
264 WordsWithBreakIndices { bs, forward_index: 0 }
265 }
266
267 /// View the underlying data as a subslice of the original data.
268 ///
269 /// The slice returned has the same lifetime as the original slice, and so
270 /// the iterator can continue to be used while this exists.
271 ///
272 /// # Examples
273 ///
274 /// ```
275 /// use bstr::ByteSlice;
276 ///
277 /// let mut it = b"foo bar baz".words_with_break_indices();
278 ///
279 /// assert_eq!(b"foo bar baz", it.as_bytes());
280 /// it.next();
281 /// assert_eq!(b" bar baz", it.as_bytes());
282 /// it.next();
283 /// it.next();
284 /// assert_eq!(b" baz", it.as_bytes());
285 /// it.next();
286 /// it.next();
287 /// assert_eq!(b"", it.as_bytes());
288 /// ```
289 #[inline]
290 pub fn as_bytes(&self) -> &'a [u8] {
291 self.bs
292 }
293}
294
295impl<'a> Iterator for WordsWithBreakIndices<'a> {
296 type Item = (usize, usize, &'a str);
297
298 #[inline]
299 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
300 let index = self.forward_index;
301 let (word, size) = decode_word(self.bs);
302 if size == 0 {
303 return None;
304 }
305 self.bs = &self.bs[size..];
306 self.forward_index += size;
307 Some((index, index + size, word))
308 }
309}
310
311fn decode_word(bs: &[u8]) -> (&str, usize) {
312 if bs.is_empty() {
313 ("", 0)
314 } else if let Some(hm) = {
315 let input = Input::new(bs).anchored(Anchored::Yes);
316 WORD_BREAK_FWD.try_search_fwd(&input).unwrap()
317 } {
318 // Safe because a match can only occur for valid UTF-8.
319 let word = unsafe { bs[..hm.offset()].to_str_unchecked() };
320 (word, word.len())
321 } else {
322 const INVALID: &'static str = "\u{FFFD}";
323 // No match on non-empty bytes implies we found invalid UTF-8.
324 let (_, size) = utf8::decode_lossy(bs);
325 (INVALID, size)
326 }
327}
328
329#[cfg(all(test, feature = "std"))]
330mod tests {
331 use alloc::{vec, vec::Vec};
332
333 #[cfg(not(miri))]
334 use ucd_parse::WordBreakTest;
335
336 use crate::ext_slice::ByteSlice;
337
338 #[test]
339 #[cfg(not(miri))]
340 fn forward_ucd() {
341 for (i, test) in ucdtests().into_iter().enumerate() {
342 let given = test.words.concat();
343 let got = words(given.as_bytes());
344 assert_eq!(
345 test.words,
346 got,
347 "\n\nword forward break test {} failed:\n\
348 given: {:?}\n\
349 expected: {:?}\n\
350 got: {:?}\n",
351 i,
352 given,
353 strs_to_bstrs(&test.words),
354 strs_to_bstrs(&got),
355 );
356 }
357 }
358
359 // Some additional tests that don't seem to be covered by the UCD tests.
360 //
361 // It's pretty amazing that the UCD tests miss these cases. I only found
362 // them by running this crate's segmenter and ICU's segmenter on the same
363 // text and comparing the output.
364 #[test]
365 fn forward_additional() {
366 assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y"));
367 assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo"));
368 assert_eq!(
369 vec!["whatsoever", ".", " ", "You", " ", "may"],
370 words(b"whatsoever. You may")
371 );
372 assert_eq!(
373 vec!["21stcentury'syesterday"],
374 words(b"21stcentury'syesterday")
375 );
376
377 assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
378 assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
379 assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
380 assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
381 assert_eq!(vec!["_123'4"], words(b"_123'4"));
382 assert_eq!(vec!["__12'345"], words(b"__12'345"));
383
384 assert_eq!(
385 vec!["tomorrowat4", ":", "00", ","],
386 words(b"tomorrowat4:00,")
387 );
388 assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
389 assert_eq!(vec!["X38"], words(b"X38"));
390
391 assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
392 assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
393 assert_eq!(vec!["1XY"], words(b"1XY"));
394
395 assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
396
397 // Tests that Vithkuqi works, which was introduced in Unicode 14.
398 // This test fails prior to Unicode 14.
399 assert_eq!(
400 vec!["\u{10570}\u{10597}"],
401 words("\u{10570}\u{10597}".as_bytes())
402 );
403 }
404
405 fn words(bytes: &[u8]) -> Vec<&str> {
406 bytes.words_with_breaks().collect()
407 }
408
409 #[cfg(not(miri))]
410 fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
411 strs.iter().map(|s| s.as_ref().as_bytes()).collect()
412 }
413
414 /// Return all of the UCD for word breaks.
415 #[cfg(not(miri))]
416 fn ucdtests() -> Vec<WordBreakTest> {
417 const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
418
419 let mut tests = vec![];
420 for mut line in TESTDATA.lines() {
421 line = line.trim();
422 if line.starts_with("#") || line.contains("surrogate") {
423 continue;
424 }
425 tests.push(line.parse().unwrap());
426 }
427 tests
428 }
429}