bstr/unicode/
sentence.rs

1use regex_automata::{dfa::Automaton, Anchored, Input};
2
3use crate::{
4    ext_slice::ByteSlice,
5    unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
6};
7
8/// An iterator over sentences in a byte string.
9///
10/// This iterator is typically constructed by
11/// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
12///
13/// Sentences typically include their trailing punctuation and whitespace.
14///
15/// Since sentences are made up of one or more codepoints, this iterator yields
16/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
17/// are [substituted](index.html#handling-of-invalid-utf-8).
18///
19/// This iterator yields words in accordance with the default sentence boundary
20/// rules specified in
21/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
22#[derive(Clone, Debug)]
23pub struct Sentences<'a> {
24    bs: &'a [u8],
25}
26
27impl<'a> Sentences<'a> {
28    pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
29        Sentences { bs }
30    }
31
32    /// View the underlying data as a subslice of the original data.
33    ///
34    /// The slice returned has the same lifetime as the original slice, and so
35    /// the iterator can continue to be used while this exists.
36    ///
37    /// # Examples
38    ///
39    /// ```
40    /// use bstr::ByteSlice;
41    ///
42    /// let mut it = b"I want this. Not that. Right now.".sentences();
43    ///
44    /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
45    /// it.next();
46    /// assert_eq!(b"Not that. Right now.", it.as_bytes());
47    /// it.next();
48    /// it.next();
49    /// assert_eq!(b"", it.as_bytes());
50    /// ```
51    #[inline]
52    pub fn as_bytes(&self) -> &'a [u8] {
53        self.bs
54    }
55}
56
57impl<'a> Iterator for Sentences<'a> {
58    type Item = &'a str;
59
60    #[inline]
61    fn next(&mut self) -> Option<&'a str> {
62        let (sentence, size) = decode_sentence(self.bs);
63        if size == 0 {
64            return None;
65        }
66        self.bs = &self.bs[size..];
67        Some(sentence)
68    }
69}
70
71/// An iterator over sentences in a byte string, along with their byte offsets.
72///
73/// This iterator is typically constructed by
74/// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
75///
76/// Sentences typically include their trailing punctuation and whitespace.
77///
78/// Since sentences are made up of one or more codepoints, this iterator
79/// yields `&str` elements (along with their start and end byte offsets).
80/// When invalid UTF-8 is encountered, replacement codepoints are
81/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
82/// indices yielded by this iterator may not correspond to the length of the
83/// sentence yielded with those indices. For example, when this iterator
84/// encounters `\xFF` in the byte string, then it will yield a pair of indices
85/// ranging over a single byte, but will provide an `&str` equivalent to
86/// `"\u{FFFD}"`, which is three bytes in length. However, when given only
87/// valid UTF-8, then all indices are in exact correspondence with their paired
88/// word.
89///
90/// This iterator yields words in accordance with the default sentence boundary
91/// rules specified in
92/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
93#[derive(Clone, Debug)]
94pub struct SentenceIndices<'a> {
95    bs: &'a [u8],
96    forward_index: usize,
97}
98
99impl<'a> SentenceIndices<'a> {
100    pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
101        SentenceIndices { bs, forward_index: 0 }
102    }
103
104    /// View the underlying data as a subslice of the original data.
105    ///
106    /// The slice returned has the same lifetime as the original slice, and so
107    /// the iterator can continue to be used while this exists.
108    ///
109    /// # Examples
110    ///
111    /// ```
112    /// use bstr::ByteSlice;
113    ///
114    /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115    ///
116    /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117    /// it.next();
118    /// assert_eq!(b"Not that. Right now.", it.as_bytes());
119    /// it.next();
120    /// it.next();
121    /// assert_eq!(b"", it.as_bytes());
122    /// ```
123    #[inline]
124    pub fn as_bytes(&self) -> &'a [u8] {
125        self.bs
126    }
127}
128
129impl<'a> Iterator for SentenceIndices<'a> {
130    type Item = (usize, usize, &'a str);
131
132    #[inline]
133    fn next(&mut self) -> Option<(usize, usize, &'a str)> {
134        let index = self.forward_index;
135        let (word, size) = decode_sentence(self.bs);
136        if size == 0 {
137            return None;
138        }
139        self.bs = &self.bs[size..];
140        self.forward_index += size;
141        Some((index, index + size, word))
142    }
143}
144
145fn decode_sentence(bs: &[u8]) -> (&str, usize) {
146    if bs.is_empty() {
147        ("", 0)
148    } else if let Some(hm) = {
149        let input = Input::new(bs).anchored(Anchored::Yes);
150        SENTENCE_BREAK_FWD.try_search_fwd(&input).unwrap()
151    } {
152        // Safe because a match can only occur for valid UTF-8.
153        let sentence = unsafe { bs[..hm.offset()].to_str_unchecked() };
154        (sentence, sentence.len())
155    } else {
156        const INVALID: &'static str = "\u{FFFD}";
157        // No match on non-empty bytes implies we found invalid UTF-8.
158        let (_, size) = utf8::decode_lossy(bs);
159        (INVALID, size)
160    }
161}
162
163#[cfg(all(test, feature = "std"))]
164mod tests {
165    use alloc::{vec, vec::Vec};
166
167    #[cfg(not(miri))]
168    use ucd_parse::SentenceBreakTest;
169
170    use crate::ext_slice::ByteSlice;
171
172    #[test]
173    #[cfg(not(miri))]
174    fn forward_ucd() {
175        for (i, test) in ucdtests().into_iter().enumerate() {
176            let given = test.sentences.concat();
177            let got = sentences(given.as_bytes());
178            assert_eq!(
179                test.sentences,
180                got,
181                "\n\nsentence forward break test {} failed:\n\
182                 given:    {:?}\n\
183                 expected: {:?}\n\
184                 got:      {:?}\n",
185                i,
186                given,
187                strs_to_bstrs(&test.sentences),
188                strs_to_bstrs(&got),
189            );
190        }
191    }
192
193    // Some additional tests that don't seem to be covered by the UCD tests.
194    #[test]
195    fn forward_additional() {
196        assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
197        assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
198
199        assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
200        assert_eq!(vec!["a... a"], sentences(b"a... a"));
201
202        assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
203    }
204
205    fn sentences(bytes: &[u8]) -> Vec<&str> {
206        bytes.sentences().collect()
207    }
208
209    #[cfg(not(miri))]
210    fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
211        strs.iter().map(|s| s.as_ref().as_bytes()).collect()
212    }
213
214    /// Return all of the UCD for sentence breaks.
215    #[cfg(not(miri))]
216    fn ucdtests() -> Vec<SentenceBreakTest> {
217        const TESTDATA: &'static str =
218            include_str!("data/SentenceBreakTest.txt");
219
220        let mut tests = vec![];
221        for mut line in TESTDATA.lines() {
222            line = line.trim();
223            if line.starts_with("#") || line.contains("surrogate") {
224                continue;
225            }
226            tests.push(line.parse().unwrap());
227        }
228        tests
229    }
230}