quick_xml/reader/
slice_reader.rs

1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::borrow::Cow;
6
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9#[cfg(feature = "encoding")]
10use encoding_rs::{Encoding, UTF_8};
11
12use crate::errors::{Error, Result};
13use crate::events::Event;
14use crate::name::QName;
15use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
16
17use memchr;
18
19/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
20/// This implementation supports not using an intermediate buffer as the byte slice
21/// itself can be used to borrow from.
22impl<'a> Reader<&'a [u8]> {
23    /// Creates an XML reader from a string slice.
24    #[allow(clippy::should_implement_trait)]
25    pub fn from_str(s: &'a str) -> Self {
26        // Rust strings are guaranteed to be UTF-8, so lock the encoding
27        #[cfg(feature = "encoding")]
28        {
29            let mut reader = Self::from_reader(s.as_bytes());
30            reader.state.encoding = EncodingRef::Explicit(UTF_8);
31            reader
32        }
33
34        #[cfg(not(feature = "encoding"))]
35        Self::from_reader(s.as_bytes())
36    }
37
38    /// Read an event that borrows from the input rather than a buffer.
39    ///
40    /// There is no asynchronous `read_event_async()` version of this function,
41    /// because it is not necessary -- the contents are already in memory and no IO
42    /// is needed, therefore there is no potential for blocking.
43    ///
44    /// # Examples
45    ///
46    /// ```
47    /// # use pretty_assertions::assert_eq;
48    /// use quick_xml::events::Event;
49    /// use quick_xml::reader::Reader;
50    ///
51    /// let mut reader = Reader::from_str(r#"
52    ///     <tag1 att1 = "test">
53    ///        <tag2><!--Test comment-->Test</tag2>
54    ///        <tag2>Test 2</tag2>
55    ///     </tag1>
56    /// "#);
57    /// reader.trim_text(true);
58    ///
59    /// let mut count = 0;
60    /// let mut txt = Vec::new();
61    /// loop {
62    ///     match reader.read_event().unwrap() {
63    ///         Event::Start(e) => count += 1,
64    ///         Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
65    ///         Event::Eof => break,
66    ///         _ => (),
67    ///     }
68    /// }
69    /// assert_eq!(count, 3);
70    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
71    /// ```
72    #[inline]
73    pub fn read_event(&mut self) -> Result<Event<'a>> {
74        self.read_event_impl(())
75    }
76
77    /// Reads until end element is found. This function is supposed to be called
78    /// after you already read a [`Start`] event.
79    ///
80    /// Returns a span that cover content between `>` of an opening tag and `<` of
81    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
82    /// this method was called after reading expanded [`Start`] event.
83    ///
84    /// Manages nested cases where parent and child elements have the _literally_
85    /// same name.
86    ///
87    /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
88    /// will be returned. In particularly, that error will be returned if you call
89    /// this method without consuming the corresponding [`Start`] event first.
90    ///
91    /// The `end` parameter should contain name of the end element _in the reader
92    /// encoding_. It is good practice to always get that parameter using
93    /// [`BytesStart::to_end()`] method.
94    ///
95    /// The correctness of the skipped events does not checked, if you disabled
96    /// the [`check_end_names`] option.
97    ///
98    /// There is no asynchronous `read_to_end_async()` version of this function,
99    /// because it is not necessary -- the contents are already in memory and no IO
100    /// is needed, therefore there is no potential for blocking.
101    ///
102    /// # Namespaces
103    ///
104    /// While the `Reader` does not support namespace resolution, namespaces
105    /// does not change the algorithm for comparing names. Although the names
106    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
107    /// same namespace, are semantically equivalent, `</b:name>` cannot close
108    /// `<a:name>`, because according to [the specification]
109    ///
110    /// > The end of every element that begins with a **start-tag** MUST be marked
111    /// > by an **end-tag** containing a name that echoes the element's type as
112    /// > given in the **start-tag**
113    ///
114    /// # Examples
115    ///
116    /// This example shows, how you can skip XML content after you read the
117    /// start event.
118    ///
119    /// ```
120    /// # use pretty_assertions::assert_eq;
121    /// use quick_xml::events::{BytesStart, Event};
122    /// use quick_xml::reader::Reader;
123    ///
124    /// let mut reader = Reader::from_str(r#"
125    ///     <outer>
126    ///         <inner>
127    ///             <inner></inner>
128    ///             <inner/>
129    ///             <outer></outer>
130    ///             <outer/>
131    ///         </inner>
132    ///     </outer>
133    /// "#);
134    /// reader.trim_text(true);
135    ///
136    /// let start = BytesStart::new("outer");
137    /// let end   = start.to_end().into_owned();
138    ///
139    /// // First, we read a start event...
140    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
141    ///
142    /// // ...then, we could skip all events to the corresponding end event.
143    /// // This call will correctly handle nested <outer> elements.
144    /// // Note, however, that this method does not handle namespaces.
145    /// reader.read_to_end(end.name()).unwrap();
146    ///
147    /// // At the end we should get an Eof event, because we ate the whole XML
148    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
149    /// ```
150    ///
151    /// [`Start`]: Event::Start
152    /// [`End`]: Event::End
153    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
154    /// [`expand_empty_elements`]: Self::expand_empty_elements
155    /// [`check_end_names`]: Self::check_end_names
156    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
157    pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
158        Ok(read_to_end!(self, end, (), read_event_impl, {}))
159    }
160
161    /// Reads content between start and end tags, including any markup. This
162    /// function is supposed to be called after you already read a [`Start`] event.
163    ///
164    /// Manages nested cases where parent and child elements have the _literally_
165    /// same name.
166    ///
167    /// This method does not unescape read data, instead it returns content
168    /// "as is" of the XML document. This is because it has no idea what text
169    /// it reads, and if, for example, it contains CDATA section, attempt to
170    /// unescape it content will spoil data.
171    ///
172    /// Any text will be decoded using the XML current [`decoder()`].
173    ///
174    /// Actually, this method perform the following code:
175    ///
176    /// ```ignore
177    /// let span = reader.read_to_end(end)?;
178    /// let text = reader.decoder().decode(&reader.inner_slice[span]);
179    /// ```
180    ///
181    /// # Examples
182    ///
183    /// This example shows, how you can read a HTML content from your XML document.
184    ///
185    /// ```
186    /// # use pretty_assertions::assert_eq;
187    /// # use std::borrow::Cow;
188    /// use quick_xml::events::{BytesStart, Event};
189    /// use quick_xml::reader::Reader;
190    ///
191    /// let mut reader = Reader::from_str("
192    ///     <html>
193    ///         <title>This is a HTML text</title>
194    ///         <p>Usual XML rules does not apply inside it
195    ///         <p>For example, elements not needed to be &quot;closed&quot;
196    ///     </html>
197    /// ");
198    /// reader.trim_text(true);
199    ///
200    /// let start = BytesStart::new("html");
201    /// let end   = start.to_end().into_owned();
202    ///
203    /// // First, we read a start event...
204    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
205    /// // ...and disable checking of end names because we expect HTML further...
206    /// reader.check_end_names(false);
207    ///
208    /// // ...then, we could read text content until close tag.
209    /// // This call will correctly handle nested <html> elements.
210    /// let text = reader.read_text(end.name()).unwrap();
211    /// assert_eq!(text, Cow::Borrowed(r#"
212    ///         <title>This is a HTML text</title>
213    ///         <p>Usual XML rules does not apply inside it
214    ///         <p>For example, elements not needed to be &quot;closed&quot;
215    ///     "#));
216    /// assert!(matches!(text, Cow::Borrowed(_)));
217    ///
218    /// // Now we can enable checks again
219    /// reader.check_end_names(true);
220    ///
221    /// // At the end we should get an Eof event, because we ate the whole XML
222    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
223    /// ```
224    ///
225    /// [`Start`]: Event::Start
226    /// [`decoder()`]: Self::decoder()
227    pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
228        // self.reader will be changed, so store original reference
229        let buffer = self.reader;
230        let span = self.read_to_end(end)?;
231
232        self.decoder().decode(&buffer[0..span.len()])
233    }
234}
235
236////////////////////////////////////////////////////////////////////////////////////////////////////
237
238/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
239/// that will be borrowed by events. This implementation provides a zero-copy deserialization
240impl<'a> XmlSource<'a, ()> for &'a [u8] {
241    #[cfg(not(feature = "encoding"))]
242    fn remove_utf8_bom(&mut self) -> Result<()> {
243        if self.starts_with(crate::encoding::UTF8_BOM) {
244            *self = &self[crate::encoding::UTF8_BOM.len()..];
245        }
246        Ok(())
247    }
248
249    #[cfg(feature = "encoding")]
250    fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
251        if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
252            *self = &self[bom_len..];
253            return Ok(Some(enc));
254        }
255        Ok(None)
256    }
257
258    fn read_bytes_until(
259        &mut self,
260        byte: u8,
261        _buf: (),
262        position: &mut usize,
263    ) -> Result<Option<&'a [u8]>> {
264        // search byte must be within the ascii range
265        debug_assert!(byte.is_ascii());
266        if self.is_empty() {
267            return Ok(None);
268        }
269
270        Ok(Some(if let Some(i) = memchr::memchr(byte, self) {
271            *position += i + 1;
272            let bytes = &self[..i];
273            *self = &self[i + 1..];
274            bytes
275        } else {
276            *position += self.len();
277            let bytes = &self[..];
278            *self = &[];
279            bytes
280        }))
281    }
282
283    fn read_bang_element(
284        &mut self,
285        _buf: (),
286        position: &mut usize,
287    ) -> Result<Option<(BangType, &'a [u8])>> {
288        // Peeked one bang ('!') before being called, so it's guaranteed to
289        // start with it.
290        debug_assert_eq!(self[0], b'!');
291
292        let bang_type = BangType::new(self[1..].first().copied())?;
293
294        if let Some((bytes, i)) = bang_type.parse(&[], self) {
295            *position += i;
296            *self = &self[i..];
297            return Ok(Some((bang_type, bytes)));
298        }
299
300        // Note: Do not update position, so the error points to
301        // somewhere sane rather than at the EOF
302        Err(bang_type.to_err())
303    }
304
305    fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<Option<&'a [u8]>> {
306        if self.is_empty() {
307            return Ok(None);
308        }
309
310        let mut state = ReadElementState::Elem;
311
312        if let Some((bytes, i)) = state.change(self) {
313            // Position now just after the `>` symbol
314            *position += i;
315            *self = &self[i..];
316            return Ok(Some(bytes));
317        }
318
319        // Note: Do not update position, so the error points to a sane place
320        // rather than at the EOF.
321        Err(Error::UnexpectedEof("Element".to_string()))
322
323        // FIXME: Figure out why the other one works without UnexpectedEof
324    }
325
326    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
327        let whitespaces = self
328            .iter()
329            .position(|b| !is_whitespace(*b))
330            .unwrap_or(self.len());
331        *position += whitespaces;
332        *self = &self[whitespaces..];
333        Ok(())
334    }
335
336    fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
337        // search byte must be within the ascii range
338        debug_assert!(byte.is_ascii());
339        if self.first() == Some(&byte) {
340            *self = &self[1..];
341            *position += 1;
342            Ok(true)
343        } else {
344            Ok(false)
345        }
346    }
347
348    fn peek_one(&mut self) -> Result<Option<u8>> {
349        Ok(self.first().copied())
350    }
351}
352
353#[cfg(test)]
354mod test {
355    use crate::reader::test::check;
356    use crate::reader::XmlSource;
357
358    /// Default buffer constructor just pass the byte array from the test
359    fn identity<T>(input: T) -> T {
360        input
361    }
362
363    check!(
364        #[test]
365        read_event_impl,
366        read_until_close,
367        identity,
368        ()
369    );
370
371    #[cfg(feature = "encoding")]
372    mod encoding {
373        use crate::events::Event;
374        use crate::reader::Reader;
375        use encoding_rs::UTF_8;
376        use pretty_assertions::assert_eq;
377
378        /// Checks that XML declaration cannot change the encoding from UTF-8 if
379        /// a `Reader` was created using `from_str` method
380        #[test]
381        fn str_always_has_utf8() {
382            let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");
383
384            assert_eq!(reader.decoder().encoding(), UTF_8);
385            reader.read_event().unwrap();
386            assert_eq!(reader.decoder().encoding(), UTF_8);
387
388            assert_eq!(reader.read_event().unwrap(), Event::Eof);
389        }
390    }
391}