quick_xml/reader/
buffered_reader.rs

1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use memchr;
9
10use crate::errors::{Error, Result};
11use crate::events::Event;
12use crate::name::QName;
13use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
14
15macro_rules! impl_buffered_source {
16    ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17        #[cfg(not(feature = "encoding"))]
18        $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
19            use crate::encoding::UTF8_BOM;
20
21            loop {
22                break match self $(.$reader)? .fill_buf() $(.$await)? {
23                    Ok(n) => {
24                        if n.starts_with(UTF8_BOM) {
25                            self $(.$reader)? .consume(UTF8_BOM.len());
26                        }
27                        Ok(())
28                    },
29                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
30                    Err(e) => Err(Error::Io(e.into())),
31                };
32            }
33        }
34
35        #[cfg(feature = "encoding")]
36        $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
37            loop {
38                break match self $(.$reader)? .fill_buf() $(.$await)? {
39                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
40                        self $(.$reader)? .consume(bom_len);
41                        Ok(Some(enc))
42                    } else {
43                        Ok(None)
44                    },
45                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
46                    Err(e) => Err(Error::Io(e.into())),
47                };
48            }
49        }
50
51        #[inline]
52        $($async)? fn read_bytes_until $(<$lf>)? (
53            &mut self,
54            byte: u8,
55            buf: &'b mut Vec<u8>,
56            position: &mut usize,
57        ) -> Result<Option<&'b [u8]>> {
58            // search byte must be within the ascii range
59            debug_assert!(byte.is_ascii());
60
61            let mut read = 0;
62            let mut done = false;
63            let start = buf.len();
64            while !done {
65                let used = {
66                    let available = match self $(.$reader)? .fill_buf() $(.$await)? {
67                        Ok(n) if n.is_empty() => break,
68                        Ok(n) => n,
69                        Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
70                        Err(e) => {
71                            *position += read;
72                            return Err(Error::Io(e.into()));
73                        }
74                    };
75
76                    match memchr::memchr(byte, available) {
77                        Some(i) => {
78                            buf.extend_from_slice(&available[..i]);
79                            done = true;
80                            i + 1
81                        }
82                        None => {
83                            buf.extend_from_slice(available);
84                            available.len()
85                        }
86                    }
87                };
88                self $(.$reader)? .consume(used);
89                read += used;
90            }
91            *position += read;
92
93            if read == 0 {
94                Ok(None)
95            } else {
96                Ok(Some(&buf[start..]))
97            }
98        }
99
100        $($async)? fn read_bang_element $(<$lf>)? (
101            &mut self,
102            buf: &'b mut Vec<u8>,
103            position: &mut usize,
104        ) -> Result<Option<(BangType, &'b [u8])>> {
105            // Peeked one bang ('!') before being called, so it's guaranteed to
106            // start with it.
107            let start = buf.len();
108            let mut read = 1;
109            buf.push(b'!');
110            self $(.$reader)? .consume(1);
111
112            let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
113
114            loop {
115                match self $(.$reader)? .fill_buf() $(.$await)? {
116                    // Note: Do not update position, so the error points to
117                    // somewhere sane rather than at the EOF
118                    Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
119                    Ok(available) => {
120                        // We only parse from start because we don't want to consider
121                        // whatever is in the buffer before the bang element
122                        if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
123                            buf.extend_from_slice(consumed);
124
125                            self $(.$reader)? .consume(used);
126                            read += used;
127
128                            *position += read;
129                            break;
130                        } else {
131                            buf.extend_from_slice(available);
132
133                            let used = available.len();
134                            self $(.$reader)? .consume(used);
135                            read += used;
136                        }
137                    }
138                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
139                    Err(e) => {
140                        *position += read;
141                        return Err(Error::Io(e.into()));
142                    }
143                }
144            }
145
146            if read == 0 {
147                Ok(None)
148            } else {
149                Ok(Some((bang_type, &buf[start..])))
150            }
151        }
152
153        #[inline]
154        $($async)? fn read_element $(<$lf>)? (
155            &mut self,
156            buf: &'b mut Vec<u8>,
157            position: &mut usize,
158        ) -> Result<Option<&'b [u8]>> {
159            let mut state = ReadElementState::Elem;
160            let mut read = 0;
161
162            let start = buf.len();
163            loop {
164                match self $(.$reader)? .fill_buf() $(.$await)? {
165                    Ok(n) if n.is_empty() => break,
166                    Ok(available) => {
167                        if let Some((consumed, used)) = state.change(available) {
168                            buf.extend_from_slice(consumed);
169
170                            self $(.$reader)? .consume(used);
171                            read += used;
172
173                            // Position now just after the `>` symbol
174                            *position += read;
175                            break;
176                        } else {
177                            // The `>` symbol not yet found, continue reading
178                            buf.extend_from_slice(available);
179
180                            let used = available.len();
181                            self $(.$reader)? .consume(used);
182                            read += used;
183                        }
184                    }
185                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
186                    Err(e) => {
187                        *position += read;
188                        return Err(Error::Io(e.into()));
189                    }
190                };
191            }
192
193            if read == 0 {
194                Ok(None)
195            } else {
196                Ok(Some(&buf[start..]))
197            }
198        }
199
200        $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
201            loop {
202                break match self $(.$reader)? .fill_buf() $(.$await)? {
203                    Ok(n) => {
204                        let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
205                        if count > 0 {
206                            self $(.$reader)? .consume(count);
207                            *position += count;
208                            continue;
209                        } else {
210                            Ok(())
211                        }
212                    }
213                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
214                    Err(e) => Err(Error::Io(e.into())),
215                };
216            }
217        }
218
219        $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
220            // search byte must be within the ascii range
221            debug_assert!(byte.is_ascii());
222
223            match self.peek_one() $(.$await)? ? {
224                Some(b) if b == byte => {
225                    *position += 1;
226                    self $(.$reader)? .consume(1);
227                    Ok(true)
228                }
229                _ => Ok(false),
230            }
231        }
232
233        $($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
234            loop {
235                break match self $(.$reader)? .fill_buf() $(.$await)? {
236                    Ok(n) if n.is_empty() => Ok(None),
237                    Ok(n) => Ok(Some(n[0])),
238                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
239                    Err(e) => Err(Error::Io(e.into())),
240                };
241            }
242        }
243    };
244}
245
246// Make it public for use in async implementations
247pub(super) use impl_buffered_source;
248
249/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
250/// `Vec<u8>` as buffer that will be borrowed by events.
251impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
252    impl_buffered_source!();
253}
254
255////////////////////////////////////////////////////////////////////////////////////////////////////
256
257/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
258impl<R: BufRead> Reader<R> {
259    /// Reads the next `Event`.
260    ///
261    /// This is the main entry point for reading XML `Event`s.
262    ///
263    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
264    /// internally).
265    ///
266    /// Having the possibility to control the internal buffers gives you some additional benefits
267    /// such as:
268    ///
269    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
270    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
271    ///   end of your loop).
272    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
273    ///
274    /// # Examples
275    ///
276    /// ```
277    /// # use pretty_assertions::assert_eq;
278    /// use quick_xml::events::Event;
279    /// use quick_xml::reader::Reader;
280    ///
281    /// let xml = r#"<tag1 att1 = "test">
282    ///                 <tag2><!--Test comment-->Test</tag2>
283    ///                 <tag2>Test 2</tag2>
284    ///              </tag1>"#;
285    /// let mut reader = Reader::from_str(xml);
286    /// reader.trim_text(true);
287    /// let mut count = 0;
288    /// let mut buf = Vec::new();
289    /// let mut txt = Vec::new();
290    /// loop {
291    ///     match reader.read_event_into(&mut buf) {
292    ///         Ok(Event::Start(_)) => count += 1,
293    ///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
294    ///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
295    ///         Ok(Event::Eof) => break,
296    ///         _ => (),
297    ///     }
298    ///     buf.clear();
299    /// }
300    /// assert_eq!(count, 3);
301    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
302    /// ```
303    #[inline]
304    pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
305        self.read_event_impl(buf)
306    }
307
308    /// Reads until end element is found using provided buffer as intermediate
309    /// storage for events content. This function is supposed to be called after
310    /// you already read a [`Start`] event.
311    ///
312    /// Returns a span that cover content between `>` of an opening tag and `<` of
313    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
314    /// this method was called after reading expanded [`Start`] event.
315    ///
316    /// Manages nested cases where parent and child elements have the _literally_
317    /// same name.
318    ///
319    /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
320    /// will be returned. In particularly, that error will be returned if you call
321    /// this method without consuming the corresponding [`Start`] event first.
322    ///
323    /// If your reader created from a string slice or byte array slice, it is
324    /// better to use [`read_to_end()`] method, because it will not copy bytes
325    /// into intermediate buffer.
326    ///
327    /// The provided `buf` buffer will be filled only by one event content at time.
328    /// Before reading of each event the buffer will be cleared. If you know an
329    /// appropriate size of each event, you can preallocate the buffer to reduce
330    /// number of reallocations.
331    ///
332    /// The `end` parameter should contain name of the end element _in the reader
333    /// encoding_. It is good practice to always get that parameter using
334    /// [`BytesStart::to_end()`] method.
335    ///
336    /// The correctness of the skipped events does not checked, if you disabled
337    /// the [`check_end_names`] option.
338    ///
339    /// # Namespaces
340    ///
341    /// While the `Reader` does not support namespace resolution, namespaces
342    /// does not change the algorithm for comparing names. Although the names
343    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
344    /// same namespace, are semantically equivalent, `</b:name>` cannot close
345    /// `<a:name>`, because according to [the specification]
346    ///
347    /// > The end of every element that begins with a **start-tag** MUST be marked
348    /// > by an **end-tag** containing a name that echoes the element's type as
349    /// > given in the **start-tag**
350    ///
351    /// # Examples
352    ///
353    /// This example shows, how you can skip XML content after you read the
354    /// start event.
355    ///
356    /// ```
357    /// # use pretty_assertions::assert_eq;
358    /// use quick_xml::events::{BytesStart, Event};
359    /// use quick_xml::reader::Reader;
360    ///
361    /// let mut reader = Reader::from_str(r#"
362    ///     <outer>
363    ///         <inner>
364    ///             <inner></inner>
365    ///             <inner/>
366    ///             <outer></outer>
367    ///             <outer/>
368    ///         </inner>
369    ///     </outer>
370    /// "#);
371    /// reader.trim_text(true);
372    /// let mut buf = Vec::new();
373    ///
374    /// let start = BytesStart::new("outer");
375    /// let end   = start.to_end().into_owned();
376    ///
377    /// // First, we read a start event...
378    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
379    ///
380    /// // ...then, we could skip all events to the corresponding end event.
381    /// // This call will correctly handle nested <outer> elements.
382    /// // Note, however, that this method does not handle namespaces.
383    /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
384    ///
385    /// // At the end we should get an Eof event, because we ate the whole XML
386    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
387    /// ```
388    ///
389    /// [`Start`]: Event::Start
390    /// [`End`]: Event::End
391    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
392    /// [`read_to_end()`]: Self::read_to_end
393    /// [`expand_empty_elements`]: Self::expand_empty_elements
394    /// [`check_end_names`]: Self::check_end_names
395    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
396    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
397        Ok(read_to_end!(self, end, buf, read_event_impl, {
398            buf.clear();
399        }))
400    }
401}
402
403impl Reader<BufReader<File>> {
404    /// Creates an XML reader from a file path.
405    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
406        let file = File::open(path)?;
407        let reader = BufReader::new(file);
408        Ok(Self::from_reader(reader))
409    }
410}
411
412#[cfg(test)]
413mod test {
414    use crate::reader::test::{check, small_buffers};
415    use crate::reader::XmlSource;
416
417    /// Default buffer constructor just pass the byte array from the test
418    fn identity<T>(input: T) -> T {
419        input
420    }
421
422    check!(
423        #[test]
424        read_event_impl,
425        read_until_close,
426        identity,
427        &mut Vec::new()
428    );
429
430    small_buffers!(
431        #[test]
432        read_event_into: std::io::BufReader<_>
433    );
434
435    #[cfg(feature = "encoding")]
436    mod encoding {
437        use crate::events::Event;
438        use crate::reader::Reader;
439        use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251};
440        use pretty_assertions::assert_eq;
441
442        /// Checks that encoding is detected by BOM and changed after XML declaration
443        /// BOM indicates UTF-16LE, but XML - windows-1251
444        #[test]
445        fn bom_detected() {
446            let mut reader =
447                Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
448            let mut buf = Vec::new();
449
450            assert_eq!(reader.decoder().encoding(), UTF_8);
451            reader.read_event_into(&mut buf).unwrap();
452            assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
453
454            assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
455        }
456
457        /// Checks that encoding is changed by XML declaration, but only once
458        #[test]
459        fn xml_declaration() {
460            let mut reader = Reader::from_reader(
461                b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>".as_ref(),
462            );
463            let mut buf = Vec::new();
464
465            assert_eq!(reader.decoder().encoding(), UTF_8);
466            reader.read_event_into(&mut buf).unwrap();
467            assert_eq!(reader.decoder().encoding(), UTF_16LE);
468
469            reader.read_event_into(&mut buf).unwrap();
470            assert_eq!(reader.decoder().encoding(), UTF_16LE);
471
472            assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
473        }
474    }
475}