quick_xml/reader/
mod.rs

1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::ops::Range;
6
7use crate::encoding::Decoder;
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::reader::state::ReaderState;
11
12use memchr;
13
14macro_rules! configure_methods {
15    ($($holder:ident)?) => {
16        /// Changes whether empty elements should be split into an `Open` and a `Close` event.
17        ///
18        /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
19        /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
20        /// default), those tags are represented by an [`Empty`] event instead.
21        ///
22        /// Note, that setting this to `true` will lead to additional allocates that
23        /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
24        /// is also set, only one additional allocation will be performed that support
25        /// both these options.
26        ///
27        /// (`false` by default)
28        ///
29        /// [`Empty`]: Event::Empty
30        /// [`Start`]: Event::Start
31        /// [`End`]: Event::End
32        /// [`check_end_names`]: Self::check_end_names
33        pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self {
34            self $(.$holder)? .state.expand_empty_elements = val;
35            self
36        }
37
38        /// Changes whether whitespace before and after character data should be removed.
39        ///
40        /// When set to `true`, all [`Text`] events are trimmed.
41        /// If after that the event is empty it will not be pushed.
42        ///
43        /// Changing this option automatically changes the [`trim_text_end`] option.
44        ///
45        /// (`false` by default).
46        ///
47        /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
48        ///
49        /// WARNING: With this option every text events will be trimmed which is
50        /// incorrect behavior when text events delimited by comments, processing
51        /// instructions or CDATA sections. To correctly trim data manually apply
52        /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
53        /// only to necessary events.
54        /// </div>
55        ///
56        /// [`Text`]: Event::Text
57        /// [`trim_text_end`]: Self::trim_text_end
58        /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
59        /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
60        pub fn trim_text(&mut self, val: bool) -> &mut Self {
61            self $(.$holder)? .state.trim_text_start = val;
62            self $(.$holder)? .state.trim_text_end = val;
63            self
64        }
65
66        /// Changes whether whitespace after character data should be removed.
67        ///
68        /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
69        /// If after that the event is empty it will not be pushed.
70        ///
71        /// (`false` by default).
72        ///
73        /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
74        ///
75        /// WARNING: With this option every text events will be trimmed which is
76        /// incorrect behavior when text events delimited by comments, processing
77        /// instructions or CDATA sections. To correctly trim data manually apply
78        /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
79        /// only to necessary events.
80        /// </div>
81        ///
82        /// [`Text`]: Event::Text
83        /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
84        /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
85        pub fn trim_text_end(&mut self, val: bool) -> &mut Self {
86            self $(.$holder)? .state.trim_text_end = val;
87            self
88        }
89
90        /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
91        /// `</a >`.
92        ///
93        /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
94        ///
95        /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
96        /// going to fail erroneously if a closing tag contains trailing whitespaces.
97        ///
98        /// (`true` by default)
99        ///
100        /// [`End`]: Event::End
101        pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self {
102            self $(.$holder)? .state.trim_markup_names_in_closing_tags = val;
103            self
104        }
105
106        /// Changes whether mismatched closing tag names should be detected.
107        ///
108        /// Note, that start and end tags [should match literally][spec], they cannot
109        /// have different prefixes even if both prefixes resolve to the same namespace.
110        /// The XML
111        ///
112        /// ```xml
113        /// <outer xmlns="namespace" xmlns:p="namespace">
114        /// </p:outer>
115        /// ```
116        ///
117        /// is not valid, even though semantically the start tag is the same as the
118        /// end tag. The reason is that namespaces are an extension of the original
119        /// XML specification (without namespaces) and it should be backward-compatible.
120        ///
121        /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
122        /// For example, `<mytag></different_tag>` will be permitted.
123        ///
124        /// If the XML is known to be sane (already processed, etc.) this saves extra time.
125        ///
126        /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
127        /// contain the data of the mismatched end tag.
128        ///
129        /// Note, that setting this to `true` will lead to additional allocates that
130        /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
131        /// is also set, only one additional allocation will be performed that support
132        /// both these options.
133        ///
134        /// (`true` by default)
135        ///
136        /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
137        /// [`End`]: Event::End
138        /// [`expand_empty_elements`]: Self::expand_empty_elements
139        pub fn check_end_names(&mut self, val: bool) -> &mut Self {
140            self $(.$holder)? .state.check_end_names = val;
141            self
142        }
143
144        /// Changes whether comments should be validated.
145        ///
146        /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
147        /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
148        /// really care about comment correctness, thus the default value is `false` to improve
149        /// performance.
150        ///
151        /// (`false` by default)
152        ///
153        /// [`Comment`]: Event::Comment
154        pub fn check_comments(&mut self, val: bool) -> &mut Self {
155            self $(.$holder)? .state.check_comments = val;
156            self
157        }
158    };
159}
160
161macro_rules! read_event_impl {
162    (
163        $self:ident, $buf:ident,
164        $reader:expr,
165        $read_until_open:ident,
166        $read_until_close:ident
167        $(, $await:ident)?
168    ) => {{
169        let event = loop {
170            match $self.state.state {
171                ParseState::Init => { // Go to OpenedTag state
172                    // If encoding set explicitly, we not need to detect it. For example,
173                    // explicit UTF-8 set automatically if Reader was created using `from_str`.
174                    // But we still need to remove BOM for consistency with no encoding
175                    // feature enabled path
176                    #[cfg(feature = "encoding")]
177                    if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
178                        if $self.state.encoding.can_be_refined() {
179                            $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
180                        }
181                    }
182
183                    // Removes UTF-8 BOM if it is present
184                    #[cfg(not(feature = "encoding"))]
185                    $reader.remove_utf8_bom() $(.$await)? ?;
186
187                    // Go to OpenedTag state
188                    match $self.$read_until_open($buf) $(.$await)? {
189                        Ok(Ok(ev)) => break Ok(ev),
190                        Ok(Err(b)) => $buf = b,
191                        Err(err)   => break Err(err),
192                    }
193                },
194                ParseState::ClosedTag => { // Go to OpenedTag state
195                    match $self.$read_until_open($buf) $(.$await)? {
196                        Ok(Ok(ev)) => break Ok(ev),
197                        Ok(Err(b)) => $buf = b,
198                        Err(err)   => break Err(err),
199                    }
200                },
201                // Go to ClosedTag state in next two arms
202                ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
203                ParseState::Empty => break $self.state.close_expanded_empty(),
204                ParseState::Exit => break Ok(Event::Eof),
205            };
206        };
207        match event {
208            Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Exit,
209            _ => {}
210        }
211        event
212    }};
213}
214
215/// Read bytes up to `<` and skip it. If current byte (after skipping all space
216/// characters if [`ReaderState::trim_text_start`] is `true`) is already `<`, then
217/// returns the next event, otherwise stay at position just after the `<` symbol.
218///
219/// Moves parser to the `OpenedTag` state.
220///
221/// This code is executed in two cases:
222/// - after start of parsing just after skipping BOM if it is present
223/// - after parsing `</tag>` or `<tag>`
224macro_rules! read_until_open {
225    (
226        $self:ident, $buf:ident,
227        $reader:expr,
228        $read_event:ident
229        $(, $await:ident)?
230    ) => {{
231        $self.state.state = ParseState::OpenedTag;
232
233        if $self.state.trim_text_start {
234            $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
235        }
236
237        // If we already at the `<` symbol, do not try to return an empty Text event
238        if $reader.skip_one(b'<', &mut $self.state.offset) $(.$await)? ? {
239            // Pass $buf to the next next iteration of parsing loop
240            return Ok(Err($buf));
241        }
242
243        match $reader
244            .read_bytes_until(b'<', $buf, &mut $self.state.offset)
245            $(.$await)?
246        {
247            // Return Text event with `bytes` content
248            Ok(Some(bytes)) => $self.state.emit_text(bytes).map(Ok),
249            Ok(None) => Ok(Ok(Event::Eof)),
250            Err(e) => Err(e),
251        }
252    }};
253}
254
255/// Read bytes up to the `>` and skip it. This method is expected to be called
256/// after seeing the `<` symbol and skipping it. Inspects the next (current)
257/// symbol and returns an appropriate [`Event`]:
258///
259/// |Symbol |Event
260/// |-------|-------------------------------------
261/// |`!`    |[`Comment`], [`CData`] or [`DocType`]
262/// |`/`    |[`End`]
263/// |`?`    |[`PI`]
264/// |_other_|[`Start`] or [`Empty`]
265///
266/// Moves parser to the `ClosedTag` state.
267///
268/// [`Comment`]: Event::Comment
269/// [`CData`]: Event::CData
270/// [`DocType`]: Event::DocType
271/// [`End`]: Event::End
272/// [`PI`]: Event::PI
273/// [`Start`]: Event::Start
274/// [`Empty`]: Event::Empty
275macro_rules! read_until_close {
276    (
277        $self:ident, $buf:ident,
278        $reader:expr
279        $(, $await:ident)?
280    ) => {{
281        $self.state.state = ParseState::ClosedTag;
282
283        match $reader.peek_one() $(.$await)? {
284            // `<!` - comment, CDATA or DOCTYPE declaration
285            Ok(Some(b'!')) => match $reader
286                .read_bang_element($buf, &mut $self.state.offset)
287                $(.$await)?
288            {
289                Ok(None) => Ok(Event::Eof),
290                Ok(Some((bang_type, bytes))) => $self.state.emit_bang(bang_type, bytes),
291                Err(e) => Err(e),
292            },
293            // `</` - closing tag
294            Ok(Some(b'/')) => match $reader
295                .read_bytes_until(b'>', $buf, &mut $self.state.offset)
296                $(.$await)?
297            {
298                Ok(None) => Ok(Event::Eof),
299                Ok(Some(bytes)) => $self.state.emit_end(bytes),
300                Err(e) => Err(e),
301            },
302            // `<?` - processing instruction
303            Ok(Some(b'?')) => match $reader
304                .read_bytes_until(b'>', $buf, &mut $self.state.offset)
305                $(.$await)?
306            {
307                Ok(None) => Ok(Event::Eof),
308                Ok(Some(bytes)) => $self.state.emit_question_mark(bytes),
309                Err(e) => Err(e),
310            },
311            // `<...` - opening or self-closed tag
312            Ok(Some(_)) => match $reader
313                .read_element($buf, &mut $self.state.offset)
314                $(.$await)?
315            {
316                Ok(None) => Ok(Event::Eof),
317                Ok(Some(bytes)) => $self.state.emit_start(bytes),
318                Err(e) => Err(e),
319            },
320            Ok(None) => Ok(Event::Eof),
321            Err(e) => Err(e),
322        }
323    }};
324}
325
326/// Generalization of `read_to_end` method for buffered and borrowed readers
327macro_rules! read_to_end {
328    (
329        $self:expr, $end:expr, $buf:expr,
330        $read_event:ident,
331        // Code block that performs clearing of internal buffer after read of each event
332        $clear:block
333        $(, $await:ident)?
334    ) => {{
335        let start = $self.buffer_position();
336        let mut depth = 0;
337        loop {
338            $clear
339            let end = $self.buffer_position();
340            match $self.$read_event($buf) $(.$await)? {
341                Err(e) => return Err(e),
342
343                Ok(Event::Start(e)) if e.name() == $end => depth += 1,
344                Ok(Event::End(e)) if e.name() == $end => {
345                    if depth == 0 {
346                        break start..end;
347                    }
348                    depth -= 1;
349                }
350                Ok(Event::Eof) => {
351                    let name = $self.decoder().decode($end.as_ref());
352                    return Err(Error::UnexpectedEof(format!("</{:?}>", name)));
353                }
354                _ => (),
355            }
356        }
357    }};
358}
359
360#[cfg(feature = "async-tokio")]
361mod async_tokio;
362mod buffered_reader;
363mod ns_reader;
364mod slice_reader;
365mod state;
366
367pub use ns_reader::NsReader;
368
369/// Range of input in bytes, that corresponds to some piece of XML
370pub type Span = Range<usize>;
371
372////////////////////////////////////////////////////////////////////////////////////////////////////
373
374/// Possible reader states. The state transition diagram (`true` and `false` shows
375/// value of [`Reader::expand_empty_elements()`] option):
376///
377/// ```mermaid
378/// flowchart LR
379///   subgraph _
380///     direction LR
381///
382///     Init      -- "(no event)"\n                                       --> OpenedTag
383///     OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
384///     ClosedTag -- "#lt;false#gt;\n(no event)"\nText                    --> OpenedTag
385///   end
386///   ClosedTag -- "#lt;true#gt;"\nStart --> Empty
387///   Empty     -- End                   --> ClosedTag
388///   _ -. Eof .-> Exit
389/// ```
390#[derive(Clone)]
391enum ParseState {
392    /// Initial state in which reader stay after creation. Transition from that
393    /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
394    /// state is always `OpenedTag`. The reader will never return to this state. The
395    /// event emitted during transition to `OpenedTag` is a `StartEvent` if the
396    /// first symbol not `<`, otherwise no event are emitted.
397    Init,
398    /// State after seeing the `<` symbol. Depending on the next symbol all other
399    /// events could be generated.
400    ///
401    /// After generating one event the reader moves to the `ClosedTag` state.
402    OpenedTag,
403    /// State in which reader searches the `<` symbol of a markup. All bytes before
404    /// that symbol will be returned in the [`Event::Text`] event. After that
405    /// the reader moves to the `OpenedTag` state.
406    ClosedTag,
407    /// This state is used only if option [`expand_empty_elements`] is set to `true`.
408    /// Reader enters to this state when it is in a `ClosedTag` state and emits an
409    /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
410    /// after which reader returned to the `ClosedTag` state.
411    ///
412    /// [`expand_empty_elements`]: ReaderState::expand_empty_elements
413    Empty,
414    /// Reader enters this state when `Eof` event generated or an error occurred.
415    /// This is the last state, the reader stay in it forever.
416    Exit,
417}
418
419/// A reference to an encoding together with information about how it was retrieved.
420///
421/// The state transition diagram:
422///
423/// ```mermaid
424/// flowchart LR
425///   Implicit    -- from_str       --> Explicit
426///   Implicit    -- BOM            --> BomDetected
427///   Implicit    -- "encoding=..." --> XmlDetected
428///   BomDetected -- "encoding=..." --> XmlDetected
429/// ```
430#[cfg(feature = "encoding")]
431#[derive(Clone, Copy)]
432enum EncodingRef {
433    /// Encoding was implicitly assumed to have a specified value. It can be refined
434    /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
435    Implicit(&'static Encoding),
436    /// Encoding was explicitly set to the desired value. It cannot be changed
437    /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
438    Explicit(&'static Encoding),
439    /// Encoding was detected from a byte order mark (BOM) or by the first bytes
440    /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
441    BomDetected(&'static Encoding),
442    /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
443    /// It can no longer change
444    XmlDetected(&'static Encoding),
445}
446#[cfg(feature = "encoding")]
447impl EncodingRef {
448    #[inline]
449    fn encoding(&self) -> &'static Encoding {
450        match self {
451            Self::Implicit(e) => e,
452            Self::Explicit(e) => e,
453            Self::BomDetected(e) => e,
454            Self::XmlDetected(e) => e,
455        }
456    }
457    #[inline]
458    fn can_be_refined(&self) -> bool {
459        match self {
460            Self::Implicit(_) | Self::BomDetected(_) => true,
461            Self::Explicit(_) | Self::XmlDetected(_) => false,
462        }
463    }
464}
465
466////////////////////////////////////////////////////////////////////////////////////////////////////
467
468/// A low level encoding-agnostic XML event reader.
469///
470/// Consumes bytes and streams XML [`Event`]s.
471///
472/// This reader does not manage namespace declarations and not able to resolve
473/// prefixes. If you want these features, use the [`NsReader`].
474///
475/// # Examples
476///
477/// ```
478/// use quick_xml::events::Event;
479/// use quick_xml::reader::Reader;
480///
481/// let xml = r#"<tag1 att1 = "test">
482///                 <tag2><!--Test comment-->Test</tag2>
483///                 <tag2>Test 2</tag2>
484///              </tag1>"#;
485/// let mut reader = Reader::from_str(xml);
486/// reader.trim_text(true);
487///
488/// let mut count = 0;
489/// let mut txt = Vec::new();
490/// let mut buf = Vec::new();
491///
492/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
493/// loop {
494///     // NOTE: this is the generic case when we don't know about the input BufRead.
495///     // when the input is a &str or a &[u8], we don't actually need to use another
496///     // buffer, we could directly call `reader.read_event()`
497///     match reader.read_event_into(&mut buf) {
498///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
499///         // exits the loop when reaching end of file
500///         Ok(Event::Eof) => break,
501///
502///         Ok(Event::Start(e)) => {
503///             match e.name().as_ref() {
504///                 b"tag1" => println!("attributes values: {:?}",
505///                                     e.attributes().map(|a| a.unwrap().value)
506///                                     .collect::<Vec<_>>()),
507///                 b"tag2" => count += 1,
508///                 _ => (),
509///             }
510///         }
511///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
512///
513///         // There are several other `Event`s we do not consider here
514///         _ => (),
515///     }
516///     // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
517///     buf.clear();
518/// }
519/// ```
520///
521/// [`NsReader`]: crate::reader::NsReader
522#[derive(Clone)]
523pub struct Reader<R> {
524    /// Source of data for parse
525    reader: R,
526    /// Configuration and current parse state
527    state: ReaderState,
528}
529
530/// Builder methods
531impl<R> Reader<R> {
532    /// Creates a `Reader` that reads from a given reader.
533    pub fn from_reader(reader: R) -> Self {
534        Self {
535            reader,
536            state: ReaderState::default(),
537        }
538    }
539
540    configure_methods!();
541}
542
543/// Getters
544impl<R> Reader<R> {
545    /// Consumes `Reader` returning the underlying reader
546    ///
547    /// Can be used to compute line and column of a parsing error position
548    ///
549    /// # Examples
550    ///
551    /// ```
552    /// # use pretty_assertions::assert_eq;
553    /// use std::{str, io::Cursor};
554    /// use quick_xml::events::Event;
555    /// use quick_xml::reader::Reader;
556    ///
557    /// let xml = r#"<tag1 att1 = "test">
558    ///                 <tag2><!--Test comment-->Test</tag2>
559    ///                 <tag3>Test 2</tag3>
560    ///              </tag1>"#;
561    /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
562    /// let mut buf = Vec::new();
563    ///
564    /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
565    ///     let end_pos = reader.buffer_position();
566    ///     let mut cursor = reader.into_inner();
567    ///     let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
568    ///         .expect("can't make a string");
569    ///     let mut line = 1;
570    ///     let mut column = 0;
571    ///     for c in s.chars() {
572    ///         if c == '\n' {
573    ///             line += 1;
574    ///             column = 0;
575    ///         } else {
576    ///             column += 1;
577    ///         }
578    ///     }
579    ///     (line, column)
580    /// }
581    ///
582    /// loop {
583    ///     match reader.read_event_into(&mut buf) {
584    ///         Ok(Event::Start(ref e)) => match e.name().as_ref() {
585    ///             b"tag1" | b"tag2" => (),
586    ///             tag => {
587    ///                 assert_eq!(b"tag3", tag);
588    ///                 assert_eq!((3, 22), into_line_and_column(reader));
589    ///                 break;
590    ///             }
591    ///         },
592    ///         Ok(Event::Eof) => unreachable!(),
593    ///         _ => (),
594    ///     }
595    ///     buf.clear();
596    /// }
597    /// ```
598    pub fn into_inner(self) -> R {
599        self.reader
600    }
601
602    /// Gets a reference to the underlying reader.
603    pub fn get_ref(&self) -> &R {
604        &self.reader
605    }
606
607    /// Gets a mutable reference to the underlying reader.
608    pub fn get_mut(&mut self) -> &mut R {
609        &mut self.reader
610    }
611
612    /// Gets the current byte position in the input data.
613    ///
614    /// Useful when debugging errors.
615    pub fn buffer_position(&self) -> usize {
616        // when internal state is OpenedTag, we have actually read until '<',
617        // which we don't want to show
618        if let ParseState::OpenedTag = self.state.state {
619            self.state.offset - 1
620        } else {
621            self.state.offset
622        }
623    }
624
625    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
626    ///
627    /// If [`encoding`] feature is enabled, the used encoding may change after
628    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
629    ///
630    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
631    /// defaults to UTF-8.
632    ///
633    /// [`encoding`]: ../index.html#encoding
634    #[inline]
635    pub fn decoder(&self) -> Decoder {
636        self.state.decoder()
637    }
638}
639
640/// Private sync reading methods
641impl<R> Reader<R> {
642    /// Read text into the given buffer, and return an event that borrows from
643    /// either that buffer or from the input itself, based on the type of the
644    /// reader.
645    fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
646    where
647        R: XmlSource<'i, B>,
648    {
649        read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
650    }
651
652    /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
653    ///
654    /// Returns inner `Ok` if the loop should be broken and an event returned.
655    /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
656    fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>>
657    where
658        R: XmlSource<'i, B>,
659    {
660        read_until_open!(self, buf, self.reader, read_event_impl)
661    }
662
663    /// Private function to read until `>` is found. This function expects that
664    /// it was called just after encounter a `<` symbol.
665    fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
666    where
667        R: XmlSource<'i, B>,
668    {
669        read_until_close!(self, buf, self.reader)
670    }
671}
672
673////////////////////////////////////////////////////////////////////////////////////////////////////
674
675/// Represents an input for a reader that can return borrowed data.
676///
677/// There are two implementors of this trait: generic one that read data from
678/// `Self`, copies some part of it into a provided buffer of type `B` and then
679/// returns data that borrow from that buffer.
680///
681/// The other implementor is for `&[u8]` and instead of copying data returns
682/// borrowed data from `Self` instead. This implementation allows zero-copy
683/// deserialization.
684///
685/// # Parameters
686/// - `'r`: lifetime of a buffer from which events will borrow
687/// - `B`: a type of a buffer that can be used to store data read from `Self` and
688///   from which events can borrow
689trait XmlSource<'r, B> {
690    /// Removes UTF-8 BOM if it is present
691    #[cfg(not(feature = "encoding"))]
692    fn remove_utf8_bom(&mut self) -> Result<()>;
693
694    /// Determines encoding from the start of input and removes BOM if it is present
695    #[cfg(feature = "encoding")]
696    fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
697
698    /// Read input until `byte` is found or end of input is reached.
699    ///
700    /// Returns a slice of data read up to `byte`, which does not include into result.
701    /// If input (`Self`) is exhausted, returns `None`.
702    ///
703    /// # Example
704    ///
705    /// ```ignore
706    /// let mut position = 0;
707    /// let mut input = b"abc*def".as_ref();
708    /// //                    ^= 4
709    ///
710    /// assert_eq!(
711    ///     input.read_bytes_until(b'*', (), &mut position).unwrap(),
712    ///     Some(b"abc".as_ref())
713    /// );
714    /// assert_eq!(position, 4); // position after the symbol matched
715    /// ```
716    ///
717    /// # Parameters
718    /// - `byte`: Byte for search
719    /// - `buf`: Buffer that could be filled from an input (`Self`) and
720    ///   from which [events] could borrow their data
721    /// - `position`: Will be increased by amount of bytes consumed
722    ///
723    /// [events]: crate::events::Event
724    fn read_bytes_until(
725        &mut self,
726        byte: u8,
727        buf: B,
728        position: &mut usize,
729    ) -> Result<Option<&'r [u8]>>;
730
731    /// Read input until comment, CDATA or processing instruction is finished.
732    ///
733    /// This method expect that `<` already was read.
734    ///
735    /// Returns a slice of data read up to end of comment, CDATA or processing
736    /// instruction (`>`), which does not include into result.
737    ///
738    /// If input (`Self`) is exhausted and nothing was read, returns `None`.
739    ///
740    /// # Parameters
741    /// - `buf`: Buffer that could be filled from an input (`Self`) and
742    ///   from which [events] could borrow their data
743    /// - `position`: Will be increased by amount of bytes consumed
744    ///
745    /// [events]: crate::events::Event
746    fn read_bang_element(
747        &mut self,
748        buf: B,
749        position: &mut usize,
750    ) -> Result<Option<(BangType, &'r [u8])>>;
751
752    /// Read input until XML element is closed by approaching a `>` symbol.
753    /// Returns `Some(buffer)` that contains a data between `<` and `>` or
754    /// `None` if end-of-input was reached and nothing was read.
755    ///
756    /// Derived from `read_until`, but modified to handle XML attributes
757    /// using a minimal state machine.
758    ///
759    /// Attribute values are [defined] as follows:
760    /// ```plain
761    /// AttValue := '"' (([^<&"]) | Reference)* '"'
762    ///           | "'" (([^<&']) | Reference)* "'"
763    /// ```
764    /// (`Reference` is something like `&quot;`, but we don't care about
765    /// escaped characters at this level)
766    ///
767    /// # Parameters
768    /// - `buf`: Buffer that could be filled from an input (`Self`) and
769    ///   from which [events] could borrow their data
770    /// - `position`: Will be increased by amount of bytes consumed
771    ///
772    /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
773    /// [events]: crate::events::Event
774    fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
775
776    /// Consume and discard all the whitespace until the next non-whitespace
777    /// character or EOF.
778    ///
779    /// # Parameters
780    /// - `position`: Will be increased by amount of bytes consumed
781    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
782
783    /// Consume and discard one character if it matches the given byte. Return
784    /// `true` if it matched.
785    ///
786    /// # Parameters
787    /// - `position`: Will be increased by 1 if byte is matched
788    fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
789
790    /// Return one character without consuming it, so that future `read_*` calls
791    /// will still include it. On EOF, return `None`.
792    fn peek_one(&mut self) -> Result<Option<u8>>;
793}
794
795/// Possible elements started with `<!`
796#[derive(Debug, PartialEq)]
797enum BangType {
798    /// <![CDATA[...]]>
799    CData,
800    /// <!--...-->
801    Comment,
802    /// <!DOCTYPE...>
803    DocType,
804}
805impl BangType {
806    #[inline(always)]
807    fn new(byte: Option<u8>) -> Result<Self> {
808        Ok(match byte {
809            Some(b'[') => Self::CData,
810            Some(b'-') => Self::Comment,
811            Some(b'D') | Some(b'd') => Self::DocType,
812            Some(b) => return Err(Error::UnexpectedBang(b)),
813            None => return Err(Error::UnexpectedEof("Bang".to_string())),
814        })
815    }
816
817    /// If element is finished, returns its content up to `>` symbol and
818    /// an index of this symbol, otherwise returns `None`
819    ///
820    /// # Parameters
821    /// - `buf`: buffer with data consumed on previous iterations
822    /// - `chunk`: data read on current iteration and not yet consumed from reader
823    #[inline(always)]
824    fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
825        for i in memchr::memchr_iter(b'>', chunk) {
826            match self {
827                // Need to read at least 6 symbols (`!---->`) for properly finished comment
828                // <!----> - XML comment
829                //  012345 - i
830                Self::Comment if buf.len() + i > 4 => {
831                    if chunk[..i].ends_with(b"--") {
832                        // We cannot strip last `--` from the buffer because we need it in case of
833                        // check_comments enabled option. XML standard requires that comment
834                        // will not end with `--->` sequence because this is a special case of
835                        // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
836                        return Some((&chunk[..i], i + 1)); // +1 for `>`
837                    }
838                    // End sequence `-|->` was splitted at |
839                    //        buf --/   \-- chunk
840                    if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
841                        return Some((&chunk[..i], i + 1)); // +1 for `>`
842                    }
843                    // End sequence `--|>` was splitted at |
844                    //         buf --/   \-- chunk
845                    if i == 0 && buf.ends_with(b"--") {
846                        return Some((&[], i + 1)); // +1 for `>`
847                    }
848                }
849                Self::Comment => {}
850                Self::CData => {
851                    if chunk[..i].ends_with(b"]]") {
852                        return Some((&chunk[..i], i + 1)); // +1 for `>`
853                    }
854                    // End sequence `]|]>` was splitted at |
855                    //        buf --/   \-- chunk
856                    if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
857                        return Some((&chunk[..i], i + 1)); // +1 for `>`
858                    }
859                    // End sequence `]]|>` was splitted at |
860                    //         buf --/   \-- chunk
861                    if i == 0 && buf.ends_with(b"]]") {
862                        return Some((&[], i + 1)); // +1 for `>`
863                    }
864                }
865                Self::DocType => {
866                    let content = &chunk[..i];
867                    let balance = memchr::memchr2_iter(b'<', b'>', content)
868                        .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
869                        .sum::<i32>();
870                    if balance == 0 {
871                        return Some((content, i + 1)); // +1 for `>`
872                    }
873                }
874            }
875        }
876        None
877    }
878    #[inline]
879    fn to_err(&self) -> Error {
880        let bang_str = match self {
881            Self::CData => "CData",
882            Self::Comment => "Comment",
883            Self::DocType => "DOCTYPE",
884        };
885        Error::UnexpectedEof(bang_str.to_string())
886    }
887}
888
889/// State machine for the [`XmlSource::read_element`]
890#[derive(Clone, Copy)]
891enum ReadElementState {
892    /// The initial state (inside element, but outside of attribute value)
893    Elem,
894    /// Inside a single-quoted attribute value
895    SingleQ,
896    /// Inside a double-quoted attribute value
897    DoubleQ,
898}
899impl ReadElementState {
900    /// Changes state by analyzing part of input.
901    /// Returns a tuple with part of chunk up to element closing symbol `>`
902    /// and a position after that symbol or `None` if such symbol was not found
903    #[inline(always)]
904    fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
905        for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) {
906            *self = match (*self, chunk[i]) {
907                // only allowed to match `>` while we are in state `Elem`
908                (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
909                (Self::Elem, b'\'') => Self::SingleQ,
910                (Self::Elem, b'\"') => Self::DoubleQ,
911
912                // the only end_byte that gets us out if the same character
913                (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
914
915                // all other bytes: no state change
916                _ => *self,
917            };
918        }
919        None
920    }
921}
922
923/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
924#[inline]
925pub(crate) const fn is_whitespace(b: u8) -> bool {
926    matches!(b, b' ' | b'\r' | b'\n' | b'\t')
927}
928
929////////////////////////////////////////////////////////////////////////////////////////////////////
930
931#[cfg(test)]
932mod test {
933    /// Checks the internal implementation of the various reader methods
934    macro_rules! check {
935        (
936            #[$test:meta]
937            $read_event:ident,
938            $read_until_close:ident,
939            // constructor of the XML source on which internal functions will be called
940            $source:path,
941            // constructor of the buffer to which read data will stored
942            $buf:expr
943            $(, $async:ident, $await:ident)?
944        ) => {
945            mod read_bytes_until {
946                use super::*;
947                // Use Bytes for printing bytes as strings for ASCII range
948                use crate::utils::Bytes;
949                use pretty_assertions::assert_eq;
950
951                /// Checks that search in the empty buffer returns `None`
952                #[$test]
953                $($async)? fn empty() {
954                    let buf = $buf;
955                    let mut position = 0;
956                    let mut input = b"".as_ref();
957                    //                ^= 0
958
959                    assert_eq!(
960                        $source(&mut input)
961                            .read_bytes_until(b'*', buf, &mut position)
962                            $(.$await)?
963                            .unwrap()
964                            .map(Bytes),
965                        None
966                    );
967                    assert_eq!(position, 0);
968                }
969
970                /// Checks that search in the buffer non-existent value returns entire buffer
971                /// as a result and set `position` to `len()`
972                #[$test]
973                $($async)? fn non_existent() {
974                    let buf = $buf;
975                    let mut position = 0;
976                    let mut input = b"abcdef".as_ref();
977                    //                      ^= 6
978
979                    assert_eq!(
980                        $source(&mut input)
981                            .read_bytes_until(b'*', buf, &mut position)
982                            $(.$await)?
983                            .unwrap()
984                            .map(Bytes),
985                        Some(Bytes(b"abcdef"))
986                    );
987                    assert_eq!(position, 6);
988                }
989
990                /// Checks that search in the buffer an element that is located in the front of
991                /// buffer returns empty slice as a result and set `position` to one symbol
992                /// after match (`1`)
993                #[$test]
994                $($async)? fn at_the_start() {
995                    let buf = $buf;
996                    let mut position = 0;
997                    let mut input = b"*abcdef".as_ref();
998                    //                 ^= 1
999
1000                    assert_eq!(
1001                        $source(&mut input)
1002                            .read_bytes_until(b'*', buf, &mut position)
1003                            $(.$await)?
1004                            .unwrap()
1005                            .map(Bytes),
1006                        Some(Bytes(b""))
1007                    );
1008                    assert_eq!(position, 1); // position after the symbol matched
1009                }
1010
1011                /// Checks that search in the buffer an element that is located in the middle of
1012                /// buffer returns slice before that symbol as a result and set `position` to one
1013                /// symbol after match
1014                #[$test]
1015                $($async)? fn inside() {
1016                    let buf = $buf;
1017                    let mut position = 0;
1018                    let mut input = b"abc*def".as_ref();
1019                    //                    ^= 4
1020
1021                    assert_eq!(
1022                        $source(&mut input)
1023                            .read_bytes_until(b'*', buf, &mut position)
1024                            $(.$await)?
1025                            .unwrap()
1026                            .map(Bytes),
1027                        Some(Bytes(b"abc"))
1028                    );
1029                    assert_eq!(position, 4); // position after the symbol matched
1030                }
1031
1032                /// Checks that search in the buffer an element that is located in the end of
1033                /// buffer returns slice before that symbol as a result and set `position` to one
1034                /// symbol after match (`len()`)
1035                #[$test]
1036                $($async)? fn in_the_end() {
1037                    let buf = $buf;
1038                    let mut position = 0;
1039                    let mut input = b"abcdef*".as_ref();
1040                    //                       ^= 7
1041
1042                    assert_eq!(
1043                        $source(&mut input)
1044                            .read_bytes_until(b'*', buf, &mut position)
1045                            $(.$await)?
1046                            .unwrap()
1047                            .map(Bytes),
1048                        Some(Bytes(b"abcdef"))
1049                    );
1050                    assert_eq!(position, 7); // position after the symbol matched
1051                }
1052            }
1053
1054            mod read_bang_element {
1055                use super::*;
1056
1057                /// Checks that reading CDATA content works correctly
1058                mod cdata {
1059                    use super::*;
1060                    use crate::errors::Error;
1061                    use crate::reader::BangType;
1062                    use crate::utils::Bytes;
1063                    use pretty_assertions::assert_eq;
1064
1065                    /// Checks that if input begins like CDATA element, but CDATA start sequence
1066                    /// is not finished, parsing ends with an error
1067                    #[$test]
1068                    #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1069                    $($async)? fn not_properly_start() {
1070                        let buf = $buf;
1071                        let mut position = 0;
1072                        let mut input = b"![]]>other content".as_ref();
1073                        //                ^= 0
1074
1075                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1076                            Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1077                            x => assert!(
1078                                false,
1079                                r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1080                                x
1081                            ),
1082                        }
1083                        assert_eq!(position, 0);
1084                    }
1085
1086                    /// Checks that if CDATA startup sequence was matched, but an end sequence
1087                    /// is not found, parsing ends with an error
1088                    #[$test]
1089                    $($async)? fn not_closed() {
1090                        let buf = $buf;
1091                        let mut position = 0;
1092                        let mut input = b"![CDATA[other content".as_ref();
1093                        //                ^= 0
1094
1095                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1096                            Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1097                            x => assert!(
1098                                false,
1099                                r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1100                                x
1101                            ),
1102                        }
1103                        assert_eq!(position, 0);
1104                    }
1105
1106                    /// Checks that CDATA element without content inside parsed successfully
1107                    #[$test]
1108                    $($async)? fn empty() {
1109                        let buf = $buf;
1110                        let mut position = 0;
1111                        let mut input = b"![CDATA[]]>other content".as_ref();
1112                        //                           ^= 11
1113
1114                        assert_eq!(
1115                            $source(&mut input)
1116                                .read_bang_element(buf, &mut position)
1117                                $(.$await)?
1118                                .unwrap()
1119                                .map(|(ty, data)| (ty, Bytes(data))),
1120                            Some((BangType::CData, Bytes(b"![CDATA[]]")))
1121                        );
1122                        assert_eq!(position, 11);
1123                    }
1124
1125                    /// Checks that CDATA element with content parsed successfully.
1126                    /// Additionally checks that sequences inside CDATA that may look like
1127                    /// a CDATA end sequence do not interrupt CDATA parsing
1128                    #[$test]
1129                    $($async)? fn with_content() {
1130                        let buf = $buf;
1131                        let mut position = 0;
1132                        let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1133                        //                                            ^= 28
1134
1135                        assert_eq!(
1136                            $source(&mut input)
1137                                .read_bang_element(buf, &mut position)
1138                                $(.$await)?
1139                                .unwrap()
1140                                .map(|(ty, data)| (ty, Bytes(data))),
1141                            Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
1142                        );
1143                        assert_eq!(position, 28);
1144                    }
1145                }
1146
1147                /// Checks that reading XML comments works correctly. According to the [specification],
1148                /// comment data can contain any sequence except `--`:
1149                ///
1150                /// ```peg
1151                /// comment = '<--' (!'--' char)* '-->';
1152                /// char = [#x1-#x2C]
1153                ///      / [#x2E-#xD7FF]
1154                ///      / [#xE000-#xFFFD]
1155                ///      / [#x10000-#x10FFFF]
1156                /// ```
1157                ///
1158                /// The presence of this limitation, however, is simply a poorly designed specification
1159                /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1160                /// presence of these sequences by default. This tests allow such content.
1161                ///
1162                /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1163                mod comment {
1164                    use super::*;
1165                    use crate::errors::Error;
1166                    use crate::reader::BangType;
1167                    use crate::utils::Bytes;
1168                    use pretty_assertions::assert_eq;
1169
1170                    #[$test]
1171                    #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1172                    $($async)? fn not_properly_start() {
1173                        let buf = $buf;
1174                        let mut position = 0;
1175                        let mut input = b"!- -->other content".as_ref();
1176                        //                ^= 0
1177
1178                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1179                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1180                            x => assert!(
1181                                false,
1182                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1183                                x
1184                            ),
1185                        }
1186                        assert_eq!(position, 0);
1187                    }
1188
1189                    #[$test]
1190                    $($async)? fn not_properly_end() {
1191                        let buf = $buf;
1192                        let mut position = 0;
1193                        let mut input = b"!->other content".as_ref();
1194                        //                ^= 0
1195
1196                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1197                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1198                            x => assert!(
1199                                false,
1200                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1201                                x
1202                            ),
1203                        }
1204                        assert_eq!(position, 0);
1205                    }
1206
1207                    #[$test]
1208                    $($async)? fn not_closed1() {
1209                        let buf = $buf;
1210                        let mut position = 0;
1211                        let mut input = b"!--other content".as_ref();
1212                        //                ^= 0
1213
1214                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1215                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1216                            x => assert!(
1217                                false,
1218                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1219                                x
1220                            ),
1221                        }
1222                        assert_eq!(position, 0);
1223                    }
1224
1225                    #[$test]
1226                    $($async)? fn not_closed2() {
1227                        let buf = $buf;
1228                        let mut position = 0;
1229                        let mut input = b"!-->other content".as_ref();
1230                        //                ^= 0
1231
1232                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1233                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1234                            x => assert!(
1235                                false,
1236                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1237                                x
1238                            ),
1239                        }
1240                        assert_eq!(position, 0);
1241                    }
1242
1243                    #[$test]
1244                    $($async)? fn not_closed3() {
1245                        let buf = $buf;
1246                        let mut position = 0;
1247                        let mut input = b"!--->other content".as_ref();
1248                        //                ^= 0
1249
1250                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1251                            Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1252                            x => assert!(
1253                                false,
1254                                r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1255                                x
1256                            ),
1257                        }
1258                        assert_eq!(position, 0);
1259                    }
1260
1261                    #[$test]
1262                    $($async)? fn empty() {
1263                        let buf = $buf;
1264                        let mut position = 0;
1265                        let mut input = b"!---->other content".as_ref();
1266                        //                      ^= 6
1267
1268                        assert_eq!(
1269                            $source(&mut input)
1270                                .read_bang_element(buf, &mut position)
1271                                $(.$await)?
1272                                .unwrap()
1273                                .map(|(ty, data)| (ty, Bytes(data))),
1274                            Some((BangType::Comment, Bytes(b"!----")))
1275                        );
1276                        assert_eq!(position, 6);
1277                    }
1278
1279                    #[$test]
1280                    $($async)? fn with_content() {
1281                        let buf = $buf;
1282                        let mut position = 0;
1283                        let mut input = b"!--->comment<--->other content".as_ref();
1284                        //                                 ^= 17
1285
1286                        assert_eq!(
1287                            $source(&mut input)
1288                                .read_bang_element(buf, &mut position)
1289                                $(.$await)?
1290                                .unwrap()
1291                                .map(|(ty, data)| (ty, Bytes(data))),
1292                            Some((BangType::Comment, Bytes(b"!--->comment<---")))
1293                        );
1294                        assert_eq!(position, 17);
1295                    }
1296                }
1297
1298                /// Checks that reading DOCTYPE definition works correctly
1299                mod doctype {
1300                    use super::*;
1301
1302                    mod uppercase {
1303                        use super::*;
1304                        use crate::errors::Error;
1305                        use crate::reader::BangType;
1306                        use crate::utils::Bytes;
1307                        use pretty_assertions::assert_eq;
1308
1309                        #[$test]
1310                        $($async)? fn not_properly_start() {
1311                            let buf = $buf;
1312                            let mut position = 0;
1313                            let mut input = b"!D other content".as_ref();
1314                            //                ^= 0
1315
1316                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1317                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1318                                x => assert!(
1319                                    false,
1320                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1321                                    x
1322                                ),
1323                            }
1324                            assert_eq!(position, 0);
1325                        }
1326
1327                        #[$test]
1328                        $($async)? fn without_space() {
1329                            let buf = $buf;
1330                            let mut position = 0;
1331                            let mut input = b"!DOCTYPEother content".as_ref();
1332                            //                ^= 0
1333
1334                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1335                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1336                                x => assert!(
1337                                    false,
1338                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1339                                    x
1340                                ),
1341                            }
1342                            assert_eq!(position, 0);
1343                        }
1344
1345                        #[$test]
1346                        $($async)? fn empty() {
1347                            let buf = $buf;
1348                            let mut position = 0;
1349                            let mut input = b"!DOCTYPE>other content".as_ref();
1350                            //                         ^= 9
1351
1352                            assert_eq!(
1353                                $source(&mut input)
1354                                    .read_bang_element(buf, &mut position)
1355                                    $(.$await)?
1356                                    .unwrap()
1357                                    .map(|(ty, data)| (ty, Bytes(data))),
1358                                Some((BangType::DocType, Bytes(b"!DOCTYPE")))
1359                            );
1360                            assert_eq!(position, 9);
1361                        }
1362
1363                        #[$test]
1364                        $($async)? fn not_closed() {
1365                            let buf = $buf;
1366                            let mut position = 0;
1367                            let mut input = b"!DOCTYPE other content".as_ref();
1368                            //                ^= 0
1369
1370                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1371                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1372                                x => assert!(
1373                                    false,
1374                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1375                                    x
1376                                ),
1377                            }
1378                            assert_eq!(position, 0);
1379                        }
1380                    }
1381
1382                    mod lowercase {
1383                        use super::*;
1384                        use crate::errors::Error;
1385                        use crate::reader::BangType;
1386                        use crate::utils::Bytes;
1387                        use pretty_assertions::assert_eq;
1388
1389                        #[$test]
1390                        $($async)? fn not_properly_start() {
1391                            let buf = $buf;
1392                            let mut position = 0;
1393                            let mut input = b"!d other content".as_ref();
1394                            //                ^= 0
1395
1396                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1397                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1398                                x => assert!(
1399                                    false,
1400                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1401                                    x
1402                                ),
1403                            }
1404                            assert_eq!(position, 0);
1405                        }
1406
1407                        #[$test]
1408                        $($async)? fn without_space() {
1409                            let buf = $buf;
1410                            let mut position = 0;
1411                            let mut input = b"!doctypeother content".as_ref();
1412                            //                ^= 0
1413
1414                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1415                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1416                                x => assert!(
1417                                    false,
1418                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1419                                    x
1420                                ),
1421                            }
1422                            assert_eq!(position, 0);
1423                        }
1424
1425                        #[$test]
1426                        $($async)? fn empty() {
1427                            let buf = $buf;
1428                            let mut position = 0;
1429                            let mut input = b"!doctype>other content".as_ref();
1430                            //                         ^= 9
1431
1432                            assert_eq!(
1433                                $source(&mut input)
1434                                    .read_bang_element(buf, &mut position)
1435                                    $(.$await)?
1436                                    .unwrap()
1437                                    .map(|(ty, data)| (ty, Bytes(data))),
1438                                Some((BangType::DocType, Bytes(b"!doctype")))
1439                            );
1440                            assert_eq!(position, 9);
1441                        }
1442
1443                        #[$test]
1444                        $($async)? fn not_closed() {
1445                            let buf = $buf;
1446                            let mut position = 0;
1447                            let mut input = b"!doctype other content".as_ref();
1448                            //                ^= 0
1449
1450                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1451                                Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1452                                x => assert!(
1453                                    false,
1454                                    r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1455                                    x
1456                                ),
1457                            }
1458                            assert_eq!(position, 0);
1459                        }
1460                    }
1461                }
1462            }
1463
1464            mod read_element {
1465                use super::*;
1466                use crate::utils::Bytes;
1467                use pretty_assertions::assert_eq;
1468
1469                /// Checks that nothing was read from empty buffer
1470                #[$test]
1471                $($async)? fn empty() {
1472                    let buf = $buf;
1473                    let mut position = 0;
1474                    let mut input = b"".as_ref();
1475                    //                ^= 0
1476
1477                    assert_eq!(
1478                        $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1479                        None
1480                    );
1481                    assert_eq!(position, 0);
1482                }
1483
1484                mod open {
1485                    use super::*;
1486                    use crate::utils::Bytes;
1487                    use pretty_assertions::assert_eq;
1488
1489                    #[$test]
1490                    $($async)? fn empty_tag() {
1491                        let buf = $buf;
1492                        let mut position = 0;
1493                        let mut input = b">".as_ref();
1494                        //                 ^= 1
1495
1496                        assert_eq!(
1497                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1498                            Some(Bytes(b""))
1499                        );
1500                        assert_eq!(position, 1);
1501                    }
1502
1503                    #[$test]
1504                    $($async)? fn normal() {
1505                        let buf = $buf;
1506                        let mut position = 0;
1507                        let mut input = b"tag>".as_ref();
1508                        //                    ^= 4
1509
1510                        assert_eq!(
1511                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1512                            Some(Bytes(b"tag"))
1513                        );
1514                        assert_eq!(position, 4);
1515                    }
1516
1517                    #[$test]
1518                    $($async)? fn empty_ns_empty_tag() {
1519                        let buf = $buf;
1520                        let mut position = 0;
1521                        let mut input = b":>".as_ref();
1522                        //                  ^= 2
1523
1524                        assert_eq!(
1525                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1526                            Some(Bytes(b":"))
1527                        );
1528                        assert_eq!(position, 2);
1529                    }
1530
1531                    #[$test]
1532                    $($async)? fn empty_ns() {
1533                        let buf = $buf;
1534                        let mut position = 0;
1535                        let mut input = b":tag>".as_ref();
1536                        //                     ^= 5
1537
1538                        assert_eq!(
1539                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1540                            Some(Bytes(b":tag"))
1541                        );
1542                        assert_eq!(position, 5);
1543                    }
1544
1545                    #[$test]
1546                    $($async)? fn with_attributes() {
1547                        let buf = $buf;
1548                        let mut position = 0;
1549                        let mut input = br#"tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
1550                        //                                                        ^= 38
1551
1552                        assert_eq!(
1553                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1554                            Some(Bytes(br#"tag  attr-1=">"  attr2  =  '>'  3attr"#))
1555                        );
1556                        assert_eq!(position, 38);
1557                    }
1558                }
1559
1560                mod self_closed {
1561                    use super::*;
1562                    use crate::utils::Bytes;
1563                    use pretty_assertions::assert_eq;
1564
1565                    #[$test]
1566                    $($async)? fn empty_tag() {
1567                        let buf = $buf;
1568                        let mut position = 0;
1569                        let mut input = b"/>".as_ref();
1570                        //                  ^= 2
1571
1572                        assert_eq!(
1573                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1574                            Some(Bytes(b"/"))
1575                        );
1576                        assert_eq!(position, 2);
1577                    }
1578
1579                    #[$test]
1580                    $($async)? fn normal() {
1581                        let buf = $buf;
1582                        let mut position = 0;
1583                        let mut input = b"tag/>".as_ref();
1584                        //                     ^= 5
1585
1586                        assert_eq!(
1587                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1588                            Some(Bytes(b"tag/"))
1589                        );
1590                        assert_eq!(position, 5);
1591                    }
1592
1593                    #[$test]
1594                    $($async)? fn empty_ns_empty_tag() {
1595                        let buf = $buf;
1596                        let mut position = 0;
1597                        let mut input = b":/>".as_ref();
1598                        //                   ^= 3
1599
1600                        assert_eq!(
1601                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1602                            Some(Bytes(b":/"))
1603                        );
1604                        assert_eq!(position, 3);
1605                    }
1606
1607                    #[$test]
1608                    $($async)? fn empty_ns() {
1609                        let buf = $buf;
1610                        let mut position = 0;
1611                        let mut input = b":tag/>".as_ref();
1612                        //                      ^= 6
1613
1614                        assert_eq!(
1615                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1616                            Some(Bytes(b":tag/"))
1617                        );
1618                        assert_eq!(position, 6);
1619                    }
1620
1621                    #[$test]
1622                    $($async)? fn with_attributes() {
1623                        let buf = $buf;
1624                        let mut position = 0;
1625                        let mut input = br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/>"#.as_ref();
1626                        //                                                           ^= 41
1627
1628                        assert_eq!(
1629                            $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1630                            Some(Bytes(br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/"#))
1631                        );
1632                        assert_eq!(position, 41);
1633                    }
1634                }
1635            }
1636
1637            mod issue_344 {
1638                use crate::errors::Error;
1639                use crate::reader::Reader;
1640
1641                #[$test]
1642                $($async)? fn cdata() {
1643                    let mut reader = Reader::from_str("![]]>");
1644
1645                    match reader.$read_until_close($buf) $(.$await)? {
1646                        Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1647                        x => assert!(
1648                            false,
1649                            r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1650                            x
1651                        ),
1652                    }
1653                }
1654
1655                #[$test]
1656                $($async)? fn comment() {
1657                    let mut reader = Reader::from_str("!- -->");
1658
1659                    match reader.$read_until_close($buf) $(.$await)? {
1660                        Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1661                        x => assert!(
1662                            false,
1663                            r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1664                            x
1665                        ),
1666                    }
1667                }
1668
1669                #[$test]
1670                $($async)? fn doctype_uppercase() {
1671                    let mut reader = Reader::from_str("!D>");
1672
1673                    match reader.$read_until_close($buf) $(.$await)? {
1674                        Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1675                        x => assert!(
1676                            false,
1677                            r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1678                            x
1679                        ),
1680                    }
1681                }
1682
1683                #[$test]
1684                $($async)? fn doctype_lowercase() {
1685                    let mut reader = Reader::from_str("!d>");
1686
1687                    match reader.$read_until_close($buf) $(.$await)? {
1688                        Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1689                        x => assert!(
1690                            false,
1691                            r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1692                            x
1693                        ),
1694                    }
1695                }
1696            }
1697
1698            /// Ensures, that no empty `Text` events are generated
1699            mod $read_event {
1700                use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
1701                use crate::reader::Reader;
1702                use pretty_assertions::assert_eq;
1703
1704                /// When `encoding` feature is enabled, encoding should be detected
1705                /// from BOM (UTF-8) and BOM should be stripped.
1706                ///
1707                /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1708                /// character should be stripped for consistency
1709                #[$test]
1710                $($async)? fn bom_from_reader() {
1711                    let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1712
1713                    assert_eq!(
1714                        reader.$read_event($buf) $(.$await)? .unwrap(),
1715                        Event::Text(BytesText::from_escaped("\u{feff}"))
1716                    );
1717
1718                    assert_eq!(
1719                        reader.$read_event($buf) $(.$await)? .unwrap(),
1720                        Event::Eof
1721                    );
1722                }
1723
1724                /// When parsing from &str, encoding is fixed (UTF-8), so
1725                /// - when `encoding` feature is disabled, the behavior the
1726                ///   same as in `bom_from_reader` text
1727                /// - when `encoding` feature is enabled, the behavior should
1728                ///   stay consistent, so the first BOM character is stripped
1729                #[$test]
1730                $($async)? fn bom_from_str() {
1731                    let mut reader = Reader::from_str("\u{feff}\u{feff}");
1732
1733                    assert_eq!(
1734                        reader.$read_event($buf) $(.$await)? .unwrap(),
1735                        Event::Text(BytesText::from_escaped("\u{feff}"))
1736                    );
1737
1738                    assert_eq!(
1739                        reader.$read_event($buf) $(.$await)? .unwrap(),
1740                        Event::Eof
1741                    );
1742                }
1743
1744                #[$test]
1745                $($async)? fn declaration() {
1746                    let mut reader = Reader::from_str("<?xml ?>");
1747
1748                    assert_eq!(
1749                        reader.$read_event($buf) $(.$await)? .unwrap(),
1750                        Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1751                    );
1752                }
1753
1754                #[$test]
1755                $($async)? fn doctype() {
1756                    let mut reader = Reader::from_str("<!DOCTYPE x>");
1757
1758                    assert_eq!(
1759                        reader.$read_event($buf) $(.$await)? .unwrap(),
1760                        Event::DocType(BytesText::from_escaped("x"))
1761                    );
1762                }
1763
1764                #[$test]
1765                $($async)? fn processing_instruction() {
1766                    let mut reader = Reader::from_str("<?xml-stylesheet?>");
1767
1768                    assert_eq!(
1769                        reader.$read_event($buf) $(.$await)? .unwrap(),
1770                        Event::PI(BytesText::from_escaped("xml-stylesheet"))
1771                    );
1772                }
1773
1774                #[$test]
1775                $($async)? fn start() {
1776                    let mut reader = Reader::from_str("<tag>");
1777
1778                    assert_eq!(
1779                        reader.$read_event($buf) $(.$await)? .unwrap(),
1780                        Event::Start(BytesStart::new("tag"))
1781                    );
1782                }
1783
1784                #[$test]
1785                $($async)? fn end() {
1786                    let mut reader = Reader::from_str("</tag>");
1787                    // Because we expect invalid XML, do not check that
1788                    // the end name paired with the start name
1789                    reader.check_end_names(false);
1790
1791                    assert_eq!(
1792                        reader.$read_event($buf) $(.$await)? .unwrap(),
1793                        Event::End(BytesEnd::new("tag"))
1794                    );
1795                }
1796
1797                #[$test]
1798                $($async)? fn empty() {
1799                    let mut reader = Reader::from_str("<tag/>");
1800
1801                    assert_eq!(
1802                        reader.$read_event($buf) $(.$await)? .unwrap(),
1803                        Event::Empty(BytesStart::new("tag"))
1804                    );
1805                }
1806
1807                #[$test]
1808                $($async)? fn text() {
1809                    let mut reader = Reader::from_str("text");
1810
1811                    assert_eq!(
1812                        reader.$read_event($buf) $(.$await)? .unwrap(),
1813                        Event::Text(BytesText::from_escaped("text"))
1814                    );
1815                }
1816
1817                #[$test]
1818                $($async)? fn cdata() {
1819                    let mut reader = Reader::from_str("<![CDATA[]]>");
1820
1821                    assert_eq!(
1822                        reader.$read_event($buf) $(.$await)? .unwrap(),
1823                        Event::CData(BytesCData::new(""))
1824                    );
1825                }
1826
1827                #[$test]
1828                $($async)? fn comment() {
1829                    let mut reader = Reader::from_str("<!---->");
1830
1831                    assert_eq!(
1832                        reader.$read_event($buf) $(.$await)? .unwrap(),
1833                        Event::Comment(BytesText::from_escaped(""))
1834                    );
1835                }
1836
1837                #[$test]
1838                $($async)? fn eof() {
1839                    let mut reader = Reader::from_str("");
1840
1841                    assert_eq!(
1842                        reader.$read_event($buf) $(.$await)? .unwrap(),
1843                        Event::Eof
1844                    );
1845                }
1846            }
1847        };
1848    }
1849
1850    /// Tests for https://github.com/tafia/quick-xml/issues/469
1851    macro_rules! small_buffers {
1852        (
1853            #[$test:meta]
1854            $read_event:ident: $BufReader:ty
1855            $(, $async:ident, $await:ident)?
1856        ) => {
1857            mod small_buffers {
1858                use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1859                use crate::reader::Reader;
1860                use pretty_assertions::assert_eq;
1861
1862                #[$test]
1863                $($async)? fn decl() {
1864                    let xml = "<?xml ?>";
1865                    //         ^^^^^^^ data that fit into buffer
1866                    let size = xml.match_indices("?>").next().unwrap().0 + 1;
1867                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1868                    let mut reader = Reader::from_reader(br);
1869                    let mut buf = Vec::new();
1870
1871                    assert_eq!(
1872                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1873                        Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1874                    );
1875                    assert_eq!(
1876                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1877                        Event::Eof
1878                    );
1879                }
1880
1881                #[$test]
1882                $($async)? fn pi() {
1883                    let xml = "<?pi?>";
1884                    //         ^^^^^ data that fit into buffer
1885                    let size = xml.match_indices("?>").next().unwrap().0 + 1;
1886                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1887                    let mut reader = Reader::from_reader(br);
1888                    let mut buf = Vec::new();
1889
1890                    assert_eq!(
1891                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1892                        Event::PI(BytesText::new("pi"))
1893                    );
1894                    assert_eq!(
1895                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1896                        Event::Eof
1897                    );
1898                }
1899
1900                #[$test]
1901                $($async)? fn empty() {
1902                    let xml = "<empty/>";
1903                    //         ^^^^^^^ data that fit into buffer
1904                    let size = xml.match_indices("/>").next().unwrap().0 + 1;
1905                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1906                    let mut reader = Reader::from_reader(br);
1907                    let mut buf = Vec::new();
1908
1909                    assert_eq!(
1910                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1911                        Event::Empty(BytesStart::new("empty"))
1912                    );
1913                    assert_eq!(
1914                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1915                        Event::Eof
1916                    );
1917                }
1918
1919                #[$test]
1920                $($async)? fn cdata1() {
1921                    let xml = "<![CDATA[cdata]]>";
1922                    //         ^^^^^^^^^^^^^^^ data that fit into buffer
1923                    let size = xml.match_indices("]]>").next().unwrap().0 + 1;
1924                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1925                    let mut reader = Reader::from_reader(br);
1926                    let mut buf = Vec::new();
1927
1928                    assert_eq!(
1929                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1930                        Event::CData(BytesCData::new("cdata"))
1931                    );
1932                    assert_eq!(
1933                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1934                        Event::Eof
1935                    );
1936                }
1937
1938                #[$test]
1939                $($async)? fn cdata2() {
1940                    let xml = "<![CDATA[cdata]]>";
1941                    //         ^^^^^^^^^^^^^^^^ data that fit into buffer
1942                    let size = xml.match_indices("]]>").next().unwrap().0 + 2;
1943                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1944                    let mut reader = Reader::from_reader(br);
1945                    let mut buf = Vec::new();
1946
1947                    assert_eq!(
1948                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1949                        Event::CData(BytesCData::new("cdata"))
1950                    );
1951                    assert_eq!(
1952                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1953                        Event::Eof
1954                    );
1955                }
1956
1957                #[$test]
1958                $($async)? fn comment1() {
1959                    let xml = "<!--comment-->";
1960                    //         ^^^^^^^^^^^^ data that fit into buffer
1961                    let size = xml.match_indices("-->").next().unwrap().0 + 1;
1962                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1963                    let mut reader = Reader::from_reader(br);
1964                    let mut buf = Vec::new();
1965
1966                    assert_eq!(
1967                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1968                        Event::Comment(BytesText::new("comment"))
1969                    );
1970                    assert_eq!(
1971                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1972                        Event::Eof
1973                    );
1974                }
1975
1976                #[$test]
1977                $($async)? fn comment2() {
1978                    let xml = "<!--comment-->";
1979                    //         ^^^^^^^^^^^^^ data that fit into buffer
1980                    let size = xml.match_indices("-->").next().unwrap().0 + 2;
1981                    let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1982                    let mut reader = Reader::from_reader(br);
1983                    let mut buf = Vec::new();
1984
1985                    assert_eq!(
1986                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1987                        Event::Comment(BytesText::new("comment"))
1988                    );
1989                    assert_eq!(
1990                        reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1991                        Event::Eof
1992                    );
1993                }
1994            }
1995        };
1996    }
1997
1998    // Export macros for the child modules:
1999    // - buffered_reader
2000    // - slice_reader
2001    pub(super) use check;
2002    pub(super) use small_buffers;
2003}