quick_xml/reader/
state.rs

1#[cfg(feature = "encoding")]
2use encoding_rs::UTF_8;
3
4use crate::encoding::Decoder;
5use crate::errors::{Error, Result};
6use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9use crate::reader::{is_whitespace, BangType, ParseState};
10
11use memchr;
12
13/// A struct that holds a current reader state and a parser configuration.
14/// It is independent on a way of reading data: the reader feed data into it and
15/// get back produced [`Event`]s.
16#[derive(Clone)]
17pub(super) struct ReaderState {
18    /// Number of bytes read from the source of data since the reader was created
19    pub offset: usize,
20    /// Defines how to process next byte
21    pub state: ParseState,
22    /// Expand empty element into an opening and closing element
23    pub expand_empty_elements: bool,
24    /// Trims leading whitespace in Text events, skip the element if text is empty
25    pub trim_text_start: bool,
26    /// Trims trailing whitespace in Text events.
27    pub trim_text_end: bool,
28    /// Trims trailing whitespaces from markup names in closing tags `</a >`
29    pub trim_markup_names_in_closing_tags: bool,
30    /// Check if [`Event::End`] nodes match last [`Event::Start`] node
31    pub check_end_names: bool,
32    /// Check if comments contains `--` (false per default)
33    pub check_comments: bool,
34    /// All currently Started elements which didn't have a matching
35    /// End element yet.
36    ///
37    /// For an XML
38    ///
39    /// ```xml
40    /// <root><one/><inner attr="value">|<tag></inner></root>
41    /// ```
42    /// when cursor at the `|` position buffer contains:
43    ///
44    /// ```text
45    /// rootinner
46    /// ^   ^
47    /// ```
48    ///
49    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
50    /// (0 and 4 in that case).
51    opened_buffer: Vec<u8>,
52    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
53    /// for that field for details
54    opened_starts: Vec<usize>,
55
56    #[cfg(feature = "encoding")]
57    /// Reference to the encoding used to read an XML
58    pub encoding: EncodingRef,
59}
60
61impl ReaderState {
62    /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
63    ///
64    /// # Parameters
65    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
66    ///
67    /// [`Text`]: Event::Text
68    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
69        let mut content = bytes;
70
71        if self.trim_text_end {
72            // Skip the ending '<'
73            let len = bytes
74                .iter()
75                .rposition(|&b| !is_whitespace(b))
76                .map_or_else(|| bytes.len(), |p| p + 1);
77            content = &bytes[..len];
78        }
79
80        Ok(Event::Text(BytesText::wrap(content, self.decoder())))
81    }
82
83    /// reads `BytesElement` starting with a `!`,
84    /// return `Comment`, `CData` or `DocType` event
85    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
86        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
87            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
88        };
89
90        let len = buf.len();
91        match bang_type {
92            BangType::Comment if buf.starts_with(b"!--") => {
93                debug_assert!(buf.ends_with(b"--"));
94                if self.check_comments {
95                    // search if '--' not in comments
96                    if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
97                        .position(|p| buf[3 + p + 1] == b'-')
98                    {
99                        self.offset += len - p;
100                        return Err(Error::UnexpectedToken("--".to_string()));
101                    }
102                }
103                Ok(Event::Comment(BytesText::wrap(
104                    &buf[3..len - 2],
105                    self.decoder(),
106                )))
107            }
108            BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
109                debug_assert!(buf.ends_with(b"]]"));
110                Ok(Event::CData(BytesCData::wrap(
111                    &buf[8..len - 2],
112                    self.decoder(),
113                )))
114            }
115            BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
116                let start = buf[8..]
117                    .iter()
118                    .position(|b| !is_whitespace(*b))
119                    .unwrap_or(len - 8);
120                if start + 8 >= len {
121                    return Err(Error::EmptyDocType);
122                }
123                Ok(Event::DocType(BytesText::wrap(
124                    &buf[8 + start..],
125                    self.decoder(),
126                )))
127            }
128            _ => Err(bang_type.to_err()),
129        }
130    }
131
132    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
133    /// end name matches the last opened start name if `self.check_end_names` is set.
134    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
135        // Strip the `/` character. `content` contains data between `</` and `>`
136        let content = &buf[1..];
137        // XML standard permits whitespaces after the markup name in closing tags.
138        // Let's strip them from the buffer before comparing tag names.
139        let name = if self.trim_markup_names_in_closing_tags {
140            if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
141                &content[..pos_end_name + 1]
142            } else {
143                content
144            }
145        } else {
146            content
147        };
148
149        let decoder = self.decoder();
150        let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| {
151            *offset -= buf.len();
152            Err(Error::EndEventMismatch {
153                expected,
154                found: decoder.decode(found).unwrap_or_default().into_owned(),
155            })
156        };
157
158        // Get the index in self.opened_buffer of the name of the last opened tag
159        match self.opened_starts.pop() {
160            Some(start) => {
161                if self.check_end_names {
162                    let expected = &self.opened_buffer[start..];
163                    if name != expected {
164                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
165                        // #513: In order to allow error recovery we should drop content of the buffer
166                        self.opened_buffer.truncate(start);
167
168                        return mismatch_err(expected, name, &mut self.offset);
169                    }
170                }
171
172                self.opened_buffer.truncate(start);
173            }
174            None => {
175                if self.check_end_names {
176                    return mismatch_err("".to_string(), &buf[1..], &mut self.offset);
177                }
178            }
179        }
180
181        Ok(Event::End(BytesEnd::wrap(name.into())))
182    }
183
184    /// reads `BytesElement` starting with a `?`,
185    /// return `Decl` or `PI` event
186    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
187        let len = buf.len();
188        if len > 2 && buf[len - 1] == b'?' {
189            if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
190                let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
191
192                // Try getting encoding from the declaration event
193                #[cfg(feature = "encoding")]
194                if self.encoding.can_be_refined() {
195                    if let Some(encoding) = event.encoder() {
196                        self.encoding = EncodingRef::XmlDetected(encoding);
197                    }
198                }
199
200                Ok(Event::Decl(event))
201            } else {
202                Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
203            }
204        } else {
205            self.offset -= len;
206            Err(Error::UnexpectedEof("XmlDecl".to_string()))
207        }
208    }
209
210    /// Converts content of a tag to a `Start` or an `Empty` event
211    ///
212    /// # Parameters
213    /// - `content`: Content of a tag between `<` and `>`
214    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
215        let len = content.len();
216        let name_end = content
217            .iter()
218            .position(|&b| is_whitespace(b))
219            .unwrap_or(len);
220        if let Some(&b'/') = content.last() {
221            // This is self-closed tag `<something/>`
222            let name_len = if name_end < len { name_end } else { len - 1 };
223            let event = BytesStart::wrap(&content[..len - 1], name_len);
224
225            if self.expand_empty_elements {
226                self.state = ParseState::Empty;
227                self.opened_starts.push(self.opened_buffer.len());
228                self.opened_buffer.extend(&content[..name_len]);
229                Ok(Event::Start(event))
230            } else {
231                Ok(Event::Empty(event))
232            }
233        } else {
234            // #514: Always store names event when .check_end_names == false,
235            // because checks can be temporary disabled and when they would be
236            // enabled, we should have that information
237            self.opened_starts.push(self.opened_buffer.len());
238            self.opened_buffer.extend(&content[..name_end]);
239            Ok(Event::Start(BytesStart::wrap(content, name_end)))
240        }
241    }
242
243    #[inline]
244    pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
245        self.state = ParseState::ClosedTag;
246        let name = self
247            .opened_buffer
248            .split_off(self.opened_starts.pop().unwrap());
249        Ok(Event::End(BytesEnd::wrap(name.into())))
250    }
251
252    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
253    ///
254    /// If [`encoding`] feature is enabled, the used encoding may change after
255    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
256    ///
257    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
258    /// defaults to UTF-8.
259    ///
260    /// [`encoding`]: ../../index.html#encoding
261    pub fn decoder(&self) -> Decoder {
262        Decoder {
263            #[cfg(feature = "encoding")]
264            encoding: self.encoding.encoding(),
265        }
266    }
267}
268
269impl Default for ReaderState {
270    fn default() -> Self {
271        Self {
272            offset: 0,
273            state: ParseState::Init,
274            expand_empty_elements: false,
275            trim_text_start: false,
276            trim_text_end: false,
277            trim_markup_names_in_closing_tags: true,
278            check_end_names: true,
279            check_comments: false,
280            opened_buffer: Vec::new(),
281            opened_starts: Vec::new(),
282
283            #[cfg(feature = "encoding")]
284            encoding: EncodingRef::Implicit(UTF_8),
285        }
286    }
287}