xmlparser/
stream.rs

1use core::char;
2use core::cmp;
3use core::ops::Range;
4use core::str;
5
6use crate::{
7    StreamError,
8    StrSpan,
9    TextPos,
10    XmlByteExt,
11    XmlCharExt,
12};
13
14type Result<T> = ::core::result::Result<T, StreamError>;
15
16
17/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
18#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
19pub enum Reference<'a> {
20    /// An entity reference.
21    ///
22    /// <https://www.w3.org/TR/xml/#NT-EntityRef>
23    Entity(&'a str),
24
25    /// A character reference.
26    ///
27    /// <https://www.w3.org/TR/xml/#NT-CharRef>
28    Char(char),
29}
30
31
32/// A streaming XML parsing interface.
33#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
34pub struct Stream<'a> {
35    pos: usize,
36    end: usize,
37    span: StrSpan<'a>,
38}
39
40impl<'a> From<&'a str> for Stream<'a> {
41    #[inline]
42    fn from(text: &'a str) -> Self {
43        Stream {
44            pos: 0,
45            end: text.len(),
46            span: text.into(),
47        }
48    }
49}
50
51impl<'a> From<StrSpan<'a>> for Stream<'a> {
52    #[inline]
53    fn from(span: StrSpan<'a>) -> Self {
54        Stream {
55            pos: 0,
56            end: span.as_str().len(),
57            span,
58        }
59    }
60}
61
62impl<'a> Stream<'a> {
63    /// Creates a new stream from a specified `text` substring.
64    #[inline]
65    pub fn from_substr(text: &'a str, fragment: Range<usize>) -> Self {
66        Stream {
67            pos: fragment.start,
68            end: fragment.end,
69            span: text.into(),
70        }
71    }
72
73    /// Returns an underling string span.
74    #[inline]
75    pub fn span(&self) -> StrSpan<'a> {
76        self.span
77    }
78
79    /// Returns current position.
80    #[inline]
81    pub fn pos(&self) -> usize {
82        self.pos
83    }
84
85    /// Sets current position equal to the end.
86    ///
87    /// Used to indicate end of parsing on error.
88    #[inline]
89    pub fn jump_to_end(&mut self) {
90        self.pos = self.end;
91    }
92
93    /// Checks if the stream is reached the end.
94    ///
95    /// Any [`pos()`] value larger than original text length indicates stream end.
96    ///
97    /// Accessing stream after reaching end via safe methods will produce
98    /// an `UnexpectedEndOfStream` error.
99    ///
100    /// Accessing stream after reaching end via *_unchecked methods will produce
101    /// a Rust's bound checking error.
102    ///
103    /// [`pos()`]: #method.pos
104    #[inline]
105    pub fn at_end(&self) -> bool {
106        self.pos >= self.end
107    }
108
109    /// Returns a byte from a current stream position.
110    ///
111    /// # Errors
112    ///
113    /// - `UnexpectedEndOfStream`
114    #[inline]
115    pub fn curr_byte(&self) -> Result<u8> {
116        if self.at_end() {
117            return Err(StreamError::UnexpectedEndOfStream);
118        }
119
120        Ok(self.curr_byte_unchecked())
121    }
122
123    /// Returns a byte from a current stream position.
124    ///
125    /// # Panics
126    ///
127    /// - if the current position is after the end of the data
128    #[inline]
129    pub fn curr_byte_unchecked(&self) -> u8 {
130        self.span.as_bytes()[self.pos]
131    }
132
133    /// Returns a next byte from a current stream position.
134    ///
135    /// # Errors
136    ///
137    /// - `UnexpectedEndOfStream`
138    #[inline]
139    pub fn next_byte(&self) -> Result<u8> {
140        if self.pos + 1 >= self.end {
141            return Err(StreamError::UnexpectedEndOfStream);
142        }
143
144        Ok(self.span.as_bytes()[self.pos + 1])
145    }
146
147    /// Advances by `n` bytes.
148    ///
149    /// # Examples
150    ///
151    /// ```rust,should_panic
152    /// use xmlparser::Stream;
153    ///
154    /// let mut s = Stream::from("text");
155    /// s.advance(2); // ok
156    /// s.advance(20); // will cause a panic via debug_assert!().
157    /// ```
158    #[inline]
159    pub fn advance(&mut self, n: usize) {
160        debug_assert!(self.pos + n <= self.end);
161        self.pos += n;
162    }
163
164    /// Checks that the stream starts with a selected text.
165    ///
166    /// We are using `&[u8]` instead of `&str` for performance reasons.
167    ///
168    /// # Examples
169    ///
170    /// ```
171    /// use xmlparser::Stream;
172    ///
173    /// let mut s = Stream::from("Some text.");
174    /// s.advance(5);
175    /// assert_eq!(s.starts_with(b"text"), true);
176    /// assert_eq!(s.starts_with(b"long"), false);
177    /// ```
178    #[inline]
179    pub fn starts_with(&self, text: &[u8]) -> bool {
180        self.span.as_bytes()[self.pos..self.end].starts_with(text)
181    }
182
183    /// Consumes the current byte if it's equal to the provided byte.
184    ///
185    /// # Errors
186    ///
187    /// - `InvalidChar`
188    /// - `UnexpectedEndOfStream`
189    ///
190    /// # Examples
191    ///
192    /// ```
193    /// use xmlparser::Stream;
194    ///
195    /// let mut s = Stream::from("Some text.");
196    /// assert!(s.consume_byte(b'S').is_ok());
197    /// assert!(s.consume_byte(b'o').is_ok());
198    /// assert!(s.consume_byte(b'm').is_ok());
199    /// assert!(s.consume_byte(b'q').is_err());
200    /// ```
201    pub fn consume_byte(&mut self, c: u8) -> Result<()> {
202        let curr = self.curr_byte()?;
203        if curr != c {
204            return Err(StreamError::InvalidChar(curr, c, self.gen_text_pos()));
205        }
206
207        self.advance(1);
208        Ok(())
209    }
210
211    /// Tries to consume the current byte if it's equal to the provided byte.
212    ///
213    /// Unlike `consume_byte()` will not return any errors.
214    pub fn try_consume_byte(&mut self, c: u8) -> bool {
215        match self.curr_byte() {
216            Ok(b) if b == c => {
217                self.advance(1);
218                true
219            }
220            _ => false,
221        }
222    }
223
224    /// Skips selected string.
225    ///
226    /// # Errors
227    ///
228    /// - `InvalidString`
229    pub fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
230        if !self.starts_with(text) {
231            let pos = self.gen_text_pos();
232
233            // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
234            let expected = str::from_utf8(text).unwrap();
235
236            return Err(StreamError::InvalidString(expected, pos));
237        }
238
239        self.advance(text.len());
240        Ok(())
241    }
242
243    /// Consumes bytes by the predicate and returns them.
244    ///
245    /// The result can be empty.
246    #[inline]
247    pub fn consume_bytes<F>(&mut self, f: F) -> StrSpan<'a>
248        where F: Fn(&Stream, u8) -> bool
249    {
250        let start = self.pos;
251        self.skip_bytes(f);
252        self.slice_back(start)
253    }
254
255    /// Skips bytes by the predicate.
256    pub fn skip_bytes<F>(&mut self, f: F)
257        where F: Fn(&Stream, u8) -> bool
258    {
259        while !self.at_end() && f(self, self.curr_byte_unchecked()) {
260            self.advance(1);
261        }
262    }
263
264    /// Consumes chars by the predicate and returns them.
265    ///
266    /// The result can be empty.
267    #[inline]
268    pub fn consume_chars<F>(&mut self, f: F) -> Result<StrSpan<'a>>
269        where F: Fn(&Stream, char) -> bool
270    {
271        let start = self.pos;
272        self.skip_chars(f)?;
273        Ok(self.slice_back(start))
274    }
275
276    /// Skips chars by the predicate.
277    #[inline]
278    pub fn skip_chars<F>(&mut self, f: F) -> Result<()>
279        where F: Fn(&Stream, char) -> bool
280    {
281        for c in self.chars() {
282            if !c.is_xml_char() {
283                return Err(StreamError::NonXmlChar(c, self.gen_text_pos()));
284            } else if f(self, c) {
285                self.advance(c.len_utf8());
286            } else {
287                break;
288            }
289        }
290
291        Ok(())
292    }
293
294    #[inline]
295    pub(crate) fn chars(&self) -> str::Chars<'a> {
296        self.span.as_str()[self.pos..self.end].chars()
297    }
298
299    /// Slices data from `pos` to the current position.
300    #[inline]
301    pub fn slice_back(&self, pos: usize) -> StrSpan<'a> {
302        self.span.slice_region(pos, self.pos)
303    }
304
305    /// Slices data from the current position to the end.
306    #[inline]
307    pub fn slice_tail(&self) -> StrSpan<'a> {
308        self.span.slice_region(self.pos, self.end)
309    }
310
311    /// Skips whitespaces.
312    ///
313    /// Accepted values: `' ' \n \r \t`.
314    #[inline]
315    pub fn skip_spaces(&mut self) {
316        while !self.at_end() && self.curr_byte_unchecked().is_xml_space() {
317            self.advance(1);
318        }
319    }
320
321    /// Checks if the stream is starts with a space.
322    #[inline]
323    pub fn starts_with_space(&self) -> bool {
324        !self.at_end() && self.curr_byte_unchecked().is_xml_space()
325    }
326
327    /// Consumes whitespaces.
328    ///
329    /// Like [`skip_spaces()`], but checks that first char is actually a space.
330    ///
331    /// [`skip_spaces()`]: #method.skip_spaces
332    ///
333    /// # Errors
334    ///
335    /// - `InvalidSpace`
336    pub fn consume_spaces(&mut self) -> Result<()> {
337        if self.at_end() {
338            return Err(StreamError::UnexpectedEndOfStream);
339        }
340
341        if !self.starts_with_space() {
342            return Err(StreamError::InvalidSpace(self.curr_byte_unchecked(), self.gen_text_pos()));
343        }
344
345        self.skip_spaces();
346        Ok(())
347    }
348
349    /// Consumes an XML character reference if there is one.
350    ///
351    /// On error will reset the position to the original.
352    pub fn try_consume_reference(&mut self) -> Option<Reference<'a>> {
353        let start = self.pos();
354
355        // Consume reference on a substream.
356        let mut s = self.clone();
357        match s.consume_reference() {
358            Ok(r) => {
359                // If the current data is a reference than advance the current stream
360                // by number of bytes read by substream.
361                self.advance(s.pos() - start);
362                Some(r)
363            }
364            Err(_) => {
365                None
366            }
367        }
368    }
369
370    /// Consumes an XML reference.
371    ///
372    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
373    ///
374    /// # Errors
375    ///
376    /// - `InvalidReference`
377    pub fn consume_reference(&mut self) -> Result<Reference<'a>> {
378        self._consume_reference().map_err(|_| StreamError::InvalidReference)
379    }
380
381    #[inline(never)]
382    fn _consume_reference(&mut self) -> Result<Reference<'a>> {
383        if !self.try_consume_byte(b'&') {
384            return Err(StreamError::InvalidReference);
385        }
386
387        let reference = if self.try_consume_byte(b'#') {
388            let (value, radix) = if self.try_consume_byte(b'x') {
389                let value = self.consume_bytes(|_, c| c.is_xml_hex_digit()).as_str();
390                (value, 16)
391            } else {
392                let value = self.consume_bytes(|_, c| c.is_xml_digit()).as_str();
393                (value, 10)
394            };
395
396            let n = u32::from_str_radix(value, radix).map_err(|_| StreamError::InvalidReference)?;
397
398            let c = char::from_u32(n).unwrap_or('\u{FFFD}');
399            if !c.is_xml_char() {
400                return Err(StreamError::InvalidReference);
401            }
402
403            Reference::Char(c)
404        } else {
405            let name = self.consume_name()?;
406            match name.as_str() {
407                "quot" => Reference::Char('"'),
408                "amp"  => Reference::Char('&'),
409                "apos" => Reference::Char('\''),
410                "lt"   => Reference::Char('<'),
411                "gt"   => Reference::Char('>'),
412                _ => Reference::Entity(name.as_str()),
413            }
414        };
415
416        self.consume_byte(b';')?;
417
418        Ok(reference)
419    }
420
421    /// Consumes an XML name and returns it.
422    ///
423    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
424    ///
425    /// # Errors
426    ///
427    /// - `InvalidName` - if name is empty or starts with an invalid char
428    /// - `UnexpectedEndOfStream`
429    pub fn consume_name(&mut self) -> Result<StrSpan<'a>> {
430        let start = self.pos();
431        self.skip_name()?;
432
433        let name = self.slice_back(start);
434        if name.is_empty() {
435            return Err(StreamError::InvalidName);
436        }
437
438        Ok(name)
439    }
440
441    /// Skips an XML name.
442    ///
443    /// The same as `consume_name()`, but does not return a consumed name.
444    ///
445    /// # Errors
446    ///
447    /// - `InvalidName` - if name is empty or starts with an invalid char
448    pub fn skip_name(&mut self) -> Result<()> {
449        let mut iter = self.chars();
450        if let Some(c) = iter.next() {
451            if c.is_xml_name_start() {
452                self.advance(c.len_utf8());
453            } else {
454                return Err(StreamError::InvalidName);
455            }
456        }
457
458        for c in iter {
459            if c.is_xml_name() {
460                self.advance(c.len_utf8());
461            } else {
462                break;
463            }
464        }
465
466        Ok(())
467    }
468
469    /// Consumes a qualified XML name and returns it.
470    ///
471    /// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
472    ///
473    /// # Errors
474    ///
475    /// - `InvalidName` - if name is empty or starts with an invalid char
476    #[inline(never)]
477    pub fn consume_qname(&mut self) -> Result<(StrSpan<'a>, StrSpan<'a>)> {
478        let start = self.pos();
479
480        let mut splitter = None;
481
482        while !self.at_end() {
483            // Check for ASCII first for performance reasons.
484            let b = self.curr_byte_unchecked();
485            if b < 128 {
486                if b == b':' {
487                    if splitter.is_none() {
488                        splitter = Some(self.pos());
489                        self.advance(1);
490                    } else {
491                        // Multiple `:` is an error.
492                        return Err(StreamError::InvalidName);
493                    }
494                } else if b.is_xml_name() {
495                    self.advance(1);
496                } else {
497                    break;
498                }
499            } else {
500                // Fallback to Unicode code point.
501                match self.chars().nth(0) {
502                    Some(c) if c.is_xml_name() => {
503                        self.advance(c.len_utf8());
504                    }
505                    _ => break,
506                }
507            }
508        }
509
510        let (prefix, local) = if let Some(splitter) = splitter {
511            let prefix = self.span().slice_region(start, splitter);
512            let local = self.slice_back(splitter + 1);
513            (prefix, local)
514        } else {
515            let local = self.slice_back(start);
516            ("".into(), local)
517        };
518
519        // Prefix must start with a `NameStartChar`.
520        if let Some(c) = prefix.as_str().chars().nth(0) {
521            if !c.is_xml_name_start() {
522                return Err(StreamError::InvalidName);
523            }
524        }
525
526        // Local name must start with a `NameStartChar`.
527        if let Some(c) = local.as_str().chars().nth(0) {
528            if !c.is_xml_name_start() {
529                return Err(StreamError::InvalidName);
530            }
531        } else {
532            // If empty - error.
533            return Err(StreamError::InvalidName);
534        }
535
536        Ok((prefix, local))
537    }
538
539    /// Consumes `=`.
540    ///
541    /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Eq>
542    ///
543    /// # Errors
544    ///
545    /// - `InvalidChar`
546    /// - `UnexpectedEndOfStream`
547    pub fn consume_eq(&mut self) -> Result<()> {
548        self.skip_spaces();
549        self.consume_byte(b'=')?;
550        self.skip_spaces();
551
552        Ok(())
553    }
554
555    /// Consumes quote.
556    ///
557    /// Consumes `'` or `"` and returns it.
558    ///
559    /// # Errors
560    ///
561    /// - `InvalidQuote`
562    /// - `UnexpectedEndOfStream`
563    pub fn consume_quote(&mut self) -> Result<u8> {
564        let c = self.curr_byte()?;
565        if c == b'\'' || c == b'"' {
566            self.advance(1);
567            Ok(c)
568        } else {
569            Err(StreamError::InvalidQuote(c, self.gen_text_pos()))
570        }
571    }
572
573    /// Calculates a current absolute position.
574    ///
575    /// This operation is very expensive. Use only for errors.
576    #[inline(never)]
577    pub fn gen_text_pos(&self) -> TextPos {
578        let text = self.span.as_str();
579        let end = self.pos;
580
581        let row = Self::calc_curr_row(text, end);
582        let col = Self::calc_curr_col(text, end);
583        TextPos::new(row, col)
584    }
585
586    /// Calculates an absolute position at `pos`.
587    ///
588    /// This operation is very expensive. Use only for errors.
589    ///
590    /// # Examples
591    ///
592    /// ```
593    /// let s = xmlparser::Stream::from("text");
594    ///
595    /// assert_eq!(s.gen_text_pos_from(2), xmlparser::TextPos::new(1, 3));
596    /// assert_eq!(s.gen_text_pos_from(9999), xmlparser::TextPos::new(1, 5));
597    /// ```
598    #[inline(never)]
599    pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
600        let mut s = self.clone();
601        s.pos = cmp::min(pos, s.span.as_str().len());
602        s.gen_text_pos()
603    }
604
605    fn calc_curr_row(text: &str, end: usize) -> u32 {
606        let mut row = 1;
607        for c in &text.as_bytes()[..end] {
608            if *c == b'\n' {
609                row += 1;
610            }
611        }
612
613        row
614    }
615
616    fn calc_curr_col(text: &str, end: usize) -> u32 {
617        let mut col = 1;
618        for c in text[..end].chars().rev() {
619            if c == '\n' {
620                break;
621            } else {
622                col += 1;
623            }
624        }
625
626        col
627    }
628}