xmlparser/
lib.rs

1/*!
2*xmlparser* is a low-level, pull-based, zero-allocation
3[XML 1.0](https://www.w3.org/TR/xml/) parser.
4
5## Example
6
7```rust
8for token in xmlparser::Tokenizer::from("<tagname name='value'/>") {
9    println!("{:?}", token);
10}
11```
12
13## Why a new library?
14
15This library is basically a low-level XML tokenizer that preserves the positions of the tokens
16and is not intended to be used directly.
17If you are looking for a higher level solution, check out
18[roxmltree](https://github.com/RazrFalcon/roxmltree).
19
20## Benefits
21
22- All tokens contain `StrSpan` structs which represent the position of the substring
23  in the original document.
24- Good error processing. All error types contain the position (line:column) where it occurred.
25- No heap allocations.
26- No dependencies.
27- Tiny. ~1400 LOC and ~30KiB in the release build according to `cargo-bloat`.
28- Supports `no_std` builds. To use without the standard library, disable the default features.
29
30## Limitations
31
32- Currently, only ENTITY objects are parsed from the DOCTYPE. All others are ignored.
33- No tree structure validation. So an XML like `<root><child></root></child>`
34  or a string without root element
35  will be parsed without errors. You should check for this manually.
36  On the other hand `<a/><a/>` will lead to an error.
37- Duplicated attributes is not an error. So XML like `<item a="v1" a="v2"/>`
38  will be parsed without errors. You should check for this manually.
39- UTF-8 only.
40
41## Safety
42
43- The library must not panic. Any panic is considered a critical bug
44  and should be reported.
45- The library forbids unsafe code.
46*/
47
48#![no_std]
49
50#![forbid(unsafe_code)]
51#![warn(missing_docs)]
52#![allow(ellipsis_inclusive_range_patterns)]
53
54#[cfg(feature = "std")]
55#[macro_use]
56extern crate std;
57
58
59macro_rules! matches {
60    ($expression:expr, $($pattern:tt)+) => {
61        match $expression {
62            $($pattern)+ => true,
63            _ => false
64        }
65    }
66}
67
68
69mod error;
70mod stream;
71mod strspan;
72mod xmlchar;
73
74pub use crate::error::*;
75pub use crate::stream::*;
76pub use crate::strspan::*;
77pub use crate::xmlchar::*;
78
79
80/// An XML token.
81#[allow(missing_docs)]
82#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
83pub enum Token<'a> {
84    /// Declaration token.
85    ///
86    /// ```text
87    /// <?xml version='1.0' encoding='UTF-8' standalone='yes'?>
88    ///                ---                                      - version
89    ///                               -----                     - encoding?
90    ///                                                  ---    - standalone?
91    /// ------------------------------------------------------- - span
92    /// ```
93    Declaration {
94        version: StrSpan<'a>,
95        encoding: Option<StrSpan<'a>>,
96        standalone: Option<bool>,
97        span: StrSpan<'a>,
98    },
99
100    /// Processing instruction token.
101    ///
102    /// ```text
103    /// <?target content?>
104    ///   ------           - target
105    ///          -------   - content?
106    /// ------------------ - span
107    /// ```
108    ProcessingInstruction {
109        target: StrSpan<'a>,
110        content: Option<StrSpan<'a>>,
111        span: StrSpan<'a>,
112    },
113
114    /// Comment token.
115    ///
116    /// ```text
117    /// <!-- text -->
118    ///     ------    - text
119    /// ------------- - span
120    /// ```
121    Comment {
122        text: StrSpan<'a>,
123        span: StrSpan<'a>,
124    },
125
126    /// DOCTYPE start token.
127    ///
128    /// ```text
129    /// <!DOCTYPE greeting SYSTEM "hello.dtd" [
130    ///           --------                      - name
131    ///                    ------------------   - external_id?
132    /// --------------------------------------- - span
133    /// ```
134    DtdStart {
135        name: StrSpan<'a>,
136        external_id: Option<ExternalId<'a>>,
137        span: StrSpan<'a>,
138    },
139
140    /// Empty DOCTYPE token.
141    ///
142    /// ```text
143    /// <!DOCTYPE greeting SYSTEM "hello.dtd">
144    ///           --------                     - name
145    ///                    ------------------  - external_id?
146    /// -------------------------------------- - span
147    /// ```
148    EmptyDtd {
149        name: StrSpan<'a>,
150        external_id: Option<ExternalId<'a>>,
151        span: StrSpan<'a>,
152    },
153
154    /// ENTITY token.
155    ///
156    /// Can appear only inside the DTD.
157    ///
158    /// ```text
159    /// <!ENTITY ns_extend "http://test.com">
160    ///          ---------                    - name
161    ///                     ---------------   - definition
162    /// ------------------------------------- - span
163    /// ```
164    EntityDeclaration {
165        name: StrSpan<'a>,
166        definition: EntityDefinition<'a>,
167        span: StrSpan<'a>,
168    },
169
170    /// DOCTYPE end token.
171    ///
172    /// ```text
173    /// <!DOCTYPE svg [
174    ///    ...
175    /// ]>
176    /// -- - span
177    /// ```
178    DtdEnd {
179        span: StrSpan<'a>,
180    },
181
182    /// Element start token.
183    ///
184    /// ```text
185    /// <ns:elem attr="value"/>
186    ///  --                     - prefix
187    ///     ----                - local
188    /// --------                - span
189    /// ```
190    ElementStart {
191        prefix: StrSpan<'a>,
192        local: StrSpan<'a>,
193        span: StrSpan<'a>,
194    },
195
196    /// Attribute token.
197    ///
198    /// ```text
199    /// <elem ns:attr="value"/>
200    ///       --              - prefix
201    ///          ----         - local
202    ///                -----  - value
203    ///       --------------- - span
204    /// ```
205    Attribute {
206        prefix: StrSpan<'a>,
207        local: StrSpan<'a>,
208        value: StrSpan<'a>,
209        span: StrSpan<'a>,
210    },
211
212    /// Element end token.
213    ///
214    /// ```text
215    /// <ns:elem>text</ns:elem>
216    ///                         - ElementEnd::Open
217    ///         -               - span
218    /// ```
219    ///
220    /// ```text
221    /// <ns:elem>text</ns:elem>
222    ///                -- ----  - ElementEnd::Close(prefix, local)
223    ///              ---------- - span
224    /// ```
225    ///
226    /// ```text
227    /// <ns:elem/>
228    ///                         - ElementEnd::Empty
229    ///         --              - span
230    /// ```
231    ElementEnd {
232        end: ElementEnd<'a>,
233        span: StrSpan<'a>,
234    },
235
236    /// Text token.
237    ///
238    /// Contains text between elements including whitespaces.
239    /// Basically everything between `>` and `<`.
240    /// Except `]]>`, which is not allowed and will lead to an error.
241    ///
242    /// ```text
243    /// <p> text </p>
244    ///    ------     - text
245    /// ```
246    ///
247    /// The token span is equal to the `text`.
248    Text {
249        text: StrSpan<'a>,
250    },
251
252    /// CDATA token.
253    ///
254    /// ```text
255    /// <p><![CDATA[text]]></p>
256    ///             ----        - text
257    ///    ----------------     - span
258    /// ```
259    Cdata {
260        text: StrSpan<'a>,
261        span: StrSpan<'a>,
262    },
263}
264
265
266/// `ElementEnd` token.
267#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
268pub enum ElementEnd<'a> {
269    /// Indicates `>`
270    Open,
271    /// Indicates `</name>`
272    Close(StrSpan<'a>, StrSpan<'a>),
273    /// Indicates `/>`
274    Empty,
275}
276
277
278/// Representation of the [ExternalID](https://www.w3.org/TR/xml/#NT-ExternalID) value.
279#[allow(missing_docs)]
280#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
281pub enum ExternalId<'a> {
282    System(StrSpan<'a>),
283    Public(StrSpan<'a>, StrSpan<'a>),
284}
285
286
287/// Representation of the [EntityDef](https://www.w3.org/TR/xml/#NT-EntityDef) value.
288#[allow(missing_docs)]
289#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
290pub enum EntityDefinition<'a> {
291    EntityValue(StrSpan<'a>),
292    ExternalId(ExternalId<'a>),
293}
294
295
296type Result<T> = core::result::Result<T, Error>;
297type StreamResult<T> = core::result::Result<T, StreamError>;
298
299
300#[derive(Clone, Copy, PartialEq)]
301enum State {
302    Declaration,
303    AfterDeclaration,
304    Dtd,
305    AfterDtd,
306    Elements,
307    Attributes,
308    AfterElements,
309    End,
310}
311
312
313/// Tokenizer for the XML structure.
314pub struct Tokenizer<'a> {
315    stream: Stream<'a>,
316    state: State,
317    depth: usize,
318    fragment_parsing: bool,
319}
320
321impl<'a> From<&'a str> for Tokenizer<'a> {
322    #[inline]
323    fn from(text: &'a str) -> Self {
324        let mut stream = Stream::from(text);
325
326        // Skip UTF-8 BOM.
327        if stream.starts_with(&[0xEF, 0xBB, 0xBF]) {
328            stream.advance(3);
329        }
330
331        Tokenizer {
332            stream,
333            state: State::Declaration,
334            depth: 0,
335            fragment_parsing: false,
336        }
337    }
338}
339
340
341macro_rules! map_err_at {
342    ($fun:expr, $stream:expr, $err:ident) => {{
343        let start = $stream.pos();
344        $fun.map_err(|e|
345            Error::$err(e, $stream.gen_text_pos_from(start))
346        )
347    }}
348}
349
350impl<'a> Tokenizer<'a> {
351    /// Enables document fragment parsing.
352    ///
353    /// By default, `xmlparser` will check for DTD, root element, etc.
354    /// But if we have to parse an XML fragment, it will lead to an error.
355    /// This method switches the parser to the root element content parsing mode,
356    /// so it will treat any data as a content of the root element.
357    pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range<usize>) -> Self {
358        Tokenizer {
359            stream: Stream::from_substr(full_text, fragment),
360            state: State::Elements,
361            depth: 0,
362            fragment_parsing: true,
363        }
364    }
365
366    fn parse_next_impl(&mut self) -> Option<Result<Token<'a>>> {
367        let s = &mut self.stream;
368
369        if s.at_end() {
370            return None;
371        }
372
373        let start = s.pos();
374
375        match self.state {
376            State::Declaration => {
377                self.state = State::AfterDeclaration;
378                if s.starts_with(b"<?xml ") {
379                    Some(Self::parse_declaration(s))
380                } else {
381                    None
382                }
383            }
384            State::AfterDeclaration => {
385                if s.starts_with(b"<!DOCTYPE") {
386                    let t = Self::parse_doctype(s);
387                    match t {
388                        Ok(Token::DtdStart { .. }) => self.state = State::Dtd,
389                        Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd,
390                        _ => {}
391                    }
392
393                    Some(t)
394                } else if s.starts_with(b"<!--") {
395                    Some(Self::parse_comment(s))
396                } else if s.starts_with(b"<?") {
397                    if s.starts_with(b"<?xml ") {
398                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
399                    } else {
400                        Some(Self::parse_pi(s))
401                    }
402                } else if s.starts_with_space() {
403                    s.skip_spaces();
404                    None
405                } else {
406                    self.state = State::AfterDtd;
407                    None
408                }
409            }
410            State::Dtd => {
411                if s.starts_with(b"<!ENTITY") {
412                    Some(Self::parse_entity_decl(s))
413                } else if s.starts_with(b"<!--") {
414                    Some(Self::parse_comment(s))
415                } else if s.starts_with(b"<?") {
416                    if s.starts_with(b"<?xml ") {
417                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
418                    } else {
419                        Some(Self::parse_pi(s))
420                    }
421                } else if s.starts_with(b"]") {
422                    // DTD ends with ']' S? '>', therefore we have to skip possible spaces.
423                    s.advance(1);
424                    s.skip_spaces();
425                    match s.curr_byte() {
426                        Ok(b'>') => {
427                            self.state = State::AfterDtd;
428                            s.advance(1);
429                            Some(Ok(Token::DtdEnd { span: s.slice_back(start) }))
430                        }
431                        Ok(c) => {
432                            let e = StreamError::InvalidChar(c, b'>', s.gen_text_pos());
433                            Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
434                        }
435                        Err(_) => {
436                            let e = StreamError::UnexpectedEndOfStream;
437                            Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
438                        }
439                    }
440                } else if s.starts_with_space() {
441                    s.skip_spaces();
442                    None
443                } else if    s.starts_with(b"<!ELEMENT")
444                          || s.starts_with(b"<!ATTLIST")
445                          || s.starts_with(b"<!NOTATION")
446                {
447                    if Self::consume_decl(s).is_err() {
448                        let pos = s.gen_text_pos_from(start);
449                        Some(Err(Error::UnknownToken(pos)))
450                    } else {
451                        None
452                    }
453                } else {
454                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
455                }
456            }
457            State::AfterDtd => {
458                if s.starts_with(b"<!--") {
459                    Some(Self::parse_comment(s))
460                } else if s.starts_with(b"<?") {
461                    if s.starts_with(b"<?xml ") {
462                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
463                    } else {
464                        Some(Self::parse_pi(s))
465                    }
466                } else if s.starts_with(b"<!") {
467                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
468                } else if s.starts_with(b"<") {
469                    self.state = State::Attributes;
470                    Some(Self::parse_element_start(s))
471                } else if s.starts_with_space() {
472                    s.skip_spaces();
473                    None
474                } else {
475                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
476                }
477            }
478            State::Elements => {
479                // Use `match` only here, because only this section is performance-critical.
480                match s.curr_byte() {
481                    Ok(b'<') => {
482                        match s.next_byte() {
483                            Ok(b'!') => {
484                                if s.starts_with(b"<!--") {
485                                    Some(Self::parse_comment(s))
486                                } else if s.starts_with(b"<![CDATA[") {
487                                    Some(Self::parse_cdata(s))
488                                } else {
489                                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
490                                }
491                            }
492                            Ok(b'?') => {
493                                if !s.starts_with(b"<?xml ") {
494                                    Some(Self::parse_pi(s))
495                                } else {
496                                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
497                                }
498                            }
499                            Ok(b'/') => {
500                                if self.depth > 0 {
501                                    self.depth -= 1;
502                                }
503
504                                if self.depth == 0 && !self.fragment_parsing {
505                                    self.state = State::AfterElements;
506                                } else {
507                                    self.state = State::Elements;
508                                }
509
510                                Some(Self::parse_close_element(s))
511                            }
512                            Ok(_) => {
513                                self.state = State::Attributes;
514                                Some(Self::parse_element_start(s))
515                            }
516                            Err(_) => {
517                                return Some(Err(Error::UnknownToken(s.gen_text_pos())));
518                            }
519                        }
520                    }
521                    Ok(_) => {
522                        Some(Self::parse_text(s))
523                    }
524                    Err(_) => {
525                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
526                    }
527                }
528            }
529            State::Attributes => {
530                let t = Self::parse_attribute(s);
531
532                if let Ok(Token::ElementEnd { end, .. }) = t {
533                    if end == ElementEnd::Open {
534                        self.depth += 1;
535                    }
536
537                    if self.depth == 0 && !self.fragment_parsing {
538                        self.state = State::AfterElements;
539                    } else {
540                        self.state = State::Elements;
541                    }
542                }
543
544                Some(t.map_err(|e| Error::InvalidAttribute(e, s.gen_text_pos_from(start))))
545            }
546            State::AfterElements => {
547                if s.starts_with(b"<!--") {
548                    Some(Self::parse_comment(s))
549                } else if s.starts_with(b"<?") {
550                    if s.starts_with(b"<?xml ") {
551                        Some(Err(Error::UnknownToken(s.gen_text_pos())))
552                    } else {
553                        Some(Self::parse_pi(s))
554                    }
555                } else if s.starts_with_space() {
556                    s.skip_spaces();
557                    None
558                } else {
559                    Some(Err(Error::UnknownToken(s.gen_text_pos())))
560                }
561            }
562            State::End => {
563                None
564            }
565        }
566    }
567
568    fn parse_declaration(s: &mut Stream<'a>) -> Result<Token<'a>> {
569        map_err_at!(Self::parse_declaration_impl(s), s, InvalidDeclaration)
570    }
571
572    // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
573    fn parse_declaration_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
574        fn consume_spaces(s: &mut Stream) -> StreamResult<()> {
575            if s.starts_with_space() {
576                s.skip_spaces();
577            } else if !s.starts_with(b"?>") && !s.at_end() {
578                return Err(StreamError::InvalidSpace(s.curr_byte_unchecked(), s.gen_text_pos()));
579            }
580
581            Ok(())
582        }
583
584        let start = s.pos();
585        s.advance(6);
586
587        let version = Self::parse_version_info(s)?;
588        consume_spaces(s)?;
589
590        let encoding = Self::parse_encoding_decl(s)?;
591        if encoding.is_some() {
592            consume_spaces(s)?;
593        }
594
595        let standalone = Self::parse_standalone(s)?;
596
597        s.skip_spaces();
598        s.skip_string(b"?>")?;
599
600        let span = s.slice_back(start);
601        Ok(Token::Declaration { version, encoding, standalone, span })
602    }
603
604    // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
605    // VersionNum  ::= '1.' [0-9]+
606    fn parse_version_info(s: &mut Stream<'a>) -> StreamResult<StrSpan<'a>> {
607        s.skip_spaces();
608        s.skip_string(b"version")?;
609        s.consume_eq()?;
610        let quote = s.consume_quote()?;
611
612        let start = s.pos();
613        s.skip_string(b"1.")?;
614        s.skip_bytes(|_, c| c.is_xml_digit());
615        let ver = s.slice_back(start);
616
617        s.consume_byte(quote)?;
618
619        Ok(ver)
620    }
621
622    // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
623    // EncName      ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
624    fn parse_encoding_decl(s: &mut Stream<'a>) -> StreamResult<Option<StrSpan<'a>>> {
625        if !s.starts_with(b"encoding") {
626            return Ok(None);
627        }
628
629        s.advance(8);
630        s.consume_eq()?;
631        let quote = s.consume_quote()?;
632        // [A-Za-z] ([A-Za-z0-9._] | '-')*
633        // TODO: check that first byte is [A-Za-z]
634        let name = s.consume_bytes(|_, c| {
635               c.is_xml_letter()
636            || c.is_xml_digit()
637            || c == b'.'
638            || c == b'-'
639            || c == b'_'
640        });
641        s.consume_byte(quote)?;
642
643        Ok(Some(name))
644    }
645
646    // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
647    fn parse_standalone(s: &mut Stream<'a>) -> StreamResult<Option<bool>> {
648        if !s.starts_with(b"standalone") {
649            return Ok(None);
650        }
651
652        s.advance(10);
653        s.consume_eq()?;
654        let quote = s.consume_quote()?;
655
656        let start = s.pos();
657        let value = s.consume_name()?.as_str();
658
659        let flag = match value {
660            "yes" => true,
661            "no" => false,
662            _ => {
663                let pos = s.gen_text_pos_from(start);
664
665                return Err(StreamError::InvalidString("yes', 'no", pos));
666            }
667        };
668
669        s.consume_byte(quote)?;
670
671        Ok(Some(flag))
672    }
673
674    fn parse_comment(s: &mut Stream<'a>) -> Result<Token<'a>> {
675        let start = s.pos();
676        Self::parse_comment_impl(s)
677            .map_err(|e| Error::InvalidComment(e, s.gen_text_pos_from(start)))
678    }
679
680    // '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
681    fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
682        let start = s.pos();
683        s.advance(4);
684        let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
685        s.skip_string(b"-->")?;
686
687        if text.as_str().contains("--") {
688            return Err(StreamError::InvalidCommentData);
689        }
690
691        if text.as_str().ends_with('-') {
692            return Err(StreamError::InvalidCommentEnd);
693        }
694
695        let span = s.slice_back(start);
696
697        Ok(Token::Comment { text, span })
698    }
699
700    fn parse_pi(s: &mut Stream<'a>) -> Result<Token<'a>> {
701        map_err_at!(Self::parse_pi_impl(s), s, InvalidPI)
702    }
703
704    // PI       ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
705    // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
706    fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
707        let start = s.pos();
708        s.advance(2);
709        let target = s.consume_name()?;
710        s.skip_spaces();
711        let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
712        let content = if !content.is_empty() {
713            Some(content)
714        } else {
715            None
716        };
717
718        s.skip_string(b"?>")?;
719
720        let span = s.slice_back(start);
721
722        Ok(Token::ProcessingInstruction { target, content, span })
723    }
724
725    fn parse_doctype(s: &mut Stream<'a>) -> Result<Token<'a>> {
726        map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype)
727    }
728
729    // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
730    fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
731        let start = s.pos();
732        s.advance(9);
733
734        s.consume_spaces()?;
735        let name = s.consume_name()?;
736        s.skip_spaces();
737
738        let external_id = Self::parse_external_id(s)?;
739        s.skip_spaces();
740
741        let c = s.curr_byte()?;
742        if c != b'[' && c !=  b'>' {
743            static EXPECTED: &[u8] = &[b'[', b'>'];
744            return Err(StreamError::InvalidCharMultiple(c, EXPECTED, s.gen_text_pos()));
745        }
746
747        s.advance(1);
748
749        let span = s.slice_back(start);
750        if c == b'[' {
751            Ok(Token::DtdStart { name, external_id, span })
752        } else {
753            Ok(Token::EmptyDtd { name, external_id, span })
754        }
755    }
756
757    // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
758    fn parse_external_id(s: &mut Stream<'a>) -> StreamResult<Option<ExternalId<'a>>> {
759        let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
760            let start = s.pos();
761            s.advance(6);
762            let id = s.slice_back(start);
763
764            s.consume_spaces()?;
765            let quote = s.consume_quote()?;
766            let literal1 = s.consume_bytes(|_, c| c != quote);
767            s.consume_byte(quote)?;
768
769            let v = if id.as_str() == "SYSTEM" {
770                ExternalId::System(literal1)
771            } else {
772                s.consume_spaces()?;
773                let quote = s.consume_quote()?;
774                let literal2 = s.consume_bytes(|_, c| c != quote);
775                s.consume_byte(quote)?;
776
777                ExternalId::Public(literal1, literal2)
778            };
779
780            Some(v)
781        } else {
782            None
783        };
784
785        Ok(v)
786    }
787
788    fn parse_entity_decl(s: &mut Stream<'a>) -> Result<Token<'a>> {
789        map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity)
790    }
791
792    // EntityDecl  ::= GEDecl | PEDecl
793    // GEDecl      ::= '<!ENTITY' S Name S EntityDef S? '>'
794    // PEDecl      ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
795    fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
796        let start = s.pos();
797        s.advance(8);
798
799        s.consume_spaces()?;
800
801        let is_ge = if s.try_consume_byte(b'%') {
802            s.consume_spaces()?;
803            false
804        } else {
805            true
806        };
807
808        let name = s.consume_name()?;
809        s.consume_spaces()?;
810        let definition = Self::parse_entity_def(s, is_ge)?;
811        s.skip_spaces();
812        s.consume_byte(b'>')?;
813
814        let span = s.slice_back(start);
815
816        Ok(Token::EntityDeclaration { name, definition, span })
817    }
818
819    // EntityDef   ::= EntityValue | (ExternalID NDataDecl?)
820    // PEDef       ::= EntityValue | ExternalID
821    // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' |  "'" ([^%&']
822    //                             | PEReference | Reference)* "'"
823    // ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
824    // NDataDecl   ::= S 'NDATA' S Name
825    fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult<EntityDefinition<'a>> {
826        let c = s.curr_byte()?;
827        match c {
828            b'"' | b'\'' => {
829                let quote = s.consume_quote()?;
830                let value = s.consume_bytes(|_, c| c != quote);
831                s.consume_byte(quote)?;
832
833                Ok(EntityDefinition::EntityValue(value))
834            }
835            b'S' | b'P' => {
836                if let Some(id) = Self::parse_external_id(s)? {
837                    if is_ge {
838                        s.skip_spaces();
839                        if s.starts_with(b"NDATA") {
840                            s.advance(5);
841                            s.consume_spaces()?;
842                            s.skip_name()?;
843                            // TODO: NDataDecl is not supported
844                        }
845                    }
846
847                    Ok(EntityDefinition::ExternalId(id))
848                } else {
849                    Err(StreamError::InvalidExternalID)
850                }
851            }
852            _ => {
853                static EXPECTED: &[u8] = &[b'"', b'\'', b'S', b'P'];
854                let pos = s.gen_text_pos();
855                Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos))
856            }
857        }
858    }
859
860    fn consume_decl(s: &mut Stream) -> StreamResult<()> {
861        s.skip_bytes(|_, c| c != b'>');
862        s.consume_byte(b'>')?;
863        Ok(())
864    }
865
866    fn parse_cdata(s: &mut Stream<'a>) -> Result<Token<'a>> {
867        map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata)
868    }
869
870    // CDSect  ::= CDStart CData CDEnd
871    // CDStart ::= '<![CDATA['
872    // CData   ::= (Char* - (Char* ']]>' Char*))
873    // CDEnd   ::= ']]>'
874    fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
875        let start = s.pos();
876        s.advance(9);
877        let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
878        s.skip_string(b"]]>")?;
879        let span = s.slice_back(start);
880        Ok(Token::Cdata { text, span })
881    }
882
883    fn parse_element_start(s: &mut Stream<'a>) -> Result<Token<'a>> {
884        map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement)
885    }
886
887    // '<' Name (S Attribute)* S? '>'
888    fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
889        let start = s.pos();
890        s.advance(1);
891        let (prefix, local) = s.consume_qname()?;
892        let span = s.slice_back(start);
893
894        Ok(Token::ElementStart { prefix, local, span })
895    }
896
897    fn parse_close_element(s: &mut Stream<'a>) -> Result<Token<'a>> {
898        map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement)
899    }
900
901    // '</' Name S? '>'
902    fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
903        let start = s.pos();
904        s.advance(2);
905
906        let (prefix, tag_name) = s.consume_qname()?;
907        s.skip_spaces();
908        s.consume_byte(b'>')?;
909
910        let span = s.slice_back(start);
911
912        Ok(Token::ElementEnd { end: ElementEnd::Close(prefix, tag_name), span })
913    }
914
915    // Name Eq AttValue
916    fn parse_attribute(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
917        let attr_start = s.pos();
918        let has_space = s.starts_with_space();
919        s.skip_spaces();
920
921        if let Ok(c) = s.curr_byte() {
922            let start = s.pos();
923
924            match c {
925                b'/' => {
926                    s.advance(1);
927                    s.consume_byte(b'>')?;
928                    let span = s.slice_back(start);
929                    return Ok(Token::ElementEnd { end: ElementEnd::Empty, span });
930                }
931                b'>' => {
932                    s.advance(1);
933                    let span = s.slice_back(start);
934                    return Ok(Token::ElementEnd { end: ElementEnd::Open, span });
935                }
936                _ => {}
937            }
938        }
939
940        if !has_space {
941            if !s.at_end() {
942                return Err(StreamError::InvalidSpace(
943                    s.curr_byte_unchecked(), s.gen_text_pos_from(attr_start))
944                );
945            } else {
946                return Err(StreamError::UnexpectedEndOfStream);
947            }
948        }
949
950        let start = s.pos();
951
952        let (prefix, local) = s.consume_qname()?;
953        s.consume_eq()?;
954        let quote = s.consume_quote()?;
955        let quote_c = quote as char;
956        // The attribute value must not contain the < character.
957        let value = s.consume_chars(|_, c| c != quote_c && c != '<')?;
958        s.consume_byte(quote)?;
959        let span = s.slice_back(start);
960
961        Ok(Token::Attribute { prefix, local, value, span })
962    }
963
964    fn parse_text(s: &mut Stream<'a>) -> Result<Token<'a>> {
965        map_err_at!(Self::parse_text_impl(s), s, InvalidCharData)
966    }
967
968    fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
969        let text = s.consume_chars(|_, c| c != '<')?;
970
971        // According to the spec, `]]>` must not appear inside a Text node.
972        // https://www.w3.org/TR/xml/#syntax
973        //
974        // Search for `>` first, since it's a bit faster than looking for `]]>`.
975        if text.as_str().contains('>') {
976            if text.as_str().contains("]]>") {
977                return Err(StreamError::InvalidCharacterData);
978            }
979        }
980
981        Ok(Token::Text { text })
982    }
983}
984
985impl<'a> Iterator for Tokenizer<'a> {
986    type Item = Result<Token<'a>>;
987
988    #[inline]
989    fn next(&mut self) -> Option<Self::Item> {
990        let mut t = None;
991        while !self.stream.at_end() && self.state != State::End && t.is_none() {
992            t = self.parse_next_impl();
993        }
994
995        if let Some(Err(_)) = t {
996            self.stream.jump_to_end();
997            self.state = State::End;
998        }
999
1000        t
1001    }
1002}