1#![no_std]
49
50#![forbid(unsafe_code)]
51#![warn(missing_docs)]
52#![allow(ellipsis_inclusive_range_patterns)]
53
54#[cfg(feature = "std")]
55#[macro_use]
56extern crate std;
57
58
59macro_rules! matches {
60 ($expression:expr, $($pattern:tt)+) => {
61 match $expression {
62 $($pattern)+ => true,
63 _ => false
64 }
65 }
66}
67
68
69mod error;
70mod stream;
71mod strspan;
72mod xmlchar;
73
74pub use crate::error::*;
75pub use crate::stream::*;
76pub use crate::strspan::*;
77pub use crate::xmlchar::*;
78
79
80#[allow(missing_docs)]
82#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
83pub enum Token<'a> {
84 Declaration {
94 version: StrSpan<'a>,
95 encoding: Option<StrSpan<'a>>,
96 standalone: Option<bool>,
97 span: StrSpan<'a>,
98 },
99
100 ProcessingInstruction {
109 target: StrSpan<'a>,
110 content: Option<StrSpan<'a>>,
111 span: StrSpan<'a>,
112 },
113
114 Comment {
122 text: StrSpan<'a>,
123 span: StrSpan<'a>,
124 },
125
126 DtdStart {
135 name: StrSpan<'a>,
136 external_id: Option<ExternalId<'a>>,
137 span: StrSpan<'a>,
138 },
139
140 EmptyDtd {
149 name: StrSpan<'a>,
150 external_id: Option<ExternalId<'a>>,
151 span: StrSpan<'a>,
152 },
153
154 EntityDeclaration {
165 name: StrSpan<'a>,
166 definition: EntityDefinition<'a>,
167 span: StrSpan<'a>,
168 },
169
170 DtdEnd {
179 span: StrSpan<'a>,
180 },
181
182 ElementStart {
191 prefix: StrSpan<'a>,
192 local: StrSpan<'a>,
193 span: StrSpan<'a>,
194 },
195
196 Attribute {
206 prefix: StrSpan<'a>,
207 local: StrSpan<'a>,
208 value: StrSpan<'a>,
209 span: StrSpan<'a>,
210 },
211
212 ElementEnd {
232 end: ElementEnd<'a>,
233 span: StrSpan<'a>,
234 },
235
236 Text {
249 text: StrSpan<'a>,
250 },
251
252 Cdata {
260 text: StrSpan<'a>,
261 span: StrSpan<'a>,
262 },
263}
264
265
266#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
268pub enum ElementEnd<'a> {
269 Open,
271 Close(StrSpan<'a>, StrSpan<'a>),
273 Empty,
275}
276
277
278#[allow(missing_docs)]
280#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
281pub enum ExternalId<'a> {
282 System(StrSpan<'a>),
283 Public(StrSpan<'a>, StrSpan<'a>),
284}
285
286
287#[allow(missing_docs)]
289#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
290pub enum EntityDefinition<'a> {
291 EntityValue(StrSpan<'a>),
292 ExternalId(ExternalId<'a>),
293}
294
295
296type Result<T> = core::result::Result<T, Error>;
297type StreamResult<T> = core::result::Result<T, StreamError>;
298
299
300#[derive(Clone, Copy, PartialEq)]
301enum State {
302 Declaration,
303 AfterDeclaration,
304 Dtd,
305 AfterDtd,
306 Elements,
307 Attributes,
308 AfterElements,
309 End,
310}
311
312
313pub struct Tokenizer<'a> {
315 stream: Stream<'a>,
316 state: State,
317 depth: usize,
318 fragment_parsing: bool,
319}
320
321impl<'a> From<&'a str> for Tokenizer<'a> {
322 #[inline]
323 fn from(text: &'a str) -> Self {
324 let mut stream = Stream::from(text);
325
326 if stream.starts_with(&[0xEF, 0xBB, 0xBF]) {
328 stream.advance(3);
329 }
330
331 Tokenizer {
332 stream,
333 state: State::Declaration,
334 depth: 0,
335 fragment_parsing: false,
336 }
337 }
338}
339
340
341macro_rules! map_err_at {
342 ($fun:expr, $stream:expr, $err:ident) => {{
343 let start = $stream.pos();
344 $fun.map_err(|e|
345 Error::$err(e, $stream.gen_text_pos_from(start))
346 )
347 }}
348}
349
350impl<'a> Tokenizer<'a> {
351 pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range<usize>) -> Self {
358 Tokenizer {
359 stream: Stream::from_substr(full_text, fragment),
360 state: State::Elements,
361 depth: 0,
362 fragment_parsing: true,
363 }
364 }
365
366 fn parse_next_impl(&mut self) -> Option<Result<Token<'a>>> {
367 let s = &mut self.stream;
368
369 if s.at_end() {
370 return None;
371 }
372
373 let start = s.pos();
374
375 match self.state {
376 State::Declaration => {
377 self.state = State::AfterDeclaration;
378 if s.starts_with(b"<?xml ") {
379 Some(Self::parse_declaration(s))
380 } else {
381 None
382 }
383 }
384 State::AfterDeclaration => {
385 if s.starts_with(b"<!DOCTYPE") {
386 let t = Self::parse_doctype(s);
387 match t {
388 Ok(Token::DtdStart { .. }) => self.state = State::Dtd,
389 Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd,
390 _ => {}
391 }
392
393 Some(t)
394 } else if s.starts_with(b"<!--") {
395 Some(Self::parse_comment(s))
396 } else if s.starts_with(b"<?") {
397 if s.starts_with(b"<?xml ") {
398 Some(Err(Error::UnknownToken(s.gen_text_pos())))
399 } else {
400 Some(Self::parse_pi(s))
401 }
402 } else if s.starts_with_space() {
403 s.skip_spaces();
404 None
405 } else {
406 self.state = State::AfterDtd;
407 None
408 }
409 }
410 State::Dtd => {
411 if s.starts_with(b"<!ENTITY") {
412 Some(Self::parse_entity_decl(s))
413 } else if s.starts_with(b"<!--") {
414 Some(Self::parse_comment(s))
415 } else if s.starts_with(b"<?") {
416 if s.starts_with(b"<?xml ") {
417 Some(Err(Error::UnknownToken(s.gen_text_pos())))
418 } else {
419 Some(Self::parse_pi(s))
420 }
421 } else if s.starts_with(b"]") {
422 s.advance(1);
424 s.skip_spaces();
425 match s.curr_byte() {
426 Ok(b'>') => {
427 self.state = State::AfterDtd;
428 s.advance(1);
429 Some(Ok(Token::DtdEnd { span: s.slice_back(start) }))
430 }
431 Ok(c) => {
432 let e = StreamError::InvalidChar(c, b'>', s.gen_text_pos());
433 Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
434 }
435 Err(_) => {
436 let e = StreamError::UnexpectedEndOfStream;
437 Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
438 }
439 }
440 } else if s.starts_with_space() {
441 s.skip_spaces();
442 None
443 } else if s.starts_with(b"<!ELEMENT")
444 || s.starts_with(b"<!ATTLIST")
445 || s.starts_with(b"<!NOTATION")
446 {
447 if Self::consume_decl(s).is_err() {
448 let pos = s.gen_text_pos_from(start);
449 Some(Err(Error::UnknownToken(pos)))
450 } else {
451 None
452 }
453 } else {
454 Some(Err(Error::UnknownToken(s.gen_text_pos())))
455 }
456 }
457 State::AfterDtd => {
458 if s.starts_with(b"<!--") {
459 Some(Self::parse_comment(s))
460 } else if s.starts_with(b"<?") {
461 if s.starts_with(b"<?xml ") {
462 Some(Err(Error::UnknownToken(s.gen_text_pos())))
463 } else {
464 Some(Self::parse_pi(s))
465 }
466 } else if s.starts_with(b"<!") {
467 Some(Err(Error::UnknownToken(s.gen_text_pos())))
468 } else if s.starts_with(b"<") {
469 self.state = State::Attributes;
470 Some(Self::parse_element_start(s))
471 } else if s.starts_with_space() {
472 s.skip_spaces();
473 None
474 } else {
475 Some(Err(Error::UnknownToken(s.gen_text_pos())))
476 }
477 }
478 State::Elements => {
479 match s.curr_byte() {
481 Ok(b'<') => {
482 match s.next_byte() {
483 Ok(b'!') => {
484 if s.starts_with(b"<!--") {
485 Some(Self::parse_comment(s))
486 } else if s.starts_with(b"<![CDATA[") {
487 Some(Self::parse_cdata(s))
488 } else {
489 Some(Err(Error::UnknownToken(s.gen_text_pos())))
490 }
491 }
492 Ok(b'?') => {
493 if !s.starts_with(b"<?xml ") {
494 Some(Self::parse_pi(s))
495 } else {
496 Some(Err(Error::UnknownToken(s.gen_text_pos())))
497 }
498 }
499 Ok(b'/') => {
500 if self.depth > 0 {
501 self.depth -= 1;
502 }
503
504 if self.depth == 0 && !self.fragment_parsing {
505 self.state = State::AfterElements;
506 } else {
507 self.state = State::Elements;
508 }
509
510 Some(Self::parse_close_element(s))
511 }
512 Ok(_) => {
513 self.state = State::Attributes;
514 Some(Self::parse_element_start(s))
515 }
516 Err(_) => {
517 return Some(Err(Error::UnknownToken(s.gen_text_pos())));
518 }
519 }
520 }
521 Ok(_) => {
522 Some(Self::parse_text(s))
523 }
524 Err(_) => {
525 Some(Err(Error::UnknownToken(s.gen_text_pos())))
526 }
527 }
528 }
529 State::Attributes => {
530 let t = Self::parse_attribute(s);
531
532 if let Ok(Token::ElementEnd { end, .. }) = t {
533 if end == ElementEnd::Open {
534 self.depth += 1;
535 }
536
537 if self.depth == 0 && !self.fragment_parsing {
538 self.state = State::AfterElements;
539 } else {
540 self.state = State::Elements;
541 }
542 }
543
544 Some(t.map_err(|e| Error::InvalidAttribute(e, s.gen_text_pos_from(start))))
545 }
546 State::AfterElements => {
547 if s.starts_with(b"<!--") {
548 Some(Self::parse_comment(s))
549 } else if s.starts_with(b"<?") {
550 if s.starts_with(b"<?xml ") {
551 Some(Err(Error::UnknownToken(s.gen_text_pos())))
552 } else {
553 Some(Self::parse_pi(s))
554 }
555 } else if s.starts_with_space() {
556 s.skip_spaces();
557 None
558 } else {
559 Some(Err(Error::UnknownToken(s.gen_text_pos())))
560 }
561 }
562 State::End => {
563 None
564 }
565 }
566 }
567
568 fn parse_declaration(s: &mut Stream<'a>) -> Result<Token<'a>> {
569 map_err_at!(Self::parse_declaration_impl(s), s, InvalidDeclaration)
570 }
571
572 fn parse_declaration_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
574 fn consume_spaces(s: &mut Stream) -> StreamResult<()> {
575 if s.starts_with_space() {
576 s.skip_spaces();
577 } else if !s.starts_with(b"?>") && !s.at_end() {
578 return Err(StreamError::InvalidSpace(s.curr_byte_unchecked(), s.gen_text_pos()));
579 }
580
581 Ok(())
582 }
583
584 let start = s.pos();
585 s.advance(6);
586
587 let version = Self::parse_version_info(s)?;
588 consume_spaces(s)?;
589
590 let encoding = Self::parse_encoding_decl(s)?;
591 if encoding.is_some() {
592 consume_spaces(s)?;
593 }
594
595 let standalone = Self::parse_standalone(s)?;
596
597 s.skip_spaces();
598 s.skip_string(b"?>")?;
599
600 let span = s.slice_back(start);
601 Ok(Token::Declaration { version, encoding, standalone, span })
602 }
603
604 fn parse_version_info(s: &mut Stream<'a>) -> StreamResult<StrSpan<'a>> {
607 s.skip_spaces();
608 s.skip_string(b"version")?;
609 s.consume_eq()?;
610 let quote = s.consume_quote()?;
611
612 let start = s.pos();
613 s.skip_string(b"1.")?;
614 s.skip_bytes(|_, c| c.is_xml_digit());
615 let ver = s.slice_back(start);
616
617 s.consume_byte(quote)?;
618
619 Ok(ver)
620 }
621
622 fn parse_encoding_decl(s: &mut Stream<'a>) -> StreamResult<Option<StrSpan<'a>>> {
625 if !s.starts_with(b"encoding") {
626 return Ok(None);
627 }
628
629 s.advance(8);
630 s.consume_eq()?;
631 let quote = s.consume_quote()?;
632 let name = s.consume_bytes(|_, c| {
635 c.is_xml_letter()
636 || c.is_xml_digit()
637 || c == b'.'
638 || c == b'-'
639 || c == b'_'
640 });
641 s.consume_byte(quote)?;
642
643 Ok(Some(name))
644 }
645
646 fn parse_standalone(s: &mut Stream<'a>) -> StreamResult<Option<bool>> {
648 if !s.starts_with(b"standalone") {
649 return Ok(None);
650 }
651
652 s.advance(10);
653 s.consume_eq()?;
654 let quote = s.consume_quote()?;
655
656 let start = s.pos();
657 let value = s.consume_name()?.as_str();
658
659 let flag = match value {
660 "yes" => true,
661 "no" => false,
662 _ => {
663 let pos = s.gen_text_pos_from(start);
664
665 return Err(StreamError::InvalidString("yes', 'no", pos));
666 }
667 };
668
669 s.consume_byte(quote)?;
670
671 Ok(Some(flag))
672 }
673
674 fn parse_comment(s: &mut Stream<'a>) -> Result<Token<'a>> {
675 let start = s.pos();
676 Self::parse_comment_impl(s)
677 .map_err(|e| Error::InvalidComment(e, s.gen_text_pos_from(start)))
678 }
679
680 fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
682 let start = s.pos();
683 s.advance(4);
684 let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
685 s.skip_string(b"-->")?;
686
687 if text.as_str().contains("--") {
688 return Err(StreamError::InvalidCommentData);
689 }
690
691 if text.as_str().ends_with('-') {
692 return Err(StreamError::InvalidCommentEnd);
693 }
694
695 let span = s.slice_back(start);
696
697 Ok(Token::Comment { text, span })
698 }
699
700 fn parse_pi(s: &mut Stream<'a>) -> Result<Token<'a>> {
701 map_err_at!(Self::parse_pi_impl(s), s, InvalidPI)
702 }
703
704 fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
707 let start = s.pos();
708 s.advance(2);
709 let target = s.consume_name()?;
710 s.skip_spaces();
711 let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
712 let content = if !content.is_empty() {
713 Some(content)
714 } else {
715 None
716 };
717
718 s.skip_string(b"?>")?;
719
720 let span = s.slice_back(start);
721
722 Ok(Token::ProcessingInstruction { target, content, span })
723 }
724
725 fn parse_doctype(s: &mut Stream<'a>) -> Result<Token<'a>> {
726 map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype)
727 }
728
729 fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
731 let start = s.pos();
732 s.advance(9);
733
734 s.consume_spaces()?;
735 let name = s.consume_name()?;
736 s.skip_spaces();
737
738 let external_id = Self::parse_external_id(s)?;
739 s.skip_spaces();
740
741 let c = s.curr_byte()?;
742 if c != b'[' && c != b'>' {
743 static EXPECTED: &[u8] = &[b'[', b'>'];
744 return Err(StreamError::InvalidCharMultiple(c, EXPECTED, s.gen_text_pos()));
745 }
746
747 s.advance(1);
748
749 let span = s.slice_back(start);
750 if c == b'[' {
751 Ok(Token::DtdStart { name, external_id, span })
752 } else {
753 Ok(Token::EmptyDtd { name, external_id, span })
754 }
755 }
756
757 fn parse_external_id(s: &mut Stream<'a>) -> StreamResult<Option<ExternalId<'a>>> {
759 let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
760 let start = s.pos();
761 s.advance(6);
762 let id = s.slice_back(start);
763
764 s.consume_spaces()?;
765 let quote = s.consume_quote()?;
766 let literal1 = s.consume_bytes(|_, c| c != quote);
767 s.consume_byte(quote)?;
768
769 let v = if id.as_str() == "SYSTEM" {
770 ExternalId::System(literal1)
771 } else {
772 s.consume_spaces()?;
773 let quote = s.consume_quote()?;
774 let literal2 = s.consume_bytes(|_, c| c != quote);
775 s.consume_byte(quote)?;
776
777 ExternalId::Public(literal1, literal2)
778 };
779
780 Some(v)
781 } else {
782 None
783 };
784
785 Ok(v)
786 }
787
788 fn parse_entity_decl(s: &mut Stream<'a>) -> Result<Token<'a>> {
789 map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity)
790 }
791
792 fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
796 let start = s.pos();
797 s.advance(8);
798
799 s.consume_spaces()?;
800
801 let is_ge = if s.try_consume_byte(b'%') {
802 s.consume_spaces()?;
803 false
804 } else {
805 true
806 };
807
808 let name = s.consume_name()?;
809 s.consume_spaces()?;
810 let definition = Self::parse_entity_def(s, is_ge)?;
811 s.skip_spaces();
812 s.consume_byte(b'>')?;
813
814 let span = s.slice_back(start);
815
816 Ok(Token::EntityDeclaration { name, definition, span })
817 }
818
819 fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult<EntityDefinition<'a>> {
826 let c = s.curr_byte()?;
827 match c {
828 b'"' | b'\'' => {
829 let quote = s.consume_quote()?;
830 let value = s.consume_bytes(|_, c| c != quote);
831 s.consume_byte(quote)?;
832
833 Ok(EntityDefinition::EntityValue(value))
834 }
835 b'S' | b'P' => {
836 if let Some(id) = Self::parse_external_id(s)? {
837 if is_ge {
838 s.skip_spaces();
839 if s.starts_with(b"NDATA") {
840 s.advance(5);
841 s.consume_spaces()?;
842 s.skip_name()?;
843 }
845 }
846
847 Ok(EntityDefinition::ExternalId(id))
848 } else {
849 Err(StreamError::InvalidExternalID)
850 }
851 }
852 _ => {
853 static EXPECTED: &[u8] = &[b'"', b'\'', b'S', b'P'];
854 let pos = s.gen_text_pos();
855 Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos))
856 }
857 }
858 }
859
860 fn consume_decl(s: &mut Stream) -> StreamResult<()> {
861 s.skip_bytes(|_, c| c != b'>');
862 s.consume_byte(b'>')?;
863 Ok(())
864 }
865
866 fn parse_cdata(s: &mut Stream<'a>) -> Result<Token<'a>> {
867 map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata)
868 }
869
870 fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
875 let start = s.pos();
876 s.advance(9);
877 let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
878 s.skip_string(b"]]>")?;
879 let span = s.slice_back(start);
880 Ok(Token::Cdata { text, span })
881 }
882
883 fn parse_element_start(s: &mut Stream<'a>) -> Result<Token<'a>> {
884 map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement)
885 }
886
887 fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
889 let start = s.pos();
890 s.advance(1);
891 let (prefix, local) = s.consume_qname()?;
892 let span = s.slice_back(start);
893
894 Ok(Token::ElementStart { prefix, local, span })
895 }
896
897 fn parse_close_element(s: &mut Stream<'a>) -> Result<Token<'a>> {
898 map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement)
899 }
900
901 fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
903 let start = s.pos();
904 s.advance(2);
905
906 let (prefix, tag_name) = s.consume_qname()?;
907 s.skip_spaces();
908 s.consume_byte(b'>')?;
909
910 let span = s.slice_back(start);
911
912 Ok(Token::ElementEnd { end: ElementEnd::Close(prefix, tag_name), span })
913 }
914
915 fn parse_attribute(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
917 let attr_start = s.pos();
918 let has_space = s.starts_with_space();
919 s.skip_spaces();
920
921 if let Ok(c) = s.curr_byte() {
922 let start = s.pos();
923
924 match c {
925 b'/' => {
926 s.advance(1);
927 s.consume_byte(b'>')?;
928 let span = s.slice_back(start);
929 return Ok(Token::ElementEnd { end: ElementEnd::Empty, span });
930 }
931 b'>' => {
932 s.advance(1);
933 let span = s.slice_back(start);
934 return Ok(Token::ElementEnd { end: ElementEnd::Open, span });
935 }
936 _ => {}
937 }
938 }
939
940 if !has_space {
941 if !s.at_end() {
942 return Err(StreamError::InvalidSpace(
943 s.curr_byte_unchecked(), s.gen_text_pos_from(attr_start))
944 );
945 } else {
946 return Err(StreamError::UnexpectedEndOfStream);
947 }
948 }
949
950 let start = s.pos();
951
952 let (prefix, local) = s.consume_qname()?;
953 s.consume_eq()?;
954 let quote = s.consume_quote()?;
955 let quote_c = quote as char;
956 let value = s.consume_chars(|_, c| c != quote_c && c != '<')?;
958 s.consume_byte(quote)?;
959 let span = s.slice_back(start);
960
961 Ok(Token::Attribute { prefix, local, value, span })
962 }
963
964 fn parse_text(s: &mut Stream<'a>) -> Result<Token<'a>> {
965 map_err_at!(Self::parse_text_impl(s), s, InvalidCharData)
966 }
967
968 fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
969 let text = s.consume_chars(|_, c| c != '<')?;
970
971 if text.as_str().contains('>') {
976 if text.as_str().contains("]]>") {
977 return Err(StreamError::InvalidCharacterData);
978 }
979 }
980
981 Ok(Token::Text { text })
982 }
983}
984
985impl<'a> Iterator for Tokenizer<'a> {
986 type Item = Result<Token<'a>>;
987
988 #[inline]
989 fn next(&mut self) -> Option<Self::Item> {
990 let mut t = None;
991 while !self.stream.at_end() && self.state != State::End && t.is_none() {
992 t = self.parse_next_impl();
993 }
994
995 if let Some(Err(_)) = t {
996 self.stream.jump_to_end();
997 self.state = State::End;
998 }
999
1000 t
1001 }
1002}