1use super::{unescape, EndTag, StartTag};
15use std::collections::{HashMap, VecDeque};
16use std::error::Error;
17use std::fmt;
18use std::iter::Iterator;
19use std::mem;
20
21#[derive(PartialEq, Eq, Debug)]
22pub enum Event {
24 PI(String),
26 ElementStart(StartTag),
28 ElementEnd(EndTag),
30 Characters(String),
32 CDATA(String),
34 Comment(String),
36}
37
38#[derive(PartialEq, Debug, Clone)]
39#[allow(missing_copy_implementations)]
40pub struct ParserError {
42 pub line: u32,
44 pub col: u32,
46 pub msg: &'static str,
48}
49
50impl Error for ParserError {
51 fn description(&self) -> &str {
52 self.msg
53 }
54}
55
56impl fmt::Display for ParserError {
57 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
58 write!(
59 f,
60 "Parse error; Line: {}, Column: {}, Reason: {}",
61 self.line, self.col, self.msg,
62 )
63 }
64}
65
66enum State {
68 OutsideTag,
69 TagOpened,
70 InProcessingInstructions,
71 InTagName,
72 InCloseTagName,
73 InTag,
74 InAttrName,
75 InAttrValue,
76 ExpectDelimiter,
77 ExpectClose,
78 ExpectSpaceOrClose,
79 InExclamationMark,
80 InCDATAOpening,
81 InCDATA,
82 InCommentOpening,
83 InComment1,
84 InComment2,
85 InDoctype,
86}
87
88pub struct Parser {
107 line: u32,
108 col: u32,
109 has_error: bool,
110 data: VecDeque<char>,
111 buf: String,
112 namespaces: Vec<HashMap<String, String>>,
113 attributes: Vec<(String, Option<String>, String)>,
114 st: State,
115 name: Option<(Option<String>, String)>,
116 attr: Option<(Option<String>, String)>,
117 delim: Option<char>,
118 level: u8,
119}
120
121impl Parser {
122 pub fn new() -> Parser {
124 let mut ns = HashMap::with_capacity(2);
125 ns.insert(
127 "xml".to_owned(),
128 "http://www.w3.org/XML/1998/namespace".to_owned(),
129 );
130 ns.insert(
131 "xmlns".to_owned(),
132 "http://www.w3.org/2000/xmlns/".to_owned(),
133 );
134
135 Parser {
136 line: 1,
137 col: 0,
138 has_error: false,
139 data: VecDeque::with_capacity(4096),
140 buf: String::new(),
141 namespaces: vec![ns],
142 attributes: Vec::new(),
143 st: State::OutsideTag,
144 name: None,
145 attr: None,
146 delim: None,
147 level: 0,
148 }
149 }
150
151 pub fn feed_str(&mut self, data: &str) {
153 self.data.extend(data.chars());
154 }
155}
156
157impl Iterator for Parser {
158 type Item = Result<Event, ParserError>;
159
160 fn next(&mut self) -> Option<Result<Event, ParserError>> {
161 if self.has_error {
162 return None;
163 }
164
165 loop {
166 let c = match self.data.pop_front() {
167 Some(c) => c,
168 None => return None,
169 };
170
171 if c == '\n' {
172 self.line += 1;
173 self.col = 0;
174 } else {
175 self.col += 1;
176 }
177
178 match self.parse_character(c) {
179 Ok(None) => continue,
180 Ok(Some(event)) => {
181 return Some(Ok(event));
182 }
183 Err(e) => {
184 self.has_error = true;
185 return Some(Err(e));
186 }
187 }
188 }
189 }
190}
191
192#[inline]
193fn parse_qname(mut qname: String) -> (Option<String>, String) {
195 if let Some(i) = qname.find(':') {
196 let local = qname.split_off(i + 1);
197 qname.pop();
198 (Some(qname), local)
199 } else {
200 (None, qname)
201 }
202}
203
204fn unescape_owned(input: String) -> Result<String, String> {
205 if input.find('&').is_none() {
206 Ok(input)
207 } else {
208 unescape(&input)
209 }
210}
211
212impl Parser {
213 fn namespace_for_prefix(&self, prefix: &str) -> Option<String> {
217 for ns in self.namespaces.iter().rev() {
218 if let Some(namespace) = ns.get(prefix) {
219 if namespace.is_empty() {
220 return None;
221 }
222 return Some(namespace.clone());
223 }
224 }
225 None
226 }
227
228 fn take_buf(&mut self) -> String {
229 self.buf.split_off(0)
230 }
231
232 fn error(&self, msg: &'static str) -> Result<Option<Event>, ParserError> {
233 Err(ParserError {
234 line: self.line,
235 col: self.col,
236 msg,
237 })
238 }
239
240 fn parse_character(&mut self, c: char) -> Result<Option<Event>, ParserError> {
241 match self.st {
243 State::OutsideTag => self.outside_tag(c),
244 State::TagOpened => self.tag_opened(c),
245 State::InProcessingInstructions => self.in_processing_instructions(c),
246 State::InTagName => self.in_tag_name(c),
247 State::InCloseTagName => self.in_close_tag_name(c),
248 State::InTag => self.in_tag(c),
249 State::InAttrName => self.in_attr_name(c),
250 State::InAttrValue => self.in_attr_value(c),
251 State::ExpectDelimiter => self.expect_delimiter(c),
252 State::ExpectClose => self.expect_close(c),
253 State::ExpectSpaceOrClose => self.expect_space_or_close(c),
254 State::InExclamationMark => self.in_exclamation_mark(c),
255 State::InCDATAOpening => self.in_cdata_opening(c),
256 State::InCDATA => self.in_cdata(c),
257 State::InCommentOpening => self.in_comment_opening(c),
258 State::InComment1 => self.in_comment1(c),
259 State::InComment2 => self.in_comment2(c),
260 State::InDoctype => self.in_doctype(c),
261 }
262 }
263
264 fn outside_tag(&mut self, c: char) -> Result<Option<Event>, ParserError> {
267 match c {
268 '<' if self.buf.is_empty() => self.st = State::TagOpened,
269 '<' => {
270 self.st = State::TagOpened;
271 let buf = match unescape_owned(self.take_buf()) {
272 Ok(unescaped) => unescaped,
273 Err(_) => return self.error("Found invalid entity"),
274 };
275 return Ok(Some(Event::Characters(buf)));
276 }
277 _ => self.buf.push(c),
278 }
279 Ok(None)
280 }
281
282 fn tag_opened(&mut self, c: char) -> Result<Option<Event>, ParserError> {
288 self.st = match c {
289 '?' => State::InProcessingInstructions,
290 '!' => State::InExclamationMark,
291 '/' => State::InCloseTagName,
292 _ => {
293 self.buf.push(c);
294 State::InTagName
295 }
296 };
297 Ok(None)
298 }
299
300 fn in_processing_instructions(&mut self, c: char) -> Result<Option<Event>, ParserError> {
303 match c {
304 '?' => {
305 self.level = 1;
306 self.buf.push(c);
307 }
308 '>' if self.level == 1 => {
309 self.level = 0;
310 self.st = State::OutsideTag;
311 let _ = self.buf.pop();
312 let buf = self.take_buf();
313 return Ok(Some(Event::PI(buf)));
314 }
315 _ => self.buf.push(c),
316 }
317 Ok(None)
318 }
319
320 fn in_tag_name(&mut self, c: char) -> Result<Option<Event>, ParserError> {
325 match c {
326 '/' | '>' => {
327 let (prefix, name) = parse_qname(self.take_buf());
328 let ns = match prefix {
329 None => self.namespace_for_prefix(""),
330 Some(ref pre) => match self.namespace_for_prefix(&pre) {
331 None => return self.error("Unbound namespace prefix in tag name"),
332 ns => ns,
333 },
334 };
335
336 self.namespaces.push(HashMap::new());
337 self.st = if c == '/' {
338 self.name = Some((prefix.clone(), name.clone()));
339 State::ExpectClose
340 } else {
341 State::OutsideTag
342 };
343
344 return Ok(Some(Event::ElementStart(StartTag {
345 name,
346 ns,
347 prefix,
348 attributes: HashMap::new(),
349 })));
350 }
351 ' ' | '\t' | '\r' | '\n' => {
352 self.namespaces.push(HashMap::new());
353 self.name = Some(parse_qname(self.take_buf()));
354 self.st = State::InTag;
355 }
356 _ => self.buf.push(c),
357 }
358 Ok(None)
359 }
360
361 fn in_close_tag_name(&mut self, c: char) -> Result<Option<Event>, ParserError> {
365 match c {
366 ' ' | '\t' | '\r' | '\n' | '>' => {
367 let (prefix, name) = parse_qname(self.take_buf());
368
369 let ns = match prefix {
370 None => self.namespace_for_prefix(""),
371 Some(ref pre) => match self.namespace_for_prefix(&pre) {
372 None => return self.error("Unbound namespace prefix in tag name"),
373 ns => ns,
374 },
375 };
376
377 self.namespaces.pop();
378 self.st = if c == '>' {
379 State::OutsideTag
380 } else {
381 State::ExpectSpaceOrClose
382 };
383
384 Ok(Some(Event::ElementEnd(EndTag { name, ns, prefix })))
385 }
386 _ => {
387 self.buf.push(c);
388 Ok(None)
389 }
390 }
391 }
392
393 fn in_tag(&mut self, c: char) -> Result<Option<Event>, ParserError> {
398 match c {
399 '/' | '>' => {
400 let attributes = mem::replace(&mut self.attributes, Vec::new());
401 let (prefix, name) = self
402 .name
403 .take()
404 .expect("Internal error: No element name set");
405 let ns = match prefix {
406 None => self.namespace_for_prefix(""),
407 Some(ref pre) => match self.namespace_for_prefix(&pre) {
408 None => return self.error("Unbound namespace prefix in tag name"),
409 ns => ns,
410 },
411 };
412
413 let mut attributes_map: HashMap<(String, Option<String>), String> = HashMap::new();
414
415 for (name, ns, value) in attributes {
418 let ns = match ns {
419 None => None,
420 Some(ref prefix) => match self.namespace_for_prefix(&prefix) {
421 None => {
422 return self.error("Unbound namespace prefix in attribute name")
423 }
424 ns => ns,
425 },
426 };
427 if attributes_map.insert((name, ns), value).is_some() {
428 return self.error("Duplicate attribute");
429 }
430 }
431
432 self.st = if c == '/' {
433 self.name = Some((prefix.clone(), name.clone()));
434 State::ExpectClose
435 } else {
436 State::OutsideTag
437 };
438
439 return Ok(Some(Event::ElementStart(StartTag {
440 name,
441 ns,
442 prefix,
443 attributes: attributes_map,
444 })));
445 }
446 ' ' | '\t' | '\r' | '\n' => (),
447 _ => {
448 self.buf.push(c);
449 self.st = State::InAttrName;
450 }
451 }
452 Ok(None)
453 }
454
455 fn in_attr_name(&mut self, c: char) -> Result<Option<Event>, ParserError> {
458 match c {
459 '=' => {
460 self.level = 0;
461 self.attr = Some(parse_qname(self.take_buf()));
462 self.st = State::ExpectDelimiter;
463 }
464 ' ' | '\t' | '\r' | '\n' => self.level = 1,
465 _ if self.level == 0 => self.buf.push(c),
466 _ => return self.error("Space occured in attribute name"),
467 }
468 Ok(None)
469 }
470
471 fn in_attr_value(&mut self, c: char) -> Result<Option<Event>, ParserError> {
474 if c == self
475 .delim
476 .expect("Internal error: In attribute value, but no delimiter set")
477 {
478 self.delim = None;
479 self.st = State::InTag;
480 let attr = self.attr.take();
481 let (prefix, name) =
482 attr.expect("Internal error: In attribute value, but no attribute name set");
483 let value = match unescape_owned(self.take_buf()) {
484 Ok(unescaped) => unescaped,
485 Err(_) => return self.error("Found invalid entity"),
486 };
487
488 let last = self
489 .namespaces
490 .last_mut()
491 .expect("Internal error: Empty namespace stack");
492 match prefix {
493 None if name == "xmlns" => {
494 last.insert(String::new(), value.clone());
495 }
496 Some(ref prefix) if prefix == "xmlns" => {
497 last.insert(name.clone(), value.clone());
498 }
499 _ => (),
500 }
501
502 self.attributes.push((name, prefix, value));
503 } else {
504 self.buf.push(c);
505 }
506 Ok(None)
507 }
508
509 fn expect_delimiter(&mut self, c: char) -> Result<Option<Event>, ParserError> {
512 match c {
513 '"' | '\'' => {
514 self.delim = Some(c);
515 self.st = State::InAttrValue;
516 }
517 ' ' | '\t' | '\r' | '\n' => (),
518 _ => return self.error("Attribute value not enclosed in ' or \""),
519 }
520 Ok(None)
521 }
522
523 fn expect_close(&mut self, c: char) -> Result<Option<Event>, ParserError> {
526 match c {
527 '>' => {
528 self.st = State::OutsideTag;
529 let (prefix, name) = self
530 .name
531 .take()
532 .expect("Internal error: No element name set");
533 let ns = match prefix {
534 None => self.namespace_for_prefix(""),
535 Some(ref pre) => match self.namespace_for_prefix(&pre) {
536 None => return self.error("Unbound namespace prefix in tag name"),
537 ns => ns,
538 },
539 };
540 self.namespaces.pop();
541 Ok(Some(Event::ElementEnd(EndTag { name, ns, prefix })))
542 }
543 _ => self.error("Expected '>' to close tag"),
544 }
545 }
546
547 fn expect_space_or_close(&mut self, c: char) -> Result<Option<Event>, ParserError> {
550 match c {
551 ' ' | '\t' | '\r' | '\n' => Ok(None),
552 '>' => {
553 self.st = State::OutsideTag;
554 Ok(None)
555 }
556 _ => self.error("Expected '>' to close tag, or LWS"),
557 }
558 }
559
560 fn in_exclamation_mark(&mut self, c: char) -> Result<Option<Event>, ParserError> {
565 self.st = match c {
566 '-' => State::InCommentOpening,
567 '[' => State::InCDATAOpening,
568 'D' => State::InDoctype,
569 _ => return self.error("Malformed XML"),
570 };
571 Ok(None)
572 }
573
574 fn in_cdata_opening(&mut self, c: char) -> Result<Option<Event>, ParserError> {
577 static CDATA_PATTERN: [char; 6] = ['C', 'D', 'A', 'T', 'A', '['];
578 if c == CDATA_PATTERN[self.level as usize] {
579 self.level += 1;
580 } else {
581 return self.error("Invalid CDATA opening sequence");
582 }
583
584 if self.level == 6 {
585 self.level = 0;
586 self.st = State::InCDATA;
587 }
588 Ok(None)
589 }
590
591 fn in_cdata(&mut self, c: char) -> Result<Option<Event>, ParserError> {
594 match c {
595 ']' => {
596 self.buf.push(c);
597 self.level += 1;
598 }
599 '>' if self.level >= 2 => {
600 self.st = State::OutsideTag;
601 self.level = 0;
602 let len = self.buf.len();
603 self.buf.truncate(len - 2);
604 let buf = self.take_buf();
605 return Ok(Some(Event::CDATA(buf)));
606 }
607 _ => {
608 self.buf.push(c);
609 self.level = 0;
610 }
611 }
612 Ok(None)
613 }
614
615 fn in_comment_opening(&mut self, c: char) -> Result<Option<Event>, ParserError> {
618 if c == '-' {
619 self.st = State::InComment1;
620 self.level = 0;
621 Ok(None)
622 } else {
623 self.error("Expected 2nd '-' to start comment")
624 }
625 }
626
627 fn in_comment1(&mut self, c: char) -> Result<Option<Event>, ParserError> {
630 if c == '-' {
631 self.level += 1;
632 } else {
633 self.level = 0;
634 }
635
636 if self.level == 2 {
637 self.level = 0;
638 self.st = State::InComment2;
639 }
640
641 self.buf.push(c);
642
643 Ok(None)
644 }
645
646 fn in_comment2(&mut self, c: char) -> Result<Option<Event>, ParserError> {
649 if c != '>' {
650 self.error("No more than one adjacent '-' allowed in a comment")
651 } else {
652 self.st = State::OutsideTag;
653 let len = self.buf.len();
654 self.buf.truncate(len - 2);
655 let buf = self.take_buf();
656 Ok(Some(Event::Comment(buf)))
657 }
658 }
659
660 fn in_doctype(&mut self, c: char) -> Result<Option<Event>, ParserError> {
663 static DOCTYPE_PATTERN: [char; 6] = ['O', 'C', 'T', 'Y', 'P', 'E'];
664 match self.level {
665 0..=5 => {
666 if c == DOCTYPE_PATTERN[self.level as usize] {
667 self.level += 1;
668 } else {
669 return self.error("Invalid DOCTYPE");
670 }
671 }
672 6 => {
673 match c {
674 ' ' | '\t' | '\r' | '\n' => (),
675 _ => return self.error("Invalid DOCTYPE"),
676 }
677 self.level += 1;
678 }
679 _ if c == '>' => {
680 self.level = 0;
681 self.st = State::OutsideTag;
682 }
683 _ => (),
684 }
685 Ok(None)
686 }
687}
688
689#[cfg(test)]
690mod parser_tests {
691 use std::collections::HashMap;
692
693 use super::super::{EndTag, Event, ParserError, StartTag};
694 use super::Parser;
695
696 #[test]
697 fn test_start_tag() {
698 let mut p = Parser::new();
699 let mut i = 0u8;
700 p.feed_str("<a>");
701 for event in p {
702 i += 1;
703 assert_eq!(
704 event,
705 Ok(Event::ElementStart(StartTag {
706 name: "a".to_owned(),
707 ns: None,
708 prefix: None,
709 attributes: HashMap::new()
710 })),
711 );
712 }
713 assert_eq!(i, 1u8);
714 }
715
716 #[test]
717 fn test_end_tag() {
718 let mut p = Parser::new();
719 let mut i = 0u8;
720 p.feed_str("</a>");
721 for event in p {
722 i += 1;
723 assert_eq!(
724 event,
725 Ok(Event::ElementEnd(EndTag {
726 name: "a".to_owned(),
727 ns: None,
728 prefix: None
729 })),
730 );
731 }
732 assert_eq!(i, 1u8);
733 }
734
735 #[test]
736 fn test_self_closing_with_space() {
737 let mut p = Parser::new();
738 p.feed_str("<register />");
739
740 let v: Vec<Result<Event, ParserError>> = p.collect();
741 assert_eq!(
742 v,
743 vec![
744 Ok(Event::ElementStart(StartTag {
745 name: "register".to_owned(),
746 ns: None,
747 prefix: None,
748 attributes: HashMap::new()
749 })),
750 Ok(Event::ElementEnd(EndTag {
751 name: "register".to_owned(),
752 ns: None,
753 prefix: None,
754 }))
755 ],
756 );
757 }
758
759 #[test]
760 fn test_self_closing_without_space() {
761 let mut p = Parser::new();
762 p.feed_str("<register/>");
763
764 let v: Vec<Result<Event, ParserError>> = p.collect();
765 assert_eq!(
766 v,
767 vec![
768 Ok(Event::ElementStart(StartTag {
769 name: "register".to_owned(),
770 ns: None,
771 prefix: None,
772 attributes: HashMap::new()
773 })),
774 Ok(Event::ElementEnd(EndTag {
775 name: "register".to_owned(),
776 ns: None,
777 prefix: None,
778 }))
779 ],
780 );
781 }
782
783 #[test]
784 fn test_self_closing_namespace() {
785 let mut p = Parser::new();
786 p.feed_str("<foo:a xmlns:foo='urn:foo'/>");
787
788 let v: Vec<Result<Event, ParserError>> = p.collect();
789 let mut attr: HashMap<(String, Option<String>), String> = HashMap::new();
790 attr.insert(
791 (
792 "foo".to_owned(),
793 Some("http://www.w3.org/2000/xmlns/".to_owned()),
794 ),
795 "urn:foo".to_owned(),
796 );
797 assert_eq!(
798 v,
799 vec![
800 Ok(Event::ElementStart(StartTag {
801 name: "a".to_owned(),
802 ns: Some("urn:foo".to_owned()),
803 prefix: Some("foo".to_owned()),
804 attributes: attr,
805 })),
806 Ok(Event::ElementEnd(EndTag {
807 name: "a".to_owned(),
808 ns: Some("urn:foo".to_owned()),
809 prefix: Some("foo".to_owned()),
810 }))
811 ],
812 );
813 }
814
815 #[test]
816 fn test_pi() {
817 let mut p = Parser::new();
818 let mut i = 0u8;
819 p.feed_str("<?xml version='1.0' encoding='utf-8'?>");
820 for event in p {
821 i += 1;
822 assert_eq!(
823 event,
824 Ok(Event::PI("xml version='1.0' encoding='utf-8'".to_owned())),
825 );
826 }
827 assert_eq!(i, 1u8);
828 }
829
830 #[test]
831 fn test_comment() {
832 let mut p = Parser::new();
833 let mut i = 0u8;
834 p.feed_str("<!--Nothing to see-->");
835 for event in p {
836 i += 1;
837 assert_eq!(event, Ok(Event::Comment("Nothing to see".to_owned())));
838 }
839 assert_eq!(i, 1u8);
840 }
841 #[test]
842 fn test_cdata() {
843 let mut p = Parser::new();
844 let mut i = 0u8;
845 p.feed_str("<![CDATA[<html><head><title>x</title></head><body/></html>]]>");
846 for event in p {
847 i += 1;
848 assert_eq!(
849 event,
850 Ok(Event::CDATA(
851 "<html><head><title>x</title></head><body/></html>".to_owned()
852 )),
853 );
854 }
855 assert_eq!(i, 1u8);
856 }
857
858 #[test]
859 fn test_characters() {
860 let mut p = Parser::new();
861 let mut i = 0u8;
862 p.feed_str("<text>Hello World, it's a nice day</text>");
863 for event in p {
864 i += 1;
865 if i == 2 {
866 assert_eq!(
867 event,
868 Ok(Event::Characters("Hello World, it's a nice day".to_owned())),
869 );
870 }
871 }
872 assert_eq!(i, 3u8);
873 }
874
875 #[test]
876 fn test_doctype() {
877 let mut p = Parser::new();
878 let mut i = 0u8;
879 p.feed_str("<!DOCTYPE html>");
880 for _ in p {
881 i += 1;
882 }
883 assert_eq!(i, 0u8);
884 }
885}