aws_smithy_xml/
decode.rs

1/*
2 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6use crate::unescape::unescape;
7use std::borrow::Cow;
8use std::convert::TryFrom;
9use std::error::Error;
10use std::fmt::{Display, Formatter};
11use xmlparser::{ElementEnd, Token, Tokenizer};
12
13pub type Depth = usize;
14
15// in general, these errors are just for reporting what happened, there isn't
16// much value in lots of different match variants
17
18#[derive(Debug)]
19enum XmlDecodeErrorKind {
20    InvalidXml(xmlparser::Error),
21    InvalidEscape { esc: String },
22    Custom(Cow<'static, str>),
23    Unhandled(Box<dyn std::error::Error + Send + Sync + 'static>),
24}
25
26#[derive(Debug)]
27pub struct XmlDecodeError {
28    kind: XmlDecodeErrorKind,
29}
30
31impl Display for XmlDecodeError {
32    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
33        match &self.kind {
34            XmlDecodeErrorKind::InvalidXml(_) => write!(f, "XML parse error"),
35            XmlDecodeErrorKind::InvalidEscape { esc } => write!(f, "invalid XML escape: {}", esc),
36            XmlDecodeErrorKind::Custom(msg) => write!(f, "error parsing XML: {}", msg),
37            XmlDecodeErrorKind::Unhandled(_) => write!(f, "error parsing XML"),
38        }
39    }
40}
41
42impl Error for XmlDecodeError {
43    fn source(&self) -> Option<&(dyn Error + 'static)> {
44        match &self.kind {
45            XmlDecodeErrorKind::InvalidXml(source) => Some(source as _),
46            XmlDecodeErrorKind::Unhandled(source) => Some(source.as_ref() as _),
47            XmlDecodeErrorKind::InvalidEscape { .. } | XmlDecodeErrorKind::Custom(..) => None,
48        }
49    }
50}
51
52impl XmlDecodeError {
53    pub(crate) fn invalid_xml(error: xmlparser::Error) -> Self {
54        Self {
55            kind: XmlDecodeErrorKind::InvalidXml(error),
56        }
57    }
58
59    pub(crate) fn invalid_escape(esc: impl Into<String>) -> Self {
60        Self {
61            kind: XmlDecodeErrorKind::InvalidEscape { esc: esc.into() },
62        }
63    }
64
65    pub fn custom(msg: impl Into<Cow<'static, str>>) -> Self {
66        Self {
67            kind: XmlDecodeErrorKind::Custom(msg.into()),
68        }
69    }
70
71    pub fn unhandled(error: impl Into<Box<dyn Error + Send + Sync + 'static>>) -> Self {
72        Self {
73            kind: XmlDecodeErrorKind::Unhandled(error.into()),
74        }
75    }
76}
77
78#[derive(PartialEq, Debug)]
79pub struct Name<'a> {
80    pub prefix: &'a str,
81    pub local: &'a str,
82}
83
84impl Name<'_> {
85    /// Check if a given name matches a tag name composed of `prefix:local` or just `local`
86    pub fn matches(&self, tag_name: &str) -> bool {
87        let split = tag_name.find(':');
88        match split {
89            None => tag_name == self.local,
90            Some(idx) => {
91                let (prefix, local) = tag_name.split_at(idx);
92                let local = &local[1..];
93                self.local == local && self.prefix == prefix
94            }
95        }
96    }
97}
98
99#[derive(Debug, PartialEq)]
100pub struct Attr<'a> {
101    name: Name<'a>,
102    // attribute values can be escaped (e.g. with double quotes, so we need a Cow)
103    value: Cow<'a, str>,
104}
105
106#[derive(Debug, PartialEq)]
107pub struct StartEl<'a> {
108    name: Name<'a>,
109    attributes: Vec<Attr<'a>>,
110    closed: bool,
111    depth: Depth,
112}
113
114/// Xml Start Element
115///
116/// ```xml
117/// <a:b   c="d">
118///  ^^^   ^^^^^
119///  name  attributes
120/// ```
121impl<'a> StartEl<'a> {
122    pub fn depth(&self) -> Depth {
123        self.depth
124    }
125
126    fn new(local: &'a str, prefix: &'a str, depth: Depth) -> Self {
127        Self {
128            name: Name { prefix, local },
129            attributes: vec![],
130            closed: false,
131            depth,
132        }
133    }
134
135    /// Retrieve an attribute with a given key
136    ///
137    /// key `prefix:local` combined as a str, joined by a `:`
138    pub fn attr<'b>(&'b self, key: &'b str) -> Option<&'b str> {
139        self.attributes
140            .iter()
141            .find(|attr| attr.name.matches(key))
142            .map(|attr| attr.value.as_ref())
143    }
144
145    /// Returns whether this `StartEl` matches a given name
146    /// in `prefix:local` form.
147    pub fn matches(&self, pat: &str) -> bool {
148        self.name.matches(pat)
149    }
150
151    /// Local component of this element's name
152    ///
153    /// ```xml
154    /// <foo:bar>
155    ///      ^^^
156    /// ```
157    pub fn local(&self) -> &str {
158        self.name.local
159    }
160
161    /// Prefix component of this elements name (or empty string)
162    /// ```xml
163    /// <foo:bar>
164    ///  ^^^
165    /// ```
166    pub fn prefix(&self) -> &str {
167        self.name.prefix
168    }
169
170    /// Returns true of `el` at `depth` is a match for this `start_el`
171    fn end_el(&self, el: ElementEnd<'_>, depth: Depth) -> bool {
172        if depth != self.depth {
173            return false;
174        }
175        match el {
176            ElementEnd::Open => false,
177            ElementEnd::Close(prefix, local) => {
178                prefix.as_str() == self.name.prefix && local.as_str() == self.name.local
179            }
180            ElementEnd::Empty => false,
181        }
182    }
183}
184
185/// Xml Document abstraction
186///
187/// This document wraps a lazy tokenizer with depth tracking.
188/// Constructing a document is essentially free.
189pub struct Document<'a> {
190    tokenizer: Tokenizer<'a>,
191    depth: Depth,
192}
193
194impl<'a> TryFrom<&'a [u8]> for Document<'a> {
195    type Error = XmlDecodeError;
196
197    fn try_from(value: &'a [u8]) -> Result<Self, Self::Error> {
198        Ok(Document::new(
199            std::str::from_utf8(value).map_err(XmlDecodeError::unhandled)?,
200        ))
201    }
202}
203
204impl<'inp> Document<'inp> {
205    pub fn new(doc: &'inp str) -> Self {
206        Document {
207            tokenizer: Tokenizer::from(doc),
208            depth: 0,
209        }
210    }
211
212    /// "Depth first" iterator
213    ///
214    /// Unlike [`next_tag()`](ScopedDecoder::next_tag), this method returns the next
215    /// start element regardless of depth. This is useful to give a pointer into the middle
216    /// of a document to start reading.
217    ///
218    /// ```xml
219    /// <Response> <-- first call returns this:
220    ///    <A> <-- next call
221    ///      <Nested /> <-- next call returns this
222    ///      <MoreNested>hello</MoreNested> <-- then this:
223    ///    </A>
224    ///    <B/> <-- second call to next_tag returns this
225    /// </Response>
226    /// ```
227    pub fn next_start_element<'a>(&'a mut self) -> Option<StartEl<'inp>> {
228        next_start_element(self)
229    }
230
231    /// A scoped reader for the entire document
232    pub fn root_element<'a>(&'a mut self) -> Result<ScopedDecoder<'inp, 'a>, XmlDecodeError> {
233        let start_el = self
234            .next_start_element()
235            .ok_or_else(|| XmlDecodeError::custom("no root element"))?;
236        Ok(ScopedDecoder {
237            doc: self,
238            start_el,
239            terminated: false,
240        })
241    }
242
243    /// A scoped reader for a specific tag
244    ///
245    /// This method is necessary for when you need to return a ScopedDecoder from a function
246    /// since normally the stacked-ownership that `next_tag()` uses would prevent returning a reference
247    /// to a field owned by the current function
248    pub fn scoped_to<'a>(&'a mut self, start_el: StartEl<'inp>) -> ScopedDecoder<'inp, 'a> {
249        ScopedDecoder {
250            doc: self,
251            start_el,
252            terminated: false,
253        }
254    }
255}
256
257/// A new-type wrapper around `Token` to prevent the wrapped third party type from showing up in
258/// public API
259#[derive(Debug)]
260pub struct XmlToken<'inp>(Token<'inp>);
261
262/// Depth tracking iterator
263///
264/// ```xml
265/// <a> <- startel depth 0
266///   <b> <- startel depth 1
267///     <c> <- startel depth 2
268///     </c> <- endel depth 2
269///   </b> <- endel depth 1
270/// </a> <- endel depth 0
271/// ```
272impl<'inp> Iterator for Document<'inp> {
273    type Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>;
274    fn next<'a>(&'a mut self) -> Option<Result<(XmlToken<'inp>, Depth), XmlDecodeError>> {
275        let tok = self.tokenizer.next()?;
276        let tok = match tok {
277            Err(e) => return Some(Err(XmlDecodeError::invalid_xml(e))),
278            Ok(tok) => tok,
279        };
280        // depth bookkeeping
281        match tok {
282            Token::ElementEnd {
283                end: ElementEnd::Close(_, _),
284                ..
285            } => {
286                self.depth -= 1;
287            }
288            Token::ElementEnd {
289                end: ElementEnd::Empty,
290                ..
291            } => self.depth -= 1,
292            t @ Token::ElementStart { .. } => {
293                self.depth += 1;
294                // We want the startel and endel to have the same depth, but after the opener,
295                // the parser will be at depth 1. Return the previous depth:
296                return Some(Ok((XmlToken(t), self.depth - 1)));
297            }
298            _ => {}
299        }
300        Some(Ok((XmlToken(tok), self.depth)))
301    }
302}
303
304/// XmlTag Abstraction
305///
306/// ScopedDecoder represents a tag-scoped view into an XML document. Methods
307/// on `ScopedDecoder` return `None` when the current tag has been exhausted.
308pub struct ScopedDecoder<'inp, 'a> {
309    doc: &'a mut Document<'inp>,
310    start_el: StartEl<'inp>,
311    terminated: bool,
312}
313
314/// When a scoped decoder is dropped, its entire scope is consumed so that the
315/// next read begins at the next tag at the same depth.
316impl Drop for ScopedDecoder<'_, '_> {
317    fn drop(&mut self) {
318        for _ in self {}
319    }
320}
321
322impl<'inp> ScopedDecoder<'inp, '_> {
323    /// The start element for this scope
324    pub fn start_el<'a>(&'a self) -> &'a StartEl<'inp> {
325        &self.start_el
326    }
327
328    /// Returns the next top-level tag in this scope
329    /// The returned reader will fully read the tag during its lifetime. If it is dropped without
330    /// the data being read, the reader will be advanced until the matching close tag. If you read
331    /// an element with `next_tag()` and you want to ignore it, simply drop the resulting `ScopeDecoder`.
332    ///
333    /// ```xml
334    /// <Response> <-- scoped reader on this tag
335    ///    <A> <-- first call to next_tag returns this
336    ///      <Nested /> <-- to get inner data, call `next_tag` on the returned decoder for `A`
337    ///      <MoreNested>hello</MoreNested>
338    ///    </A>
339    ///    <B/> <-- second call to next_tag returns this
340    /// </Response>
341    /// ```
342    pub fn next_tag<'a>(&'a mut self) -> Option<ScopedDecoder<'inp, 'a>> {
343        let next_tag = next_start_element(self)?;
344        Some(self.nested_decoder(next_tag))
345    }
346
347    fn nested_decoder<'a>(&'a mut self, start_el: StartEl<'inp>) -> ScopedDecoder<'inp, 'a> {
348        ScopedDecoder {
349            doc: self.doc,
350            start_el,
351            terminated: false,
352        }
353    }
354}
355
356impl<'inp, 'a> Iterator for ScopedDecoder<'inp, 'a> {
357    type Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>;
358
359    fn next(&mut self) -> Option<Self::Item> {
360        if self.start_el.closed {
361            self.terminated = true;
362        }
363        if self.terminated {
364            return None;
365        }
366        let (tok, depth) = match self.doc.next() {
367            Some(Ok((tok, depth))) => (tok, depth),
368            other => return other,
369        };
370
371        match tok.0 {
372            Token::ElementEnd { end, .. } if self.start_el.end_el(end, depth) => {
373                self.terminated = true;
374                return None;
375            }
376            _ => {}
377        }
378        Some(Ok((tok, depth)))
379    }
380}
381
382/// Load the next start element out of a depth-tagged token iterator
383fn next_start_element<'a, 'inp>(
384    tokens: &'a mut impl Iterator<Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>>,
385) -> Option<StartEl<'inp>> {
386    let mut out = StartEl::new("", "", 0);
387    loop {
388        match tokens.next()? {
389            Ok((XmlToken(Token::ElementStart { local, prefix, .. }), depth)) => {
390                out.name.local = local.as_str();
391                out.name.prefix = prefix.as_str();
392                out.depth = depth;
393            }
394            Ok((
395                XmlToken(Token::Attribute {
396                    prefix,
397                    local,
398                    value,
399                    ..
400                }),
401                _,
402            )) => out.attributes.push(Attr {
403                name: Name {
404                    local: local.as_str(),
405                    prefix: prefix.as_str(),
406                },
407                value: unescape(value.as_str()).ok()?,
408            }),
409            Ok((
410                XmlToken(Token::ElementEnd {
411                    end: ElementEnd::Open,
412                    ..
413                }),
414                _,
415            )) => break,
416            Ok((
417                XmlToken(Token::ElementEnd {
418                    end: ElementEnd::Empty,
419                    ..
420                }),
421                _,
422            )) => {
423                out.closed = true;
424                break;
425            }
426            _ => {}
427        }
428    }
429    Some(out)
430}
431
432/// Returns the data element at the current position
433///
434/// If the current position is not a data element (and is instead a `<start-element>`) an error
435/// will be returned
436pub fn try_data<'a, 'inp>(
437    tokens: &'a mut impl Iterator<Item = Result<(XmlToken<'inp>, Depth), XmlDecodeError>>,
438) -> Result<Cow<'inp, str>, XmlDecodeError> {
439    loop {
440        match tokens.next().map(|opt| opt.map(|opt| opt.0)) {
441            None => return Ok(Cow::Borrowed("")),
442            Some(Ok(XmlToken(Token::Text { text }))) => return unescape(text.as_str()),
443            Some(Ok(e @ XmlToken(Token::ElementStart { .. }))) => {
444                return Err(XmlDecodeError::custom(format!(
445                    "looking for a data element, found: {:?}",
446                    e
447                )))
448            }
449            Some(Err(e)) => return Err(e),
450            _ => {}
451        }
452    }
453}
454
455#[cfg(test)]
456mod test {
457    use crate::decode::{try_data, Attr, Depth, Document, Name, StartEl};
458
459    // test helper to create a closed startel
460    fn closed<'a>(local: &'a str, prefix: &'a str, depth: Depth) -> StartEl<'a> {
461        let mut s = StartEl::new(local, prefix, depth);
462        s.closed = true;
463        s
464    }
465
466    #[test]
467    fn scoped_tokens() {
468        let xml = r#"<Response><A></A></Response>"#;
469        let mut doc = Document::new(xml);
470        let mut root = doc.root_element().expect("valid document");
471        assert_eq!(root.start_el().local(), "Response");
472        assert_eq!(root.next_tag().expect("tag exists").start_el().local(), "A");
473        assert!(root.next_tag().is_none());
474    }
475
476    #[test]
477    fn handle_depth_properly() {
478        let xml = r#"<Response><Response></Response><A/></Response>"#;
479        let mut doc = Document::new(xml);
480        let mut scoped = doc.root_element().expect("valid document");
481        assert_eq!(
482            scoped.next_tag().unwrap().start_el(),
483            &StartEl::new("Response", "", 1)
484        );
485        let closed_a = closed("A", "", 1);
486        assert_eq!(scoped.next_tag().unwrap().start_el(), &closed_a);
487        assert!(scoped.next_tag().is_none())
488    }
489
490    #[test]
491    fn self_closing() {
492        let xml = r#"<Response/>"#;
493        let mut doc = Document::new(xml);
494        let mut scoped = doc.root_element().expect("valid doc");
495        assert!(scoped.start_el.closed);
496        assert!(scoped.next_tag().is_none())
497    }
498
499    #[test]
500    fn terminate_scope() {
501        let xml = r#"<Response><Struct><A></A><Also/></Struct><More/></Response>"#;
502        let mut doc = Document::new(xml);
503        let mut response_iter = doc.root_element().expect("valid doc");
504        let mut struct_iter = response_iter.next_tag().unwrap();
505        assert_eq!(
506            struct_iter.next_tag().as_ref().map(|t| t.start_el()),
507            Some(&StartEl::new("A", "", 2))
508        );
509        // When the inner iter is dropped, it will read to the end of its scope
510        // prevent accidental behavior where we didn't read a full node
511        drop(struct_iter);
512        assert_eq!(
513            response_iter.next_tag().unwrap().start_el(),
514            &closed("More", "", 1)
515        );
516    }
517
518    #[test]
519    fn read_data_invalid() {
520        let xml = r#"<Response><A></A></Response>"#;
521        let mut doc = Document::new(xml);
522        let mut resp = doc.root_element().unwrap();
523        try_data(&mut resp).expect_err("no data");
524    }
525
526    #[test]
527    fn read_data() {
528        let xml = r#"<Response>hello</Response>"#;
529        let mut doc = Document::new(xml);
530        let mut scoped = doc.root_element().unwrap();
531        assert_eq!(try_data(&mut scoped).unwrap(), "hello");
532    }
533
534    /// Whitespace within an element is preserved
535    #[test]
536    fn read_data_whitespace() {
537        let xml = r#"<Response> hello </Response>"#;
538        let mut doc = Document::new(xml);
539        let mut scoped = doc.root_element().unwrap();
540        assert_eq!(try_data(&mut scoped).unwrap(), " hello ");
541    }
542
543    #[test]
544    fn ignore_insignificant_whitespace() {
545        let xml = r#"<Response>   <A>  </A>    </Response>"#;
546        let mut doc = Document::new(xml);
547        let mut resp = doc.root_element().unwrap();
548        let mut a = resp.next_tag().expect("should be a");
549        let data = try_data(&mut a).expect("valid");
550        assert_eq!(data, "  ");
551    }
552
553    #[test]
554    fn read_attributes() {
555        let xml = r#"<Response xsi:type="CanonicalUser">hello</Response>"#;
556        let mut tokenizer = Document::new(xml);
557        let root = tokenizer.root_element().unwrap();
558
559        assert_eq!(
560            root.start_el().attributes,
561            vec![Attr {
562                name: Name {
563                    prefix: "xsi",
564                    local: "type"
565                },
566                value: "CanonicalUser".into()
567            }]
568        )
569    }
570
571    #[test]
572    fn unescape_data() {
573        let xml = r#"<Response key="&quot;hey&quot;>">&gt;</Response>"#;
574        let mut doc = Document::new(xml);
575        let mut root = doc.root_element().unwrap();
576        assert_eq!(try_data(&mut root).unwrap(), ">");
577        assert_eq!(root.start_el().attr("key"), Some("\"hey\">"));
578    }
579
580    #[test]
581    fn nested_self_closer() {
582        let xml = r#"<XmlListsInputOutput>
583                <stringList/>
584                <stringSet></stringSet>
585        </XmlListsInputOutput>"#;
586        let mut doc = Document::new(xml);
587        let mut root = doc.root_element().unwrap();
588        let mut string_list = root.next_tag().unwrap();
589        assert_eq!(string_list.start_el(), &closed("stringList", "", 1));
590        assert!(string_list.next_tag().is_none());
591        drop(string_list);
592        assert_eq!(
593            root.next_tag().unwrap().start_el(),
594            &StartEl::new("stringSet", "", 1)
595        );
596    }
597
598    #[test]
599    fn confusing_nested_same_name_tag() {
600        // an inner b which could be confused as closing the outer b if depth
601        // is not properly tracked:
602        let root_tags = &["a", "b", "c", "d"];
603        let xml = r#"<XmlListsInputOutput>
604                <a/>
605                <b>
606                  <c/>
607                  <b></b>
608                  <here/>
609                </b>
610                <c></c>
611                <d>more</d>
612        </XmlListsInputOutput>"#;
613        let mut doc = Document::new(xml);
614        let mut root = doc.root_element().unwrap();
615        let mut cmp = vec![];
616        while let Some(tag) = root.next_tag() {
617            cmp.push(tag.start_el().local().to_owned());
618        }
619        assert_eq!(root_tags, cmp.as_slice());
620    }
621}