aws_smithy_xml/
unescape.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/*
 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 * SPDX-License-Identifier: Apache-2.0
 */

use crate::decode::XmlDecodeError;
use std::borrow::Cow;

/// Unescape XML encoded characters
///
/// This function will unescape the 4 literal escapes:
/// - `<`, `>`, `&`, `"`, and `'`
/// - Decimal escapes: `{`
/// - Hex escapes: `
`
///
/// If no escape sequences are present, Cow<&'str> will be returned, avoiding the need
/// to copy the String.
pub(crate) fn unescape(s: &str) -> Result<Cow<'_, str>, XmlDecodeError> {
    // no &, no need to escape anything
    if !s.contains('&') {
        return Ok(Cow::Borrowed(s));
    }
    // this will be strictly larger than required avoiding the need for another allocation
    let mut res = String::with_capacity(s.len());
    // could consider memchr as performance optimization
    let mut sections = s.split('&');
    // push content before the first &
    if let Some(prefix) = sections.next() {
        res.push_str(prefix);
    }
    for section in sections {
        // entities look like &<somedata>;
        match section.find(';') {
            Some(idx) => {
                let entity = &section[..idx];
                match entity {
                    "lt" => res.push('<'),
                    "gt" => res.push('>'),
                    "amp" => res.push('&'),
                    "quot" => res.push('"'),
                    "apos" => res.push('\''),
                    entity => {
                        // e.g. &#xD;
                        let (entity, radix) = if let Some(entity) = entity.strip_prefix("#x") {
                            (entity, 16)
                        } else if let Some(entity) = entity.strip_prefix('#') {
                            // e.g. &#123;
                            (entity, 10)
                        } else {
                            return Err(XmlDecodeError::invalid_escape(entity));
                        };
                        let char_code = u32::from_str_radix(entity, radix).map_err(|_| {
                            XmlDecodeError::invalid_escape(format!(
                                "expected numeric escape in base {}; got: {}",
                                radix, &entity
                            ))
                        })?;
                        let chr = std::char::from_u32(char_code).ok_or_else(|| {
                            XmlDecodeError::invalid_escape(format!(
                                "invalid char code: {}",
                                char_code
                            ))
                        })?;
                        res.push(chr);
                    }
                }
                // push everything from the `;` to the next `&`
                res.push_str(&section[idx + 1..])
            }
            None => return Err(XmlDecodeError::invalid_escape("unterminated pattern")),
        }
    }
    Ok(Cow::Owned(res))
}

#[cfg(test)]
mod test {
    use crate::unescape::unescape;
    use std::borrow::Cow;

    #[test]
    fn basic_unescape() {
        assert_eq!(
            unescape("&lt; &gt; &apos; &quot; &amp;").unwrap(),
            "< > ' \" &"
        );
        assert_eq!(
            unescape("Since a &gt; b, b is less than a").unwrap(),
            "Since a > b, b is less than a"
        );
    }

    #[test]
    fn no_need_to_escape() {
        assert_eq!(unescape("hello 🍕!").unwrap(), Cow::Borrowed("hello 🍕!"));
    }

    #[test]
    fn complex_unescape() {
        // Test cases adapted from Apache Commons StringEscapeUtilsTest.java
        assert_eq!(
            unescape("a&lt;b&gt;c&quot;d&apos;e&amp;f;;").unwrap(),
            "a<b>c\"d'e&f;;"
        );
        assert_eq!(unescape("&amp;lt;").unwrap(), "&lt;")
    }

    #[test]
    fn newline_encoding() {
        assert_eq!(unescape("&#10;").unwrap(), "\n");
        assert_eq!(unescape("&#xD;").unwrap(), "\r");
    }

    #[test]
    fn xml_eol_encoding() {
        assert_eq!(unescape("&#xA; &#xA;").unwrap(), "\n \n");
        assert_eq!(
            unescape("a&#xD;&#xA; b&#xA; c&#xD;").unwrap(),
            "a\r\n b\n c\r"
        );
        assert_eq!(
            unescape("a&#xD;&#x85; b&#x85;").unwrap(),
            "a\r\u{0085} b\u{0085}"
        );
        assert_eq!(
            unescape("a&#xD;&#x2028; b&#x85; c&#x2028;").unwrap(),
            "a\r\u{2028} b\u{0085} c\u{2028}"
        );
    }

    #[test]
    fn invalid_escapes() {
        unescape("&lte;").expect_err("lte does not make a ≤");
        unescape("&lt").expect_err("unterminated escape sequence");
        unescape("&#Q1234;").expect_err("Q does not began a numeric sequence");
        unescape("&#3.14;").expect_err("decimal escape");
        unescape("&#xZZ").expect_err("Z is not hex");
        unescape("here is a & but without an escape sequence...").expect_err("naked &");
    }

    use proptest::prelude::*;
    proptest! {
        #[test]
        fn no_panics(s: String) {
            let unescaped = unescape(&s);
            // if the string needed to be escaped, we
            if s.contains('&') {
                assert!(
                    matches!(unescaped, Ok(Cow::Owned(_)) | Err(_))
                );
            }
        }
    }
}