http_types/mime/
parse.rs

1use std::borrow::Cow;
2use std::fmt;
3
4use super::{Mime, ParamName, ParamValue};
5
6/// Parse a string into a mime type.
7/// Follows the [WHATWG MIME parsing algorithm](https://mimesniff.spec.whatwg.org/#parsing-a-mime-type)
8pub(crate) fn parse(input: &str) -> crate::Result<Mime> {
9    // 1
10    let input = input.trim_matches(is_http_whitespace_char);
11
12    // 3.
13    let (basetype, input) = collect_code_point_sequence_char(input, '/');
14
15    // 4.
16    crate::ensure!(!basetype.is_empty(), "MIME type should not be empty");
17    crate::ensure!(
18        basetype.chars().all(is_http_token_code_point),
19        "MIME type should ony contain valid HTTP token code points"
20    );
21
22    // 5.
23    crate::ensure!(!input.is_empty(), "MIME must contain a sub type");
24
25    // 6.
26    let input = &input[1..];
27
28    // 7.
29    let (subtype, input) = collect_code_point_sequence_char(input, ';');
30
31    // 8.
32    let subtype = subtype.trim_end_matches(is_http_whitespace_char);
33
34    // 9.
35    crate::ensure!(!subtype.is_empty(), "MIME sub type should not be empty");
36    crate::ensure!(
37        subtype.chars().all(is_http_token_code_point),
38        "MIME sub type should ony contain valid HTTP token code points"
39    );
40
41    // 10.
42    let basetype = basetype.to_ascii_lowercase();
43    let subtype = subtype.to_ascii_lowercase();
44    let mut params = vec![];
45    let mut is_utf8 = false;
46
47    // 11.
48    let mut input = input;
49    while !input.is_empty() {
50        // 1.
51        input = &input[1..];
52
53        // 2.
54        input = input.trim_start_matches(is_http_whitespace_char);
55
56        // 3.
57        let (parameter_name, new_input) =
58            collect_code_point_sequence_slice(input, &[';', '='] as &[char]);
59        input = new_input;
60
61        // 4.
62        let parameter_name = parameter_name.to_ascii_lowercase();
63
64        if input.is_empty() {
65            // 6.
66            break;
67        } else {
68            // 5.
69            if input.starts_with(';') {
70                continue;
71            } else {
72                // It's a '='
73                input = &input[1..];
74            }
75        }
76
77        let parameter_value = if input.starts_with('"') {
78            // 8.
79            // implementation of https://fetch.spec.whatwg.org/#collect-an-http-quoted-string
80            let (parameter_value, new_input) = collect_http_quoted_string(input);
81            let (_, new_input) = collect_code_point_sequence_char(new_input, ';');
82            input = new_input;
83            parameter_value
84        } else {
85            // 9.
86            let (parameter_value, new_input) = collect_code_point_sequence_char(input, ';');
87            input = new_input;
88            let parameter_value = parameter_value.trim_end_matches(is_http_whitespace_char);
89            if parameter_value.is_empty() {
90                continue;
91            }
92            parameter_value.to_owned()
93        };
94
95        // 10.
96        if parameter_name == "charset" && parameter_value == "utf-8" {
97            is_utf8 = true;
98        } else if !parameter_name.is_empty()
99            && parameter_name.chars().all(is_http_token_code_point)
100            && parameter_value
101                .chars()
102                .all(is_http_quoted_string_token_code_point)
103        {
104            let name = ParamName(parameter_name.into());
105            let value = ParamValue(parameter_value.into());
106            if !params.iter().any(|(k, _)| k == &name) {
107                params.push((name, value));
108            }
109        }
110    }
111
112    Ok(Mime {
113        essence: Cow::Owned(format!("{}/{}", &basetype, &subtype)),
114        basetype: basetype.into(),
115        subtype: subtype.into(),
116        params,
117        is_utf8,
118    })
119}
120
121/// Validates [HTTP token code points](https://mimesniff.spec.whatwg.org/#http-token-code-point)
122fn is_http_token_code_point(c: char) -> bool {
123    matches!(c,
124        '!'
125        | '#'
126        | '$'
127        | '%'
128        | '&'
129        | '\''
130        | '*'
131        | '+'
132        | '-'
133        | '.'
134        | '^'
135        | '_'
136        | '`'
137        | '|'
138        | '~'
139        | 'a'..='z'
140        | 'A'..='Z'
141        | '0'..='9')
142}
143
144/// Validates [HTTP quoted-string token code points](https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point)
145fn is_http_quoted_string_token_code_point(c: char) -> bool {
146    matches!(c, '\t' | ' '..='~' | '\u{80}'..='\u{FF}')
147}
148
149/// Is a [HTTP whitespace](https://fetch.spec.whatwg.org/#http-whitespace)
150fn is_http_whitespace_char(c: char) -> bool {
151    matches!(c, '\n' | '\r' | '\t' | ' ')
152}
153
154/// [code point sequence collection](https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points)
155fn collect_code_point_sequence_char(input: &str, delimiter: char) -> (&str, &str) {
156    input.split_at(input.find(delimiter).unwrap_or_else(|| input.len()))
157}
158
159/// [code point sequence collection](https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points)
160fn collect_code_point_sequence_slice<'a>(input: &'a str, delimiter: &[char]) -> (&'a str, &'a str) {
161    input.split_at(input.find(delimiter).unwrap_or_else(|| input.len()))
162}
163
164/// [HTTP quoted string collection](https://fetch.spec.whatwg.org/#collect-an-http-quoted-string)
165///
166/// Assumes that the first char is '"'
167fn collect_http_quoted_string(mut input: &str) -> (String, &str) {
168    // 2.
169    let mut value = String::new();
170    // 4.
171    input = &input[1..];
172    // 5.
173    loop {
174        // 1.
175        let (add_value, new_input) =
176            collect_code_point_sequence_slice(input, &['"', '\\'] as &[char]);
177        value.push_str(add_value);
178        let mut chars = new_input.chars();
179        // 3.
180        if let Some(quote_or_backslash) = chars.next() {
181            // 4.
182            input = chars.as_str();
183            //5.
184            if quote_or_backslash == '\\' {
185                if let Some(c) = chars.next() {
186                    // 2.
187                    value.push(c);
188                    // 3.
189                    input = chars.as_str();
190                } else {
191                    // 1.
192                    value.push('\\');
193                    break;
194                }
195            } else {
196                // 6.
197                break;
198            }
199        } else {
200            // 2
201            break;
202        }
203    }
204    (value, input)
205}
206
207/// Implementation of [WHATWG MIME serialization algorithm](https://mimesniff.spec.whatwg.org/#serializing-a-mime-type)
208pub(crate) fn format(mime_type: &Mime, f: &mut fmt::Formatter<'_>) -> fmt::Result {
209    write!(f, "{}", &mime_type.essence)?;
210    if mime_type.is_utf8 {
211        write!(f, ";charset=utf-8")?;
212    }
213    for (name, value) in mime_type.params.iter() {
214        if value.0.chars().all(is_http_token_code_point) && !value.0.is_empty() {
215            write!(f, ";{}={}", name, value)?;
216        } else {
217            let value = value
218                .0
219                .chars()
220                .flat_map(|c| match c {
221                    '"' | '\\' => EscapeMimeValue::backslash(c),
222                    c => EscapeMimeValue::char(c),
223                })
224                .collect::<String>();
225            write!(f, ";{}=\"{}\"", name, value)?;
226        }
227    }
228    Ok(())
229}
230
231struct EscapeMimeValue {
232    state: EscapeMimeValueState,
233}
234
235impl EscapeMimeValue {
236    fn backslash(c: char) -> Self {
237        EscapeMimeValue {
238            state: EscapeMimeValueState::Backslash(c),
239        }
240    }
241
242    fn char(c: char) -> Self {
243        EscapeMimeValue {
244            state: EscapeMimeValueState::Char(c),
245        }
246    }
247}
248
249#[derive(Clone, Debug)]
250enum EscapeMimeValueState {
251    Done,
252    Char(char),
253    Backslash(char),
254}
255
256impl Iterator for EscapeMimeValue {
257    type Item = char;
258
259    fn next(&mut self) -> Option<char> {
260        match self.state {
261            EscapeMimeValueState::Done => None,
262            EscapeMimeValueState::Char(c) => {
263                self.state = EscapeMimeValueState::Done;
264                Some(c)
265            }
266            EscapeMimeValueState::Backslash(c) => {
267                self.state = EscapeMimeValueState::Char(c);
268                Some('\\')
269            }
270        }
271    }
272
273    fn size_hint(&self) -> (usize, Option<usize>) {
274        match self.state {
275            EscapeMimeValueState::Done => (0, Some(0)),
276            EscapeMimeValueState::Char(_) => (1, Some(1)),
277            EscapeMimeValueState::Backslash(_) => (2, Some(2)),
278        }
279    }
280}
281
282#[test]
283fn test() {
284    let mime = parse("text/html").unwrap();
285    assert_eq!(mime.basetype(), "text");
286    assert_eq!(mime.subtype(), "html");
287
288    // technically invalid mime, but allow anyway
289    let mime = parse("text/html;").unwrap();
290    assert_eq!(mime.basetype(), "text");
291    assert_eq!(mime.subtype(), "html");
292
293    let mime = parse("text/html; charset=utf-8").unwrap();
294    assert_eq!(mime.basetype(), "text");
295    assert_eq!(mime.subtype(), "html");
296    assert_eq!(mime.param("charset").unwrap(), "utf-8");
297
298    let mime = parse("text/html; charset=utf-8;").unwrap();
299    assert_eq!(mime.basetype(), "text");
300    assert_eq!(mime.subtype(), "html");
301    assert_eq!(mime.param("charset").unwrap(), "utf-8");
302
303    assert!(parse("text").is_err());
304    assert!(parse("text/").is_err());
305    assert!(parse("t/").is_err());
306    assert!(parse("t/h").is_ok());
307}
308
309/// Web Platform tests for MIME type parsing
310/// From https://github.com/web-platform-tests/wpt/blob/master/mimesniff/mime-types/resources/mime-types.json
311#[test]
312fn whatwag_tests() {
313    fn assert_parse(input: &str, expected: &str) {
314        let actual = parse(input).unwrap();
315        assert_eq!(actual.to_string(), expected);
316    }
317
318    fn assert_fails(input: &str) {
319        assert!(parse(input).is_err());
320    }
321
322    fn assert_parse_and_encoding(
323        input: &str,
324        expected: &str,
325        _encoding: impl Into<Option<&'static str>>,
326    ) {
327        //TODO: check encoding
328        assert_parse(input, expected);
329    }
330
331    // Basics
332    assert_parse_and_encoding("text/html;charset=gbk", "text/html;charset=gbk", "GBK");
333    assert_parse_and_encoding("TEXT/HTML;CHARSET=GBK", "text/html;charset=GBK", "GBK");
334
335    //" Legacy comment syntax"
336    assert_parse_and_encoding("text/html;charset=gbk(", "text/html;charset=\"gbk(\"", None);
337    assert_parse_and_encoding(
338        "text/html;x=(;charset=gbk",
339        "text/html;x=\"(\";charset=gbk",
340        "GBK",
341    );
342
343    // Duplicate parameter
344    assert_parse_and_encoding(
345        "text/html;charset=gbk;charset=windows-1255",
346        "text/html;charset=gbk",
347        "GBK",
348    );
349    assert_parse_and_encoding(
350        "text/html;charset=();charset=GBK",
351        "text/html;charset=\"()\"",
352        None,
353    );
354
355    // Spaces
356    assert_parse_and_encoding("text/html;charset =gbk", "text/html", None);
357    assert_parse_and_encoding("text/html ;charset=gbk", "text/html;charset=gbk", "GBK");
358    assert_parse_and_encoding("text/html; charset=gbk", "text/html;charset=gbk", "GBK");
359    assert_parse_and_encoding(
360        "text/html;charset= gbk",
361        "text/html;charset=\" gbk\"",
362        "GBK",
363    );
364    assert_parse_and_encoding(
365        "text/html;charset= \"gbk\"",
366        "text/html;charset=\" \\\"gbk\\\"\"",
367        None,
368    );
369
370    // 0x0B and 0x0C
371    assert_parse_and_encoding("text/html;charset=\u{000B}gbk", "text/html", None);
372    assert_parse_and_encoding("text/html;charset=\u{000C}gbk", "text/html", None);
373    assert_parse_and_encoding("text/html;\u{000B}charset=gbk", "text/html", None);
374    assert_parse_and_encoding("text/html;\u{000C}charset=gbk", "text/html", None);
375
376    // Single quotes are a token, not a delimiter
377    assert_parse_and_encoding("text/html;charset='gbk'", "text/html;charset='gbk'", None);
378    assert_parse_and_encoding("text/html;charset='gbk", "text/html;charset='gbk", None);
379    assert_parse_and_encoding("text/html;charset=gbk'", "text/html;charset=gbk'", None);
380    assert_parse_and_encoding(
381        "text/html;charset=';charset=GBK",
382        "text/html;charset='",
383        None,
384    );
385
386    // Invalid parameters
387    assert_parse_and_encoding("text/html;test;charset=gbk", "text/html;charset=gbk", "GBK");
388    assert_parse_and_encoding(
389        "text/html;test=;charset=gbk",
390        "text/html;charset=gbk",
391        "GBK",
392    );
393    assert_parse_and_encoding("text/html;';charset=gbk", "text/html;charset=gbk", "GBK");
394    assert_parse_and_encoding("text/html;\";charset=gbk", "text/html;charset=gbk", "GBK");
395    assert_parse_and_encoding("text/html ; ; charset=gbk", "text/html;charset=gbk", "GBK");
396    assert_parse_and_encoding("text/html;;;;charset=gbk", "text/html;charset=gbk", "GBK");
397    assert_parse_and_encoding(
398        "text/html;charset= \"\u{007F};charset=GBK",
399        "text/html;charset=GBK",
400        "GBK",
401    );
402    assert_parse_and_encoding(
403        "text/html;charset=\"\u{007F};charset=foo\";charset=GBK",
404        "text/html;charset=GBK",
405        "GBK",
406    );
407
408    // Double quotes"
409    assert_parse_and_encoding("text/html;charset=\"gbk\"", "text/html;charset=gbk", "GBK");
410    assert_parse_and_encoding("text/html;charset=\"gbk", "text/html;charset=gbk", "GBK");
411    assert_parse_and_encoding(
412        "text/html;charset=gbk\"",
413        "text/html;charset=\"gbk\\\"\"",
414        None,
415    );
416    assert_parse_and_encoding(
417        "text/html;charset=\" gbk\"",
418        "text/html;charset=\" gbk\"",
419        "GBK",
420    );
421    assert_parse_and_encoding(
422        "text/html;charset=\"gbk \"",
423        "text/html;charset=\"gbk \"",
424        "GBK",
425    );
426    assert_parse_and_encoding(
427        "text/html;charset=\"\\ gbk\"",
428        "text/html;charset=\" gbk\"",
429        "GBK",
430    );
431    assert_parse_and_encoding(
432        "text/html;charset=\"\\g\\b\\k\"",
433        "text/html;charset=gbk",
434        "GBK",
435    );
436    assert_parse_and_encoding("text/html;charset=\"gbk\"x", "text/html;charset=gbk", "GBK");
437    assert_parse_and_encoding(
438        "text/html;charset=\"\";charset=GBK",
439        "text/html;charset=\"\"",
440        None,
441    );
442    assert_parse_and_encoding(
443        "text/html;charset=\";charset=GBK",
444        "text/html;charset=\";charset=GBK\"",
445        None,
446    );
447
448    // Unexpected code points
449    assert_parse_and_encoding(
450        "text/html;charset={gbk}",
451        "text/html;charset=\"{gbk}\"",
452        None,
453    );
454
455    // Parameter name longer than 127
456    assert_parse_and_encoding("text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk", "text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk", "GBK");
457
458    // type/subtype longer than 127
459    assert_parse("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789", "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
460
461    // Valid
462    assert_parse("!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
463    assert_parse("x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u{0080}\u{0081}\u{0082}\u{0083}\u{0084}\u{0085}\u{0086}\u{0087}\u{0088}\u{0089}\u{008A}\u{008B}\u{008C}\u{008D}\u{008E}\u{008F}\u{0090}\u{0091}\u{0092}\u{0093}\u{0094}\u{0095}\u{0096}\u{0097}\u{0098}\u{0099}\u{009A}\u{009B}\u{009C}\u{009D}\u{009E}\u{009F}\u{00A0}\u{00A1}\u{00A2}\u{00A3}\u{00A4}\u{00A5}\u{00A6}\u{00A7}\u{00A8}\u{00A9}\u{00AA}\u{00AB}\u{00AC}\u{00AD}\u{00AE}\u{00AF}\u{00B0}\u{00B1}\u{00B2}\u{00B3}\u{00B4}\u{00B5}\u{00B6}\u{00B7}\u{00B8}\u{00B9}\u{00BA}\u{00BB}\u{00BC}\u{00BD}\u{00BE}\u{00BF}\u{00C0}\u{00C1}\u{00C2}\u{00C3}\u{00C4}\u{00C5}\u{00C6}\u{00C7}\u{00C8}\u{00C9}\u{00CA}\u{00CB}\u{00CC}\u{00CD}\u{00CE}\u{00CF}\u{00D0}\u{00D1}\u{00D2}\u{00D3}\u{00D4}\u{00D5}\u{00D6}\u{00D7}\u{00D8}\u{00D9}\u{00DA}\u{00DB}\u{00DC}\u{00DD}\u{00DE}\u{00DF}\u{00E0}\u{00E1}\u{00E2}\u{00E3}\u{00E4}\u{00E5}\u{00E6}\u{00E7}\u{00E8}\u{00E9}\u{00EA}\u{00EB}\u{00EC}\u{00ED}\u{00EE}\u{00EF}\u{00F0}\u{00F1}\u{00F2}\u{00F3}\u{00F4}\u{00F5}\u{00F6}\u{00F7}\u{00F8}\u{00F9}\u{00FA}\u{00FB}\u{00FC}\u{00FD}\u{00FE}\u{00FF}\"", "x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u{0080}\u{0081}\u{0082}\u{0083}\u{0084}\u{0085}\u{0086}\u{0087}\u{0088}\u{0089}\u{008A}\u{008B}\u{008C}\u{008D}\u{008E}\u{008F}\u{0090}\u{0091}\u{0092}\u{0093}\u{0094}\u{0095}\u{0096}\u{0097}\u{0098}\u{0099}\u{009A}\u{009B}\u{009C}\u{009D}\u{009E}\u{009F}\u{00A0}\u{00A1}\u{00A2}\u{00A3}\u{00A4}\u{00A5}\u{00A6}\u{00A7}\u{00A8}\u{00A9}\u{00AA}\u{00AB}\u{00AC}\u{00AD}\u{00AE}\u{00AF}\u{00B0}\u{00B1}\u{00B2}\u{00B3}\u{00B4}\u{00B5}\u{00B6}\u{00B7}\u{00B8}\u{00B9}\u{00BA}\u{00BB}\u{00BC}\u{00BD}\u{00BE}\u{00BF}\u{00C0}\u{00C1}\u{00C2}\u{00C3}\u{00C4}\u{00C5}\u{00C6}\u{00C7}\u{00C8}\u{00C9}\u{00CA}\u{00CB}\u{00CC}\u{00CD}\u{00CE}\u{00CF}\u{00D0}\u{00D1}\u{00D2}\u{00D3}\u{00D4}\u{00D5}\u{00D6}\u{00D7}\u{00D8}\u{00D9}\u{00DA}\u{00DB}\u{00DC}\u{00DD}\u{00DE}\u{00DF}\u{00E0}\u{00E1}\u{00E2}\u{00E3}\u{00E4}\u{00E5}\u{00E6}\u{00E7}\u{00E8}\u{00E9}\u{00EA}\u{00EB}\u{00EC}\u{00ED}\u{00EE}\u{00EF}\u{00F0}\u{00F1}\u{00F2}\u{00F3}\u{00F4}\u{00F5}\u{00F6}\u{00F7}\u{00F8}\u{00F9}\u{00FA}\u{00FB}\u{00FC}\u{00FD}\u{00FE}\u{00FF}\"");
464
465    // End-of-file handling
466    assert_parse("x/x;test", "x/x");
467    assert_parse("x/x;test=\"\\", "x/x;test=\"\\\\\"");
468
469    // Whitespace (not handled by generated-mime-types.json or above)
470    assert_parse("x/x;x= ", "x/x");
471    assert_parse("x/x;x=\t", "x/x");
472    assert_parse("x/x\n\r\t ;x=x", "x/x;x=x");
473    assert_parse("\n\r\t x/x;x=x\n\r\t ", "x/x;x=x");
474    assert_parse("x/x;\n\r\t x=x\n\r\t ;x=y", "x/x;x=x");
475
476    // Latin1
477    assert_parse_and_encoding(
478        "text/html;test=\u{00FF};charset=gbk",
479        "text/html;test=\"\u{00FF}\";charset=gbk",
480        "GBK",
481    );
482
483    // >Latin1
484    assert_parse("x/x;test=\u{FFFD};x=x", "x/x;x=x");
485
486    // Failure
487    assert_fails("\u{000B}x/x");
488    assert_fails("\u{000C}x/x");
489    assert_fails("x/x\u{000B}");
490    assert_fails("x/x\u{000C}");
491    assert_fails("");
492    assert_fails("\t");
493    assert_fails("/");
494    assert_fails("bogus");
495    assert_fails("bogus/");
496    assert_fails("bogus/ ");
497    assert_fails("bogus/bogus/;");
498    assert_fails("</>");
499    assert_fails("(/)");
500    assert_fails("ÿ/ÿ");
501    assert_fails("text/html(;doesnot=matter");
502    assert_fails("{/}");
503    assert_fails("\u{0100}/\u{0100}");
504    assert_fails("text /html");
505    assert_fails("text/ html");
506    assert_fails("\"text/html\"");
507}