1use std::borrow::Cow;
2use std::fmt;
3
4use super::{Mime, ParamName, ParamValue};
5
6pub(crate) fn parse(input: &str) -> crate::Result<Mime> {
9 let input = input.trim_matches(is_http_whitespace_char);
11
12 let (basetype, input) = collect_code_point_sequence_char(input, '/');
14
15 crate::ensure!(!basetype.is_empty(), "MIME type should not be empty");
17 crate::ensure!(
18 basetype.chars().all(is_http_token_code_point),
19 "MIME type should ony contain valid HTTP token code points"
20 );
21
22 crate::ensure!(!input.is_empty(), "MIME must contain a sub type");
24
25 let input = &input[1..];
27
28 let (subtype, input) = collect_code_point_sequence_char(input, ';');
30
31 let subtype = subtype.trim_end_matches(is_http_whitespace_char);
33
34 crate::ensure!(!subtype.is_empty(), "MIME sub type should not be empty");
36 crate::ensure!(
37 subtype.chars().all(is_http_token_code_point),
38 "MIME sub type should ony contain valid HTTP token code points"
39 );
40
41 let basetype = basetype.to_ascii_lowercase();
43 let subtype = subtype.to_ascii_lowercase();
44 let mut params = vec![];
45 let mut is_utf8 = false;
46
47 let mut input = input;
49 while !input.is_empty() {
50 input = &input[1..];
52
53 input = input.trim_start_matches(is_http_whitespace_char);
55
56 let (parameter_name, new_input) =
58 collect_code_point_sequence_slice(input, &[';', '='] as &[char]);
59 input = new_input;
60
61 let parameter_name = parameter_name.to_ascii_lowercase();
63
64 if input.is_empty() {
65 break;
67 } else {
68 if input.starts_with(';') {
70 continue;
71 } else {
72 input = &input[1..];
74 }
75 }
76
77 let parameter_value = if input.starts_with('"') {
78 let (parameter_value, new_input) = collect_http_quoted_string(input);
81 let (_, new_input) = collect_code_point_sequence_char(new_input, ';');
82 input = new_input;
83 parameter_value
84 } else {
85 let (parameter_value, new_input) = collect_code_point_sequence_char(input, ';');
87 input = new_input;
88 let parameter_value = parameter_value.trim_end_matches(is_http_whitespace_char);
89 if parameter_value.is_empty() {
90 continue;
91 }
92 parameter_value.to_owned()
93 };
94
95 if parameter_name == "charset" && parameter_value == "utf-8" {
97 is_utf8 = true;
98 } else if !parameter_name.is_empty()
99 && parameter_name.chars().all(is_http_token_code_point)
100 && parameter_value
101 .chars()
102 .all(is_http_quoted_string_token_code_point)
103 {
104 let name = ParamName(parameter_name.into());
105 let value = ParamValue(parameter_value.into());
106 if !params.iter().any(|(k, _)| k == &name) {
107 params.push((name, value));
108 }
109 }
110 }
111
112 Ok(Mime {
113 essence: Cow::Owned(format!("{}/{}", &basetype, &subtype)),
114 basetype: basetype.into(),
115 subtype: subtype.into(),
116 params,
117 is_utf8,
118 })
119}
120
121fn is_http_token_code_point(c: char) -> bool {
123 matches!(c,
124 '!'
125 | '#'
126 | '$'
127 | '%'
128 | '&'
129 | '\''
130 | '*'
131 | '+'
132 | '-'
133 | '.'
134 | '^'
135 | '_'
136 | '`'
137 | '|'
138 | '~'
139 | 'a'..='z'
140 | 'A'..='Z'
141 | '0'..='9')
142}
143
144fn is_http_quoted_string_token_code_point(c: char) -> bool {
146 matches!(c, '\t' | ' '..='~' | '\u{80}'..='\u{FF}')
147}
148
149fn is_http_whitespace_char(c: char) -> bool {
151 matches!(c, '\n' | '\r' | '\t' | ' ')
152}
153
154fn collect_code_point_sequence_char(input: &str, delimiter: char) -> (&str, &str) {
156 input.split_at(input.find(delimiter).unwrap_or_else(|| input.len()))
157}
158
159fn collect_code_point_sequence_slice<'a>(input: &'a str, delimiter: &[char]) -> (&'a str, &'a str) {
161 input.split_at(input.find(delimiter).unwrap_or_else(|| input.len()))
162}
163
164fn collect_http_quoted_string(mut input: &str) -> (String, &str) {
168 let mut value = String::new();
170 input = &input[1..];
172 loop {
174 let (add_value, new_input) =
176 collect_code_point_sequence_slice(input, &['"', '\\'] as &[char]);
177 value.push_str(add_value);
178 let mut chars = new_input.chars();
179 if let Some(quote_or_backslash) = chars.next() {
181 input = chars.as_str();
183 if quote_or_backslash == '\\' {
185 if let Some(c) = chars.next() {
186 value.push(c);
188 input = chars.as_str();
190 } else {
191 value.push('\\');
193 break;
194 }
195 } else {
196 break;
198 }
199 } else {
200 break;
202 }
203 }
204 (value, input)
205}
206
207pub(crate) fn format(mime_type: &Mime, f: &mut fmt::Formatter<'_>) -> fmt::Result {
209 write!(f, "{}", &mime_type.essence)?;
210 if mime_type.is_utf8 {
211 write!(f, ";charset=utf-8")?;
212 }
213 for (name, value) in mime_type.params.iter() {
214 if value.0.chars().all(is_http_token_code_point) && !value.0.is_empty() {
215 write!(f, ";{}={}", name, value)?;
216 } else {
217 let value = value
218 .0
219 .chars()
220 .flat_map(|c| match c {
221 '"' | '\\' => EscapeMimeValue::backslash(c),
222 c => EscapeMimeValue::char(c),
223 })
224 .collect::<String>();
225 write!(f, ";{}=\"{}\"", name, value)?;
226 }
227 }
228 Ok(())
229}
230
231struct EscapeMimeValue {
232 state: EscapeMimeValueState,
233}
234
235impl EscapeMimeValue {
236 fn backslash(c: char) -> Self {
237 EscapeMimeValue {
238 state: EscapeMimeValueState::Backslash(c),
239 }
240 }
241
242 fn char(c: char) -> Self {
243 EscapeMimeValue {
244 state: EscapeMimeValueState::Char(c),
245 }
246 }
247}
248
249#[derive(Clone, Debug)]
250enum EscapeMimeValueState {
251 Done,
252 Char(char),
253 Backslash(char),
254}
255
256impl Iterator for EscapeMimeValue {
257 type Item = char;
258
259 fn next(&mut self) -> Option<char> {
260 match self.state {
261 EscapeMimeValueState::Done => None,
262 EscapeMimeValueState::Char(c) => {
263 self.state = EscapeMimeValueState::Done;
264 Some(c)
265 }
266 EscapeMimeValueState::Backslash(c) => {
267 self.state = EscapeMimeValueState::Char(c);
268 Some('\\')
269 }
270 }
271 }
272
273 fn size_hint(&self) -> (usize, Option<usize>) {
274 match self.state {
275 EscapeMimeValueState::Done => (0, Some(0)),
276 EscapeMimeValueState::Char(_) => (1, Some(1)),
277 EscapeMimeValueState::Backslash(_) => (2, Some(2)),
278 }
279 }
280}
281
282#[test]
283fn test() {
284 let mime = parse("text/html").unwrap();
285 assert_eq!(mime.basetype(), "text");
286 assert_eq!(mime.subtype(), "html");
287
288 let mime = parse("text/html;").unwrap();
290 assert_eq!(mime.basetype(), "text");
291 assert_eq!(mime.subtype(), "html");
292
293 let mime = parse("text/html; charset=utf-8").unwrap();
294 assert_eq!(mime.basetype(), "text");
295 assert_eq!(mime.subtype(), "html");
296 assert_eq!(mime.param("charset").unwrap(), "utf-8");
297
298 let mime = parse("text/html; charset=utf-8;").unwrap();
299 assert_eq!(mime.basetype(), "text");
300 assert_eq!(mime.subtype(), "html");
301 assert_eq!(mime.param("charset").unwrap(), "utf-8");
302
303 assert!(parse("text").is_err());
304 assert!(parse("text/").is_err());
305 assert!(parse("t/").is_err());
306 assert!(parse("t/h").is_ok());
307}
308
309#[test]
312fn whatwag_tests() {
313 fn assert_parse(input: &str, expected: &str) {
314 let actual = parse(input).unwrap();
315 assert_eq!(actual.to_string(), expected);
316 }
317
318 fn assert_fails(input: &str) {
319 assert!(parse(input).is_err());
320 }
321
322 fn assert_parse_and_encoding(
323 input: &str,
324 expected: &str,
325 _encoding: impl Into<Option<&'static str>>,
326 ) {
327 assert_parse(input, expected);
329 }
330
331 assert_parse_and_encoding("text/html;charset=gbk", "text/html;charset=gbk", "GBK");
333 assert_parse_and_encoding("TEXT/HTML;CHARSET=GBK", "text/html;charset=GBK", "GBK");
334
335 assert_parse_and_encoding("text/html;charset=gbk(", "text/html;charset=\"gbk(\"", None);
337 assert_parse_and_encoding(
338 "text/html;x=(;charset=gbk",
339 "text/html;x=\"(\";charset=gbk",
340 "GBK",
341 );
342
343 assert_parse_and_encoding(
345 "text/html;charset=gbk;charset=windows-1255",
346 "text/html;charset=gbk",
347 "GBK",
348 );
349 assert_parse_and_encoding(
350 "text/html;charset=();charset=GBK",
351 "text/html;charset=\"()\"",
352 None,
353 );
354
355 assert_parse_and_encoding("text/html;charset =gbk", "text/html", None);
357 assert_parse_and_encoding("text/html ;charset=gbk", "text/html;charset=gbk", "GBK");
358 assert_parse_and_encoding("text/html; charset=gbk", "text/html;charset=gbk", "GBK");
359 assert_parse_and_encoding(
360 "text/html;charset= gbk",
361 "text/html;charset=\" gbk\"",
362 "GBK",
363 );
364 assert_parse_and_encoding(
365 "text/html;charset= \"gbk\"",
366 "text/html;charset=\" \\\"gbk\\\"\"",
367 None,
368 );
369
370 assert_parse_and_encoding("text/html;charset=\u{000B}gbk", "text/html", None);
372 assert_parse_and_encoding("text/html;charset=\u{000C}gbk", "text/html", None);
373 assert_parse_and_encoding("text/html;\u{000B}charset=gbk", "text/html", None);
374 assert_parse_and_encoding("text/html;\u{000C}charset=gbk", "text/html", None);
375
376 assert_parse_and_encoding("text/html;charset='gbk'", "text/html;charset='gbk'", None);
378 assert_parse_and_encoding("text/html;charset='gbk", "text/html;charset='gbk", None);
379 assert_parse_and_encoding("text/html;charset=gbk'", "text/html;charset=gbk'", None);
380 assert_parse_and_encoding(
381 "text/html;charset=';charset=GBK",
382 "text/html;charset='",
383 None,
384 );
385
386 assert_parse_and_encoding("text/html;test;charset=gbk", "text/html;charset=gbk", "GBK");
388 assert_parse_and_encoding(
389 "text/html;test=;charset=gbk",
390 "text/html;charset=gbk",
391 "GBK",
392 );
393 assert_parse_and_encoding("text/html;';charset=gbk", "text/html;charset=gbk", "GBK");
394 assert_parse_and_encoding("text/html;\";charset=gbk", "text/html;charset=gbk", "GBK");
395 assert_parse_and_encoding("text/html ; ; charset=gbk", "text/html;charset=gbk", "GBK");
396 assert_parse_and_encoding("text/html;;;;charset=gbk", "text/html;charset=gbk", "GBK");
397 assert_parse_and_encoding(
398 "text/html;charset= \"\u{007F};charset=GBK",
399 "text/html;charset=GBK",
400 "GBK",
401 );
402 assert_parse_and_encoding(
403 "text/html;charset=\"\u{007F};charset=foo\";charset=GBK",
404 "text/html;charset=GBK",
405 "GBK",
406 );
407
408 assert_parse_and_encoding("text/html;charset=\"gbk\"", "text/html;charset=gbk", "GBK");
410 assert_parse_and_encoding("text/html;charset=\"gbk", "text/html;charset=gbk", "GBK");
411 assert_parse_and_encoding(
412 "text/html;charset=gbk\"",
413 "text/html;charset=\"gbk\\\"\"",
414 None,
415 );
416 assert_parse_and_encoding(
417 "text/html;charset=\" gbk\"",
418 "text/html;charset=\" gbk\"",
419 "GBK",
420 );
421 assert_parse_and_encoding(
422 "text/html;charset=\"gbk \"",
423 "text/html;charset=\"gbk \"",
424 "GBK",
425 );
426 assert_parse_and_encoding(
427 "text/html;charset=\"\\ gbk\"",
428 "text/html;charset=\" gbk\"",
429 "GBK",
430 );
431 assert_parse_and_encoding(
432 "text/html;charset=\"\\g\\b\\k\"",
433 "text/html;charset=gbk",
434 "GBK",
435 );
436 assert_parse_and_encoding("text/html;charset=\"gbk\"x", "text/html;charset=gbk", "GBK");
437 assert_parse_and_encoding(
438 "text/html;charset=\"\";charset=GBK",
439 "text/html;charset=\"\"",
440 None,
441 );
442 assert_parse_and_encoding(
443 "text/html;charset=\";charset=GBK",
444 "text/html;charset=\";charset=GBK\"",
445 None,
446 );
447
448 assert_parse_and_encoding(
450 "text/html;charset={gbk}",
451 "text/html;charset=\"{gbk}\"",
452 None,
453 );
454
455 assert_parse_and_encoding("text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk", "text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk", "GBK");
457
458 assert_parse("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789", "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");
460
461 assert_parse("!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
463 assert_parse("x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u{0080}\u{0081}\u{0082}\u{0083}\u{0084}\u{0085}\u{0086}\u{0087}\u{0088}\u{0089}\u{008A}\u{008B}\u{008C}\u{008D}\u{008E}\u{008F}\u{0090}\u{0091}\u{0092}\u{0093}\u{0094}\u{0095}\u{0096}\u{0097}\u{0098}\u{0099}\u{009A}\u{009B}\u{009C}\u{009D}\u{009E}\u{009F}\u{00A0}\u{00A1}\u{00A2}\u{00A3}\u{00A4}\u{00A5}\u{00A6}\u{00A7}\u{00A8}\u{00A9}\u{00AA}\u{00AB}\u{00AC}\u{00AD}\u{00AE}\u{00AF}\u{00B0}\u{00B1}\u{00B2}\u{00B3}\u{00B4}\u{00B5}\u{00B6}\u{00B7}\u{00B8}\u{00B9}\u{00BA}\u{00BB}\u{00BC}\u{00BD}\u{00BE}\u{00BF}\u{00C0}\u{00C1}\u{00C2}\u{00C3}\u{00C4}\u{00C5}\u{00C6}\u{00C7}\u{00C8}\u{00C9}\u{00CA}\u{00CB}\u{00CC}\u{00CD}\u{00CE}\u{00CF}\u{00D0}\u{00D1}\u{00D2}\u{00D3}\u{00D4}\u{00D5}\u{00D6}\u{00D7}\u{00D8}\u{00D9}\u{00DA}\u{00DB}\u{00DC}\u{00DD}\u{00DE}\u{00DF}\u{00E0}\u{00E1}\u{00E2}\u{00E3}\u{00E4}\u{00E5}\u{00E6}\u{00E7}\u{00E8}\u{00E9}\u{00EA}\u{00EB}\u{00EC}\u{00ED}\u{00EE}\u{00EF}\u{00F0}\u{00F1}\u{00F2}\u{00F3}\u{00F4}\u{00F5}\u{00F6}\u{00F7}\u{00F8}\u{00F9}\u{00FA}\u{00FB}\u{00FC}\u{00FD}\u{00FE}\u{00FF}\"", "x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u{0080}\u{0081}\u{0082}\u{0083}\u{0084}\u{0085}\u{0086}\u{0087}\u{0088}\u{0089}\u{008A}\u{008B}\u{008C}\u{008D}\u{008E}\u{008F}\u{0090}\u{0091}\u{0092}\u{0093}\u{0094}\u{0095}\u{0096}\u{0097}\u{0098}\u{0099}\u{009A}\u{009B}\u{009C}\u{009D}\u{009E}\u{009F}\u{00A0}\u{00A1}\u{00A2}\u{00A3}\u{00A4}\u{00A5}\u{00A6}\u{00A7}\u{00A8}\u{00A9}\u{00AA}\u{00AB}\u{00AC}\u{00AD}\u{00AE}\u{00AF}\u{00B0}\u{00B1}\u{00B2}\u{00B3}\u{00B4}\u{00B5}\u{00B6}\u{00B7}\u{00B8}\u{00B9}\u{00BA}\u{00BB}\u{00BC}\u{00BD}\u{00BE}\u{00BF}\u{00C0}\u{00C1}\u{00C2}\u{00C3}\u{00C4}\u{00C5}\u{00C6}\u{00C7}\u{00C8}\u{00C9}\u{00CA}\u{00CB}\u{00CC}\u{00CD}\u{00CE}\u{00CF}\u{00D0}\u{00D1}\u{00D2}\u{00D3}\u{00D4}\u{00D5}\u{00D6}\u{00D7}\u{00D8}\u{00D9}\u{00DA}\u{00DB}\u{00DC}\u{00DD}\u{00DE}\u{00DF}\u{00E0}\u{00E1}\u{00E2}\u{00E3}\u{00E4}\u{00E5}\u{00E6}\u{00E7}\u{00E8}\u{00E9}\u{00EA}\u{00EB}\u{00EC}\u{00ED}\u{00EE}\u{00EF}\u{00F0}\u{00F1}\u{00F2}\u{00F3}\u{00F4}\u{00F5}\u{00F6}\u{00F7}\u{00F8}\u{00F9}\u{00FA}\u{00FB}\u{00FC}\u{00FD}\u{00FE}\u{00FF}\"");
464
465 assert_parse("x/x;test", "x/x");
467 assert_parse("x/x;test=\"\\", "x/x;test=\"\\\\\"");
468
469 assert_parse("x/x;x= ", "x/x");
471 assert_parse("x/x;x=\t", "x/x");
472 assert_parse("x/x\n\r\t ;x=x", "x/x;x=x");
473 assert_parse("\n\r\t x/x;x=x\n\r\t ", "x/x;x=x");
474 assert_parse("x/x;\n\r\t x=x\n\r\t ;x=y", "x/x;x=x");
475
476 assert_parse_and_encoding(
478 "text/html;test=\u{00FF};charset=gbk",
479 "text/html;test=\"\u{00FF}\";charset=gbk",
480 "GBK",
481 );
482
483 assert_parse("x/x;test=\u{FFFD};x=x", "x/x;x=x");
485
486 assert_fails("\u{000B}x/x");
488 assert_fails("\u{000C}x/x");
489 assert_fails("x/x\u{000B}");
490 assert_fails("x/x\u{000C}");
491 assert_fails("");
492 assert_fails("\t");
493 assert_fails("/");
494 assert_fails("bogus");
495 assert_fails("bogus/");
496 assert_fails("bogus/ ");
497 assert_fails("bogus/bogus/;");
498 assert_fails("</>");
499 assert_fails("(/)");
500 assert_fails("ÿ/ÿ");
501 assert_fails("text/html(;doesnot=matter");
502 assert_fails("{/}");
503 assert_fails("\u{0100}/\u{0100}");
504 assert_fails("text /html");
505 assert_fails("text/ html");
506 assert_fails("\"text/html\"");
507}