encoding/
label.rs
1use all;
8use types::EncodingRef;
9
10pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> {
13 let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]);
14 let label: String =
15 label.chars().map(|c| match c { 'A'...'Z' => (c as u8 + 32) as char, _ => c }).collect();
16 match &label[..] {
17 "unicode-1-1-utf-8" |
18 "utf-8" |
19 "utf8" =>
20 Some(all::UTF_8 as EncodingRef),
21 "866" |
22 "cp866" |
23 "csibm866" |
24 "ibm866" =>
25 Some(all::IBM866 as EncodingRef),
26 "csisolatin2" |
27 "iso-8859-2" |
28 "iso-ir-101" |
29 "iso8859-2" |
30 "iso88592" |
31 "iso_8859-2" |
32 "iso_8859-2:1987" |
33 "l2" |
34 "latin2" =>
35 Some(all::ISO_8859_2 as EncodingRef),
36 "csisolatin3" |
37 "iso-8859-3" |
38 "iso-ir-109" |
39 "iso8859-3" |
40 "iso88593" |
41 "iso_8859-3" |
42 "iso_8859-3:1988" |
43 "l3" |
44 "latin3" =>
45 Some(all::ISO_8859_3 as EncodingRef),
46 "csisolatin4" |
47 "iso-8859-4" |
48 "iso-ir-110" |
49 "iso8859-4" |
50 "iso88594" |
51 "iso_8859-4" |
52 "iso_8859-4:1988" |
53 "l4" |
54 "latin4" =>
55 Some(all::ISO_8859_4 as EncodingRef),
56 "csisolatincyrillic" |
57 "cyrillic" |
58 "iso-8859-5" |
59 "iso-ir-144" |
60 "iso8859-5" |
61 "iso88595" |
62 "iso_8859-5" |
63 "iso_8859-5:1988" =>
64 Some(all::ISO_8859_5 as EncodingRef),
65 "arabic" |
66 "asmo-708" |
67 "csiso88596e" |
68 "csiso88596i" |
69 "csisolatinarabic" |
70 "ecma-114" |
71 "iso-8859-6" |
72 "iso-8859-6-e" |
73 "iso-8859-6-i" |
74 "iso-ir-127" |
75 "iso8859-6" |
76 "iso88596" |
77 "iso_8859-6" |
78 "iso_8859-6:1987" =>
79 Some(all::ISO_8859_6 as EncodingRef),
80 "csisolatingreek" |
81 "ecma-118" |
82 "elot_928" |
83 "greek" |
84 "greek8" |
85 "iso-8859-7" |
86 "iso-ir-126" |
87 "iso8859-7" |
88 "iso88597" |
89 "iso_8859-7" |
90 "iso_8859-7:1987" |
91 "sun_eu_greek" =>
92 Some(all::ISO_8859_7 as EncodingRef),
93 "csiso88598e" |
94 "csisolatinhebrew" |
95 "hebrew" |
96 "iso-8859-8" |
97 "iso-8859-8-e" |
98 "iso-ir-138" |
99 "iso8859-8" |
100 "iso88598" |
101 "iso_8859-8" |
102 "iso_8859-8:1988" |
103 "visual" =>
104 Some(all::ISO_8859_8 as EncodingRef),
105 "csiso88598i" |
106 "iso-8859-8-i" |
107 "logical" =>
108 Some(all::whatwg::ISO_8859_8_I as EncodingRef),
109 "csisolatin6" |
110 "iso-8859-10" |
111 "iso-ir-157" |
112 "iso8859-10" |
113 "iso885910" |
114 "l6" |
115 "latin6" =>
116 Some(all::ISO_8859_10 as EncodingRef),
117 "iso-8859-13" |
118 "iso8859-13" |
119 "iso885913" =>
120 Some(all::ISO_8859_13 as EncodingRef),
121 "iso-8859-14" |
122 "iso8859-14" |
123 "iso885914" =>
124 Some(all::ISO_8859_14 as EncodingRef),
125 "csisolatin9" |
126 "iso-8859-15" |
127 "iso8859-15" |
128 "iso885915" |
129 "iso_8859-15" |
130 "l9" =>
131 Some(all::ISO_8859_15 as EncodingRef),
132 "iso-8859-16" =>
133 Some(all::ISO_8859_16 as EncodingRef),
134 "cskoi8r" |
135 "koi" |
136 "koi8" |
137 "koi8-r" |
138 "koi8_r" =>
139 Some(all::KOI8_R as EncodingRef),
140 "koi8-u" =>
141 Some(all::KOI8_U as EncodingRef),
142 "csmacintosh" |
143 "mac" |
144 "macintosh" |
145 "x-mac-roman" =>
146 Some(all::MAC_ROMAN as EncodingRef),
147 "dos-874" |
148 "iso-8859-11" |
149 "iso8859-11" |
150 "iso885911" |
151 "tis-620" |
152 "windows-874" =>
153 Some(all::WINDOWS_874 as EncodingRef),
154 "cp1250" |
155 "windows-1250" |
156 "x-cp1250" =>
157 Some(all::WINDOWS_1250 as EncodingRef),
158 "cp1251" |
159 "windows-1251" |
160 "x-cp1251" =>
161 Some(all::WINDOWS_1251 as EncodingRef),
162 "ansi_x3.4-1968" |
163 "ascii" |
164 "cp1252" |
165 "cp819" |
166 "csisolatin1" |
167 "ibm819" |
168 "iso-8859-1" |
169 "iso-ir-100" |
170 "iso8859-1" |
171 "iso88591" |
172 "iso_8859-1" |
173 "iso_8859-1:1987" |
174 "l1" |
175 "latin1" |
176 "us-ascii" |
177 "windows-1252" |
178 "x-cp1252" =>
179 Some(all::WINDOWS_1252 as EncodingRef),
180 "cp1253" |
181 "windows-1253" |
182 "x-cp1253" =>
183 Some(all::WINDOWS_1253 as EncodingRef),
184 "cp1254" |
185 "csisolatin5" |
186 "iso-8859-9" |
187 "iso-ir-148" |
188 "iso8859-9" |
189 "iso88599" |
190 "iso_8859-9" |
191 "iso_8859-9:1989" |
192 "l5" |
193 "latin5" |
194 "windows-1254" |
195 "x-cp1254" =>
196 Some(all::WINDOWS_1254 as EncodingRef),
197 "cp1255" |
198 "windows-1255" |
199 "x-cp1255" =>
200 Some(all::WINDOWS_1255 as EncodingRef),
201 "cp1256" |
202 "windows-1256" |
203 "x-cp1256" =>
204 Some(all::WINDOWS_1256 as EncodingRef),
205 "cp1257" |
206 "windows-1257" |
207 "x-cp1257" =>
208 Some(all::WINDOWS_1257 as EncodingRef),
209 "cp1258" |
210 "windows-1258" |
211 "x-cp1258" =>
212 Some(all::WINDOWS_1258 as EncodingRef),
213 "x-mac-cyrillic" |
214 "x-mac-ukrainian" =>
215 Some(all::MAC_CYRILLIC as EncodingRef),
216 "chinese" |
217 "csgb2312" |
218 "csiso58gb231280" |
219 "gb2312" |
220 "gb_2312" |
221 "gb_2312-80" |
222 "gbk" |
223 "iso-ir-58" |
224 "x-gbk" =>
225 Some(all::GBK as EncodingRef),
226 "gb18030" =>
227 Some(all::GB18030 as EncodingRef),
228 "big5" |
229 "big5-hkscs" |
230 "cn-big5" |
231 "csbig5" |
232 "x-x-big5" =>
233 Some(all::BIG5_2003 as EncodingRef),
234 "cseucpkdfmtjapanese" |
235 "euc-jp" |
236 "x-euc-jp" =>
237 Some(all::EUC_JP as EncodingRef),
238 "csiso2022jp" |
239 "iso-2022-jp" =>
240 Some(all::ISO_2022_JP as EncodingRef),
241 "csshiftjis" |
242 "ms_kanji" |
243 "shift-jis" |
244 "shift_jis" |
245 "sjis" |
246 "windows-31j" |
247 "x-sjis" =>
248 Some(all::WINDOWS_31J as EncodingRef),
249 "cseuckr" |
250 "csksc56011987" |
251 "euc-kr" |
252 "iso-ir-149" |
253 "korean" |
254 "ks_c_5601-1987" |
255 "ks_c_5601-1989" |
256 "ksc5601" |
257 "ksc_5601" |
258 "windows-949" =>
259 Some(all::WINDOWS_949 as EncodingRef),
260 "csiso2022kr" |
261 "hz-gb-2312" |
262 "iso-2022-kr" |
263 "iso-2022-cn" |
264 "iso-2022-cn-ext" =>
265 Some(all::whatwg::REPLACEMENT as EncodingRef),
266 "utf-16be" =>
267 Some(all::UTF_16BE as EncodingRef),
268 "utf-16" |
269 "utf-16le" =>
270 Some(all::UTF_16LE as EncodingRef),
271 "x-user-defined" =>
272 Some(all::whatwg::X_USER_DEFINED as EncodingRef),
273 _ => None
274 }
275}
276
277pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> {
281 match cp {
282 65001 => Some(all::UTF_8 as EncodingRef),
283 866 => Some(all::IBM866 as EncodingRef),
284 28591 => Some(all::ISO_8859_1 as EncodingRef),
285 28592 => Some(all::ISO_8859_2 as EncodingRef),
286 28593 => Some(all::ISO_8859_3 as EncodingRef),
287 28594 => Some(all::ISO_8859_4 as EncodingRef),
288 28595 => Some(all::ISO_8859_5 as EncodingRef),
289 28596 => Some(all::ISO_8859_6 as EncodingRef),
290 28597 => Some(all::ISO_8859_7 as EncodingRef),
291 28598 => Some(all::ISO_8859_8 as EncodingRef),
292 38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef),
293 28603 => Some(all::ISO_8859_13 as EncodingRef),
294 28605 => Some(all::ISO_8859_15 as EncodingRef),
295 20866 => Some(all::KOI8_R as EncodingRef),
296 21866 => Some(all::KOI8_U as EncodingRef),
297 10000 => Some(all::MAC_ROMAN as EncodingRef),
298 874 => Some(all::WINDOWS_874 as EncodingRef),
299 1250 => Some(all::WINDOWS_1250 as EncodingRef),
300 1251 => Some(all::WINDOWS_1251 as EncodingRef),
301 1252 => Some(all::WINDOWS_1252 as EncodingRef),
302 1253 => Some(all::WINDOWS_1253 as EncodingRef),
303 1254 => Some(all::WINDOWS_1254 as EncodingRef),
304 1255 => Some(all::WINDOWS_1255 as EncodingRef),
305 1256 => Some(all::WINDOWS_1256 as EncodingRef),
306 1257 => Some(all::WINDOWS_1257 as EncodingRef),
307 1258 => Some(all::WINDOWS_1258 as EncodingRef),
308 1259 => Some(all::MAC_CYRILLIC as EncodingRef),
309 936 | 54936 => Some(all::GB18030 as EncodingRef), 52936 => Some(all::HZ as EncodingRef),
311 950 => Some(all::BIG5_2003 as EncodingRef),
312 20932 => Some(all::EUC_JP as EncodingRef),
313 50220 => Some(all::ISO_2022_JP as EncodingRef),
314 932 => Some(all::WINDOWS_31J as EncodingRef),
315 949 => Some(all::WINDOWS_949 as EncodingRef),
316 1201 => Some(all::UTF_16BE as EncodingRef),
317 1200 => Some(all::UTF_16LE as EncodingRef),
318 _ => None
319 }
320}
321
322#[cfg(test)]
323mod tests {
324 extern crate test;
325 use all;
326 use super::encoding_from_whatwg_label;
327
328 #[test]
329 fn test_encoding_from_whatwg_label() {
330 assert!(encoding_from_whatwg_label("utf-8").is_some());
331 assert!(encoding_from_whatwg_label("UTF-8").is_some());
332 assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some());
333 assert!(encoding_from_whatwg_label("\u{A0}utf-8").is_none(),
334 "Non-ASCII whitespace should not be trimmed");
335 assert!(encoding_from_whatwg_label("greek").is_some());
336 assert!(encoding_from_whatwg_label("gree\u{212A}").is_none(),
337 "Case-insensitive matching should be ASCII only. Kelvin sign does not match k.");
338
339 for encoding in all::encodings() {
341 if let Some(whatwg_name) = encoding.whatwg_name() {
342 if whatwg_name == "replacement" { continue; }
343 assert_eq!(encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()),
344 Some(whatwg_name));
345 }
346 }
347 }
348
349 #[bench]
350 fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) {
351 bencher.iter(|| test::black_box({
352 encoding_from_whatwg_label("iso-8859-bazinga")
353 }))
354 }
355}
356