encoding/
label.rs

1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! An interface for retrieving an encoding (or a set of encodings) from a string/numeric label.
6
7use all;
8use types::EncodingRef;
9
10/// Returns an encoding from given label, defined in the WHATWG Encoding standard, if any.
11/// Implements "get an encoding" algorithm: http://encoding.spec.whatwg.org/#concept-encoding-get
12pub fn encoding_from_whatwg_label(label: &str) -> Option<EncodingRef> {
13    let label = label.trim_matches(&[' ', '\n', '\r', '\t', '\x0C'][..]);
14    let label: String =
15        label.chars().map(|c| match c { 'A'...'Z' => (c as u8 + 32) as char, _ => c }).collect();
16    match &label[..] {
17        "unicode-1-1-utf-8" |
18        "utf-8" |
19        "utf8" =>
20            Some(all::UTF_8 as EncodingRef),
21        "866" |
22        "cp866" |
23        "csibm866" |
24        "ibm866" =>
25            Some(all::IBM866 as EncodingRef),
26        "csisolatin2" |
27        "iso-8859-2" |
28        "iso-ir-101" |
29        "iso8859-2" |
30        "iso88592" |
31        "iso_8859-2" |
32        "iso_8859-2:1987" |
33        "l2" |
34        "latin2" =>
35            Some(all::ISO_8859_2 as EncodingRef),
36        "csisolatin3" |
37        "iso-8859-3" |
38        "iso-ir-109" |
39        "iso8859-3" |
40        "iso88593" |
41        "iso_8859-3" |
42        "iso_8859-3:1988" |
43        "l3" |
44        "latin3" =>
45            Some(all::ISO_8859_3 as EncodingRef),
46        "csisolatin4" |
47        "iso-8859-4" |
48        "iso-ir-110" |
49        "iso8859-4" |
50        "iso88594" |
51        "iso_8859-4" |
52        "iso_8859-4:1988" |
53        "l4" |
54        "latin4" =>
55            Some(all::ISO_8859_4 as EncodingRef),
56        "csisolatincyrillic" |
57        "cyrillic" |
58        "iso-8859-5" |
59        "iso-ir-144" |
60        "iso8859-5" |
61        "iso88595" |
62        "iso_8859-5" |
63        "iso_8859-5:1988" =>
64            Some(all::ISO_8859_5 as EncodingRef),
65        "arabic" |
66        "asmo-708" |
67        "csiso88596e" |
68        "csiso88596i" |
69        "csisolatinarabic" |
70        "ecma-114" |
71        "iso-8859-6" |
72        "iso-8859-6-e" |
73        "iso-8859-6-i" |
74        "iso-ir-127" |
75        "iso8859-6" |
76        "iso88596" |
77        "iso_8859-6" |
78        "iso_8859-6:1987" =>
79            Some(all::ISO_8859_6 as EncodingRef),
80        "csisolatingreek" |
81        "ecma-118" |
82        "elot_928" |
83        "greek" |
84        "greek8" |
85        "iso-8859-7" |
86        "iso-ir-126" |
87        "iso8859-7" |
88        "iso88597" |
89        "iso_8859-7" |
90        "iso_8859-7:1987" |
91        "sun_eu_greek" =>
92            Some(all::ISO_8859_7 as EncodingRef),
93        "csiso88598e" |
94        "csisolatinhebrew" |
95        "hebrew" |
96        "iso-8859-8" |
97        "iso-8859-8-e" |
98        "iso-ir-138" |
99        "iso8859-8" |
100        "iso88598" |
101        "iso_8859-8" |
102        "iso_8859-8:1988" |
103        "visual" =>
104            Some(all::ISO_8859_8 as EncodingRef),
105        "csiso88598i" |
106        "iso-8859-8-i" |
107        "logical" =>
108            Some(all::whatwg::ISO_8859_8_I as EncodingRef),
109        "csisolatin6" |
110        "iso-8859-10" |
111        "iso-ir-157" |
112        "iso8859-10" |
113        "iso885910" |
114        "l6" |
115        "latin6" =>
116            Some(all::ISO_8859_10 as EncodingRef),
117        "iso-8859-13" |
118        "iso8859-13" |
119        "iso885913" =>
120            Some(all::ISO_8859_13 as EncodingRef),
121        "iso-8859-14" |
122        "iso8859-14" |
123        "iso885914" =>
124            Some(all::ISO_8859_14 as EncodingRef),
125        "csisolatin9" |
126        "iso-8859-15" |
127        "iso8859-15" |
128        "iso885915" |
129        "iso_8859-15" |
130        "l9" =>
131            Some(all::ISO_8859_15 as EncodingRef),
132        "iso-8859-16" =>
133            Some(all::ISO_8859_16 as EncodingRef),
134        "cskoi8r" |
135        "koi" |
136        "koi8" |
137        "koi8-r" |
138        "koi8_r" =>
139            Some(all::KOI8_R as EncodingRef),
140        "koi8-u" =>
141            Some(all::KOI8_U as EncodingRef),
142        "csmacintosh" |
143        "mac" |
144        "macintosh" |
145        "x-mac-roman" =>
146            Some(all::MAC_ROMAN as EncodingRef),
147        "dos-874" |
148        "iso-8859-11" |
149        "iso8859-11" |
150        "iso885911" |
151        "tis-620" |
152        "windows-874" =>
153            Some(all::WINDOWS_874 as EncodingRef),
154        "cp1250" |
155        "windows-1250" |
156        "x-cp1250" =>
157            Some(all::WINDOWS_1250 as EncodingRef),
158        "cp1251" |
159        "windows-1251" |
160        "x-cp1251" =>
161            Some(all::WINDOWS_1251 as EncodingRef),
162        "ansi_x3.4-1968" |
163        "ascii" |
164        "cp1252" |
165        "cp819" |
166        "csisolatin1" |
167        "ibm819" |
168        "iso-8859-1" |
169        "iso-ir-100" |
170        "iso8859-1" |
171        "iso88591" |
172        "iso_8859-1" |
173        "iso_8859-1:1987" |
174        "l1" |
175        "latin1" |
176        "us-ascii" |
177        "windows-1252" |
178        "x-cp1252" =>
179            Some(all::WINDOWS_1252 as EncodingRef),
180        "cp1253" |
181        "windows-1253" |
182        "x-cp1253" =>
183            Some(all::WINDOWS_1253 as EncodingRef),
184        "cp1254" |
185        "csisolatin5" |
186        "iso-8859-9" |
187        "iso-ir-148" |
188        "iso8859-9" |
189        "iso88599" |
190        "iso_8859-9" |
191        "iso_8859-9:1989" |
192        "l5" |
193        "latin5" |
194        "windows-1254" |
195        "x-cp1254" =>
196            Some(all::WINDOWS_1254 as EncodingRef),
197        "cp1255" |
198        "windows-1255" |
199        "x-cp1255" =>
200            Some(all::WINDOWS_1255 as EncodingRef),
201        "cp1256" |
202        "windows-1256" |
203        "x-cp1256" =>
204            Some(all::WINDOWS_1256 as EncodingRef),
205        "cp1257" |
206        "windows-1257" |
207        "x-cp1257" =>
208            Some(all::WINDOWS_1257 as EncodingRef),
209        "cp1258" |
210        "windows-1258" |
211        "x-cp1258" =>
212            Some(all::WINDOWS_1258 as EncodingRef),
213        "x-mac-cyrillic" |
214        "x-mac-ukrainian" =>
215            Some(all::MAC_CYRILLIC as EncodingRef),
216        "chinese" |
217        "csgb2312" |
218        "csiso58gb231280" |
219        "gb2312" |
220        "gb_2312" |
221        "gb_2312-80" |
222        "gbk" |
223        "iso-ir-58" |
224        "x-gbk" =>
225            Some(all::GBK as EncodingRef),
226        "gb18030" =>
227            Some(all::GB18030 as EncodingRef),
228        "big5" |
229        "big5-hkscs" |
230        "cn-big5" |
231        "csbig5" |
232        "x-x-big5" =>
233            Some(all::BIG5_2003 as EncodingRef),
234        "cseucpkdfmtjapanese" |
235        "euc-jp" |
236        "x-euc-jp" =>
237            Some(all::EUC_JP as EncodingRef),
238        "csiso2022jp" |
239        "iso-2022-jp" =>
240            Some(all::ISO_2022_JP as EncodingRef),
241        "csshiftjis" |
242        "ms_kanji" |
243        "shift-jis" |
244        "shift_jis" |
245        "sjis" |
246        "windows-31j" |
247        "x-sjis" =>
248            Some(all::WINDOWS_31J as EncodingRef),
249        "cseuckr" |
250        "csksc56011987" |
251        "euc-kr" |
252        "iso-ir-149" |
253        "korean" |
254        "ks_c_5601-1987" |
255        "ks_c_5601-1989" |
256        "ksc5601" |
257        "ksc_5601" |
258        "windows-949" =>
259            Some(all::WINDOWS_949 as EncodingRef),
260        "csiso2022kr" |
261        "hz-gb-2312" |
262        "iso-2022-kr" |
263        "iso-2022-cn" |
264        "iso-2022-cn-ext" =>
265            Some(all::whatwg::REPLACEMENT as EncodingRef),
266        "utf-16be" =>
267            Some(all::UTF_16BE as EncodingRef),
268        "utf-16" |
269        "utf-16le" =>
270            Some(all::UTF_16LE as EncodingRef),
271        "x-user-defined" =>
272            Some(all::whatwg::X_USER_DEFINED as EncodingRef),
273        _ => None
274    }
275}
276
277/// Returns an encoding from Windows code page number.
278/// http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
279/// Sometimes it can return a *superset* of the requested encoding, e.g. for several CJK encodings.
280pub fn encoding_from_windows_code_page(cp: usize) -> Option<EncodingRef> {
281    match cp {
282        65001 => Some(all::UTF_8 as EncodingRef),
283        866 => Some(all::IBM866 as EncodingRef),
284        28591 => Some(all::ISO_8859_1 as EncodingRef),
285        28592 => Some(all::ISO_8859_2 as EncodingRef),
286        28593 => Some(all::ISO_8859_3 as EncodingRef),
287        28594 => Some(all::ISO_8859_4 as EncodingRef),
288        28595 => Some(all::ISO_8859_5 as EncodingRef),
289        28596 => Some(all::ISO_8859_6 as EncodingRef),
290        28597 => Some(all::ISO_8859_7 as EncodingRef),
291        28598 => Some(all::ISO_8859_8 as EncodingRef),
292        38598 => Some(all::whatwg::ISO_8859_8_I as EncodingRef),
293        28603 => Some(all::ISO_8859_13 as EncodingRef),
294        28605 => Some(all::ISO_8859_15 as EncodingRef),
295        20866 => Some(all::KOI8_R as EncodingRef),
296        21866 => Some(all::KOI8_U as EncodingRef),
297        10000 => Some(all::MAC_ROMAN as EncodingRef),
298        874 => Some(all::WINDOWS_874 as EncodingRef),
299        1250 => Some(all::WINDOWS_1250 as EncodingRef),
300        1251 => Some(all::WINDOWS_1251 as EncodingRef),
301        1252 => Some(all::WINDOWS_1252 as EncodingRef),
302        1253 => Some(all::WINDOWS_1253 as EncodingRef),
303        1254 => Some(all::WINDOWS_1254 as EncodingRef),
304        1255 => Some(all::WINDOWS_1255 as EncodingRef),
305        1256 => Some(all::WINDOWS_1256 as EncodingRef),
306        1257 => Some(all::WINDOWS_1257 as EncodingRef),
307        1258 => Some(all::WINDOWS_1258 as EncodingRef),
308        1259 => Some(all::MAC_CYRILLIC as EncodingRef),
309        936 | 54936 => Some(all::GB18030 as EncodingRef), // XXX technically wrong
310        52936 => Some(all::HZ as EncodingRef),
311        950 => Some(all::BIG5_2003 as EncodingRef),
312        20932 => Some(all::EUC_JP as EncodingRef),
313        50220 => Some(all::ISO_2022_JP as EncodingRef),
314        932 => Some(all::WINDOWS_31J as EncodingRef),
315        949 => Some(all::WINDOWS_949 as EncodingRef),
316        1201 => Some(all::UTF_16BE as EncodingRef),
317        1200 => Some(all::UTF_16LE as EncodingRef),
318        _ => None
319    }
320}
321
322#[cfg(test)]
323mod tests {
324    extern crate test;
325    use all;
326    use super::encoding_from_whatwg_label;
327
328    #[test]
329    fn test_encoding_from_whatwg_label() {
330        assert!(encoding_from_whatwg_label("utf-8").is_some());
331        assert!(encoding_from_whatwg_label("UTF-8").is_some());
332        assert!(encoding_from_whatwg_label("\t\n\x0C\r utf-8\t\n\x0C\r ").is_some());
333        assert!(encoding_from_whatwg_label("\u{A0}utf-8").is_none(),
334                "Non-ASCII whitespace should not be trimmed");
335        assert!(encoding_from_whatwg_label("greek").is_some());
336        assert!(encoding_from_whatwg_label("gree\u{212A}").is_none(),
337                "Case-insensitive matching should be ASCII only. Kelvin sign does not match k.");
338
339        // checks if the `whatwg_name` method returns the label that resolves back to that encoding
340        for encoding in all::encodings() {
341            if let Some(whatwg_name) = encoding.whatwg_name() {
342                if whatwg_name == "replacement" { continue; }
343                assert_eq!(encoding_from_whatwg_label(whatwg_name).and_then(|e| e.whatwg_name()),
344                           Some(whatwg_name));
345            }
346        }
347    }
348
349    #[bench]
350    fn bench_encoding_from_whatwg_label(bencher: &mut test::Bencher) {
351        bencher.iter(|| test::black_box({
352            encoding_from_whatwg_label("iso-8859-bazinga")
353        }))
354    }
355}
356