encoding/lib.rs
1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! # Encoding 0.2.33
6//!
7//! Character encoding support for Rust. (also known as `rust-encoding`)
8//! It is based on [WHATWG Encoding Standard](http://encoding.spec.whatwg.org/),
9//! and also provides an advanced interface for error detection and recovery.
10//!
11//! ## Usage
12//!
13//! Put this in your `Cargo.toml`:
14//!
15//! ```toml
16//! [dependencies]
17//! encoding = "0.2"
18//! ```
19//!
20//! Then put this in your crate root:
21//!
22//! ```rust
23//! extern crate encoding;
24//! ```
25//!
26//! ## Overview
27//!
28//! To encode a string:
29//!
30//! ~~~~ {.rust}
31//! use encoding::{Encoding, EncoderTrap};
32//! use encoding::all::ISO_8859_1;
33//!
34//! assert_eq!(ISO_8859_1.encode("caf\u{e9}", EncoderTrap::Strict),
35//! Ok(vec![99,97,102,233]));
36//! ~~~~
37//!
38//! To encode a string with unrepresentable characters:
39//!
40//! ~~~~ {.rust}
41//! use encoding::{Encoding, EncoderTrap};
42//! use encoding::all::ISO_8859_2;
43//!
44//! assert!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Strict).is_err());
45//! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Replace),
46//! Ok(vec![65,99,109,101,63]));
47//! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::Ignore),
48//! Ok(vec![65,99,109,101]));
49//! assert_eq!(ISO_8859_2.encode("Acme\u{a9}", EncoderTrap::NcrEscape),
50//! Ok(vec![65,99,109,101,38,35,49,54,57,59]));
51//! ~~~~
52//!
53//! To decode a byte sequence:
54//!
55//! ~~~~ {.rust}
56//! use encoding::{Encoding, DecoderTrap};
57//! use encoding::all::ISO_8859_1;
58//!
59//! assert_eq!(ISO_8859_1.decode(&[99,97,102,233], DecoderTrap::Strict),
60//! Ok("caf\u{e9}".to_string()));
61//! ~~~~
62//!
63//! To decode a byte sequence with invalid sequences:
64//!
65//! ~~~~ {.rust}
66//! use encoding::{Encoding, DecoderTrap};
67//! use encoding::all::ISO_8859_6;
68//!
69//! assert!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Strict).is_err());
70//! assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Replace),
71//! Ok("Acme\u{fffd}".to_string()));
72//! assert_eq!(ISO_8859_6.decode(&[65,99,109,101,169], DecoderTrap::Ignore),
73//! Ok("Acme".to_string()));
74//! ~~~~
75//!
76//! To encode or decode the input into the already allocated buffer:
77//!
78//! ~~~~ {.rust}
79//! use encoding::{Encoding, EncoderTrap, DecoderTrap};
80//! use encoding::all::{ISO_8859_2, ISO_8859_6};
81//!
82//! let mut bytes = Vec::new();
83//! let mut chars = String::new();
84//!
85//! assert!(ISO_8859_2.encode_to("Acme\u{a9}", EncoderTrap::Ignore, &mut bytes).is_ok());
86//! assert!(ISO_8859_6.decode_to(&[65,99,109,101,169], DecoderTrap::Replace, &mut chars).is_ok());
87//!
88//! assert_eq!(bytes, [65,99,109,101]);
89//! assert_eq!(chars, "Acme\u{fffd}");
90//! ~~~~
91//!
92//! A practical example of custom encoder traps:
93//!
94//! ~~~~ {.rust}
95//! use encoding::{Encoding, ByteWriter, EncoderTrap, DecoderTrap};
96//! use encoding::types::RawEncoder;
97//! use encoding::all::ASCII;
98//!
99//! // hexadecimal numeric character reference replacement
100//! fn hex_ncr_escape(_encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool {
101//! let escapes: Vec<String> =
102//! input.chars().map(|ch| format!("&#x{:x};", ch as isize)).collect();
103//! let escapes = escapes.concat();
104//! output.write_bytes(escapes.as_bytes());
105//! true
106//! }
107//! static HEX_NCR_ESCAPE: EncoderTrap = EncoderTrap::Call(hex_ncr_escape);
108//!
109//! let orig = "Hello, 世界!".to_string();
110//! let encoded = ASCII.encode(&orig, HEX_NCR_ESCAPE).unwrap();
111//! assert_eq!(ASCII.decode(&encoded, DecoderTrap::Strict),
112//! Ok("Hello, 世界!".to_string()));
113//! ~~~~
114//!
115//! Getting the encoding from the string label, as specified in WHATWG Encoding standard:
116//!
117//! ~~~~ {.rust}
118//! use encoding::{Encoding, DecoderTrap};
119//! use encoding::label::encoding_from_whatwg_label;
120//! use encoding::all::WINDOWS_949;
121//!
122//! let euckr = encoding_from_whatwg_label("euc-kr").unwrap();
123//! assert_eq!(euckr.name(), "windows-949");
124//! assert_eq!(euckr.whatwg_name(), Some("euc-kr")); // for the sake of compatibility
125//! let broken = &[0xbf, 0xec, 0xbf, 0xcd, 0xff, 0xbe, 0xd3];
126//! assert_eq!(euckr.decode(broken, DecoderTrap::Replace),
127//! Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string()));
128//!
129//! // corresponding Encoding native API:
130//! assert_eq!(WINDOWS_949.decode(broken, DecoderTrap::Replace),
131//! Ok("\u{c6b0}\u{c640}\u{fffd}\u{c559}".to_string()));
132//! ~~~~
133//!
134//! ## Types and Stuffs
135//!
136//! There are three main entry points to Encoding.
137//!
138//! **`Encoding`** is a single character encoding.
139//! It contains `encode` and `decode` methods for converting `String` to `Vec<u8>` and vice versa.
140//! For the error handling, they receive **traps** (`EncoderTrap` and `DecoderTrap` respectively)
141//! which replace any error with some string (e.g. `U+FFFD`) or sequence (e.g. `?`).
142//! You can also use `EncoderTrap::Strict` and `DecoderTrap::Strict` traps to stop on an error.
143//!
144//! There are two ways to get `Encoding`:
145//!
146//! * `encoding::all` has static items for every supported encoding.
147//! You should use them when the encoding would not change or only handful of them are required.
148//! Combined with link-time optimization, any unused encoding would be discarded from the binary.
149//!
150//! * `encoding::label` has functions to dynamically get an encoding from given string ("label").
151//! They will return a static reference to the encoding,
152//! which type is also known as `EncodingRef`.
153//! It is useful when a list of required encodings is not available in advance,
154//! but it will result in the larger binary and missed optimization opportunities.
155//!
156//! **`RawEncoder`** is an experimental incremental encoder.
157//! At each step of `raw_feed`, it receives a slice of string
158//! and emits any encoded bytes to a generic `ByteWriter` (normally `Vec<u8>`).
159//! It will stop at the first error if any, and would return a `CodecError` struct in that case.
160//! The caller is responsible for calling `raw_finish` at the end of encoding process.
161//!
162//! **`RawDecoder`** is an experimental incremental decoder.
163//! At each step of `raw_feed`, it receives a slice of byte sequence
164//! and emits any decoded characters to a generic `StringWriter` (normally `String`).
165//! Otherwise it is identical to `RawEncoder`s.
166//!
167//! One should prefer `Encoding::{encode,decode}` as a primary interface.
168//! `RawEncoder` and `RawDecoder` is experimental and can change substantially.
169//! See the additional documents on `encoding::types` module for more information on them.
170//!
171//! ## Supported Encodings
172//!
173//! Encoding covers all encodings specified by WHATWG Encoding Standard and some more:
174//!
175//! * 7-bit strict ASCII (`ascii`)
176//! * UTF-8 (`utf-8`)
177//! * UTF-16 in little endian (`utf-16` or `utf-16le`) and big endian (`utf-16be`)
178//! * All single byte encoding in WHATWG Encoding Standard:
179//! * IBM code page 866
180//! * ISO 8859-{2,3,4,5,6,7,8,10,13,14,15,16}
181//! * KOI8-R, KOI8-U
182//! * MacRoman (`macintosh`), Macintosh Cyrillic encoding (`x-mac-cyrillic`)
183//! * Windows code pages 874, 1250, 1251, 1252 (instead of ISO 8859-1), 1253,
184//! 1254 (instead of ISO 8859-9), 1255, 1256, 1257, 1258
185//! * All multi byte encodings in WHATWG Encoding Standard:
186//! * Windows code page 949 (`euc-kr`, since the strict EUC-KR is hardly used)
187//! * EUC-JP and Windows code page 932 (`shift_jis`,
188//! since it's the most widespread extension to Shift_JIS)
189//! * ISO-2022-JP with asymmetric JIS X 0212 support
190//! (Note: this is not yet up to date to the current standard)
191//! * GBK
192//! * GB 18030
193//! * Big5-2003 with HKSCS-2008 extensions
194//! * Encodings that were originally specified by WHATWG Encoding Standard:
195//! * HZ
196//! * ISO 8859-1 (distinct from Windows code page 1252)
197//!
198//! Parenthesized names refer to the encoding's primary name assigned by WHATWG Encoding Standard.
199//!
200//! Many legacy character encodings lack the proper specification,
201//! and even those that have a specification are highly dependent of the actual implementation.
202//! Consequently one should be careful when picking a desired character encoding.
203//! The only standards reliable in this regard are WHATWG Encoding Standard and
204//! [vendor-provided mappings from the Unicode consortium](http://www.unicode.org/Public/MAPPINGS/).
205//! Whenever in doubt, look at the source code and specifications for detailed explanations.
206
207#![cfg_attr(test, feature(test))] // lib stability features as per RFC #507
208
209extern crate encoding_index_singlebyte as index_singlebyte;
210extern crate encoding_index_korean as index_korean;
211extern crate encoding_index_japanese as index_japanese;
212extern crate encoding_index_simpchinese as index_simpchinese;
213extern crate encoding_index_tradchinese as index_tradchinese;
214
215#[cfg(test)] extern crate test;
216
217pub use self::types::{CodecError, ByteWriter, StringWriter,
218 RawEncoder, RawDecoder, EncodingRef, Encoding,
219 EncoderTrapFunc, DecoderTrapFunc, DecoderTrap,
220 EncoderTrap, decode}; // reexport
221
222#[macro_use] mod util;
223#[cfg(test)] #[macro_use] mod testutils;
224
225pub mod types;
226
227/// Codec implementations.
228pub mod codec {
229 pub mod error;
230 pub mod ascii;
231 pub mod singlebyte;
232 pub mod utf_8;
233 pub mod utf_16;
234 pub mod korean;
235 pub mod japanese;
236 pub mod simpchinese;
237 pub mod tradchinese;
238 pub mod whatwg;
239}
240
241pub mod all;
242pub mod label;
243
244#[cfg(test)]
245mod tests {
246 use super::*;
247
248 #[test]
249 fn test_decode() {
250 fn test_one(input: &[u8], expected_result: &str, expected_encoding: &str) {
251 let (result, used_encoding) = decode(
252 input, DecoderTrap::Strict, all::ISO_8859_1 as EncodingRef);
253 let result = result.unwrap();
254 assert_eq!(used_encoding.name(), expected_encoding);
255 assert_eq!(&result[..], expected_result);
256 }
257
258 test_one(&[0xEF, 0xBB, 0xBF, 0xC3, 0xA9], "é", "utf-8");
259 test_one(&[0xC3, 0xA9], "é", "iso-8859-1");
260
261 test_one(&[0xFE, 0xFF, 0x00, 0xE9], "é", "utf-16be");
262 test_one(&[0x00, 0xE9], "\x00é", "iso-8859-1");
263
264 test_one(&[0xFF, 0xFE, 0xE9, 0x00], "é", "utf-16le");
265 test_one(&[0xE9, 0x00], "é\x00", "iso-8859-1");
266 }
267}