encoding/codec/
simpchinese.rs

1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy simplified Chinese encodings based on GB 2312 and GB 18030.
6
7use std::convert::Into;
8use std::marker::PhantomData;
9use std::default::Default;
10use util::StrCharIndex;
11use index_simpchinese as index;
12use types::*;
13
14/// An implementation type for GBK.
15///
16/// Can be used as a type parameter to `GBEncoding` and `GBEncoder`.
17/// (GB18030Decoder is shared by both.)
18#[derive(Clone, Copy)]
19pub struct GBK;
20
21/// An implementation type for GB18030.
22///
23/// Can be used as a type parameter to `GBEncoding` and `GBEncoder.'
24/// (GB18030Decoder is shared by both.)
25#[derive(Clone, Copy)]
26pub struct GB18030;
27
28/// An internal trait used to customize GBK and GB18030 implementations.
29#[doc(hidden)] // XXX never intended to be used publicly, should be gone later
30pub trait GBType: Clone + 'static {
31    fn name() -> &'static str;
32    fn whatwg_name() -> Option<&'static str>;
33    fn initial_gbk_flag() -> bool;
34}
35
36impl GBType for GBK {
37    fn name() -> &'static str { "gbk" }
38    fn whatwg_name() -> Option<&'static str> { Some("gbk") }
39    fn initial_gbk_flag() -> bool { true }
40}
41
42impl GBType for GB18030 {
43    fn name() -> &'static str { "gb18030" }
44    fn whatwg_name() -> Option<&'static str> { Some("gb18030") }
45    fn initial_gbk_flag() -> bool { false }
46}
47
48/**
49 * GBK and GB 18030-2005.
50 *
51 * The original GBK 1.0 region spans `[81-FE] [40-7E 80-FE]`, and is derived from
52 * several different revisions of a family of encodings named "GBK":
53 *
54 * - GBK as specified in the normative annex of GB 13000.1-93,
55 *   the domestic standard equivalent to Unicode 1.1,
56 *   consisted of characters included in Unicode 1.1 and not in GB 2312-80.
57 * - Windows code page 936 is the widespread extension to GBK.
58 * - Due to the popularity of Windows code page 936,
59 *   a formal encoding based on Windows code page 936 (while adding new characters)
60 *   was standardized into GBK 1.0.
61 * - Finally, GB 18030 added four-byte sequences to GBK for becoming a pan-Unicode encoding,
62 *   while adding new characters to the (former) GBK region again.
63 *
64 * GB 18030-2005 is a simplified Chinese encoding which extends GBK 1.0 to a pan-Unicode encoding.
65 * It assigns four-byte sequences to every Unicode codepoint missing from the GBK area,
66 * lexicographically ordered with occasional "gaps" for codepoints in the GBK area.
67 * Due to this compatibility decision,
68 * there is no simple relationship between these four-byte sequences and Unicode codepoints,
69 * though there *exists* a relatively simple mapping algorithm with a small lookup table.
70 *
71 * ## Specialization
72 *
73 * This type is specialized with GBType `T`,
74 * which should be either `GBK` or `GB18030`.
75 */
76#[derive(Clone, Copy)]
77pub struct GBEncoding<T> {
78    _marker: PhantomData<T>
79}
80
81/// A type for GBK.
82pub type GBKEncoding = GBEncoding<GBK>;
83/// A type for GB18030.
84pub type GB18030Encoding = GBEncoding<GB18030>;
85
86/// An instance for GBK.
87pub const GBK_ENCODING: GBKEncoding = GBEncoding { _marker: PhantomData };
88/// An instance for GB18030.
89pub const GB18030_ENCODING: GB18030Encoding = GBEncoding { _marker: PhantomData };
90
91impl<T: GBType> Encoding for GBEncoding<T> {
92    fn name(&self) -> &'static str { <T as GBType>::name() }
93    fn whatwg_name(&self) -> Option<&'static str> { <T as GBType>::whatwg_name() }
94    fn raw_encoder(&self) -> Box<RawEncoder> { GBEncoder::<T>::new() }
95    fn raw_decoder(&self) -> Box<RawDecoder> { GB18030Decoder::new() }
96}
97
98/**
99 * An encoder for GBK and GB18030.
100 *
101 * ## Specialization
102 *
103 * This type is specialized with GBType `T`,
104 * which should be either `GBK` or `GB18030`.
105 */
106#[derive(Clone, Copy)]
107pub struct GBEncoder<T> {
108    _marker: PhantomData<T>
109}
110
111impl<T: GBType> GBEncoder<T> {
112    pub fn new() -> Box<RawEncoder> {
113        Box::new(GBEncoder::<T> { _marker: PhantomData })
114    }
115}
116
117impl<T: GBType> RawEncoder for GBEncoder<T> {
118    fn from_self(&self) -> Box<RawEncoder> { GBEncoder::<T>::new() }
119    fn is_ascii_compatible(&self) -> bool { true }
120
121    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
122        output.writer_hint(input.len());
123
124        let gbk_flag = <T as GBType>::initial_gbk_flag();
125        for ((i, j), ch) in input.index_iter() {
126            if ch < '\u{80}' {
127                output.write_byte(ch as u8);
128            } else if gbk_flag && ch == '\u{20AC}' {
129                output.write_byte('\u{80}' as u8)
130            } else {
131                let ptr = index::gb18030::backward(ch as u32);
132                if ptr == 0xffff {
133                    if gbk_flag {
134                        return (i, Some(CodecError {
135                            upto: j as isize,
136                            cause: "gbk doesn't support gb18030 extensions".into()
137                        }));
138                    }
139                    let ptr = index::gb18030_ranges::backward(ch as u32);
140                    assert!(ptr != 0xffffffff);
141                    let (ptr, byte4) = (ptr / 10, ptr % 10);
142                    let (ptr, byte3) = (ptr / 126, ptr % 126);
143                    let (byte1, byte2) = (ptr / 10, ptr % 10);
144                    output.write_byte((byte1 + 0x81) as u8);
145                    output.write_byte((byte2 + 0x30) as u8);
146                    output.write_byte((byte3 + 0x81) as u8);
147                    output.write_byte((byte4 + 0x30) as u8);
148                } else {
149                    let lead = ptr / 190 + 0x81;
150                    let trail = ptr % 190;
151                    let trailoffset = if trail < 0x3f {0x40} else {0x41};
152                    output.write_byte(lead as u8);
153                    output.write_byte((trail + trailoffset) as u8);
154                }
155            }
156        }
157        (input.len(), None)
158    }
159
160    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
161        None
162    }
163}
164
165/// A decoder for GB 18030 (also used by GBK).
166#[derive(Clone, Copy)]
167struct GB18030Decoder {
168    st: gb18030::State,
169}
170
171impl GB18030Decoder {
172    pub fn new() -> Box<RawDecoder> {
173        Box::new(GB18030Decoder { st: Default::default() })
174    }
175}
176
177impl RawDecoder for GB18030Decoder {
178    fn from_self(&self) -> Box<RawDecoder> { GB18030Decoder::new() }
179    fn is_ascii_compatible(&self) -> bool { true }
180
181    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
182        let (st, processed, err) = gb18030::raw_feed(self.st, input, output, &());
183        self.st = st;
184        (processed, err)
185    }
186
187    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
188        let (st, err) = gb18030::raw_finish(self.st, output, &());
189        self.st = st;
190        err
191    }
192}
193
194stateful_decoder! {
195    module gb18030;
196
197    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
198        use index_simpchinese as index;
199
200        let lead = lead as u16;
201        let trail = trail as u16;
202        let index = match (lead, trail) {
203            (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0x80...0xfe) => {
204                let trailoffset = if trail < 0x7f {0x40} else {0x41};
205                (lead - 0x81) * 190 + trail - trailoffset
206            }
207            _ => 0xffff,
208        };
209        index::gb18030::forward(index)
210    }
211
212    internal pub fn map_four_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 {
213        use index_simpchinese as index;
214
215        // no range check here, caller should have done all checks
216        let index = (b1 as u32 - 0x81) * 12600 + (b2 as u32 - 0x30) * 1260 +
217                    (b3 as u32 - 0x81) * 10 + (b4 as u32 - 0x30);
218        index::gb18030_ranges::forward(index)
219    }
220
221initial:
222    // gb18030 first = 0x00, gb18030 second = 0x00, gb18030 third = 0x00
223    state S0(ctx: Context) {
224        case b @ 0x00...0x7f => ctx.emit(b as u32);
225        case 0x80 => ctx.emit(0x20ac);
226        case b @ 0x81...0xfe => S1(ctx, b);
227        case _ => ctx.err("invalid sequence");
228    }
229
230transient:
231    // gb18030 first != 0x00, gb18030 second = 0x00, gb18030 third = 0x00
232    state S1(ctx: Context, first: u8) {
233        case b @ 0x30...0x39 => S2(ctx, first, b);
234        case b => match map_two_bytes(first, b) {
235            0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional
236            ch => ctx.emit(ch)
237        };
238    }
239
240    // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third = 0x00
241    state S2(ctx: Context, first: u8, second: u8) {
242        case b @ 0x81...0xfe => S3(ctx, first, second, b);
243        case _ => ctx.backup_and_err(2, "invalid sequence");
244    }
245
246    // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third != 0x00
247    state S3(ctx: Context, first: u8, second: u8, third: u8) {
248        case b @ 0x30...0x39 => match map_four_bytes(first, second, third, b) {
249            0xffffffff => ctx.backup_and_err(3, "invalid sequence"), // unconditional
250            ch => ctx.emit(ch)
251        };
252        case _ => ctx.backup_and_err(3, "invalid sequence");
253    }
254}
255
256#[cfg(test)]
257mod gb18030_tests {
258    extern crate test;
259    use super::GB18030_ENCODING;
260    use testutils;
261    use types::*;
262
263    #[test]
264    fn test_encoder() {
265        let mut e = GB18030_ENCODING.raw_encoder();
266        assert_feed_ok!(e, "A", "", [0x41]);
267        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
268        assert_feed_ok!(e, "", "", []);
269        assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "",
270                        [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1,
271                         0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]);
272        assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa2, 0xe3, 0x2f, 0x6d]);
273        assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]);
274        assert_feed_ok!(e, "\u{80}", "", [0x81, 0x30, 0x81, 0x30]);
275        assert_feed_ok!(e, "\u{81}", "", [0x81, 0x30, 0x81, 0x31]);
276        assert_feed_ok!(e, "\u{a3}", "", [0x81, 0x30, 0x84, 0x35]);
277        assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
278        assert_feed_ok!(e, "\u{a5}", "", [0x81, 0x30, 0x84, 0x36]);
279        assert_feed_ok!(e, "\u{10ffff}", "", [0xe3, 0x32, 0x9a, 0x35]);
280        assert_feed_ok!(e, "\u{2a6a5}\u{3007}", "", [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96]);
281        assert_finish_ok!(e, []);
282    }
283
284    #[test]
285    fn test_decoder_valid() {
286        let mut d = GB18030_ENCODING.raw_decoder();
287        assert_feed_ok!(d, [0x41], [], "A");
288        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
289        assert_feed_ok!(d, [], [], "");
290        assert_feed_ok!(d, [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1,
291                            0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa], [],
292                        "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}");
293        assert_feed_ok!(d, [0x31, 0x80, 0x2f, 0x6d], [], "1\u{20ac}/m");
294        assert_feed_ok!(d, [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3], [], "\u{ff21}\u{ff22}\u{ff23}");
295        assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x30], [], "\u{80}");
296        assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x31], [], "\u{81}");
297        assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x35], [], "\u{a3}");
298        assert_feed_ok!(d, [0xa1, 0xe8], [], "\u{a4}" );
299        assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x36], [], "\u{a5}");
300        assert_feed_ok!(d, [0xe3, 0x32, 0x9a, 0x35], [], "\u{10ffff}");
301        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96], [], "\u{2a6a5}\u{3007}");
302        assert_finish_ok!(d, "");
303    }
304
305    #[test]
306    fn test_decoder_valid_partial() {
307        let mut d = GB18030_ENCODING.raw_decoder();
308        assert_feed_ok!(d, [], [0xa1], "");
309        assert_feed_ok!(d, [0xa1], [], "\u{3000}");
310        assert_feed_ok!(d, [], [0x81], "");
311        assert_feed_ok!(d, [], [0x30], "");
312        assert_feed_ok!(d, [], [0x81], "");
313        assert_feed_ok!(d, [0x30], [], "\u{80}");
314        assert_feed_ok!(d, [], [0x81], "");
315        assert_feed_ok!(d, [], [0x30], "");
316        assert_feed_ok!(d, [0x81, 0x31], [], "\u{81}");
317        assert_feed_ok!(d, [], [0x81], "");
318        assert_feed_ok!(d, [0x30, 0x81, 0x32], [], "\u{82}");
319        assert_feed_ok!(d, [], [0x81], "");
320        assert_feed_ok!(d, [], [0x30, 0x81], "");
321        assert_feed_ok!(d, [0x33], [], "\u{83}");
322        assert_feed_ok!(d, [], [0x81, 0x30], "");
323        assert_feed_ok!(d, [], [0x81], "");
324        assert_feed_ok!(d, [0x34], [], "\u{84}");
325        assert_feed_ok!(d, [], [0x81, 0x30], "");
326        assert_feed_ok!(d, [0x81, 0x35], [], "\u{85}");
327        assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
328        assert_feed_ok!(d, [0x36], [], "\u{86}");
329        assert_finish_ok!(d, "");
330    }
331
332    #[test]
333    fn test_decoder_invalid_partial() {
334        let mut d = GB18030_ENCODING.raw_decoder();
335        assert_feed_ok!(d, [], [0xa1], "");
336        assert_finish_err!(d, "");
337
338        let mut d = GB18030_ENCODING.raw_decoder();
339        assert_feed_ok!(d, [], [0x81], "");
340        assert_finish_err!(d, "");
341
342        let mut d = GB18030_ENCODING.raw_decoder();
343        assert_feed_ok!(d, [], [0x81, 0x30], "");
344        assert_finish_err!(d, "");
345
346        let mut d = GB18030_ENCODING.raw_decoder();
347        assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
348        assert_finish_err!(d, "");
349    }
350
351    #[test]
352    fn test_decoder_invalid_out_of_range() {
353        let mut d = GB18030_ENCODING.raw_decoder();
354        assert_feed_err!(d, [], [0xff], [], "");
355        assert_feed_err!(d, [], [0x81], [0x00], "");
356        assert_feed_err!(d, [], [0x81], [0x7f], "");
357        assert_feed_err!(d, [], [0x81], [0xff], "");
358        assert_feed_err!(d, [], [0x81], [0x31, 0x00], "");
359        assert_feed_err!(d, [], [0x81], [0x31, 0x80], "");
360        assert_feed_err!(d, [], [0x81], [0x31, 0xff], "");
361        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x00], "");
362        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x2f], "");
363        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x3a], "");
364        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0xff], "");
365        assert_finish_ok!(d, "");
366    }
367
368    #[test]
369    fn test_decoder_invalid_boundary() {
370        // U+10FFFF (E3 32 9A 35) is the last Unicode codepoint, E3 32 9A 36 is invalid.
371        // note that since the 2nd to 4th bytes may coincide with ASCII, bytes 32 9A 36 is
372        // not considered to be in the problem. this is compatible to WHATWG Encoding standard.
373        let mut d = GB18030_ENCODING.raw_decoder();
374        assert_feed_ok!(d, [], [0xe3], "");
375        assert_feed_err!(d, [], [], [0x32, 0x9a, 0x36], "");
376        assert_finish_ok!(d, "");
377
378        let mut d = GB18030_ENCODING.raw_decoder();
379        assert_feed_ok!(d, [], [0xe3], "");
380        assert_feed_ok!(d, [], [0x32, 0x9a], "");
381        assert_feed_err!(d, -2, [], [], [0x32, 0x9a, 0x36], "");
382        assert_finish_ok!(d, "");
383    }
384
385    #[test]
386    fn test_decoder_feed_after_finish() {
387        let mut d = GB18030_ENCODING.raw_decoder();
388        assert_feed_ok!(d, [0xd2, 0xbb], [0xd2], "\u{4e00}");
389        assert_finish_err!(d, "");
390        assert_feed_ok!(d, [0xd2, 0xbb], [], "\u{4e00}");
391        assert_finish_ok!(d, "");
392
393        let mut d = GB18030_ENCODING.raw_decoder();
394        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35, 0xee], "\u{2a6a5}");
395        assert_finish_err!(d, "");
396        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35], "\u{2a6a5}");
397        assert_finish_err!(d, "");
398        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98], "\u{2a6a5}");
399        assert_finish_err!(d, "");
400        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [], "\u{2a6a5}");
401        assert_finish_ok!(d, "");
402    }
403
404    #[bench]
405    fn bench_encode_short_text(bencher: &mut test::Bencher) {
406        let s = testutils::SIMPLIFIED_CHINESE_TEXT;
407        bencher.bytes = s.len() as u64;
408        bencher.iter(|| test::black_box({
409            GB18030_ENCODING.encode(&s, EncoderTrap::Strict)
410        }))
411    }
412
413    #[bench]
414    fn bench_decode_short_text(bencher: &mut test::Bencher) {
415        let s = GB18030_ENCODING.encode(testutils::SIMPLIFIED_CHINESE_TEXT,
416                                       EncoderTrap::Strict).ok().unwrap();
417        bencher.bytes = s.len() as u64;
418        bencher.iter(|| test::black_box({
419            GB18030_ENCODING.decode(&s, DecoderTrap::Strict)
420        }))
421    }
422}
423
424#[cfg(test)]
425mod gbk_tests {
426    extern crate test;
427    use super::GBK_ENCODING;
428    use testutils;
429    use types::*;
430
431    // GBK and GB 18030 share the same decoder logic.
432
433    #[test]
434    fn test_encoder() {
435        let mut e = GBK_ENCODING.raw_encoder();
436        assert_feed_ok!(e, "A", "", [0x41]);
437        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
438        assert_feed_ok!(e, "", "", []);
439        assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "",
440                        [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1,
441                         0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]);
442        assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0x80, 0x2f, 0x6d]);
443        assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]);
444        assert_feed_err!(e, "", "\u{80}", "", []);
445        assert_feed_err!(e, "", "\u{81}", "", []);
446        assert_feed_err!(e, "", "\u{a3}", "", []);
447        assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
448        assert_feed_err!(e, "", "\u{a5}", "", []);
449        assert_feed_err!(e, "", "\u{10ffff}", "", []);
450        assert_feed_err!(e, "", "\u{2a6a5}", "\u{3007}", []);
451        assert_feed_err!(e, "\u{3007}", "\u{2a6a5}", "", [0xa9, 0x96]);
452        assert_finish_ok!(e, []);
453    }
454
455    #[bench]
456    fn bench_encode_short_text(bencher: &mut test::Bencher) {
457        let s = testutils::SIMPLIFIED_CHINESE_TEXT;
458        bencher.bytes = s.len() as u64;
459        bencher.iter(|| test::black_box({
460            GBK_ENCODING.encode(&s, EncoderTrap::Strict)
461        }))
462    }
463}
464
465/**
466 * HZ. (RFC 1843)
467 *
468 * This is a simplified Chinese encoding based on GB 2312.
469 * It bears a resemblance to ISO 2022 encodings in such that the printable escape sequences `~{`
470 * and `~}` are used to delimit a sequence of 7-bit-safe GB 2312 sequences. For the comparison,
471 * they are equivalent to ISO-2022-CN escape sequences `ESC $ ) A` and `ESC ( B`.
472 * Additional escape sequences `~~` (for a literal `~`) and `~\n` (ignored) are also supported.
473 */
474#[derive(Clone, Copy)]
475pub struct HZEncoding;
476
477impl Encoding for HZEncoding {
478    fn name(&self) -> &'static str { "hz" }
479    fn whatwg_name(&self) -> Option<&'static str> { None }
480    fn raw_encoder(&self) -> Box<RawEncoder> { HZEncoder::new() }
481    fn raw_decoder(&self) -> Box<RawDecoder> { HZDecoder::new() }
482}
483
484/// An encoder for HZ.
485#[derive(Clone, Copy)]
486pub struct HZEncoder {
487    escaped: bool,
488}
489
490impl HZEncoder {
491    pub fn new() -> Box<RawEncoder> { Box::new(HZEncoder { escaped: false }) }
492}
493
494impl RawEncoder for HZEncoder {
495    fn from_self(&self) -> Box<RawEncoder> { HZEncoder::new() }
496    fn is_ascii_compatible(&self) -> bool { false }
497
498    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
499        output.writer_hint(input.len());
500
501        let mut escaped = self.escaped;
502        macro_rules! ensure_escaped(
503            () => (if !escaped { output.write_bytes(b"~{"); escaped = true; })
504        );
505        macro_rules! ensure_unescaped(
506            () => (if escaped { output.write_bytes(b"~}"); escaped = false; })
507        );
508
509        for ((i,j), ch) in input.index_iter() {
510            if ch < '\u{80}' {
511                ensure_unescaped!();
512                output.write_byte(ch as u8);
513                if ch == '~' { output.write_byte('~' as u8); }
514            } else {
515                let ptr = index::gb18030::backward(ch as u32);
516                if ptr == 0xffff {
517                    self.escaped = escaped; // do NOT reset the state!
518                    return (i, Some(CodecError {
519                        upto: j as isize, cause: "unrepresentable character".into()
520                    }));
521                } else {
522                    let lead = ptr / 190;
523                    let trail = ptr % 190;
524                    if lead < 0x21 - 1 || trail < 0x21 + 0x3f { // GBK extension, ignored
525                        self.escaped = escaped; // do NOT reset the state!
526                        return (i, Some(CodecError {
527                            upto: j as isize, cause: "unrepresentable character".into()
528                        }));
529                    } else {
530                        ensure_escaped!();
531                        output.write_byte((lead + 1) as u8);
532                        output.write_byte((trail - 0x3f) as u8);
533                    }
534                }
535            }
536        }
537
538        self.escaped = escaped;
539        (input.len(), None)
540    }
541
542    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
543        None
544    }
545}
546
547/// A decoder for HZ.
548#[derive(Clone, Copy)]
549struct HZDecoder {
550    st: hz::State,
551}
552
553impl HZDecoder {
554    pub fn new() -> Box<RawDecoder> {
555        Box::new(HZDecoder { st: Default::default() })
556    }
557}
558
559impl RawDecoder for HZDecoder {
560    fn from_self(&self) -> Box<RawDecoder> { HZDecoder::new() }
561    fn is_ascii_compatible(&self) -> bool { true }
562
563    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
564        let (st, processed, err) = hz::raw_feed(self.st, input, output, &());
565        self.st = st;
566        (processed, err)
567    }
568
569    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
570        let (st, err) = hz::raw_finish(self.st, output, &());
571        self.st = st;
572        err
573    }
574}
575
576stateful_decoder! {
577    module hz;
578
579    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
580        use index_simpchinese as index;
581
582        let lead = lead as u16;
583        let trail = trail as u16;
584        let index = match (lead, trail) {
585            (0x20...0x7f, 0x21...0x7e) => (lead - 1) * 190 + (trail + 0x3f),
586            _ => 0xffff,
587        };
588        index::gb18030::forward(index)
589    }
590
591initial:
592    // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x00
593    state A0(ctx: Context) {
594        case 0x7e => A1(ctx);
595        case b @ 0x00...0x7f => ctx.emit(b as u32);
596        case _ => ctx.err("invalid sequence");
597        final => ctx.reset();
598    }
599
600checkpoint:
601    // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x00
602    state B0(ctx: Context) {
603        case 0x7e => B1(ctx);
604        case b @ 0x20...0x7f => B2(ctx, b);
605        case 0x0a => ctx.err("invalid sequence"); // error *and* reset
606        case _ => ctx.err("invalid sequence"), B0(ctx);
607        final => ctx.reset();
608    }
609
610transient:
611    // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x7e
612    state A1(ctx: Context) {
613        case 0x7b => B0(ctx);
614        case 0x7d => A0(ctx);
615        case 0x7e => ctx.emit(0x7e), A0(ctx);
616        case 0x0a => A0(ctx);
617        case _ => ctx.backup_and_err(1, "invalid sequence");
618        final => ctx.err("incomplete sequence");
619    }
620
621    // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x7e
622    state B1(ctx: Context) {
623        case 0x7b => B0(ctx);
624        case 0x7d => A0(ctx);
625        case 0x7e => ctx.emit(0x7e), B0(ctx);
626        case 0x0a => A0(ctx);
627        case _ => ctx.backup_and_err(1, "invalid sequence"), B0(ctx);
628        final => ctx.err("incomplete sequence");
629    }
630
631    // hz-gb-2312 flag = set, hz-gb-2312 lead != 0 & != 0x7e
632    state B2(ctx: Context, lead: u8) {
633        case 0x0a => ctx.err("invalid sequence"); // should reset the state!
634        case b =>
635            match map_two_bytes(lead, b) {
636                0xffff => ctx.err("invalid sequence"),
637                ch => ctx.emit(ch)
638            },
639            B0(ctx);
640        final => ctx.err("incomplete sequence");
641    }
642}
643
644#[cfg(test)]
645mod hz_tests {
646    extern crate test;
647    use super::HZEncoding;
648    use testutils;
649    use types::*;
650
651    #[test]
652    fn test_encoder_valid() {
653        let mut e = HZEncoding.raw_encoder();
654        assert_feed_ok!(e, "A", "", *b"A");
655        assert_feed_ok!(e, "BC", "", *b"BC");
656        assert_feed_ok!(e, "", "", *b"");
657        assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "",
658                        *b"~{VP;*HKCq92:M9z");
659        assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", *b"#A#B#C");
660        assert_feed_ok!(e, "1\u{20ac}/m", "", *b"~}1~{\"c~}/m");
661        assert_feed_ok!(e, "~<\u{a4}~\u{0a4}>~", "", *b"~~<~{!h~}~~~{!h~}>~~");
662        assert_finish_ok!(e, []);
663    }
664
665    #[test]
666    fn test_encoder_invalid() {
667        let mut e = HZEncoding.raw_encoder();
668        assert_feed_err!(e, "", "\u{ffff}", "", []);
669        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
670        // no support for GBK extension
671        assert_feed_err!(e, "", "\u{3007}", "", []);
672        assert_finish_ok!(e, []);
673    }
674
675    #[test]
676    fn test_decoder_valid() {
677        let mut d = HZEncoding.raw_decoder();
678        assert_feed_ok!(d, *b"A", *b"", "A");
679        assert_feed_ok!(d, *b"BC", *b"", "BC");
680        assert_feed_ok!(d, *b"D~~E", *b"~", "D~E");
681        assert_feed_ok!(d, *b"~F~\nG", *b"~", "~FG");
682        assert_feed_ok!(d, *b"", *b"", "");
683        assert_feed_ok!(d, *b"\nH", *b"~", "H");
684        assert_feed_ok!(d, *b"{VP~}~{;*~{HKCq92:M9z", *b"",
685                        "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}");
686        assert_feed_ok!(d, *b"", *b"#", "");
687        assert_feed_ok!(d, *b"A", *b"~", "\u{ff21}");
688        assert_feed_ok!(d, *b"~#B~~#C", *b"~", "~\u{ff22}~\u{ff23}");
689        assert_feed_ok!(d, *b"", *b"", "");
690        assert_feed_ok!(d, *b"\n#D~{#E~\n#F~{#G", *b"~", "#D\u{ff25}#F\u{ff27}");
691        assert_feed_ok!(d, *b"}X~}YZ", *b"", "XYZ");
692        assert_finish_ok!(d, "");
693    }
694
695    #[test]
696    fn test_decoder_invalid_out_or_range() {
697        let mut d = HZEncoding.raw_decoder();
698        assert_feed_ok!(d, *b"~{", *b"", "");
699        assert_feed_err!(d, *b"", *b"\x20\x20", *b"", "");
700        assert_feed_err!(d, *b"", *b"\x20\x7f", *b"", ""); // do not reset the state (except for CR)
701        assert_feed_err!(d, *b"", *b"\x21\x7f", *b"", "");
702        assert_feed_err!(d, *b"", *b"\x7f\x20", *b"", "");
703        assert_feed_err!(d, *b"", *b"\x7f\x21", *b"", "");
704        assert_feed_err!(d, *b"", *b"\x7f\x7f", *b"", "");
705        assert_finish_ok!(d, "");
706    }
707
708    #[test]
709    fn test_decoder_invalid_carriage_return() {
710        // CR in the multibyte mode is invalid but *also* resets the state
711        let mut d = HZEncoding.raw_decoder();
712        assert_feed_ok!(d, *b"~{#A", *b"", "\u{ff21}");
713        assert_feed_err!(d, *b"", *b"\n", *b"", "");
714        assert_feed_ok!(d, *b"#B~{#C", *b"", "#B\u{ff23}");
715        assert_feed_err!(d, *b"", *b"#\n", *b"", "");
716        assert_feed_ok!(d, *b"#D", *b"", "#D");
717        assert_finish_ok!(d, "");
718    }
719
720    #[test]
721    fn test_decoder_invalid_partial() {
722        let mut d = HZEncoding.raw_decoder();
723        assert_feed_ok!(d, *b"", *b"~", "");
724        assert_finish_err!(d, "");
725
726        let mut d = HZEncoding.raw_decoder();
727        assert_feed_ok!(d, *b"~{", *b"#", "");
728        assert_finish_err!(d, "");
729
730        let mut d = HZEncoding.raw_decoder();
731        assert_feed_ok!(d, *b"~{#A", *b"~", "\u{ff21}");
732        assert_finish_err!(d, "");
733    }
734
735    #[test]
736    fn test_decoder_invalid_escape() {
737        let mut d = HZEncoding.raw_decoder();
738        assert_feed_ok!(d, *b"#A", *b"", "#A");
739        assert_feed_err!(d, *b"", *b"~", *b"xy", "");
740        assert_feed_ok!(d, *b"#B", *b"", "#B");
741        assert_feed_ok!(d, *b"", *b"~", "");
742        assert_feed_err!(d, *b"", *b"", *b"xy", "");
743        assert_feed_ok!(d, *b"#C~{#D", *b"", "#C\u{ff24}");
744        assert_feed_err!(d, *b"", *b"~", *b"xy", "");
745        assert_feed_ok!(d, *b"#E", *b"", "\u{ff25}"); // does not reset to ASCII
746        assert_feed_ok!(d, *b"", *b"~", "");
747        assert_feed_err!(d, *b"", *b"", *b"xy", "");
748        assert_feed_ok!(d, *b"#F~}#G", *b"", "\u{ff26}#G");
749        assert_finish_ok!(d, "");
750    }
751
752    #[test]
753    fn test_decoder_feed_after_finish() {
754        let mut d = HZEncoding.raw_decoder();
755        assert_feed_ok!(d, *b"R;~{R;", *b"R", "R;\u{4e00}");
756        assert_finish_err!(d, "");
757        assert_feed_ok!(d, *b"R;~{R;", *b"", "R;\u{4e00}");
758        assert_finish_ok!(d, "");
759    }
760
761    #[bench]
762    fn bench_encode_short_text(bencher: &mut test::Bencher) {
763        let s = testutils::SIMPLIFIED_CHINESE_TEXT;
764        bencher.bytes = s.len() as u64;
765        bencher.iter(|| test::black_box({
766            HZEncoding.encode(&s, EncoderTrap::Strict)
767        }))
768    }
769
770    #[bench]
771    fn bench_decode_short_text(bencher: &mut test::Bencher) {
772        let s = HZEncoding.encode(testutils::SIMPLIFIED_CHINESE_TEXT,
773                                  EncoderTrap::Strict).ok().unwrap();
774        bencher.bytes = s.len() as u64;
775        bencher.iter(|| test::black_box({
776            HZEncoding.decode(&s, DecoderTrap::Strict)
777        }))
778    }
779}
780