encoding/codec/
japanese.rs

1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy Japanese encodings based on JIS X 0208 and JIS X 0212.
6
7use std::convert::Into;
8use std::default::Default;
9use util::StrCharIndex;
10use index_japanese as index;
11use types::*;
12use self::ISO2022JPState::{ASCII,Katakana,Lead};
13
14/**
15 * EUC-JP. (XXX with asymmetric JIS X 0212 support)
16 *
17 * This is a Japanese encoding created from three JIS character sets:
18 *
19 * - JIS X 0201, which lower half is ISO/IEC 646:JP (US-ASCII with yen sign and overline)
20 *   and upper half contains legacy half-width Katakanas.
21 * - JIS X 0208, a primary graphic character set (94x94).
22 * - JIS X 0212, a supplementary graphic character set (94x94).
23 *
24 * EUC-JP contains the lower half of JIS X 0201 in G0 (`[21-7E]`),
25 * JIS X 0208 in G1 (`[A1-FE] [A1-FE]`),
26 * the upper half of JIS X 0212 in G2 (`8E [A1-DF]`), and
27 * JIS X 0212 in G3 (`8F [A1-FE] [A1-FE]`).
28 */
29#[derive(Clone, Copy)]
30pub struct EUCJPEncoding;
31
32impl Encoding for EUCJPEncoding {
33    fn name(&self) -> &'static str { "euc-jp" }
34    fn whatwg_name(&self) -> Option<&'static str> { Some("euc-jp") }
35    fn raw_encoder(&self) -> Box<RawEncoder> { EUCJPEncoder::new() }
36    fn raw_decoder(&self) -> Box<RawDecoder> { EUCJP0212Decoder::new() }
37}
38
39/// An encoder for EUC-JP with unused G3 character set.
40#[derive(Clone, Copy)]
41pub struct EUCJPEncoder;
42
43impl EUCJPEncoder {
44    pub fn new() -> Box<RawEncoder> { Box::new(EUCJPEncoder) }
45}
46
47impl RawEncoder for EUCJPEncoder {
48    fn from_self(&self) -> Box<RawEncoder> { EUCJPEncoder::new() }
49    fn is_ascii_compatible(&self) -> bool { true }
50
51    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
52        output.writer_hint(input.len());
53
54        for ((i,j), ch) in input.index_iter() {
55            match ch {
56                '\u{0}'...'\u{7f}' => { output.write_byte(ch as u8); }
57                '\u{a5}' => { output.write_byte(0x5c); }
58                '\u{203e}' => { output.write_byte(0x7e); }
59                '\u{ff61}'...'\u{ff9f}' => {
60                    output.write_byte(0x8e);
61                    output.write_byte((ch as usize - 0xff61 + 0xa1) as u8);
62                }
63                _ => {
64                    let ptr = index::jis0208::backward(ch as u32);
65                    if ptr == 0xffff {
66                        return (i, Some(CodecError {
67                            upto: j as isize, cause: "unrepresentable character".into()
68                        }));
69                    } else {
70                        let lead = ptr / 94 + 0xa1;
71                        let trail = ptr % 94 + 0xa1;
72                        output.write_byte(lead as u8);
73                        output.write_byte(trail as u8);
74                    }
75                }
76            }
77        }
78        (input.len(), None)
79    }
80
81    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
82        None
83    }
84}
85
86/// A decoder for EUC-JP with JIS X 0212 in G3.
87#[derive(Clone, Copy)]
88struct EUCJP0212Decoder {
89    st: eucjp::State,
90}
91
92impl EUCJP0212Decoder {
93    pub fn new() -> Box<RawDecoder> {
94        Box::new(EUCJP0212Decoder { st: Default::default() })
95    }
96}
97
98impl RawDecoder for EUCJP0212Decoder {
99    fn from_self(&self) -> Box<RawDecoder> { EUCJP0212Decoder::new() }
100    fn is_ascii_compatible(&self) -> bool { true }
101
102    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
103        let (st, processed, err) = eucjp::raw_feed(self.st, input, output, &());
104        self.st = st;
105        (processed, err)
106    }
107
108    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
109        let (st, err) = eucjp::raw_finish(self.st, output, &());
110        self.st = st;
111        err
112    }
113}
114
115stateful_decoder! {
116    module eucjp;
117
118    internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 {
119        use index_japanese as index;
120
121        let lead = lead as u16;
122        let trail = trail as u16;
123        let index = match (lead, trail) {
124            (0xa1...0xfe, 0xa1...0xfe) => (lead - 0xa1) * 94 + trail - 0xa1,
125            _ => 0xffff,
126        };
127        index::jis0208::forward(index)
128    }
129
130    internal pub fn map_two_0212_bytes(lead: u8, trail: u8) -> u32 {
131        use index_japanese as index;
132
133        let lead = lead as u16;
134        let trail = trail as u16;
135        let index = match (lead, trail) {
136            (0xa1...0xfe, 0xa1...0xfe) => (lead - 0xa1) * 94 + trail - 0xa1,
137            _ => 0xffff,
138        };
139        index::jis0212::forward(index)
140    }
141
142initial:
143    // euc-jp lead = 0x00
144    state S0(ctx: Context) {
145        case b @ 0x00...0x7f => ctx.emit(b as u32);
146        case 0x8e => S1(ctx);
147        case 0x8f => S2(ctx);
148        case b @ 0xa1...0xfe => S3(ctx, b);
149        case _ => ctx.err("invalid sequence");
150    }
151
152transient:
153    // euc-jp lead = 0x8e
154    state S1(ctx: Context) {
155        case b @ 0xa1...0xdf => ctx.emit(0xff61 + b as u32 - 0xa1);
156        case 0xa1...0xfe => ctx.err("invalid sequence");
157        case _ => ctx.backup_and_err(1, "invalid sequence");
158    }
159
160    // euc-jp lead = 0x8f
161    // JIS X 0201 half-width katakana
162    state S2(ctx: Context) {
163        case b @ 0xa1...0xfe => S4(ctx, b);
164        case _ => ctx.backup_and_err(1, "invalid sequence");
165    }
166
167    // euc-jp lead != 0x00, euc-jp jis0212 flag = unset
168    // JIS X 0208 two-byte sequence
169    state S3(ctx: Context, lead: u8) {
170        case b @ 0xa1...0xfe => match map_two_0208_bytes(lead, b) {
171            // do NOT backup, we only backup for out-of-range trails.
172            0xffff => ctx.err("invalid sequence"),
173            ch => ctx.emit(ch as u32)
174        };
175        case _ => ctx.backup_and_err(1, "invalid sequence");
176    }
177
178    // euc-jp lead != 0x00, euc-jp jis0212 flag = set
179    // JIS X 0212 three-byte sequence
180    state S4(ctx: Context, lead: u8) {
181        case b @ 0xa1...0xfe => match map_two_0212_bytes(lead, b) {
182            // do NOT backup, we only backup for out-of-range trails.
183            0xffff => ctx.err("invalid sequence"),
184            ch => ctx.emit(ch as u32)
185        };
186        case _ => ctx.backup_and_err(1, "invalid sequence");
187    }
188}
189
190#[cfg(test)]
191mod eucjp_tests {
192    extern crate test;
193    use super::EUCJPEncoding;
194    use testutils;
195    use types::*;
196
197    #[test]
198    fn test_encoder_valid() {
199        let mut e = EUCJPEncoding.raw_encoder();
200        assert_feed_ok!(e, "A", "", [0x41]);
201        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
202        assert_feed_ok!(e, "", "", []);
203        assert_feed_ok!(e, "\u{a5}", "", [0x5c]);
204        assert_feed_ok!(e, "\u{203e}", "", [0x7e]);
205        assert_feed_ok!(e, "\u{306b}\u{307b}\u{3093}", "", [0xa4, 0xcb, 0xa4, 0xdb, 0xa4, 0xf3]);
206        assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0x8e, 0xc6, 0x8e, 0xce, 0x8e, 0xdd]);
207        assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0xc6, 0xfc, 0xcb, 0xdc]);
208        assert_finish_ok!(e, []);
209    }
210
211    #[test]
212    fn test_encoder_double_mapped() {
213        // these characters are double-mapped to both EUDC area and Shift_JIS extension area
214        // but only the former should be used. (note that U+FFE2 is triple-mapped!)
215        let mut e = EUCJPEncoding.raw_encoder();
216        assert_feed_ok!(e, "\u{9ed1}\u{2170}\u{ffe2}", "", [0xfc, 0xee, 0xfc, 0xf1, 0xa2, 0xcc]);
217        assert_finish_ok!(e, []);
218    }
219
220    #[test]
221    fn test_encoder_invalid() {
222        let mut e = EUCJPEncoding.raw_encoder();
223        assert_feed_err!(e, "", "\u{ffff}", "", []);
224        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
225        // JIS X 0212 is not supported in the encoder
226        assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []);
227        assert_finish_ok!(e, []);
228    }
229
230    #[test]
231    fn test_decoder_valid() {
232        let mut d = EUCJPEncoding.raw_decoder();
233        assert_feed_ok!(d, [0x41], [], "A");
234        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
235        assert_feed_ok!(d, [], [], "");
236        assert_feed_ok!(d, [0x5c], [], "\\");
237        assert_feed_ok!(d, [0x7e], [], "~");
238        assert_feed_ok!(d, [0xa4, 0xcb, 0xa4, 0xdb, 0xa4, 0xf3], [], "\u{306b}\u{307b}\u{3093}");
239        assert_feed_ok!(d, [0x8e, 0xc6, 0x8e, 0xce, 0x8e, 0xdd], [], "\u{ff86}\u{ff8e}\u{ff9d}");
240        assert_feed_ok!(d, [0xc6, 0xfc, 0xcb, 0xdc], [], "\u{65e5}\u{672c}");
241        assert_feed_ok!(d, [0x8f, 0xcb, 0xc6, 0xec, 0xb8], [], "\u{736c}\u{8c78}");
242        assert_finish_ok!(d, "");
243    }
244
245    #[test]
246    fn test_decoder_valid_partial() {
247        let mut d = EUCJPEncoding.raw_decoder();
248        assert_feed_ok!(d, [], [0xa4], "");
249        assert_feed_ok!(d, [0xcb], [0xa4], "\u{306b}");
250        assert_feed_ok!(d, [0xdb], [0xa4], "\u{307b}");
251        assert_feed_ok!(d, [0xf3], [], "\u{3093}");
252        assert_feed_ok!(d, [], [0x8e], "");
253        assert_feed_ok!(d, [0xc6], [0x8e], "\u{ff86}");
254        assert_feed_ok!(d, [0xce], [0x8e], "\u{ff8e}");
255        assert_feed_ok!(d, [0xdd], [], "\u{ff9d}");
256        assert_feed_ok!(d, [], [0xc6], "");
257        assert_feed_ok!(d, [0xfc], [0xcb], "\u{65e5}");
258        assert_feed_ok!(d, [0xdc], [], "\u{672c}");
259        assert_feed_ok!(d, [], [0x8f], "");
260        assert_feed_ok!(d, [], [0xcb], "");
261        assert_feed_ok!(d, [0xc6], [0xec], "\u{736c}");
262        assert_feed_ok!(d, [0xb8], [], "\u{8c78}");
263        assert_feed_ok!(d, [], [0x8f, 0xcb], "");
264        assert_feed_ok!(d, [0xc6, 0xec, 0xb8], [], "\u{736c}\u{8c78}");
265        assert_finish_ok!(d, "");
266    }
267
268    #[test]
269    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
270        for i in 0x8e..0x90 {
271            let mut d = EUCJPEncoding.raw_decoder();
272            assert_feed_ok!(d, [], [i], ""); // wait for a trail
273            assert_finish_err!(d, "");
274        }
275
276        for i in 0xa1..0xff {
277            let mut d = EUCJPEncoding.raw_decoder();
278            assert_feed_ok!(d, [], [i], ""); // wait for a trail
279            assert_finish_err!(d, "");
280        }
281
282        // immediate failures
283        let mut d = EUCJPEncoding.raw_decoder();
284        for i in 0x80..0x8e {
285            assert_feed_err!(d, [], [i], [], "");
286        }
287        for i in 0x90..0xa1 {
288            assert_feed_err!(d, [], [i], [], "");
289        }
290        assert_feed_err!(d, [], [0xff], [], "");
291        assert_finish_ok!(d, "");
292    }
293
294    #[test]
295    fn test_decoder_invalid_lone_lead_followed_by_space() {
296        for i in 0x80..0x100 {
297            let i = i as u8;
298            let mut d = EUCJPEncoding.raw_decoder();
299            assert_feed_err!(d, [], [i], [0x20], "");
300            assert_finish_ok!(d, "");
301        }
302    }
303
304    #[test]
305    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
306        for i in 0x80..0x100 {
307            let i = i as u8;
308            let mut d = EUCJPEncoding.raw_decoder();
309            assert_feed_err!(d, [], [i], [0x80], "");
310            assert_feed_err!(d, [], [i], [0xff], "");
311            assert_finish_ok!(d, "");
312        }
313    }
314
315    #[test]
316    fn test_decoder_invalid_lone_lead_for_0212_immediate_test_finish() {
317        for i in 0xa1..0xff {
318            let mut d = EUCJPEncoding.raw_decoder();
319            assert_feed_ok!(d, [], [0x8f, i], ""); // wait for a trail
320            assert_finish_err!(d, "");
321        }
322    }
323
324    #[test]
325    fn test_decoder_invalid_lone_lead_for_0212_immediate_test_finish_partial() {
326        for i in 0xa1..0xff {
327            let mut d = EUCJPEncoding.raw_decoder();
328            assert_feed_ok!(d, [], [0x8f], "");
329            assert_feed_ok!(d, [], [i], ""); // wait for a trail
330            assert_finish_err!(d, "");
331        }
332    }
333
334    #[test]
335    fn test_decoder_invalid_trail_for_0201() {
336        for i in 0..0xa1 {
337            let mut d = EUCJPEncoding.raw_decoder();
338            assert_feed_err!(d, [], [0x8e], [i], "");
339            assert_finish_ok!(d, "");
340        }
341
342        for i in 0xe0..0xff {
343            let mut d = EUCJPEncoding.raw_decoder();
344            assert_feed_err!(d, [], [0x8e, i], [], "");
345            assert_finish_ok!(d, "");
346        }
347    }
348
349    #[test]
350    fn test_decoder_invalid_trail_for_0201_partial() {
351        for i in 0..0xa1 {
352            let mut d = EUCJPEncoding.raw_decoder();
353            assert_feed_ok!(d, [], [0x8e], "");
354            assert_feed_err!(d, [], [], [i], "");
355            assert_finish_ok!(d, "");
356        }
357
358        for i in 0xe0..0xff {
359            let mut d = EUCJPEncoding.raw_decoder();
360            assert_feed_ok!(d, [], [0x8e], "");
361            assert_feed_err!(d, [], [i], [], "");
362            assert_finish_ok!(d, "");
363        }
364    }
365
366    #[test]
367    fn test_decoder_invalid_middle_for_0212() {
368        for i in 0..0xa1 {
369            let mut d = EUCJPEncoding.raw_decoder();
370            assert_feed_err!(d, [], [0x8f], [i], "");
371            assert_finish_ok!(d, "");
372        }
373    }
374
375    #[test]
376    fn test_decoder_invalid_middle_for_0212_partial() {
377        for i in 0..0xa1 {
378            let mut d = EUCJPEncoding.raw_decoder();
379            assert_feed_ok!(d, [], [0x8f], "");
380            assert_feed_err!(d, [], [], [i], "");
381            assert_finish_ok!(d, "");
382        }
383    }
384
385    #[test]
386    fn test_decoder_invalid_trail_for_0212() {
387        for i in 0..0xa1 {
388            let mut d = EUCJPEncoding.raw_decoder();
389            assert_feed_err!(d, [], [0x8f, 0xa1], [i], "");
390            assert_finish_ok!(d, "");
391        }
392    }
393
394    #[test]
395    fn test_decoder_invalid_trail_for_0212_partial() {
396        for i in 0..0xa1 {
397            let mut d = EUCJPEncoding.raw_decoder();
398            assert_feed_ok!(d, [], [0x8f], "");
399            assert_feed_ok!(d, [], [0xa1], "");
400            assert_feed_err!(d, [], [], [i], "");
401            assert_finish_ok!(d, "");
402        }
403    }
404
405    #[test]
406    fn test_decoder_feed_after_finish() {
407        let mut d = EUCJPEncoding.raw_decoder();
408        assert_feed_ok!(d, [0xa4, 0xa2], [0xa4], "\u{3042}");
409        assert_finish_err!(d, "");
410        assert_feed_ok!(d, [0xa4, 0xa2], [], "\u{3042}");
411        assert_finish_ok!(d, "");
412    }
413
414    #[bench]
415    fn bench_encode_short_text(bencher: &mut test::Bencher) {
416        let s = testutils::JAPANESE_TEXT;
417        bencher.bytes = s.len() as u64;
418        bencher.iter(|| test::black_box({
419            EUCJPEncoding.encode(&s, EncoderTrap::Strict)
420        }))
421    }
422
423    #[bench]
424    fn bench_decode_short_text(bencher: &mut test::Bencher) {
425        let s = EUCJPEncoding.encode(testutils::JAPANESE_TEXT,
426                                     EncoderTrap::Strict).ok().unwrap();
427        bencher.bytes = s.len() as u64;
428        bencher.iter(|| test::black_box({
429            EUCJPEncoding.decode(&s, DecoderTrap::Strict)
430        }))
431    }
432}
433
434/**
435 * Windows code page 932, i.e. Shift_JIS with IBM/NEC extensions.
436 *
437 * This is a Japanese encoding for JIS X 0208
438 * compatible to the original assignments of JIS X 0201 (`[21-7E A1-DF]`).
439 * The 94 by 94 region of JIS X 0208 is sliced, or rather "shifted" into
440 * the odd half (odd row number) and even half (even row number),
441 * and merged into the 188 by 47 region mapped to `[81-9F E0-EF] [40-7E 80-FC]`.
442 * The remaining area, `[80 A0 F0-FF] [40-7E 80-FC]`, has been subjected to
443 * numerous extensions incompatible to each other.
444 * This particular implementation uses IBM/NEC extensions
445 * which assigns more characters to `[F0-FC 80-FC]` and also to the Private Use Area (PUA).
446 * It requires some cares to handle
447 * since the second byte of JIS X 0208 can have its MSB unset.
448 */
449#[derive(Clone, Copy)]
450pub struct Windows31JEncoding;
451
452impl Encoding for Windows31JEncoding {
453    fn name(&self) -> &'static str { "windows-31j" }
454    fn whatwg_name(&self) -> Option<&'static str> { Some("shift_jis") } // WHATWG compatibility
455    fn raw_encoder(&self) -> Box<RawEncoder> { Windows31JEncoder::new() }
456    fn raw_decoder(&self) -> Box<RawDecoder> { Windows31JDecoder::new() }
457}
458
459/// An encoder for Shift_JIS with IBM/NEC extensions.
460#[derive(Clone, Copy)]
461pub struct Windows31JEncoder;
462
463impl Windows31JEncoder {
464    pub fn new() -> Box<RawEncoder> { Box::new(Windows31JEncoder) }
465}
466
467impl RawEncoder for Windows31JEncoder {
468    fn from_self(&self) -> Box<RawEncoder> { Windows31JEncoder::new() }
469    fn is_ascii_compatible(&self) -> bool { true }
470
471    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
472        output.writer_hint(input.len());
473
474        for ((i,j), ch) in input.index_iter() {
475            match ch {
476                '\u{0}'...'\u{80}' => { output.write_byte(ch as u8); }
477                '\u{a5}' => { output.write_byte(0x5c); }
478                '\u{203e}' => { output.write_byte(0x7e); }
479                '\u{ff61}'...'\u{ff9f}' => {
480                    output.write_byte((ch as usize - 0xff61 + 0xa1) as u8);
481                }
482                _ => {
483                    // corresponds to the "index shift_jis pointer" in the WHATWG spec
484                    let ptr = index::jis0208::backward_remapped(ch as u32);
485                    if ptr == 0xffff {
486                        return (i, Some(CodecError {
487                            upto: j as isize, cause: "unrepresentable character".into(),
488                        }));
489                    } else {
490                        let lead = ptr / 188;
491                        let leadoffset = if lead < 0x1f {0x81} else {0xc1};
492                        let trail = ptr % 188;
493                        let trailoffset = if trail < 0x3f {0x40} else {0x41};
494                        output.write_byte((lead + leadoffset) as u8);
495                        output.write_byte((trail + trailoffset) as u8);
496                    }
497                }
498            }
499        }
500        (input.len(), None)
501    }
502
503    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
504        None
505    }
506}
507
508/// A decoder for Shift_JIS with IBM/NEC extensions.
509#[derive(Clone, Copy)]
510struct Windows31JDecoder {
511    st: windows31j::State,
512}
513
514impl Windows31JDecoder {
515    pub fn new() -> Box<RawDecoder> {
516        Box::new(Windows31JDecoder { st: Default::default() })
517    }
518}
519
520impl RawDecoder for Windows31JDecoder {
521    fn from_self(&self) -> Box<RawDecoder> { Windows31JDecoder::new() }
522    fn is_ascii_compatible(&self) -> bool { true }
523
524    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
525        let (st, processed, err) = windows31j::raw_feed(self.st, input, output, &());
526        self.st = st;
527        (processed, err)
528    }
529
530    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
531        let (st, err) = windows31j::raw_finish(self.st, output, &());
532        self.st = st;
533        err
534    }
535}
536
537stateful_decoder! {
538    module windows31j;
539
540    internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 {
541        use index_japanese as index;
542
543        let lead = lead as u16;
544        let trail = trail as u16;
545        let leadoffset = if lead < 0xa0 {0x81} else {0xc1};
546        let trailoffset = if trail < 0x7f {0x40} else {0x41};
547        let index = match (lead, trail) {
548            (0xf0...0xf9, 0x40...0x7e) | (0xf0...0xf9, 0x80...0xfc) =>
549                return (0xe000 + (lead - 0xf0) * 188 + trail - trailoffset) as u32,
550            (0x81...0x9f, 0x40...0x7e) | (0x81...0x9f, 0x80...0xfc) |
551            (0xe0...0xfc, 0x40...0x7e) | (0xe0...0xfc, 0x80...0xfc) =>
552                (lead - leadoffset) * 188 + trail - trailoffset,
553            _ => 0xffff,
554        };
555        index::jis0208::forward(index)
556    }
557
558initial:
559    // shift_jis lead = 0x00
560    state S0(ctx: Context) {
561        case b @ 0x00...0x80 => ctx.emit(b as u32);
562        case b @ 0xa1...0xdf => ctx.emit(0xff61 + b as u32 - 0xa1);
563        case b @ 0x81...0x9f, b @ 0xe0...0xfc => S1(ctx, b);
564        case _ => ctx.err("invalid sequence");
565    }
566
567transient:
568    // shift_jis lead != 0x00
569    state S1(ctx: Context, lead: u8) {
570        case b => match map_two_0208_bytes(lead, b) {
571            0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional
572            ch => ctx.emit(ch)
573        };
574    }
575}
576
577#[cfg(test)]
578mod windows31j_tests {
579    extern crate test;
580    use super::Windows31JEncoding;
581    use testutils;
582    use types::*;
583
584    #[test]
585    fn test_encoder_valid() {
586        let mut e = Windows31JEncoding.raw_encoder();
587        assert_feed_ok!(e, "A", "", [0x41]);
588        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
589        assert_feed_ok!(e, "", "", []);
590        assert_feed_ok!(e, "\u{a5}", "", [0x5c]);
591        assert_feed_ok!(e, "\u{203e}", "", [0x7e]);
592        assert_feed_ok!(e, "\u{306b}\u{307b}\u{3093}", "", [0x82, 0xc9, 0x82, 0xd9, 0x82, 0xf1]);
593        assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0xc6, 0xce, 0xdd]);
594        assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0x93, 0xfa, 0x96, 0x7b]);
595        assert_finish_ok!(e, []);
596    }
597
598    #[test]
599    fn test_encoder_no_eudc() {
600        let mut e = Windows31JEncoding.raw_encoder();
601        assert_feed_err!(e, "", "\u{e000}", "", []);
602        assert_feed_err!(e, "", "\u{e757}", "", []);
603        assert_feed_err!(e, "", "\u{e758}", "", []);
604        assert_finish_ok!(e, []);
605    }
606
607    #[test]
608    fn test_encoder_double_mapped() {
609        // these characters are double-mapped to both EUDC area and Shift_JIS extension area
610        // but only the latter should be used. (note that U+FFE2 is triple-mapped!)
611        let mut e = Windows31JEncoding.raw_encoder();
612        assert_feed_ok!(e, "\u{9ed1}\u{2170}\u{ffe2}", "", [0xfc, 0x4b, 0xfa, 0x40, 0x81, 0xca]);
613        assert_finish_ok!(e, []);
614    }
615
616    #[test]
617    fn test_encoder_invalid() {
618        let mut e = Windows31JEncoding.raw_encoder();
619        assert_feed_err!(e, "", "\u{ffff}", "", []);
620        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
621        assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []);
622        assert_finish_ok!(e, []);
623    }
624
625    #[test]
626    fn test_decoder_valid() {
627        let mut d = Windows31JEncoding.raw_decoder();
628        assert_feed_ok!(d, [0x41], [], "A");
629        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
630        assert_feed_ok!(d, [], [], "");
631        assert_feed_ok!(d, [0x5c], [], "\\");
632        assert_feed_ok!(d, [0x7e], [], "~");
633        assert_feed_ok!(d, [0x80], [], "\u{80}"); // compatibility
634        assert_feed_ok!(d, [0x82, 0xc9, 0x82, 0xd9, 0x82, 0xf1], [], "\u{306b}\u{307b}\u{3093}");
635        assert_feed_ok!(d, [0xc6, 0xce, 0xdd], [], "\u{ff86}\u{ff8e}\u{ff9d}");
636        assert_feed_ok!(d, [0x93, 0xfa, 0x96, 0x7b], [], "\u{65e5}\u{672c}");
637        assert_finish_ok!(d, "");
638    }
639
640    #[test]
641    fn test_decoder_eudc() {
642        let mut d = Windows31JEncoding.raw_decoder();
643        assert_feed_ok!(d, [], [0xf0], "");
644        assert_feed_ok!(d, [0x40], [], "\u{e000}");
645        assert_feed_ok!(d, [0xf9, 0xfc], [], "\u{e757}");
646        assert_feed_err!(d, [], [0xf0], [0x00], "");
647        assert_feed_err!(d, [], [0xf0], [0xff], "");
648        assert_finish_ok!(d, "");
649    }
650
651    #[test]
652    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
653        for i in 0x81..0xa0 {
654            let mut d = Windows31JEncoding.raw_decoder();
655            assert_feed_ok!(d, [], [i], ""); // wait for a trail
656            assert_finish_err!(d, "");
657        }
658
659        for i in 0xe0..0xfd {
660            let mut d = Windows31JEncoding.raw_decoder();
661            assert_feed_ok!(d, [], [i], ""); // wait for a trail
662            assert_finish_err!(d, "");
663        }
664
665        // A0/FD/FE/FF: immediate failure
666        let mut d = Windows31JEncoding.raw_decoder();
667        assert_feed_err!(d, [], [0xa0], [], "");
668        assert_feed_err!(d, [], [0xfd], [], "");
669        assert_feed_err!(d, [], [0xfe], [], "");
670        assert_feed_err!(d, [], [0xff], [], "");
671        assert_finish_ok!(d, "");
672    }
673
674    #[test]
675    fn test_decoder_invalid_lone_lead_followed_by_space() {
676        for i in 0x81..0xa0 {
677            let mut d = Windows31JEncoding.raw_decoder();
678            assert_feed_err!(d, [], [i], [0x20], "");
679            assert_finish_ok!(d, "");
680        }
681
682        for i in 0xe0..0xfd {
683            let mut d = Windows31JEncoding.raw_decoder();
684            assert_feed_err!(d, [], [i], [0x20], "");
685            assert_finish_ok!(d, "");
686        }
687    }
688
689    #[test]
690    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
691        for i in 0x81..0xa0 {
692            let mut d = Windows31JEncoding.raw_decoder();
693            assert_feed_err!(d, [], [i], [0x3f], "");
694            assert_feed_err!(d, [], [i], [0x7f], "");
695            assert_feed_err!(d, [], [i], [0xfd], "");
696            assert_feed_err!(d, [], [i], [0xfe], "");
697            assert_feed_err!(d, [], [i], [0xff], "");
698            assert_finish_ok!(d, "");
699        }
700
701        for i in 0xe0..0xfd {
702            let mut d = Windows31JEncoding.raw_decoder();
703            assert_feed_err!(d, [], [i], [0x3f], "");
704            assert_feed_err!(d, [], [i], [0x7f], "");
705            assert_feed_err!(d, [], [i], [0xfd], "");
706            assert_feed_err!(d, [], [i], [0xfe], "");
707            assert_feed_err!(d, [], [i], [0xff], "");
708            assert_finish_ok!(d, "");
709        }
710    }
711
712    #[test]
713    fn test_decoder_invalid_lead_followed_by_invalid_trail_partial() {
714        for i in 0x81..0xa0 {
715            let mut d = Windows31JEncoding.raw_decoder();
716            assert_feed_ok!(d, [], [i], "");
717            assert_feed_err!(d, [], [], [0xff], "");
718            assert_finish_ok!(d, "");
719        }
720
721        for i in 0xe0..0xfd {
722            let mut d = Windows31JEncoding.raw_decoder();
723            assert_feed_ok!(d, [], [i], "");
724            assert_feed_err!(d, [], [], [0xff], "");
725            assert_finish_ok!(d, "");
726        }
727    }
728
729    #[test]
730    fn test_decoder_feed_after_finish() {
731        let mut d = Windows31JEncoding.raw_decoder();
732        assert_feed_ok!(d, [0x82, 0xa0], [0x82], "\u{3042}");
733        assert_finish_err!(d, "");
734        assert_feed_ok!(d, [0x82, 0xa0], [], "\u{3042}");
735        assert_finish_ok!(d, "");
736    }
737
738    #[bench]
739    fn bench_encode_short_text(bencher: &mut test::Bencher) {
740        let s = testutils::JAPANESE_TEXT;
741        bencher.bytes = s.len() as u64;
742        bencher.iter(|| test::black_box({
743            Windows31JEncoding.encode(&s, EncoderTrap::Strict)
744        }))
745    }
746
747    #[bench]
748    fn bench_decode_short_text(bencher: &mut test::Bencher) {
749        let s = Windows31JEncoding.encode(testutils::JAPANESE_TEXT,
750                                          EncoderTrap::Strict).ok().unwrap();
751        bencher.bytes = s.len() as u64;
752        bencher.iter(|| test::black_box({
753            Windows31JEncoding.decode(&s, DecoderTrap::Strict)
754        }))
755    }
756}
757
758/**
759 * ISO-2022-JP.
760 *
761 * This version of ISO-2022-JP does not correspond to any standardized repertoire of character sets
762 * due to the widespread implementation differences. The following character sets are supported:
763 *
764 * - JIS X 0201-1976 roman (`ESC ( J` or `ESC ( B`; the latter is originally allocated to ASCII
765 *   but willfully violated)
766 * - JIS X 0201-1976 kana (`ESC ( I`)
767 * - JIS X 0208-1983 (`ESC $ B` or `ESC $ @`; the latter is originally allocated to JIS X 0208-1978
768 *   but willfully violated)
769 * - JIS X 0212-1990 (`ESC $ ( D`, XXX asymmetric support)
770 */
771#[derive(Clone, Copy)]
772pub struct ISO2022JPEncoding;
773
774impl Encoding for ISO2022JPEncoding {
775    fn name(&self) -> &'static str { "iso-2022-jp" }
776    fn whatwg_name(&self) -> Option<&'static str> { Some("iso-2022-jp") }
777    fn raw_encoder(&self) -> Box<RawEncoder> { ISO2022JPEncoder::new() }
778    fn raw_decoder(&self) -> Box<RawDecoder> { ISO2022JPDecoder::new() }
779}
780
781#[derive(PartialEq,Clone,Copy)]
782enum ISO2022JPState {
783    ASCII, // U+0000..007F, U+00A5, U+203E
784    Katakana, // JIS X 0201: U+FF61..FF9F
785    Lead, // JIS X 0208
786}
787
788/// An encoder for ISO-2022-JP without JIS X 0212/0213 support.
789#[derive(Clone, Copy)]
790pub struct ISO2022JPEncoder {
791    st: ISO2022JPState
792}
793
794impl ISO2022JPEncoder {
795    pub fn new() -> Box<RawEncoder> { Box::new(ISO2022JPEncoder { st: ASCII }) }
796}
797
798impl RawEncoder for ISO2022JPEncoder {
799    fn from_self(&self) -> Box<RawEncoder> { ISO2022JPEncoder::new() }
800    fn is_ascii_compatible(&self) -> bool { true }
801
802    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
803        output.writer_hint(input.len());
804
805        let mut st = self.st;
806        macro_rules! ensure_ASCII(
807            () => (if st != ASCII { output.write_bytes(b"\x1b(B"); st = ASCII; })
808        );
809        macro_rules! ensure_Katakana(
810            () => (if st != Katakana { output.write_bytes(b"\x1b(I"); st = Katakana; })
811        );
812        macro_rules! ensure_Lead(
813            () => (if st != Lead { output.write_bytes(b"\x1b$B"); st = Lead; })
814        );
815
816        for ((i,j), ch) in input.index_iter() {
817            match ch {
818                '\u{0}'...'\u{7f}' => { ensure_ASCII!(); output.write_byte(ch as u8); }
819                '\u{a5}' => { ensure_ASCII!(); output.write_byte(0x5c); }
820                '\u{203e}' => { ensure_ASCII!(); output.write_byte(0x7e); }
821                '\u{ff61}'...'\u{ff9f}' => {
822                    ensure_Katakana!();
823                    output.write_byte((ch as usize - 0xff61 + 0x21) as u8);
824                }
825                _ => {
826                    let ptr = index::jis0208::backward(ch as u32);
827                    if ptr == 0xffff {
828                        self.st = st; // do NOT reset the state!
829                        return (i, Some(CodecError {
830                            upto: j as isize, cause: "unrepresentable character".into()
831                        }));
832                    } else {
833                        ensure_Lead!();
834                        let lead = ptr / 94 + 0x21;
835                        let trail = ptr % 94 + 0x21;
836                        output.write_byte(lead as u8);
837                        output.write_byte(trail as u8);
838                    }
839                }
840            }
841        }
842
843        self.st = st;
844        (input.len(), None)
845    }
846
847    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
848        None
849    }
850}
851
852/// A decoder for ISO-2022-JP with JIS X 0212 support.
853#[derive(Clone, Copy)]
854struct ISO2022JPDecoder {
855    st: iso2022jp::State,
856}
857
858impl ISO2022JPDecoder {
859    pub fn new() -> Box<RawDecoder> {
860        Box::new(ISO2022JPDecoder { st: Default::default() })
861    }
862}
863
864impl RawDecoder for ISO2022JPDecoder {
865    fn from_self(&self) -> Box<RawDecoder> { ISO2022JPDecoder::new() }
866    fn is_ascii_compatible(&self) -> bool { false }
867
868    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
869        let (st, processed, err) = iso2022jp::raw_feed(self.st, input, output, &());
870        self.st = st;
871        (processed, err)
872    }
873
874    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
875        let (st, err) = iso2022jp::raw_finish(self.st, output, &());
876        self.st = st;
877        err
878    }
879}
880
881stateful_decoder! {
882    module iso2022jp;
883
884    internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 {
885        use index_japanese as index;
886
887        let lead = lead as u16;
888        let trail = trail as u16;
889        let index = match (lead, trail) {
890            (0x21...0x7e, 0x21...0x7e) => (lead - 0x21) * 94 + trail - 0x21,
891            _ => 0xffff,
892        };
893        index::jis0208::forward(index)
894    }
895
896    internal pub fn map_two_0212_bytes(lead: u8, trail: u8) -> u32 {
897        use index_japanese as index;
898
899        let lead = lead as u16;
900        let trail = trail as u16;
901        let index = match (lead, trail) {
902            (0x21...0x7e, 0x21...0x7e) => (lead - 0x21) * 94 + trail - 0x21,
903            _ => 0xffff,
904        };
905        index::jis0212::forward(index)
906    }
907
908initial:
909    // iso-2022-jp state = ASCII, iso-2022-jp jis0212 flag = unset, iso-2022-jp lead = 0x00
910    state ASCII(ctx: Context) {
911        case 0x1b => EscapeStart(ctx);
912        case b @ 0x00...0x7f => ctx.emit(b as u32), ASCII(ctx);
913        case _ => ctx.err("invalid sequence"), ASCII(ctx);
914        final => ctx.reset();
915    }
916
917checkpoint:
918    // iso-2022-jp state = Lead, iso-2022-jp jis0212 flag = unset
919    state Lead0208(ctx: Context) {
920        case 0x0a => ctx.emit(0x000a); // return to ASCII
921        case 0x1b => EscapeStart(ctx);
922        case b => Trail0208(ctx, b);
923        final => ctx.reset();
924    }
925
926    // iso-2022-jp state = Lead, iso-2022-jp jis0212 flag = set
927    state Lead0212(ctx: Context) {
928        case 0x0a => ctx.emit(0x000a); // return to ASCII
929        case 0x1b => EscapeStart(ctx);
930        case b => Trail0212(ctx, b);
931        final => ctx.reset();
932    }
933
934    // iso-2022-jp state = Katakana
935    state Katakana(ctx: Context) {
936        case 0x1b => EscapeStart(ctx);
937        case b @ 0x21...0x5f => ctx.emit(0xff61 + b as u32 - 0x21), Katakana(ctx);
938        case _ => ctx.err("invalid sequence"), Katakana(ctx);
939        final => ctx.reset();
940    }
941
942transient:
943    // iso-2022-jp state = EscapeStart
944    // ESC
945    state EscapeStart(ctx: Context) {
946        case 0x24 => EscapeMiddle24(ctx); // ESC $
947        case 0x28 => EscapeMiddle28(ctx); // ESC (
948        case _ => ctx.backup_and_err(1, "invalid sequence");
949        final => ctx.err("incomplete sequence");
950    }
951
952    // iso-2022-jp state = EscapeMiddle, iso-2022-jp lead = 0x24
953    // ESC $
954    state EscapeMiddle24(ctx: Context) {
955        case 0x40, 0x42 => Lead0208(ctx); // ESC $ @ (JIS X 0208-1978) or ESC $ B (-1983)
956        case 0x28 => EscapeFinal(ctx); // ESC $ (
957        case _ => ctx.backup_and_err(2, "invalid sequence");
958        final => ctx.err("incomplete sequence");
959    }
960
961    // iso-2022-jp state = EscapeMiddle, iso-2022-jp lead = 0x28
962    // ESC (
963    state EscapeMiddle28(ctx: Context) {
964        case 0x42, 0x4a => ctx.reset(); // ESC ( B (ASCII) or ESC ( J (JIS X 0201-1976 roman)
965        case 0x49 => Katakana(ctx); // ESC ( I (JIS X 0201-1976 kana)
966        case _ => ctx.backup_and_err(2, "invalid sequence");
967        final => ctx.err("incomplete sequence");
968    }
969
970    // iso-2022-jp state = EscapeFinal
971    // ESC $ (
972    state EscapeFinal(ctx: Context) {
973        case 0x44 => Lead0212(ctx); // ESC $ ( D (JIS X 0212-1990)
974        case _ => ctx.backup_and_err(3, "invalid sequence");
975        final => ctx.backup_and_err(1, "incomplete sequence");
976    }
977
978    // iso-2022-jp state = Trail, iso-2022-jp jis0212 flag = unset
979    state Trail0208(ctx: Context, lead: u8) {
980        case b =>
981            match map_two_0208_bytes(lead, b) {
982                0xffff => ctx.err("invalid sequence"),
983                ch => ctx.emit(ch as u32)
984            },
985            Lead0208(ctx);
986        final => ctx.err("incomplete sequence");
987    }
988
989    // iso-2022-jp state = Trail, iso-2022-jp jis0212 flag = set
990    state Trail0212(ctx: Context, lead: u8) {
991        case b =>
992            match map_two_0212_bytes(lead, b) {
993                0xffff => ctx.err("invalid sequence"),
994                ch => ctx.emit(ch as u32)
995            },
996            Lead0212(ctx);
997        final => ctx.err("incomplete sequence");
998    }
999}
1000
1001#[cfg(test)]
1002mod iso2022jp_tests {
1003    extern crate test;
1004    use super::ISO2022JPEncoding;
1005    use testutils;
1006    use types::*;
1007
1008    #[test]
1009    fn test_encoder_valid() {
1010        let mut e = ISO2022JPEncoding.raw_encoder();
1011        assert_feed_ok!(e, "A", "", [0x41]);
1012        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
1013        assert_feed_ok!(e, "\x1b\x24\x42", "", [0x1b, 0x24, 0x42]); // no round-trip guarantee
1014        assert_feed_ok!(e, "", "", []);
1015        assert_feed_ok!(e, "\u{a5}", "", [0x5c]);
1016        assert_feed_ok!(e, "\u{203e}", "", [0x7e]);
1017        assert_feed_ok!(e, "\u{306b}\u{307b}\u{3093}", "", [0x1b, 0x24, 0x42,
1018                                                            0x24, 0x4b, 0x24, 0x5b, 0x24, 0x73]);
1019        assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0x46, 0x7c, 0x4b, 0x5c]);
1020        assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0x1b, 0x28, 0x49,
1021                                                            0x46, 0x4e, 0x5d]);
1022        assert_feed_ok!(e, "XYZ", "", [0x1b, 0x28, 0x42,
1023                                       0x58, 0x59, 0x5a]);
1024        assert_finish_ok!(e, []);
1025
1026        // one ASCII character and two similarly looking characters:
1027        // - A: U+0020 SPACE (requires ASCII state)
1028        // - B: U+30CD KATAKANA LETTER NE (requires JIS X 0208 Lead state)
1029        // - C: U+FF88 HALFWIDTH KATAKANA LETTER NE (requires Katakana state)
1030        // - D is omitted as the encoder does not support JIS X 0212.
1031        // a (3,2) De Bruijn near-sequence "ABCACBA" is used to test all possible cases.
1032        const AD: &'static str = "\x20";
1033        const BD: &'static str = "\u{30cd}";
1034        const CD: &'static str = "\u{ff88}";
1035        const AE: &'static [u8] = &[0x1b, 0x28, 0x42, 0x20];
1036        const BE: &'static [u8] = &[0x1b, 0x24, 0x42, 0x25, 0x4d];
1037        const CE: &'static [u8] = &[0x1b, 0x28, 0x49, 0x48];
1038        let mut e = ISO2022JPEncoding.raw_encoder();
1039        let decoded: String = ["\x20",      BD, CD, AD, CD, BD, AD].concat();
1040        let encoded: Vec<_> = [&[0x20][..], BE, CE, AE, CE, BE, AE].concat();
1041        assert_feed_ok!(e, decoded, "", encoded);
1042        assert_finish_ok!(e, []);
1043    }
1044
1045    #[test]
1046    fn test_encoder_invalid() {
1047        let mut e = ISO2022JPEncoding.raw_encoder();
1048        assert_feed_err!(e, "", "\u{ffff}", "", []);
1049        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
1050        // JIS X 0212 is not supported in the encoder
1051        assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []);
1052        assert_finish_ok!(e, []);
1053    }
1054
1055    #[test]
1056    fn test_decoder_valid() {
1057        let mut d = ISO2022JPEncoding.raw_decoder();
1058        assert_feed_ok!(d, [0x41], [], "A");
1059        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
1060        assert_feed_ok!(d, [0x1b, 0x28, 0x4a,
1061                            0x44, 0x45, 0x46], [], "DEF");
1062        assert_feed_ok!(d, [], [], "");
1063        assert_feed_ok!(d, [0x5c], [], "\\");
1064        assert_feed_ok!(d, [0x7e], [], "~");
1065        assert_feed_ok!(d, [0x1b, 0x24, 0x42,
1066                            0x24, 0x4b,
1067                            0x1b, 0x24, 0x42,
1068                            0x24, 0x5b, 0x24, 0x73], [], "\u{306b}\u{307b}\u{3093}");
1069        assert_feed_ok!(d, [0x46, 0x7c, 0x4b, 0x5c], [], "\u{65e5}\u{672c}");
1070        assert_feed_ok!(d, [0x1b, 0x28, 0x49,
1071                            0x46, 0x4e, 0x5d], [], "\u{ff86}\u{ff8e}\u{ff9d}");
1072        assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44,
1073                            0x4b, 0x46,
1074                            0x1b, 0x24, 0x40,
1075                            0x6c, 0x38], [], "\u{736c}\u{8c78}");
1076        assert_feed_ok!(d, [0x1b, 0x28, 0x42,
1077                            0x58, 0x59, 0x5a], [], "XYZ");
1078        assert_finish_ok!(d, "");
1079
1080        let mut d = ISO2022JPEncoding.raw_decoder();
1081        assert_feed_ok!(d, [0x1b, 0x24, 0x42,
1082                            0x24, 0x4b, 0x24, 0x5b, 0x24, 0x73], [], "\u{306b}\u{307b}\u{3093}");
1083        assert_finish_ok!(d, "");
1084
1085        let mut d = ISO2022JPEncoding.raw_decoder();
1086        assert_feed_ok!(d, [0x1b, 0x28, 0x49,
1087                            0x46, 0x4e, 0x5d], [], "\u{ff86}\u{ff8e}\u{ff9d}");
1088        assert_finish_ok!(d, "");
1089
1090        let mut d = ISO2022JPEncoding.raw_decoder();
1091        assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44,
1092                            0x4b, 0x46], [], "\u{736c}");
1093        assert_finish_ok!(d, "");
1094
1095        // one ASCII character and three similarly looking characters:
1096        // - A: U+0020 SPACE (requires ASCII state)
1097        // - B: U+30CD KATAKANA LETTER NE (requires JIS X 0208 Lead state)
1098        // - C: U+FF88 HALFWIDTH KATAKANA LETTER NE (requires Katakana state)
1099        // - D: U+793B CJK UNIFIED IDEOGRAPH-793B (requires JIS X 0212 Lead state)
1100        // a (4,2) De Bruijn sequence "AABBCCACBADDBDCDA" is used to test all possible cases.
1101        const AD: &'static str = "\x20";
1102        const BD: &'static str = "\u{30cd}";
1103        const CD: &'static str = "\u{ff88}";
1104        const DD: &'static str = "\u{793b}";
1105        const AE: &'static [u8] = &[0x1b, 0x28, 0x42,       0x20];
1106        const BE: &'static [u8] = &[0x1b, 0x24, 0x42,       0x25, 0x4d];
1107        const CE: &'static [u8] = &[0x1b, 0x28, 0x49,       0x48];
1108        const DE: &'static [u8] = &[0x1b, 0x24, 0x28, 0x44, 0x50, 0x4b];
1109        let mut d = ISO2022JPEncoding.raw_decoder();
1110        let dec: String = ["\x20",     AD,BD,BD,CD,CD,AD,CD,BD,AD,DD,DD,BD,DD,CD,DD,AD].concat();
1111        let enc: Vec<_> = [&[0x20][..],AE,BE,BE,CE,CE,AE,CE,BE,AE,DE,DE,BE,DE,CE,DE,AE].concat();
1112        assert_feed_ok!(d, enc, [], dec);
1113        assert_finish_ok!(d, "");
1114    }
1115
1116    #[test]
1117    fn test_decoder_valid_partial() {
1118        let mut d = ISO2022JPEncoding.raw_decoder();
1119
1120        assert_feed_ok!(d, [], [0x1b], "");
1121        assert_feed_ok!(d, [], [0x28], "");
1122        assert_feed_ok!(d, [0x4a, 0x41], [], "A");
1123        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1124        assert_feed_ok!(d, [0x4a, 0x42], [0x1b], "B");
1125        assert_feed_ok!(d, [0x28, 0x4a, 0x43], [], "C");
1126
1127        assert_feed_ok!(d, [], [0x1b], "");
1128        assert_feed_ok!(d, [], [0x24], "");
1129        assert_feed_ok!(d, [0x42], [0x24], "");
1130        assert_feed_ok!(d, [0x4b], [0x1b, 0x24], "\u{306b}");
1131        assert_feed_ok!(d, [0x42, 0x24, 0x5b], [], "\u{307b}");
1132        assert_feed_ok!(d, [], [0x1b], "");
1133        assert_feed_ok!(d, [0x24, 0x42, 0x24, 0x73], [], "\u{3093}");
1134
1135        assert_feed_ok!(d, [], [0x1b], "");
1136        assert_feed_ok!(d, [], [0x28], "");
1137        assert_feed_ok!(d, [0x49, 0x46], [], "\u{ff86}");
1138        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1139        assert_feed_ok!(d, [0x49, 0x4e], [0x1b], "\u{ff8e}");
1140        assert_feed_ok!(d, [0x28, 0x49, 0x5d], [], "\u{ff9d}");
1141
1142        assert_feed_ok!(d, [], [0x1b, 0x24], "");
1143        assert_feed_ok!(d, [], [0x28], "");
1144        assert_feed_ok!(d, [0x44], [0x4b], "");
1145        assert_feed_ok!(d, [0x46], [0x1b, 0x24, 0x28], "\u{736c}");
1146        assert_feed_ok!(d, [0x44, 0x4b, 0x46], [], "\u{736c}");
1147
1148        assert_finish_ok!(d, "");
1149    }
1150
1151    #[test]
1152    fn test_decoder_carriage_return() {
1153        // CR in Lead state "resets to ASCII"
1154        let mut d = ISO2022JPEncoding.raw_decoder();
1155        assert_feed_ok!(d, [0x1b, 0x24, 0x42,
1156                            0x25, 0x4d,
1157                            0x0a,
1158                            0x25, 0x4d], [], "\u{30cd}\n\x25\x4d");
1159        assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44,
1160                            0x50, 0x4b,
1161                            0x0a,
1162                            0x50, 0x4b], [], "\u{793b}\n\x50\x4b");
1163        assert_finish_ok!(d, "");
1164
1165        // other states don't allow CR
1166        let mut d = ISO2022JPEncoding.raw_decoder();
1167        assert_feed_err!(d, [0x1b, 0x28, 0x49, 0x48], [0x0a], [], "\u{ff88}"); // Katakana
1168        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x25, 0x0a], [], ""); // Trail
1169        assert_finish_ok!(d, "");
1170    }
1171
1172    #[test]
1173    fn test_decoder_invalid_partial() {
1174        let mut d = ISO2022JPEncoding.raw_decoder();
1175        assert_feed_ok!(d, [0x1b, 0x24, 0x42, 0x24, 0x4b], [0x24], "\u{306b}");
1176        assert_finish_err!(d, "");
1177
1178        let mut d = ISO2022JPEncoding.raw_decoder();
1179        assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46], [0x50], "\u{736c}");
1180        assert_finish_err!(d, "");
1181    }
1182
1183    #[test]
1184    fn test_decoder_invalid_partial_escape() {
1185        let mut d = ISO2022JPEncoding.raw_decoder();
1186        assert_feed_ok!(d, [], [0x1b], "");
1187        assert_finish_err!(d, "");
1188
1189        let mut d = ISO2022JPEncoding.raw_decoder();
1190        assert_feed_ok!(d, [], [0x1b, 0x24], "");
1191        assert_finish_err!(d, ""); // no backup
1192
1193        let mut d = ISO2022JPEncoding.raw_decoder();
1194        assert_feed_ok!(d, [], [0x1b, 0x24, 0x28], "");
1195        assert_finish_err!(d, -1, ""); // backup of -1, not -2
1196
1197        let mut d = ISO2022JPEncoding.raw_decoder();
1198        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1199        assert_finish_err!(d, ""); // no backup
1200
1201        assert_eq!(ISO2022JPEncoding.decode(&[0x1b], DecoderTrap::Replace),
1202                   Ok("\u{fffd}".to_string()));
1203        assert_eq!(ISO2022JPEncoding.decode(&[0x1b, 0x24], DecoderTrap::Replace),
1204                   Ok("\u{fffd}".to_string()));
1205        assert_eq!(ISO2022JPEncoding.decode(&[0x1b, 0x24, 0x28], DecoderTrap::Replace),
1206                   Ok("\u{fffd}\x28".to_string()));
1207        assert_eq!(ISO2022JPEncoding.decode(&[0x1b, 0x28], DecoderTrap::Replace),
1208                   Ok("\u{fffd}".to_string()));
1209    }
1210
1211    #[test]
1212    fn test_decoder_invalid_escape() {
1213        // also tests allowed but never used escape codes in ISO 2022
1214        let mut d = ISO2022JPEncoding.raw_decoder();
1215        macro_rules! reset(() => (
1216            assert_feed_ok!(d, [0x41, 0x42, 0x43, 0x1b, 0x24, 0x42, 0x21, 0x21], [],
1217                            "ABC\u{3000}")
1218        ));
1219
1220        reset!();
1221        assert_feed_ok!(d, [], [0x1b], "");
1222        assert_feed_err!(d, [], [], [0x00], "");
1223        reset!();
1224        assert_feed_err!(d, [], [0x1b], [0x0a], "");
1225        reset!();
1226        assert_feed_err!(d, [], [0x1b], [0x20], "");
1227        reset!();
1228        assert_feed_err!(d, [], [0x1b], [0x21, 0x5a], ""); // ESC ! Z (CZD)
1229        reset!();
1230        assert_feed_err!(d, [], [0x1b], [0x22, 0x5a], ""); // ESC " Z (C1D)
1231        reset!();
1232        assert_feed_err!(d, [], [0x1b], [0x24, 0x5a], ""); // ESC $ Z (GZDM4)
1233        reset!();
1234        assert_feed_ok!(d, [], [0x1b, 0x24], "");
1235        assert_feed_err!(d, -1, [], [], [0x24, 0x5a], "");
1236        reset!();
1237        assert_feed_err!(d, [], [0x1b], [0x24, 0x28, 0x5a], ""); // ESC $ ( Z (GZDM4)
1238        reset!();
1239        assert_feed_ok!(d, [], [0x1b, 0x24, 0x28], "");
1240        assert_feed_err!(d, -2, [], [], [0x24, 0x28, 0x5a], "");
1241        reset!();
1242        assert_feed_err!(d, [], [0x1b], [0x24, 0x29, 0x5a], ""); // ESC $ ) Z (G1DM4)
1243        reset!();
1244        assert_feed_err!(d, [], [0x1b], [0x24, 0x2a, 0x5a], ""); // ESC $ * Z (G2DM4)
1245        reset!();
1246        assert_feed_err!(d, [], [0x1b], [0x24, 0x2b, 0x5a], ""); // ESC $ + Z (G3DM4)
1247        reset!();
1248        assert_feed_err!(d, [], [0x1b], [0x24, 0x2d, 0x5a], ""); // ESC $ - Z (G1DM6)
1249        reset!();
1250        assert_feed_err!(d, [], [0x1b], [0x24, 0x2e, 0x5a], ""); // ESC $ . Z (G2DM6)
1251        reset!();
1252        assert_feed_err!(d, [], [0x1b], [0x24, 0x2f, 0x5a], ""); // ESC $ / Z (G3DM6)
1253        reset!();
1254        assert_feed_err!(d, [], [0x1b], [0x25, 0x5a], ""); // ESC % Z (DOCS)
1255        reset!();
1256        assert_feed_err!(d, [], [0x1b], [0x25, 0x2f, 0x5a], ""); // ESC % / Z (DOCS)
1257        reset!();
1258        assert_feed_err!(d, [], [0x1b], [0x28, 0x5a], ""); // ESC ( Z (GZD4)
1259        reset!();
1260        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1261        assert_feed_err!(d, -1, [], [], [0x28, 0x5a], "");
1262        reset!();
1263        assert_feed_err!(d, [], [0x1b], [0x29, 0x5a], ""); // ESC ) Z (G1D4)
1264        reset!();
1265        assert_feed_err!(d, [], [0x1b], [0x2a, 0x5a], ""); // ESC * Z (G2D4)
1266        reset!();
1267        assert_feed_err!(d, [], [0x1b], [0x2b, 0x5a], ""); // ESC + Z (G3D4)
1268        reset!();
1269        assert_feed_err!(d, [], [0x1b], [0x2d, 0x5a], ""); // ESC - Z (G1D6)
1270        reset!();
1271        assert_feed_err!(d, [], [0x1b], [0x2e, 0x5a], ""); // ESC . Z (G2D6)
1272        reset!();
1273        assert_feed_err!(d, [], [0x1b], [0x2f, 0x5a], ""); // ESC / Z (G3D6)
1274        reset!();
1275        assert_feed_err!(d, [], [0x1b], [0x4e], ""); // ESC N (SS2)
1276        reset!();
1277        assert_feed_err!(d, [], [0x1b], [0x4f], ""); // ESC O (SS3)
1278        reset!();
1279        assert_feed_err!(d, [], [0x1b], [0x6e], ""); // ESC n (LS2)
1280        reset!();
1281        assert_feed_err!(d, [], [0x1b], [0x6f], ""); // ESC o (LS3)
1282        reset!();
1283        assert_feed_err!(d, [], [0x1b], [0x7c], ""); // ESC | (LS3R)
1284        reset!();
1285        assert_feed_err!(d, [], [0x1b], [0x7d], ""); // ESC } (LS2R)
1286        reset!();
1287        assert_feed_err!(d, [], [0x1b], [0x7e], ""); // ESC ~ (LS1R)
1288        reset!();
1289        assert_feed_err!(d, [], [0x1b], [0xff], "");
1290        reset!();
1291        assert_finish_ok!(d, "");
1292    }
1293
1294    #[test]
1295    fn test_decoder_invalid_out_or_range() {
1296        let mut d = ISO2022JPEncoding.raw_decoder();
1297        assert_feed_err!(d, [], [0x80], [], "");
1298        assert_feed_err!(d, [], [0xff], [], "");
1299        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x80, 0x21], [], "");
1300        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x21, 0x80], [], "");
1301        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x20, 0x21], [], "");
1302        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x21, 0x20], [], "");
1303        assert_feed_err!(d, [0x1b, 0x28, 0x49], [0x20], [], "");
1304        assert_feed_err!(d, [0x1b, 0x28, 0x49], [0x60], [], "");
1305        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x80, 0x21], [], "");
1306        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x21, 0x80], [], "");
1307        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x20, 0x21], [], "");
1308        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x21, 0x20], [], "");
1309        assert_finish_ok!(d, "");
1310    }
1311
1312    #[test]
1313    fn test_decoder_feed_after_finish() {
1314        let mut d = ISO2022JPEncoding.raw_decoder();
1315        assert_feed_ok!(d, [0x24, 0x22,
1316                            0x1b, 0x24, 0x42,
1317                            0x24, 0x22], [0x24], "\x24\x22\u{3042}");
1318        assert_finish_err!(d, "");
1319        assert_feed_ok!(d, [0x24, 0x22,
1320                            0x1b, 0x24, 0x42,
1321                            0x24, 0x22], [], "\x24\x22\u{3042}");
1322        assert_finish_ok!(d, "");
1323    }
1324
1325    #[bench]
1326    fn bench_encode_short_text(bencher: &mut test::Bencher) {
1327        let s = testutils::JAPANESE_TEXT;
1328        bencher.bytes = s.len() as u64;
1329        bencher.iter(|| test::black_box({
1330            ISO2022JPEncoding.encode(&s, EncoderTrap::Strict)
1331        }))
1332    }
1333
1334    #[bench]
1335    fn bench_decode_short_text(bencher: &mut test::Bencher) {
1336        let s = ISO2022JPEncoding.encode(testutils::JAPANESE_TEXT,
1337                                         EncoderTrap::Strict).ok().unwrap();
1338        bencher.bytes = s.len() as u64;
1339        bencher.iter(|| test::black_box({
1340            ISO2022JPEncoding.decode(&s, DecoderTrap::Strict)
1341        }))
1342    }
1343}