encoding/codec/
korean.rs

1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy Korean encodings based on KS X 1001.
6
7use std::convert::Into;
8use std::default::Default;
9use util::StrCharIndex;
10use index_korean as index;
11use types::*;
12
13/**
14 * Windows code page 949.
15 *
16 * This is a Korean encoding derived from EUC-KR,
17 * which is so widespread that most occurrences of EUC-KR actually mean this encoding.
18 * Unlike KS X 1001 (and EUC-KR) which only contains a set of 2,350 common Hangul syllables,
19 * it assigns remaining 8,822 Hangul syllables to the two-byte sequence
20 * which second byte have its MSB unset (i.e. `[81-C6] [41-5A 61-7A 81-FE]`).
21 * Its design strongly resembles that of Shift_JIS but less prone to errors
22 * since the set of MSB-unset second bytes is much limited compared to Shift_JIS.
23 */
24#[derive(Clone, Copy)]
25pub struct Windows949Encoding;
26
27impl Encoding for Windows949Encoding {
28    fn name(&self) -> &'static str { "windows-949" }
29    fn whatwg_name(&self) -> Option<&'static str> { Some("euc-kr") } // WHATWG compatibility
30    fn raw_encoder(&self) -> Box<RawEncoder> { Windows949Encoder::new() }
31    fn raw_decoder(&self) -> Box<RawDecoder> { Windows949Decoder::new() }
32}
33
34/// An encoder for Windows code page 949.
35#[derive(Clone, Copy)]
36pub struct Windows949Encoder;
37
38impl Windows949Encoder {
39    pub fn new() -> Box<RawEncoder> { Box::new(Windows949Encoder) }
40}
41
42impl RawEncoder for Windows949Encoder {
43    fn from_self(&self) -> Box<RawEncoder> { Windows949Encoder::new() }
44    fn is_ascii_compatible(&self) -> bool { true }
45
46    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
47        output.writer_hint(input.len());
48
49        for ((i,j), ch) in input.index_iter() {
50            if ch <= '\u{7f}' {
51                output.write_byte(ch as u8);
52            } else {
53                let ptr = index::euc_kr::backward(ch as u32);
54                if ptr == 0xffff {
55                    return (i, Some(CodecError {
56                        upto: j as isize, cause: "unrepresentable character".into()
57                    }));
58                } else {
59                    output.write_byte((ptr / 190 + 0x81) as u8);
60                    output.write_byte((ptr % 190 + 0x41) as u8);
61                }
62            }
63        }
64        (input.len(), None)
65    }
66
67    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
68        None
69    }
70}
71
72/// A decoder for Windows code page 949.
73#[derive(Clone, Copy)]
74struct Windows949Decoder {
75    st: windows949::State,
76}
77
78impl Windows949Decoder {
79    pub fn new() -> Box<RawDecoder> {
80        Box::new(Windows949Decoder { st: Default::default() })
81    }
82}
83
84impl RawDecoder for Windows949Decoder {
85    fn from_self(&self) -> Box<RawDecoder> { Windows949Decoder::new() }
86    fn is_ascii_compatible(&self) -> bool { true }
87
88    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
89        let (st, processed, err) = windows949::raw_feed(self.st, input, output, &());
90        self.st = st;
91        (processed, err)
92    }
93
94    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
95        let (st, err) = windows949::raw_finish(self.st, output, &());
96        self.st = st;
97        err
98    }
99}
100
101stateful_decoder! {
102    module windows949;
103
104    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
105        use index_korean as index;
106
107        let lead = lead as u16;
108        let trail = trail as u16;
109        let index = match (lead, trail) {
110            (0x81...0xfe, 0x41...0xfe) => (lead - 0x81) * 190 + (trail - 0x41),
111            (_, _) => 0xffff,
112        };
113        index::euc_kr::forward(index)
114    }
115
116initial:
117    // euc-kr lead = 0x00
118    state S0(ctx: Context) {
119        case b @ 0x00...0x7f => ctx.emit(b as u32);
120        case b @ 0x81...0xfe => S1(ctx, b);
121        case _ => ctx.err("invalid sequence");
122    }
123
124transient:
125    // euc-kr lead != 0x00
126    state S1(ctx: Context, lead: u8) {
127        case b => match map_two_bytes(lead, b) {
128            0xffff => {
129                let backup = if b < 0x80 {1} else {0};
130                ctx.backup_and_err(backup, "invalid sequence")
131            },
132            ch => ctx.emit(ch as u32)
133        };
134    }
135}
136
137#[cfg(test)]
138mod windows949_tests {
139    extern crate test;
140    use super::Windows949Encoding;
141    use testutils;
142    use types::*;
143
144    #[test]
145    fn test_encoder_valid() {
146        let mut e = Windows949Encoding.raw_encoder();
147        assert_feed_ok!(e, "A", "", [0x41]);
148        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
149        assert_feed_ok!(e, "", "", []);
150        assert_feed_ok!(e, "\u{ac00}", "", [0xb0, 0xa1]);
151        assert_feed_ok!(e, "\u{b098}\u{b2e4}", "", [0xb3, 0xaa, 0xb4, 0xd9]);
152        assert_feed_ok!(e, "\u{bdc1}\u{314b}\u{d7a3}", "", [0x94, 0xee, 0xa4, 0xbb, 0xc6, 0x52]);
153        assert_finish_ok!(e, []);
154    }
155
156    #[test]
157    fn test_encoder_invalid() {
158        let mut e = Windows949Encoding.raw_encoder();
159        assert_feed_err!(e, "", "\u{ffff}", "", []);
160        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
161        assert_feed_err!(e, "?", "\u{fffd}", "!", [0x3f]); // for invalid table entries
162        assert_finish_ok!(e, []);
163    }
164
165    #[test]
166    fn test_decoder_valid() {
167        let mut d = Windows949Encoding.raw_decoder();
168        assert_feed_ok!(d, [0x41], [], "A");
169        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
170        assert_feed_ok!(d, [], [], "");
171        assert_feed_ok!(d, [0xb0, 0xa1], [], "\u{ac00}");
172        assert_feed_ok!(d, [0xb3, 0xaa, 0xb4, 0xd9], [], "\u{b098}\u{b2e4}");
173        assert_feed_ok!(d, [0x94, 0xee, 0xa4, 0xbb, 0xc6, 0x52, 0xc1, 0x64], [],
174                        "\u{bdc1}\u{314b}\u{d7a3}\u{d58f}");
175        assert_finish_ok!(d, "");
176    }
177
178    #[test]
179    fn test_decoder_valid_partial() {
180        let mut d = Windows949Encoding.raw_decoder();
181        assert_feed_ok!(d, [], [0xb0], "");
182        assert_feed_ok!(d, [0xa1], [], "\u{ac00}");
183        assert_feed_ok!(d, [0xb3, 0xaa], [0xb4], "\u{b098}");
184        assert_feed_ok!(d, [0xd9], [0x94], "\u{b2e4}");
185        assert_feed_ok!(d, [0xee, 0xa4, 0xbb], [0xc6], "\u{bdc1}\u{314b}");
186        assert_feed_ok!(d, [0x52, 0xc1, 0x64], [], "\u{d7a3}\u{d58f}");
187        assert_finish_ok!(d, "");
188    }
189
190    #[test]
191    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
192        for i in 0x81..0xff {
193            let mut d = Windows949Encoding.raw_decoder();
194            assert_feed_ok!(d, [], [i], ""); // wait for a trail
195            assert_finish_err!(d, "");
196        }
197
198        // 80/FF: immediate failure
199        let mut d = Windows949Encoding.raw_decoder();
200        assert_feed_err!(d, [], [0x80], [], "");
201        assert_feed_err!(d, [], [0xff], [], "");
202        assert_finish_ok!(d, "");
203    }
204
205    #[test]
206    fn test_decoder_invalid_lone_lead_followed_by_space() {
207        for i in 0x80..0x100 {
208            let i = i as u8;
209            let mut d = Windows949Encoding.raw_decoder();
210            assert_feed_err!(d, [], [i], [0x20], "");
211            assert_finish_ok!(d, "");
212        }
213    }
214
215    #[test]
216    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
217        // should behave similarly to Big5.
218        // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16691
219        for i in 0x81..0xff {
220            let mut d = Windows949Encoding.raw_decoder();
221            assert_feed_err!(d, [], [i, 0x80], [0x20], "");
222            assert_feed_err!(d, [], [i, 0xff], [0x20], "");
223            assert_finish_ok!(d, "");
224
225            let mut d = Windows949Encoding.raw_decoder();
226            assert_feed_ok!(d, [], [i], "");
227            assert_feed_err!(d, [], [0x80], [0x20], "");
228            assert_feed_ok!(d, [], [i], "");
229            assert_feed_err!(d, [], [0xff], [0x20], "");
230            assert_finish_ok!(d, "");
231        }
232
233        let mut d = Windows949Encoding.raw_decoder();
234        assert_feed_err!(d, [], [0x80], [0x80], "");
235        assert_feed_err!(d, [], [0x80], [0xff], "");
236        assert_feed_err!(d, [], [0xff], [0x80], "");
237        assert_feed_err!(d, [], [0xff], [0xff], "");
238        assert_finish_ok!(d, "");
239    }
240
241    #[test]
242    fn test_decoder_invalid_boundary() {
243        // U+D7A3 (C6 52) is the last Hangul syllable not in KS X 1001, C6 53 is invalid.
244        // note that since the trail byte may coincide with ASCII, the trail byte 53 is
245        // not considered to be in the problem. this is compatible to WHATWG Encoding standard.
246        let mut d = Windows949Encoding.raw_decoder();
247        assert_feed_ok!(d, [], [0xc6], "");
248        assert_feed_err!(d, [], [], [0x53], "");
249        assert_finish_ok!(d, "");
250    }
251
252    #[test]
253    fn test_decoder_feed_after_finish() {
254        let mut d = Windows949Encoding.raw_decoder();
255        assert_feed_ok!(d, [0xb0, 0xa1], [0xb0], "\u{ac00}");
256        assert_finish_err!(d, "");
257        assert_feed_ok!(d, [0xb0, 0xa1], [], "\u{ac00}");
258        assert_finish_ok!(d, "");
259    }
260
261    #[bench]
262    fn bench_encode_short_text(bencher: &mut test::Bencher) {
263        let s = testutils::KOREAN_TEXT;
264        bencher.bytes = s.len() as u64;
265        bencher.iter(|| test::black_box({
266            Windows949Encoding.encode(&s, EncoderTrap::Strict)
267        }))
268    }
269
270    #[bench]
271    fn bench_decode_short_text(bencher: &mut test::Bencher) {
272        let s = Windows949Encoding.encode(testutils::KOREAN_TEXT,
273                                          EncoderTrap::Strict).ok().unwrap();
274        bencher.bytes = s.len() as u64;
275        bencher.iter(|| test::black_box({
276            Windows949Encoding.decode(&s, DecoderTrap::Strict)
277        }))
278    }
279}
280