1use std::convert::Into;
8use std::default::Default;
9use util::StrCharIndex;
10use index_tradchinese as index;
11use types::*;
12
13#[derive(Clone, Copy)]
28pub struct BigFive2003Encoding;
29
30impl Encoding for BigFive2003Encoding {
31 fn name(&self) -> &'static str { "big5-2003" }
32 fn whatwg_name(&self) -> Option<&'static str> { Some("big5") } fn raw_encoder(&self) -> Box<RawEncoder> { BigFive2003Encoder::new() }
34 fn raw_decoder(&self) -> Box<RawDecoder> { BigFive2003HKSCS2008Decoder::new() }
35}
36
37#[derive(Clone, Copy)]
39pub struct BigFive2003Encoder;
40
41impl BigFive2003Encoder {
42 pub fn new() -> Box<RawEncoder> { Box::new(BigFive2003Encoder) }
43}
44
45impl RawEncoder for BigFive2003Encoder {
46 fn from_self(&self) -> Box<RawEncoder> { BigFive2003Encoder::new() }
47 fn is_ascii_compatible(&self) -> bool { true }
48
49 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
50 output.writer_hint(input.len());
51
52 for ((i,j), ch) in input.index_iter() {
53 if ch < '\u{80}' {
54 output.write_byte(ch as u8);
55 } else {
56 let ptr = index::big5::backward(ch as u32);
57 if ptr == 0xffff || ptr < (0xa1 - 0x81) * 157 {
58 return (i, Some(CodecError {
60 upto: j as isize, cause: "unrepresentable character".into()
61 }));
62 }
63 let lead = ptr / 157 + 0x81;
64 let trail = ptr % 157;
65 let trailoffset = if trail < 0x3f {0x40} else {0x62};
66 output.write_byte(lead as u8);
67 output.write_byte((trail + trailoffset) as u8);
68 }
69 }
70 (input.len(), None)
71 }
72
73 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
74 None
75 }
76}
77
78#[derive(Clone, Copy)]
80struct BigFive2003HKSCS2008Decoder {
81 st: bigfive2003::State,
82}
83
84impl BigFive2003HKSCS2008Decoder {
85 pub fn new() -> Box<RawDecoder> {
86 Box::new(BigFive2003HKSCS2008Decoder { st: Default::default() })
87 }
88}
89
90impl RawDecoder for BigFive2003HKSCS2008Decoder {
91 fn from_self(&self) -> Box<RawDecoder> { BigFive2003HKSCS2008Decoder::new() }
92 fn is_ascii_compatible(&self) -> bool { true }
93
94 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
95 let (st, processed, err) = bigfive2003::raw_feed(self.st, input, output, &());
96 self.st = st;
97 (processed, err)
98 }
99
100 fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
101 let (st, err) = bigfive2003::raw_finish(self.st, output, &());
102 self.st = st;
103 err
104 }
105}
106
107stateful_decoder! {
108 module bigfive2003;
109
110 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
111 use index_tradchinese as index;
112
113 let lead = lead as u16;
114 let trail = trail as u16;
115 let index = match (lead, trail) {
116 (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0xa1...0xfe) => {
117 let trailoffset = if trail < 0x7f {0x40} else {0x62};
118 (lead - 0x81) * 157 + trail - trailoffset
119 }
120 _ => 0xffff,
121 };
122 index::big5::forward(index) }
124
125initial:
126 state S0(ctx: Context) {
128 case b @ 0x00...0x7f => ctx.emit(b as u32);
129 case b @ 0x81...0xfe => S1(ctx, b);
130 case _ => ctx.err("invalid sequence");
131 }
132
133transient:
134 state S1(ctx: Context, lead: u8) {
136 case b => match map_two_bytes(lead, b) {
137 0xffff => {
138 let backup = if b < 0x80 {1} else {0};
139 ctx.backup_and_err(backup, "invalid sequence")
140 },
141 0 => ctx.emit_str("\u{ca}\u{304}"),
142 1 => ctx.emit_str("\u{ca}\u{30c}"),
143 2 => ctx.emit_str("\u{ea}\u{304}"),
144 3 => ctx.emit_str("\u{ea}\u{30c}"),
145 ch => ctx.emit(ch),
146 };
147 }
148}
149
150#[cfg(test)]
151mod bigfive2003_tests {
152 extern crate test;
153 use super::BigFive2003Encoding;
154 use testutils;
155 use types::*;
156
157 #[test]
158 fn test_encoder_valid() {
159 let mut e = BigFive2003Encoding.raw_encoder();
160 assert_feed_ok!(e, "A", "", [0x41]);
161 assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
162 assert_feed_ok!(e, "", "", []);
163 assert_feed_ok!(e, "\u{4e2d}\u{83ef}\u{6c11}\u{570b}", "",
164 [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea]);
165 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa3, 0xe1, 0x2f, 0x6d]);
166 assert_feed_ok!(e, "\u{ffed}", "", [0xf9, 0xfe]);
167 assert_finish_ok!(e, []);
168 }
169
170 #[test]
171 fn test_encoder_invalid() {
172 let mut e = BigFive2003Encoding.raw_encoder();
173 assert_feed_err!(e, "", "\u{ffff}", "", []);
174 assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
175 assert_feed_err!(e, "", "\u{3eec}", "\u{4e00}", []); assert_finish_ok!(e, []);
177 }
178
179 #[test]
180 fn test_decoder_valid() {
181 let mut d = BigFive2003Encoding.raw_decoder();
182 assert_feed_ok!(d, [0x41], [], "A");
183 assert_feed_ok!(d, [0x42, 0x43], [], "BC");
184 assert_feed_ok!(d, [], [], "");
185 assert_feed_ok!(d, [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea], [],
186 "\u{4e2d}\u{83ef}\u{6c11}\u{570b}");
187 assert_feed_ok!(d, [], [0xa4], "");
188 assert_feed_ok!(d, [0xa4, 0xb5, 0xd8], [0xa5], "\u{4e2d}\u{83ef}");
189 assert_feed_ok!(d, [0xc1, 0xb0, 0xea], [], "\u{6c11}\u{570b}");
190 assert_feed_ok!(d, [0x31, 0xa3, 0xe1, 0x2f, 0x6d], [], "1\u{20ac}/m");
191 assert_feed_ok!(d, [0xf9, 0xfe], [], "\u{ffed}");
192 assert_feed_ok!(d, [0x87, 0x7e], [], "\u{3eec}"); assert_feed_ok!(d, [0x88, 0x62, 0x88, 0x64, 0x88, 0xa3, 0x88, 0xa5], [],
194 "\u{ca}\u{304}\u{00ca}\u{30c}\u{ea}\u{304}\u{ea}\u{30c}"); assert_finish_ok!(d, "");
196 }
197
198 #[test]
199 fn test_decoder_invalid_lone_lead_immediate_test_finish() {
200 for i in 0x81..0xff {
201 let mut d = BigFive2003Encoding.raw_decoder();
202 assert_feed_ok!(d, [], [i], ""); assert_finish_err!(d, "");
204 }
205
206 let mut d = BigFive2003Encoding.raw_decoder();
208 assert_feed_err!(d, [], [0x80], [], "");
209 assert_feed_err!(d, [], [0xff], [], "");
210 assert_finish_ok!(d, "");
211 }
212
213 #[test]
214 fn test_decoder_invalid_lone_lead_followed_by_space() {
215 for i in 0x80..0x100 {
216 let i = i as u8;
217 let mut d = BigFive2003Encoding.raw_decoder();
218 assert_feed_err!(d, [], [i], [0x20], "");
219 assert_finish_ok!(d, "");
220 }
221 }
222
223 #[test]
224 fn test_decoder_invalid_lead_followed_by_invalid_trail() {
225 for i in 0x81..0xff {
228 let mut d = BigFive2003Encoding.raw_decoder();
229 assert_feed_err!(d, [], [i, 0x80], [0x20], "");
230 assert_feed_err!(d, [], [i, 0xff], [0x20], "");
231 assert_finish_ok!(d, "");
232
233 let mut d = BigFive2003Encoding.raw_decoder();
234 assert_feed_ok!(d, [], [i], "");
235 assert_feed_err!(d, [], [0x80], [0x20], "");
236 assert_feed_ok!(d, [], [i], "");
237 assert_feed_err!(d, [], [0xff], [0x20], "");
238 assert_finish_ok!(d, "");
239 }
240
241 let mut d = BigFive2003Encoding.raw_decoder();
243 assert_feed_err!(d, [], [0x80], [0x80], "");
244 assert_feed_err!(d, [], [0x80], [0xff], "");
245 assert_feed_err!(d, [], [0xff], [0x80], "");
246 assert_feed_err!(d, [], [0xff], [0xff], "");
247 assert_finish_ok!(d, "");
248 }
249
250 #[test]
251 fn test_decoder_feed_after_finish() {
252 let mut d = BigFive2003Encoding.raw_decoder();
253 assert_feed_ok!(d, [0xa4, 0x40], [0xa4], "\u{4e00}");
254 assert_finish_err!(d, "");
255 assert_feed_ok!(d, [0xa4, 0x40], [], "\u{4e00}");
256 assert_finish_ok!(d, "");
257 }
258
259 #[bench]
260 fn bench_encode_short_text(bencher: &mut test::Bencher) {
261 let s = testutils::TRADITIONAL_CHINESE_TEXT;
262 bencher.bytes = s.len() as u64;
263 bencher.iter(|| test::black_box({
264 BigFive2003Encoding.encode(&s, EncoderTrap::Strict)
265 }))
266 }
267
268 #[bench]
269 fn bench_decode_short_text(bencher: &mut test::Bencher) {
270 let s = BigFive2003Encoding.encode(testutils::TRADITIONAL_CHINESE_TEXT,
271 EncoderTrap::Strict).ok().unwrap();
272 bencher.bytes = s.len() as u64;
273 bencher.iter(|| test::black_box({
274 BigFive2003Encoding.decode(&s, DecoderTrap::Strict)
275 }))
276 }
277}