encoding_rs/
big5.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range32;
16
17pub struct Big5Decoder {
18    lead: Option<u8>,
19}
20
21impl Big5Decoder {
22    pub fn new() -> VariantDecoder {
23        VariantDecoder::Big5(Big5Decoder { lead: None })
24    }
25
26    pub fn in_neutral_state(&self) -> bool {
27        self.lead.is_none()
28    }
29
30    fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
31        byte_length.checked_add(match self.lead {
32            None => 0,
33            Some(_) => 1,
34        })
35    }
36
37    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
38        // If there is a lead but the next byte isn't a valid trail, an
39        // error is generated for the lead (+1). Then another iteration checks
40        // space, which needs +1 to account for the possibility of astral
41        // output or combining pair.
42        checked_add(1, self.plus_one_if_lead(byte_length))
43    }
44
45    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
46        // No need to account for REPLACEMENT CHARACTERS.
47        // Cases:
48        // ASCII: 1 to 1
49        // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
50        // lead set and first byte is trail: 1 to 4 worst case
51        //
52        // When checking for space for the last byte:
53        // no lead: the last byte must be ASCII (or fatal error): 1 to 1
54        // lead set: space for 4 bytes was already checked when reading the
55        // lead, hence the last lead and the last trail together are worst
56        // case 2 to 4.
57        //
58        // If lead set and the input is a single trail byte, the worst-case
59        // output is 4, so we need to add one before multiplying if lead is
60        // set.
61        //
62        // Finally, add two so that if input is non-zero, the output is at
63        // least 4.
64        checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
65    }
66
67    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
68        // If there is a lead but the next byte isn't a valid trail, an
69        // error is generated for the lead (+(1*3)). Then another iteration
70        // checks space, which needs +3 to account for the possibility of astral
71        // output or combining pair. In between start and end, the worst case
72        // is that every byte is bad: *3.
73        checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
74    }
75
76    ascii_compatible_two_byte_decoder_functions!(
77        {
78            // If lead is between 0x81 and 0xFE, inclusive,
79            // subtract offset 0x81.
80            let non_ascii_minus_offset =
81                non_ascii.wrapping_sub(0x81);
82            if non_ascii_minus_offset > (0xFE - 0x81) {
83                return (DecoderResult::Malformed(1, 0),
84                        source.consumed(),
85                        handle.written());
86            }
87            non_ascii_minus_offset
88        },
89        {
90            // If trail is between 0x40 and 0x7E, inclusive,
91            // subtract offset 0x40. Else if trail is
92            // between 0xA1 and 0xFE, inclusive, subtract
93            // offset 0x62.
94            // TODO: Find out which range is more probable.
95            let mut trail_minus_offset =
96                byte.wrapping_sub(0x40);
97            if trail_minus_offset > (0x7E - 0x40) {
98                let trail_minus_range_start =
99                    byte.wrapping_sub(0xA1);
100                if trail_minus_range_start >
101                   (0xFE - 0xA1) {
102                    if byte < 0x80 {
103                        return (DecoderResult::Malformed(1, 0),
104                                unread_handle_trail.unread(),
105                                handle.written());
106                    }
107                    return (DecoderResult::Malformed(2, 0),
108                            unread_handle_trail.consumed(),
109                            handle.written());
110                }
111                trail_minus_offset = byte - 0x62;
112            }
113            let pointer = lead_minus_offset as usize *
114                          157usize +
115                          trail_minus_offset as usize;
116            let rebased_pointer = pointer.wrapping_sub(942);
117            let low_bits = big5_low_bits(rebased_pointer);
118            if low_bits == 0 {
119                match pointer {
120                    1133 => {
121                        handle.write_big5_combination(0x00CAu16,
122                                                      0x0304u16)
123                    }
124                    1135 => {
125                        handle.write_big5_combination(0x00CAu16,
126                                                      0x030Cu16)
127                    }
128                    1164 => {
129                        handle.write_big5_combination(0x00EAu16,
130                                                      0x0304u16)
131                    }
132                    1166 => {
133                        handle.write_big5_combination(0x00EAu16,
134                                                      0x030Cu16)
135                    }
136                    _ => {
137                        if byte < 0x80 {
138                            return (DecoderResult::Malformed(1, 0),
139                                    unread_handle_trail.unread(),
140                                    handle.written());
141                        }
142                        return (DecoderResult::Malformed(2, 0),
143                                unread_handle_trail.consumed(),
144                                handle.written());
145                    }
146                }
147            } else if big5_is_astral(rebased_pointer) {
148                handle.write_astral(u32::from(low_bits) |
149                                    0x20000u32)
150            } else {
151                handle.write_bmp_excl_ascii(low_bits)
152            }
153        },
154        self,
155        non_ascii,
156        byte,
157        lead_minus_offset,
158        unread_handle_trail,
159        source,
160        handle,
161        'outermost,
162        copy_ascii_from_check_space_astral,
163        check_space_astral,
164        false);
165}
166
167pub struct Big5Encoder;
168
169impl Big5Encoder {
170    pub fn new(encoding: &'static Encoding) -> Encoder {
171        Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
172    }
173
174    pub fn max_buffer_length_from_utf16_without_replacement(
175        &self,
176        u16_length: usize,
177    ) -> Option<usize> {
178        // Astral: 2 to 2
179        // ASCII: 1 to 1
180        // Other: 1 to 2
181        u16_length.checked_mul(2)
182    }
183
184    pub fn max_buffer_length_from_utf8_without_replacement(
185        &self,
186        byte_length: usize,
187    ) -> Option<usize> {
188        // Astral: 4 to 2
189        // Upper BMP: 3 to 2
190        // Lower BMP: 2 to 2
191        // ASCII: 1 to 1
192        byte_length.checked_add(1)
193    }
194
195    ascii_compatible_encoder_functions!(
196        {
197            // For simplicity, unified ideographs
198            // in the pointer range 11206...11212 are handled
199            // as Level 1 Hanzi.
200            if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
201                handle.write_two(lead, trail)
202            } else {
203                let pointer = if let Some(pointer) = big5_box_encode(bmp) {
204                    pointer
205                } else if let Some(pointer) = big5_other_encode(bmp) {
206                    pointer
207                } else {
208                    return (
209                        EncoderResult::unmappable_from_bmp(bmp),
210                        source.consumed(),
211                        handle.written(),
212                    );
213                };
214                let lead = pointer / 157 + 0x81;
215                let remainder = pointer % 157;
216                let trail = if remainder < 0x3F {
217                    remainder + 0x40
218                } else {
219                    remainder + 0x62
220                };
221                handle.write_two(lead as u8, trail as u8)
222            }
223        },
224        {
225            if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
226                if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
227                    // big5_astral_encode returns rebased pointer,
228                    // so adding 0x87 instead of 0x81.
229                    let lead = rebased_pointer / 157 + 0x87;
230                    let remainder = rebased_pointer % 157;
231                    let trail = if remainder < 0x3F {
232                        remainder + 0x40
233                    } else {
234                        remainder + 0x62
235                    };
236                    handle.write_two(lead as u8, trail as u8)
237                } else {
238                    return (
239                        EncoderResult::Unmappable(astral),
240                        source.consumed(),
241                        handle.written(),
242                    );
243                }
244            } else {
245                return (
246                    EncoderResult::Unmappable(astral),
247                    source.consumed(),
248                    handle.written(),
249                );
250            }
251        },
252        bmp,
253        astral,
254        self,
255        source,
256        handle,
257        copy_ascii_to_check_space_two,
258        check_space_two,
259        false
260    );
261}
262
263// Any copyright to the test code below this comment is dedicated to the
264// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
265
266#[cfg(test)]
267mod tests {
268    use super::super::testing::*;
269    use super::super::*;
270
271    fn decode_big5(bytes: &[u8], expect: &str) {
272        decode(BIG5, bytes, expect);
273    }
274
275    fn encode_big5(string: &str, expect: &[u8]) {
276        encode(BIG5, string, expect);
277    }
278
279    #[test]
280    fn test_big5_decode() {
281        // Empty
282        decode_big5(b"", &"");
283
284        // ASCII
285        decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
286
287        // Edge cases
288        decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
289        decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
290        decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
291        decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
292        decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
293        decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
294        decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
295        decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
296        decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
297        decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
298        decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
299        decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
300
301        // Edge cases surrounded with ASCII
302        decode_big5(
303            &[0x61u8, 0x87u8, 0x40u8, 0x62u8],
304            &"\u{0061}\u{43F0}\u{0062}",
305        );
306        decode_big5(
307            &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
308            &"\u{0061}\u{79D4}\u{0062}",
309        );
310        decode_big5(
311            &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
312            &"\u{0061}\u{2910D}\u{0062}",
313        );
314        decode_big5(
315            &[0x61u8, 0x88u8, 0x62u8, 0x62u8],
316            &"\u{0061}\u{00CA}\u{0304}\u{0062}",
317        );
318        decode_big5(
319            &[0x61u8, 0x88u8, 0x64u8, 0x62u8],
320            &"\u{0061}\u{00CA}\u{030C}\u{0062}",
321        );
322        decode_big5(
323            &[0x61u8, 0x88u8, 0x66u8, 0x62u8],
324            &"\u{0061}\u{00CA}\u{0062}",
325        );
326        decode_big5(
327            &[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
328            &"\u{0061}\u{00EA}\u{0304}\u{0062}",
329        );
330        decode_big5(
331            &[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
332            &"\u{0061}\u{00EA}\u{030C}\u{0062}",
333        );
334        decode_big5(
335            &[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
336            &"\u{0061}\u{00EA}\u{0062}",
337        );
338        decode_big5(
339            &[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
340            &"\u{0061}\u{8991}\u{0062}",
341        );
342        decode_big5(
343            &[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
344            &"\u{0061}\u{27967}\u{0062}",
345        );
346        decode_big5(
347            &[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
348            &"\u{0061}\u{8A29}\u{0062}",
349        );
350
351        // Bad sequences
352        decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
353        decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
354        decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
355        decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
356        decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
357        decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
358    }
359
360    #[test]
361    fn test_big5_encode() {
362        // Empty
363        encode_big5("", b"");
364
365        // ASCII
366        encode_big5("\u{0061}\u{0062}", b"\x61\x62");
367
368        if !cfg!(miri) {
369            // Miri is too slow
370            // Edge cases
371            encode_big5("\u{9EA6}\u{0061}", b"&#40614;\x61");
372            encode_big5("\u{2626B}\u{0061}", b"&#156267;\x61");
373            encode_big5("\u{3000}", b"\xA1\x40");
374            encode_big5("\u{20AC}", b"\xA3\xE1");
375            encode_big5("\u{4E00}", b"\xA4\x40");
376            encode_big5("\u{27607}", b"\xC8\xA4");
377            encode_big5("\u{FFE2}", b"\xC8\xCD");
378            encode_big5("\u{79D4}", b"\xFE\xFE");
379
380            // Not in index
381            encode_big5("\u{2603}\u{0061}", b"&#9731;\x61");
382        }
383
384        // duplicate low bits
385        encode_big5("\u{203B5}", b"\xFD\x6A");
386        encode_big5("\u{25605}", b"\xFE\x46");
387
388        // prefer last
389        encode_big5("\u{2550}", b"\xF9\xF9");
390    }
391
392    #[test]
393    #[cfg_attr(miri, ignore)] // Miri is too slow
394    fn test_big5_decode_all() {
395        let input = include_bytes!("test_data/big5_in.txt");
396        let expectation = include_str!("test_data/big5_in_ref.txt");
397        let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
398        assert!(had_errors, "Should have had errors.");
399        assert_eq!(&cow[..], expectation);
400    }
401
402    #[test]
403    #[cfg_attr(miri, ignore)] // Miri is too slow
404    fn test_big5_encode_all() {
405        let input = include_str!("test_data/big5_out.txt");
406        let expectation = include_bytes!("test_data/big5_out_ref.txt");
407        let (cow, encoding, had_errors) = BIG5.encode(input);
408        assert!(!had_errors, "Should not have had errors.");
409        assert_eq!(encoding, BIG5);
410        assert_eq!(&cow[..], &expectation[..]);
411    }
412
413    #[test]
414    #[cfg_attr(miri, ignore)] // Miri is too slow
415    fn test_big5_encode_from_two_low_surrogates() {
416        let expectation = b"&#65533;&#65533;";
417        let mut output = [0u8; 40];
418        let mut encoder = BIG5.new_encoder();
419        let (result, read, written, had_errors) =
420            encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
421        assert_eq!(result, CoderResult::InputEmpty);
422        assert_eq!(read, 2);
423        assert_eq!(written, expectation.len());
424        assert!(had_errors);
425        assert_eq!(&output[..written], expectation);
426    }
427}