encoding_rs/
gb18030.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16use super::in_range16;
17
18enum Gb18030Pending {
19    None,
20    One(u8),
21    Two(u8, u8),
22    Three(u8, u8, u8),
23}
24
25impl Gb18030Pending {
26    fn is_none(&self) -> bool {
27        match *self {
28            Gb18030Pending::None => true,
29            _ => false,
30        }
31    }
32
33    fn count(&self) -> usize {
34        match *self {
35            Gb18030Pending::None => 0,
36            Gb18030Pending::One(_) => 1,
37            Gb18030Pending::Two(_, _) => 2,
38            Gb18030Pending::Three(_, _, _) => 3,
39        }
40    }
41}
42
43pub struct Gb18030Decoder {
44    first: Option<u8>,
45    second: Option<u8>,
46    third: Option<u8>,
47    pending: Gb18030Pending,
48    pending_ascii: Option<u8>,
49}
50
51impl Gb18030Decoder {
52    pub fn new() -> VariantDecoder {
53        VariantDecoder::Gb18030(Gb18030Decoder {
54            first: None,
55            second: None,
56            third: None,
57            pending: Gb18030Pending::None,
58            pending_ascii: None,
59        })
60    }
61
62    pub fn in_neutral_state(&self) -> bool {
63        self.first.is_none()
64            && self.second.is_none()
65            && self.third.is_none()
66            && self.pending.is_none()
67            && self.pending_ascii.is_none()
68    }
69
70    fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
71        byte_length.checked_add(
72            self.pending.count()
73                + match self.first {
74                    None => 0,
75                    Some(_) => 1,
76                }
77                + match self.second {
78                    None => 0,
79                    Some(_) => 1,
80                }
81                + match self.third {
82                    None => 0,
83                    Some(_) => 1,
84                }
85                + match self.pending_ascii {
86                    None => 0,
87                    Some(_) => 1,
88                },
89        )
90    }
91
92    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
93        // ASCII: 1 to 1 (worst case)
94        // gbk: 2 to 1
95        // ranges: 4 to 1 or 4 to 2
96        checked_add(1, self.extra_from_state(byte_length))
97    }
98
99    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
100        // ASCII: 1 to 1
101        // gbk: 2 to 2 or 2 to 3
102        // ranges: 4 to 2, 4 to 3 or 4 to 4
103        // 0x80: 1 to 3 (worst case)
104        self.max_utf8_buffer_length(byte_length)
105    }
106
107    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
108        checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
109    }
110
111    gb18030_decoder_functions!(
112        {
113            // If first is between 0x81 and 0xFE, inclusive,
114            // subtract offset 0x81.
115            let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
116            if non_ascii_minus_offset > (0xFE - 0x81) {
117                if non_ascii == 0x80 {
118                    handle.write_upper_bmp(0x20ACu16);
119                    continue 'outermost;
120                }
121                return (DecoderResult::Malformed(1, 0),
122                        source.consumed(),
123                        handle.written());
124            }
125            non_ascii_minus_offset
126        },
127        {
128            // Two-byte (or error)
129            if first_minus_offset >= 0x20 {
130                // Not the gbk ideograph range above GB2312
131                let trail_minus_offset = second.wrapping_sub(0xA1);
132                if trail_minus_offset <= (0xFE - 0xA1) {
133                    // GB2312
134                    let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
135                    if hanzi_lead < (0x77 - 0x2F) {
136                        // Level 1 Hanzi, Level 2 Hanzi
137                        // or one of the 5 PUA code
138                        // points in between.
139                        let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
140                        let upper_bmp = GB2312_HANZI[hanzi_pointer];
141                        handle.write_upper_bmp(upper_bmp)
142                    } else if first_minus_offset == 0x20 {
143                        // Symbols (starting with ideographic space)
144                        let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
145                        handle.write_bmp_excl_ascii(bmp)
146                    } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
147                        handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
148                    } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
149                        handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
150                    } else if first_minus_offset > 0x76 {
151                        // Bottom PUA
152                        let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
153                        handle.write_upper_bmp(pua)
154                    } else {
155                        let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
156                        handle.write_bmp_excl_ascii(bmp)
157                    }
158                } else {
159                    // gbk range on the left
160                    let mut trail_minus_offset = second.wrapping_sub(0x40);
161                    if trail_minus_offset > (0x7E - 0x40) {
162                        let trail_minus_range_start = second.wrapping_sub(0x80);
163                        if trail_minus_range_start > (0xA0 - 0x80) {
164                            if second < 0x80 {
165                                return (DecoderResult::Malformed(1, 0),
166                                        unread_handle_second.unread(),
167                                        handle.written());
168                            }
169                            return (DecoderResult::Malformed(2, 0),
170                                    unread_handle_second.consumed(),
171                                    handle.written());
172                        }
173                        trail_minus_offset = second - 0x41;
174                    }
175                    // Zero-base lead
176                    let left_lead = first_minus_offset - 0x20;
177                    let left_pointer = left_lead as usize * (190 - 94) +
178                                       trail_minus_offset as usize;
179                    let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
180                    if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
181                        let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
182                        handle.write_upper_bmp(upper_bmp)
183                    } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
184                        let bmp = gbk_other_decode(left_pointer as u16);
185                        handle.write_bmp_excl_ascii(bmp)
186                    } else {
187                        let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
188                        let upper_bmp = GBK_BOTTOM[bottom_pointer];
189                        handle.write_upper_bmp(upper_bmp)
190                    }
191                }
192            } else {
193                // gbk ideograph range above GB2312
194                let mut trail_minus_offset = second.wrapping_sub(0x40);
195                if trail_minus_offset > (0x7E - 0x40) {
196                    let trail_minus_range_start = second.wrapping_sub(0x80);
197                    if trail_minus_range_start > (0xFE - 0x80) {
198                        if second < 0x80 {
199                            return (DecoderResult::Malformed(1, 0),
200                                    unread_handle_second.unread(),
201                                    handle.written());
202                        }
203                        return (DecoderResult::Malformed(2, 0),
204                                unread_handle_second.consumed(),
205                                handle.written());
206                    }
207                    trail_minus_offset = second - 0x41;
208                }
209                let pointer = first_minus_offset as usize * 190usize +
210                              trail_minus_offset as usize;
211                let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
212                handle.write_upper_bmp(upper_bmp)
213            }
214        },
215        {
216            // If third is between 0x81 and 0xFE, inclusive,
217            // subtract offset 0x81.
218            let third_minus_offset = third.wrapping_sub(0x81);
219            if third_minus_offset > (0xFE - 0x81) {
220                // We have an error. Let's inline what's going
221                // to happen when `second` is
222                // reprocessed. (`third` gets unread.)
223                // `second` is guaranteed ASCII, so let's
224                // put it in `pending_ascii`. Recompute
225                // `second` from `second_minus_offset`.
226                self.pending_ascii = Some(second_minus_offset + 0x30);
227                // Now unread `third` and designate the previous
228                // `first` as being in error.
229                return (DecoderResult::Malformed(1, 1),
230                        unread_handle_third.unread(),
231                        handle.written());
232            }
233            third_minus_offset
234        },
235        {
236            // If fourth is between 0x30 and 0x39, inclusive,
237            // subtract offset 0x30.
238            //
239            // If we have an error, we'll inline what's going
240            // to happen when `second` and `third` are
241            // reprocessed. (`fourth` gets unread.)
242            // `second` is guaranteed ASCII, so let's
243            // put it in `pending_ascii`. Recompute
244            // `second` from `second_minus_offset` to
245            // make this block reusable when `second`
246            // is not in scope.
247            //
248            // `third` is guaranteed to be in the range
249            // that makes it become the new `self.first`.
250            //
251            // `fourth` gets unread and the previous
252            // `first` gets designates as being in error.
253            let fourth_minus_offset = fourth.wrapping_sub(0x30);
254            if fourth_minus_offset > (0x39 - 0x30) {
255                self.pending_ascii = Some(second_minus_offset + 0x30);
256                self.pending = Gb18030Pending::One(third_minus_offset);
257                return (DecoderResult::Malformed(1, 2),
258                        unread_handle_fourth.unread(),
259                        handle.written());
260            }
261            let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
262                          (second_minus_offset as usize * (10 * 126)) +
263                          (third_minus_offset as usize * 10) +
264                          fourth_minus_offset as usize;
265            if pointer <= 39419 {
266                // BMP
267                if pointer == 7457 {
268                    handle.write_upper_bmp(0xE7C7)
269                } else {
270                    handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
271                }
272            } else if pointer >= 189_000 && pointer <= 1_237_575 {
273                // Astral
274                handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
275            } else {
276                return (DecoderResult::Malformed(4, 0),
277                        unread_handle_fourth.consumed(),
278                        handle.written());
279            }
280        },
281        self,
282        non_ascii,
283        first_minus_offset,
284        second,
285        second_minus_offset,
286        unread_handle_second,
287        third,
288        third_minus_offset,
289        unread_handle_third,
290        fourth,
291        fourth_minus_offset,
292        unread_handle_fourth,
293        source,
294        handle,
295        'outermost);
296}
297
298// XXX Experiment with inline directives
299fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
300    // Try ideographic punctuation first as it's the most likely case.
301    // Throwing in the check for full-width currencies and tilde is probably
302    // more size-efficient here than elsewhere.
303    if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
304        if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
305            return Some((0xA1, pos + 0xA1));
306        }
307    }
308    // Ext A
309    if in_range16(bmp, 0x3400, 0x4E00) {
310        return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
311            (
312                0xFE,
313                pos + if pos < (0x3F - 16) {
314                    0x40 + 16
315                } else {
316                    0x41 + 16
317                },
318            )
319        });
320    }
321    // Compatibility ideographs
322    if in_range16(bmp, 0xF900, 0xFB00) {
323        return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
324            if pos < 5 {
325                // end of second to last row
326                (0xFD, pos + (190 - 94 - 5 + 0x41))
327            } else {
328                // last row
329                (0xFE, pos + (0x40 - 5))
330            }
331        });
332    }
333    // Handle everything below U+02CA, which is in GBK_OTHER.
334    if bmp < 0x02CA {
335        if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
336            // Pinyin except U+1E3F
337            if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
338                return Some((0xA8, pos + 0xA1));
339            }
340        } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
341            || in_inclusive_range16(bmp, 0x02C7, 0x02C9)
342        {
343            // Diacritics and Latin 1 symbols
344            if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
345                return Some((0xA1, pos + 0xA1 + 3));
346            }
347        }
348        return None;
349    }
350    if bmp >= 0xE794 {
351        // Various brackets, all in PUA or full-width regions
352        if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
353            return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
354        }
355    } else if bmp == 0x1E3F {
356        // The one Pinyin placed elsewhere on the BMP
357        return Some((0xA8, 0x7B - 0x60 + 0xA1));
358    } else if in_range16(bmp, 0xA000, 0xD800) {
359        // Since Korean has usage in China, let's spend a branch to fast-track
360        // Hangul.
361        return None;
362    }
363    // GB2312 other (except bottom PUA and PUA between Hanzi levels).
364    if let Some(other_pointer) = gb2312_other_encode(bmp) {
365        let other_lead = other_pointer as usize / 94;
366        let other_trail = other_pointer as usize % 94;
367        return Some((0xA2 + other_lead, 0xA1 + other_trail));
368    }
369    // At this point, we've handled all mappable characters above U+02D9 but
370    // below U+2010. Let's check for that range in order to let lower BMP
371    // characters used for minority languages in China avoid the subsequent
372    // search that deals mainly with various symbols.
373    if in_range16(bmp, 0x02DA, 0x2010) {
374        return None;
375    }
376    // GBK other (except radicals and PUA in GBK_BOTTOM).
377    if let Some(other_pointer) = gbk_other_encode(bmp) {
378        let other_lead = other_pointer as usize / (190 - 94);
379        let other_trail = other_pointer as usize % (190 - 94);
380        let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
381        return Some((other_lead + (0x81 + 0x20), other_trail + offset));
382    }
383    // CJK Radicals Supplement or PUA in GBK_BOTTOM
384    if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
385        if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
386            let trail = pos + 16;
387            let offset = if trail < 0x3F { 0x40 } else { 0x41 };
388            return Some((0xFE, trail + offset));
389        }
390    }
391    // GB2312 bottom PUA
392    let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
393    if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
394        let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
395        let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
396        return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
397    }
398    // PUA between Hanzi Levels
399    let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
400    if bmp_minus_pua_between_hanzi < 5 {
401        return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
402    }
403    None
404}
405
406#[cfg(not(feature = "fast-gb-hanzi-encode"))]
407#[inline(always)]
408fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
409    if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
410        (lead, trail)
411    } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
412        let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
413        let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
414        (hanzi_lead as u8, hanzi_trail as u8)
415    } else {
416        let (lead, gbk_trail) = if bmp < 0x72DC {
417            // Above GB2312
418            let pointer = gbk_top_ideograph_encode(bmp) as usize;
419            let lead = (pointer / 190) + 0x81;
420            let gbk_trail = pointer % 190;
421            (lead, gbk_trail)
422        } else {
423            // To the left of GB2312
424            let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
425            let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
426            let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
427            (lead, gbk_trail)
428        };
429        let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
430        (lead as u8, (gbk_trail + offset) as u8)
431    }
432}
433
434#[cfg(feature = "fast-gb-hanzi-encode")]
435#[inline(always)]
436fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
437    gbk_hanzi_encode(bmp_minus_unified_start)
438}
439
440pub struct Gb18030Encoder {
441    extended: bool,
442}
443
444impl Gb18030Encoder {
445    pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
446        Encoder::new(
447            encoding,
448            VariantEncoder::Gb18030(Gb18030Encoder {
449                extended: extended_range,
450            }),
451        )
452    }
453
454    pub fn max_buffer_length_from_utf16_without_replacement(
455        &self,
456        u16_length: usize,
457    ) -> Option<usize> {
458        if self.extended {
459            u16_length.checked_mul(4)
460        } else {
461            // Need to add, because space check is done with the four-byte
462            // assumption.
463            checked_add(2, u16_length.checked_mul(2))
464        }
465    }
466
467    pub fn max_buffer_length_from_utf8_without_replacement(
468        &self,
469        byte_length: usize,
470    ) -> Option<usize> {
471        if self.extended {
472            // 1 to 1
473            // 2 to 2
474            // 3 to 2
475            // 2 to 4 (worst)
476            // 3 to 4
477            // 4 to 4
478            checked_add(2, byte_length.checked_mul(2))
479        } else {
480            // 1 to 1
481            // 2 to 2
482            // 3 to 2
483            // Need to add, because space check is done with the four-byte
484            // assumption.
485            byte_length.checked_add(3)
486        }
487    }
488
489    ascii_compatible_encoder_functions!(
490        {
491            let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
492            if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
493                // CJK Unified Ideographs
494                // Can't fail now, since all are
495                // mapped.
496                let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
497                handle.write_two(lead, trail)
498            } else if bmp == 0xE5E5 {
499                // It's not optimal to check for the unmappable
500                // and for euro at this stage, but getting
501                // the out of the way makes the rest of the
502                // code less messy.
503                return (
504                    EncoderResult::unmappable_from_bmp(bmp),
505                    source.consumed(),
506                    handle.written(),
507                );
508            } else if bmp == 0x20AC && !self.extended {
509                handle.write_one(0x80u8)
510            } else {
511                match gbk_encode_non_unified(bmp) {
512                    Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
513                    None => {
514                        if !self.extended {
515                            return (
516                                EncoderResult::unmappable_from_bmp(bmp),
517                                source.consumed(),
518                                handle.written(),
519                            );
520                        }
521                        let range_pointer = gb18030_range_encode(bmp);
522                        let first = range_pointer / (10 * 126 * 10);
523                        let rem_first = range_pointer % (10 * 126 * 10);
524                        let second = rem_first / (10 * 126);
525                        let rem_second = rem_first % (10 * 126);
526                        let third = rem_second / 10;
527                        let fourth = rem_second % 10;
528                        handle.write_four(
529                            (first + 0x81) as u8,
530                            (second + 0x30) as u8,
531                            (third + 0x81) as u8,
532                            (fourth + 0x30) as u8,
533                        )
534                    }
535                }
536            }
537        },
538        {
539            if !self.extended {
540                return (
541                    EncoderResult::Unmappable(astral),
542                    source.consumed(),
543                    handle.written(),
544                );
545            }
546            let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
547            let first = range_pointer / (10 * 126 * 10);
548            let rem_first = range_pointer % (10 * 126 * 10);
549            let second = rem_first / (10 * 126);
550            let rem_second = rem_first % (10 * 126);
551            let third = rem_second / 10;
552            let fourth = rem_second % 10;
553            handle.write_four(
554                (first + 0x81) as u8,
555                (second + 0x30) as u8,
556                (third + 0x81) as u8,
557                (fourth + 0x30) as u8,
558            )
559        },
560        bmp,
561        astral,
562        self,
563        source,
564        handle,
565        copy_ascii_to_check_space_four,
566        check_space_four,
567        false
568    );
569}
570
571// Any copyright to the test code below this comment is dedicated to the
572// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
573
574#[cfg(test)]
575mod tests {
576    use super::super::testing::*;
577    use super::super::*;
578
579    fn decode_gb18030(bytes: &[u8], expect: &str) {
580        decode(GB18030, bytes, expect);
581    }
582
583    fn encode_gb18030(string: &str, expect: &[u8]) {
584        encode(GB18030, string, expect);
585    }
586
587    fn encode_gbk(string: &str, expect: &[u8]) {
588        encode(GBK, string, expect);
589    }
590
591    #[test]
592    fn test_gb18030_decode() {
593        // Empty
594        decode_gb18030(b"", &"");
595
596        // ASCII
597        decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
598
599        // euro
600        decode_gb18030(b"\x80", "\u{20AC}");
601        decode_gb18030(b"\xA2\xE3", "\u{20AC}");
602
603        // two bytes
604        decode_gb18030(b"\x81\x40", "\u{4E02}");
605        decode_gb18030(b"\x81\x7E", "\u{4E8A}");
606        decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
607        decode_gb18030(b"\x81\x80", "\u{4E90}");
608        decode_gb18030(b"\x81\xFE", "\u{4FA2}");
609        decode_gb18030(b"\xFE\x40", "\u{FA0C}");
610        decode_gb18030(b"\xFE\x7E", "\u{E843}");
611        decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
612        decode_gb18030(b"\xFE\x80", "\u{4723}");
613        decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
614
615        // The difference from the original GB18030
616        decode_gb18030(b"\xA3\xA0", "\u{3000}");
617        decode_gb18030(b"\xA1\xA1", "\u{3000}");
618
619        // 0xFF
620        decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
621        decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
622        decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
623        decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
624        decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
625        decode_gb18030(
626            b"\xFF\x32\x9A\x33\x00",
627            "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
628        );
629
630        // Four bytes
631        decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
632        decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
633        decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
634        decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
635        decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
636        decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
637        decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
638        decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
639        decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
640    }
641
642    #[test]
643    fn test_gb18030_encode() {
644        // Empty
645        encode_gb18030("", b"");
646
647        // ASCII
648        encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
649
650        // euro
651        encode_gb18030("\u{20AC}", b"\xA2\xE3");
652
653        // two bytes
654        encode_gb18030("\u{4E02}", b"\x81\x40");
655        encode_gb18030("\u{4E8A}", b"\x81\x7E");
656        if !cfg!(miri) {
657            // Miri is too slow
658            encode_gb18030("\u{4E90}", b"\x81\x80");
659            encode_gb18030("\u{4FA2}", b"\x81\xFE");
660            encode_gb18030("\u{FA0C}", b"\xFE\x40");
661            encode_gb18030("\u{E843}", b"\xFE\x7E");
662            encode_gb18030("\u{4723}", b"\xFE\x80");
663            encode_gb18030("\u{E4C5}", b"\xFE\xFE");
664        }
665
666        // The difference from the original GB18030
667        encode_gb18030("\u{E5E5}", b"&#58853;");
668        encode_gb18030("\u{3000}", b"\xA1\xA1");
669
670        // Four bytes
671        encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
672        encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
673        if !cfg!(miri) {
674            // Miri is too slow
675            encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
676            encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
677            encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
678        }
679
680        // Edge cases
681        encode_gb18030("\u{00F7}", b"\xA1\xC2");
682    }
683
684    #[test]
685    fn test_gbk_encode() {
686        // Empty
687        encode_gbk("", b"");
688
689        // ASCII
690        encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
691
692        // euro
693        encode_gbk("\u{20AC}", b"\x80");
694
695        // two bytes
696        encode_gbk("\u{4E02}", b"\x81\x40");
697        encode_gbk("\u{4E8A}", b"\x81\x7E");
698        if !cfg!(miri) {
699            // Miri is too slow
700            encode_gbk("\u{4E90}", b"\x81\x80");
701            encode_gbk("\u{4FA2}", b"\x81\xFE");
702            encode_gbk("\u{FA0C}", b"\xFE\x40");
703            encode_gbk("\u{E843}", b"\xFE\x7E");
704            encode_gbk("\u{4723}", b"\xFE\x80");
705            encode_gbk("\u{E4C5}", b"\xFE\xFE");
706        }
707
708        // The difference from the original gb18030
709        encode_gbk("\u{E5E5}", b"&#58853;");
710        encode_gbk("\u{3000}", b"\xA1\xA1");
711
712        // Four bytes
713        encode_gbk("\u{0080}", b"&#128;");
714        encode_gbk("\u{E7C7}", b"&#59335;");
715        if !cfg!(miri) {
716            // Miri is too slow
717            encode_gbk("\u{2603}", b"&#9731;");
718            encode_gbk("\u{1F4A9}", b"&#128169;");
719            encode_gbk("\u{10FFFF}", b"&#1114111;");
720        }
721
722        // Edge cases
723        encode_gbk("\u{00F7}", b"\xA1\xC2");
724    }
725
726    #[test]
727    #[cfg_attr(miri, ignore)] // Miri is too slow
728    fn test_gb18030_decode_all() {
729        let input = include_bytes!("test_data/gb18030_in.txt");
730        let expectation = include_str!("test_data/gb18030_in_ref.txt");
731        let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
732        assert!(!had_errors, "Should not have had errors.");
733        assert_eq!(&cow[..], expectation);
734    }
735
736    #[test]
737    #[cfg_attr(miri, ignore)] // Miri is too slow
738    fn test_gb18030_encode_all() {
739        let input = include_str!("test_data/gb18030_out.txt");
740        let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
741        let (cow, encoding, had_errors) = GB18030.encode(input);
742        assert!(!had_errors, "Should not have had errors.");
743        assert_eq!(encoding, GB18030);
744        assert_eq!(&cow[..], &expectation[..]);
745    }
746
747    #[test]
748    fn test_gb18030_encode_from_utf16_max_length() {
749        let mut output = [0u8; 20];
750        let mut encoder = GB18030.new_encoder();
751        {
752            let needed = encoder
753                .max_buffer_length_from_utf16_without_replacement(1)
754                .unwrap();
755            let (result, read, written) = encoder.encode_from_utf16_without_replacement(
756                &[0x3000],
757                &mut output[..needed],
758                true,
759            );
760            assert_eq!(result, EncoderResult::InputEmpty);
761            assert_eq!(read, 1);
762            assert_eq!(written, 2);
763            assert_eq!(output[0], 0xA1);
764            assert_eq!(output[1], 0xA1);
765        }
766    }
767}