encoding_rs/
iso_2022_jp.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16
17#[derive(Copy, Clone, PartialEq)]
18enum Iso2022JpDecoderState {
19    Ascii,
20    Roman,
21    Katakana,
22    LeadByte,
23    TrailByte,
24    EscapeStart,
25    Escape,
26}
27
28pub struct Iso2022JpDecoder {
29    decoder_state: Iso2022JpDecoderState,
30    output_state: Iso2022JpDecoderState, // only takes 1 of first 4 values
31    lead: u8,
32    output_flag: bool,
33    pending_prepended: bool,
34}
35
36impl Iso2022JpDecoder {
37    pub fn new() -> VariantDecoder {
38        VariantDecoder::Iso2022Jp(Iso2022JpDecoder {
39            decoder_state: Iso2022JpDecoderState::Ascii,
40            output_state: Iso2022JpDecoderState::Ascii,
41            lead: 0u8,
42            output_flag: false,
43            pending_prepended: false,
44        })
45    }
46
47    pub fn in_neutral_state(&self) -> bool {
48        self.decoder_state == Iso2022JpDecoderState::Ascii
49            && self.output_state == Iso2022JpDecoderState::Ascii
50            && self.lead == 0u8
51            && !self.output_flag
52            && !self.pending_prepended
53    }
54
55    fn extra_to_input_from_state(&self, byte_length: usize) -> Option<usize> {
56        byte_length.checked_add(
57            if self.lead == 0 || self.pending_prepended {
58                0
59            } else {
60                1
61            } + match self.decoder_state {
62                Iso2022JpDecoderState::Escape | Iso2022JpDecoderState::EscapeStart => 1,
63                _ => 0,
64            },
65        )
66    }
67
68    fn extra_to_output_from_state(&self) -> usize {
69        if self.lead != 0 && self.pending_prepended {
70            1 + self.output_flag as usize
71        } else {
72            self.output_flag as usize
73        }
74    }
75
76    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
77        checked_add(
78            self.extra_to_output_from_state(),
79            self.extra_to_input_from_state(byte_length),
80        )
81    }
82
83    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
84        // worst case: 1 to 3 (half-width katakana)
85        self.max_utf8_buffer_length(byte_length)
86    }
87
88    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
89        checked_mul(
90            3,
91            checked_add(
92                self.extra_to_output_from_state(),
93                self.extra_to_input_from_state(byte_length),
94            ),
95        )
96    }
97
98    decoder_functions!(
99        {
100            if self.pending_prepended {
101                // lead was set in EscapeStart and "prepended"
102                // in Escape.
103                debug_assert!(self.lead == 0x24u8 || self.lead == 0x28u8);
104                match dest.check_space_bmp() {
105                    Space::Full(_) => {
106                        return (DecoderResult::OutputFull, 0, 0);
107                    }
108                    Space::Available(destination_handle) => {
109                        self.pending_prepended = false;
110                        self.output_flag = false;
111                        match self.decoder_state {
112                            Iso2022JpDecoderState::Ascii | Iso2022JpDecoderState::Roman => {
113                                destination_handle.write_ascii(self.lead);
114                                self.lead = 0x0u8;
115                            }
116                            Iso2022JpDecoderState::Katakana => {
117                                destination_handle
118                                    .write_upper_bmp(u16::from(self.lead) - 0x21u16 + 0xFF61u16);
119                                self.lead = 0x0u8;
120                            }
121                            Iso2022JpDecoderState::LeadByte => {
122                                self.decoder_state = Iso2022JpDecoderState::TrailByte;
123                            }
124                            _ => unreachable!(),
125                        }
126                    }
127                }
128            }
129        },
130        {},
131        {
132            match self.decoder_state {
133                Iso2022JpDecoderState::TrailByte | Iso2022JpDecoderState::EscapeStart => {
134                    self.decoder_state = self.output_state;
135                    return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
136                }
137                Iso2022JpDecoderState::Escape => {
138                    self.pending_prepended = true;
139                    self.decoder_state = self.output_state;
140                    return (DecoderResult::Malformed(1, 1), src_consumed, dest.written());
141                }
142                _ => {}
143            }
144        },
145        {
146            match self.decoder_state {
147                Iso2022JpDecoderState::Ascii => {
148                    if b == 0x1Bu8 {
149                        self.decoder_state = Iso2022JpDecoderState::EscapeStart;
150                        continue;
151                    }
152                    self.output_flag = false;
153                    if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
154                        return (
155                            DecoderResult::Malformed(1, 0),
156                            unread_handle.consumed(),
157                            destination_handle.written(),
158                        );
159                    }
160                    destination_handle.write_ascii(b);
161                    continue;
162                }
163                Iso2022JpDecoderState::Roman => {
164                    if b == 0x1Bu8 {
165                        self.decoder_state = Iso2022JpDecoderState::EscapeStart;
166                        continue;
167                    }
168                    self.output_flag = false;
169                    if b == 0x5Cu8 {
170                        destination_handle.write_mid_bmp(0x00A5u16);
171                        continue;
172                    }
173                    if b == 0x7Eu8 {
174                        destination_handle.write_upper_bmp(0x203Eu16);
175                        continue;
176                    }
177                    if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
178                        return (
179                            DecoderResult::Malformed(1, 0),
180                            unread_handle.consumed(),
181                            destination_handle.written(),
182                        );
183                    }
184                    destination_handle.write_ascii(b);
185                    continue;
186                }
187                Iso2022JpDecoderState::Katakana => {
188                    if b == 0x1Bu8 {
189                        self.decoder_state = Iso2022JpDecoderState::EscapeStart;
190                        continue;
191                    }
192                    self.output_flag = false;
193                    if b >= 0x21u8 && b <= 0x5Fu8 {
194                        destination_handle.write_upper_bmp(u16::from(b) - 0x21u16 + 0xFF61u16);
195                        continue;
196                    }
197                    return (
198                        DecoderResult::Malformed(1, 0),
199                        unread_handle.consumed(),
200                        destination_handle.written(),
201                    );
202                }
203                Iso2022JpDecoderState::LeadByte => {
204                    if b == 0x1Bu8 {
205                        self.decoder_state = Iso2022JpDecoderState::EscapeStart;
206                        continue;
207                    }
208                    self.output_flag = false;
209                    if b >= 0x21u8 && b <= 0x7Eu8 {
210                        self.lead = b;
211                        self.decoder_state = Iso2022JpDecoderState::TrailByte;
212                        continue;
213                    }
214                    return (
215                        DecoderResult::Malformed(1, 0),
216                        unread_handle.consumed(),
217                        destination_handle.written(),
218                    );
219                }
220                Iso2022JpDecoderState::TrailByte => {
221                    if b == 0x1Bu8 {
222                        self.decoder_state = Iso2022JpDecoderState::EscapeStart;
223                        // The byte in error is the previous
224                        // lead byte.
225                        return (
226                            DecoderResult::Malformed(1, 1),
227                            unread_handle.consumed(),
228                            destination_handle.written(),
229                        );
230                    }
231                    self.decoder_state = Iso2022JpDecoderState::LeadByte;
232                    let jis0208_lead_minus_offset = self.lead - 0x21;
233                    let byte = b;
234                    let handle = destination_handle;
235                    // The code below uses else after continue in
236                    // order to retain the structure seen in EUC-JP.
237                    let trail_minus_offset = byte.wrapping_sub(0x21);
238                    // Fast-track Hiragana (60% according to Lunde)
239                    // and Katakana (10% acconding to Lunde).
240                    if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
241                        // Hiragana
242                        handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset));
243                        continue;
244                    } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
245                        // Katakana
246                        handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset));
247                        continue;
248                    } else if trail_minus_offset > (0xFE - 0xA1) {
249                        return (
250                            DecoderResult::Malformed(2, 0),
251                            unread_handle.consumed(),
252                            handle.written(),
253                        );
254                    } else {
255                        let pointer =
256                            mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize;
257                        let level1_pointer = pointer.wrapping_sub(1410);
258                        if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
259                            handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]);
260                            continue;
261                        } else {
262                            let level2_pointer = pointer.wrapping_sub(4418);
263                            if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
264                                handle.write_upper_bmp(
265                                    JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer],
266                                );
267                                continue;
268                            } else {
269                                let ibm_pointer = pointer.wrapping_sub(8272);
270                                if ibm_pointer < IBM_KANJI.len() {
271                                    handle.write_upper_bmp(IBM_KANJI[ibm_pointer]);
272                                    continue;
273                                } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
274                                    handle.write_bmp_excl_ascii(bmp);
275                                    continue;
276                                } else if let Some(bmp) = jis0208_range_decode(pointer) {
277                                    handle.write_bmp_excl_ascii(bmp);
278                                    continue;
279                                } else {
280                                    return (
281                                        DecoderResult::Malformed(2, 0),
282                                        unread_handle.consumed(),
283                                        handle.written(),
284                                    );
285                                }
286                            }
287                        }
288                    }
289                }
290                Iso2022JpDecoderState::EscapeStart => {
291                    if b == 0x24u8 || b == 0x28u8 {
292                        self.lead = b;
293                        self.decoder_state = Iso2022JpDecoderState::Escape;
294                        continue;
295                    }
296                    self.output_flag = false;
297                    self.decoder_state = self.output_state;
298                    return (
299                        DecoderResult::Malformed(1, 0),
300                        unread_handle.unread(),
301                        destination_handle.written(),
302                    );
303                }
304                Iso2022JpDecoderState::Escape => {
305                    let mut state: Option<Iso2022JpDecoderState> = None;
306                    if self.lead == 0x28u8 && b == 0x42u8 {
307                        state = Some(Iso2022JpDecoderState::Ascii);
308                    } else if self.lead == 0x28u8 && b == 0x4Au8 {
309                        state = Some(Iso2022JpDecoderState::Roman);
310                    } else if self.lead == 0x28u8 && b == 0x49u8 {
311                        state = Some(Iso2022JpDecoderState::Katakana);
312                    } else if self.lead == 0x24u8 && (b == 0x40u8 || b == 0x42u8) {
313                        state = Some(Iso2022JpDecoderState::LeadByte);
314                    }
315                    match state {
316                        Some(s) => {
317                            self.lead = 0x0u8;
318                            self.decoder_state = s;
319                            self.output_state = s;
320                            let flag = self.output_flag;
321                            self.output_flag = true;
322                            if flag {
323                                // We had an escape sequence
324                                // immediately following another
325                                // escape sequence. Therefore,
326                                // the first one of these was
327                                // useless.
328                                return (
329                                    DecoderResult::Malformed(3, 3),
330                                    unread_handle.consumed(),
331                                    destination_handle.written(),
332                                );
333                            }
334                            continue;
335                        }
336                        None => {
337                            // self.lead is still the previous
338                            // byte. It will be processed in
339                            // the preabmle upon next call.
340                            self.pending_prepended = true;
341                            self.output_flag = false;
342                            self.decoder_state = self.output_state;
343                            // The byte in error is not the
344                            // current or the previous byte but
345                            // the one before those (lone 0x1B).
346                            return (
347                                DecoderResult::Malformed(1, 1),
348                                unread_handle.unread(),
349                                destination_handle.written(),
350                            );
351                        }
352                    }
353                }
354            }
355        },
356        self,
357        src_consumed,
358        dest,
359        source,
360        b,
361        destination_handle,
362        unread_handle,
363        check_space_bmp
364    );
365}
366
367#[cfg(feature = "fast-kanji-encode")]
368#[inline(always)]
369fn is_kanji_mapped(bmp: u16) -> bool {
370    // Use the shift_jis variant, because we don't care about the
371    // byte values here.
372    jis0208_kanji_shift_jis_encode(bmp).is_some()
373}
374
375#[cfg(not(feature = "fast-kanji-encode"))]
376#[cfg_attr(
377    feature = "cargo-clippy",
378    allow(if_let_redundant_pattern_matching, if_same_then_else)
379)]
380#[inline(always)]
381fn is_kanji_mapped(bmp: u16) -> bool {
382    if 0x4EDD == bmp {
383        true
384    } else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
385        // Use the shift_jis variant, because we don't care about the
386        // byte values here.
387        true
388    } else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
389        true
390    } else if let Some(_) = position(&IBM_KANJI[..], bmp) {
391        true
392    } else {
393        false
394    }
395}
396
397#[cfg_attr(
398    feature = "cargo-clippy",
399    allow(if_let_redundant_pattern_matching, if_same_then_else)
400)]
401fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
402    // The code below uses else after return to
403    // keep the same structure as in EUC-JP.
404    // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
405    let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
406    if bmp_minus_hiragana < 0x53 {
407        true
408    } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
409        is_kanji_mapped(bmp)
410    } else {
411        let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
412        if bmp_minus_katakana < 0x56 {
413            true
414        } else {
415            let bmp_minus_space = bmp.wrapping_sub(0x3000);
416            if bmp_minus_space < 3 {
417                // fast-track common punctuation
418                true
419            } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
420                true
421            } else if bmp == 0x2212 {
422                true
423            } else if let Some(_) = jis0208_range_encode(bmp) {
424                true
425            } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 || bmp == 0xF9DC {
426                true
427            } else if let Some(_) = ibm_symbol_encode(bmp) {
428                true
429            } else if let Some(_) = jis0208_symbol_encode(bmp) {
430                true
431            } else {
432                false
433            }
434        }
435    }
436}
437
438#[cfg(feature = "fast-kanji-encode")]
439#[inline(always)]
440fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
441    jis0208_kanji_iso_2022_jp_encode(bmp)
442}
443
444#[cfg(not(feature = "fast-kanji-encode"))]
445#[inline(always)]
446fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
447    if 0x4EDD == bmp {
448        // Ideograph on the symbol row!
449        Some((0x21, 0xB8 - 0x80))
450    } else if let Some((lead, trail)) = jis0208_level1_kanji_iso_2022_jp_encode(bmp) {
451        Some((lead, trail))
452    } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
453        let lead = (pos / 94) + (0xD0 - 0x80);
454        let trail = (pos % 94) + 0x21;
455        Some((lead as u8, trail as u8))
456    } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
457        let lead = (pos / 94) + (0xF9 - 0x80);
458        let trail = (pos % 94) + 0x21;
459        Some((lead as u8, trail as u8))
460    } else {
461        None
462    }
463}
464
465enum Iso2022JpEncoderState {
466    Ascii,
467    Roman,
468    Jis0208,
469}
470
471pub struct Iso2022JpEncoder {
472    state: Iso2022JpEncoderState,
473}
474
475impl Iso2022JpEncoder {
476    pub fn new(encoding: &'static Encoding) -> Encoder {
477        Encoder::new(
478            encoding,
479            VariantEncoder::Iso2022Jp(Iso2022JpEncoder {
480                state: Iso2022JpEncoderState::Ascii,
481            }),
482        )
483    }
484
485    pub fn has_pending_state(&self) -> bool {
486        match self.state {
487            Iso2022JpEncoderState::Ascii => false,
488            _ => true,
489        }
490    }
491
492    pub fn max_buffer_length_from_utf16_without_replacement(
493        &self,
494        u16_length: usize,
495    ) -> Option<usize> {
496        // Worst case: every other character is ASCII/Roman and every other
497        // JIS0208.
498        // Two UTF-16 input units:
499        // Transition to Roman: 3
500        // Roman/ASCII: 1
501        // Transition to JIS0208: 3
502        // JIS0208: 2
503        // End transition: 3
504        checked_add_opt(
505            checked_add(3, u16_length.checked_mul(4)),
506            checked_div(u16_length.checked_add(1), 2),
507        )
508    }
509
510    pub fn max_buffer_length_from_utf8_without_replacement(
511        &self,
512        byte_length: usize,
513    ) -> Option<usize> {
514        // Worst case: every other character is ASCII/Roman and every other
515        // JIS0208.
516        // Three UTF-8 input units: 1 ASCII, 2 JIS0208
517        // Transition to ASCII: 3
518        // Roman/ASCII: 1
519        // Transition to JIS0208: 3
520        // JIS0208: 2
521        // End transition: 3
522        checked_add(3, byte_length.checked_mul(3))
523    }
524
525    encoder_functions!(
526        {
527            match self.state {
528                Iso2022JpEncoderState::Ascii => {}
529                _ => match dest.check_space_three() {
530                    Space::Full(dst_written) => {
531                        return (EncoderResult::OutputFull, src_consumed, dst_written);
532                    }
533                    Space::Available(destination_handle) => {
534                        self.state = Iso2022JpEncoderState::Ascii;
535                        destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
536                    }
537                },
538            }
539        },
540        {
541            match self.state {
542                Iso2022JpEncoderState::Ascii => {
543                    if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
544                        return (
545                            EncoderResult::Unmappable('\u{FFFD}'),
546                            unread_handle.consumed(),
547                            destination_handle.written(),
548                        );
549                    }
550                    if c <= '\u{7F}' {
551                        destination_handle.write_one(c as u8);
552                        continue;
553                    }
554                    if c == '\u{A5}' || c == '\u{203E}' {
555                        self.state = Iso2022JpEncoderState::Roman;
556                        destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
557                        unread_handle.unread();
558                        continue;
559                    }
560                    if c > '\u{FFFF}' {
561                        return (
562                            EncoderResult::Unmappable(c),
563                            unread_handle.consumed(),
564                            destination_handle.written(),
565                        );
566                    }
567                    // Yes, if c is in index, we'll search
568                    // again in the Jis0208 state, but this
569                    // encoder is not worth optimizing.
570                    if is_mapped_for_two_byte_encode(c as u16) {
571                        self.state = Iso2022JpEncoderState::Jis0208;
572                        destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
573                        unread_handle.unread();
574                        continue;
575                    }
576                    return (
577                        EncoderResult::Unmappable(c),
578                        unread_handle.consumed(),
579                        destination_handle.written(),
580                    );
581                }
582                Iso2022JpEncoderState::Roman => {
583                    if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
584                        return (
585                            EncoderResult::Unmappable('\u{FFFD}'),
586                            unread_handle.consumed(),
587                            destination_handle.written(),
588                        );
589                    }
590                    if c == '\u{5C}' || c == '\u{7E}' {
591                        self.state = Iso2022JpEncoderState::Ascii;
592                        destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
593                        unread_handle.unread();
594                        continue;
595                    }
596                    if c <= '\u{7F}' {
597                        destination_handle.write_one(c as u8);
598                        continue;
599                    }
600                    if c == '\u{A5}' {
601                        destination_handle.write_one(0x5Cu8);
602                        continue;
603                    }
604                    if c == '\u{203E}' {
605                        destination_handle.write_one(0x7Eu8);
606                        continue;
607                    }
608                    if c > '\u{FFFF}' {
609                        return (
610                            EncoderResult::Unmappable(c),
611                            unread_handle.consumed(),
612                            destination_handle.written(),
613                        );
614                    }
615                    // Yes, if c is in index, we'll search
616                    // again in the Jis0208 state, but this
617                    // encoder is not worth optimizing.
618                    if is_mapped_for_two_byte_encode(c as u16) {
619                        self.state = Iso2022JpEncoderState::Jis0208;
620                        destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
621                        unread_handle.unread();
622                        continue;
623                    }
624                    return (
625                        EncoderResult::Unmappable(c),
626                        unread_handle.consumed(),
627                        destination_handle.written(),
628                    );
629                }
630                Iso2022JpEncoderState::Jis0208 => {
631                    if c <= '\u{7F}' {
632                        self.state = Iso2022JpEncoderState::Ascii;
633                        destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
634                        unread_handle.unread();
635                        continue;
636                    }
637                    if c == '\u{A5}' || c == '\u{203E}' {
638                        self.state = Iso2022JpEncoderState::Roman;
639                        destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
640                        unread_handle.unread();
641                        continue;
642                    }
643                    if c > '\u{FFFF}' {
644                        // Transition to ASCII here in order
645                        // not to make it the responsibility
646                        // of the caller.
647                        self.state = Iso2022JpEncoderState::Ascii;
648                        return (
649                            EncoderResult::Unmappable(c),
650                            unread_handle.consumed(),
651                            destination_handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8),
652                        );
653                    }
654                    let bmp = c as u16;
655                    let handle = destination_handle;
656                    // The code below uses else after continue to
657                    // keep the same structure as in EUC-JP.
658                    // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
659                    let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
660                    if bmp_minus_hiragana < 0x53 {
661                        handle.write_two(0x24, 0x21 + bmp_minus_hiragana as u8);
662                        continue;
663                    } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
664                        if let Some((lead, trail)) = encode_kanji(bmp) {
665                            handle.write_two(lead, trail);
666                            continue;
667                        } else {
668                            self.state = Iso2022JpEncoderState::Ascii;
669                            return (
670                                EncoderResult::Unmappable(c),
671                                unread_handle.consumed(),
672                                handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8),
673                            );
674                        }
675                    } else {
676                        let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
677                        if bmp_minus_katakana < 0x56 {
678                            handle.write_two(0x25, 0x21 + bmp_minus_katakana as u8);
679                            continue;
680                        } else {
681                            let bmp_minus_space = bmp.wrapping_sub(0x3000);
682                            if bmp_minus_space < 3 {
683                                // fast-track common punctuation
684                                handle.write_two(0x21, 0x21 + bmp_minus_space as u8);
685                                continue;
686                            }
687                            let bmp_minus_half_width = bmp.wrapping_sub(0xFF61);
688                            if bmp_minus_half_width <= (0xFF9F - 0xFF61) {
689                                // We have half-width katakana. The lead is either
690                                // row 1 or 5 of JIS X 0208, so the lookup table
691                                // only stores the trail.
692                                let lead =
693                                    if bmp != 0xFF70 && in_inclusive_range16(bmp, 0xFF66, 0xFF9D) {
694                                        0x25u8
695                                    } else {
696                                        0x21u8
697                                    };
698                                let trail =
699                                    ISO_2022_JP_HALF_WIDTH_TRAIL[bmp_minus_half_width as usize];
700                                handle.write_two(lead, trail);
701                                continue;
702                            } else if bmp == 0x2212 {
703                                handle.write_two(0x21, 0x5D);
704                                continue;
705                            } else if let Some(pointer) = jis0208_range_encode(bmp) {
706                                let lead = (pointer / 94) + 0x21;
707                                let trail = (pointer % 94) + 0x21;
708                                handle.write_two(lead as u8, trail as u8);
709                                continue;
710                            } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
711                                || bmp == 0xF929
712                                || bmp == 0xF9DC
713                            {
714                                // Guaranteed to be found in IBM_KANJI
715                                let pos = position(&IBM_KANJI[..], bmp).unwrap();
716                                let lead = (pos / 94) + (0xF9 - 0x80);
717                                let trail = (pos % 94) + 0x21;
718                                handle.write_two(lead as u8, trail as u8);
719                                continue;
720                            } else if let Some(pointer) = ibm_symbol_encode(bmp) {
721                                let lead = (pointer / 94) + 0x21;
722                                let trail = (pointer % 94) + 0x21;
723                                handle.write_two(lead as u8, trail as u8);
724                                continue;
725                            } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
726                                let lead = (pointer / 94) + 0x21;
727                                let trail = (pointer % 94) + 0x21;
728                                handle.write_two(lead as u8, trail as u8);
729                                continue;
730                            } else {
731                                self.state = Iso2022JpEncoderState::Ascii;
732                                return (
733                                    EncoderResult::Unmappable(c),
734                                    unread_handle.consumed(),
735                                    handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8),
736                                );
737                            }
738                        }
739                    }
740                }
741            }
742        },
743        self,
744        src_consumed,
745        source,
746        dest,
747        c,
748        destination_handle,
749        unread_handle,
750        check_space_three
751    );
752}
753
754// Any copyright to the test code below this comment is dedicated to the
755// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
756
757#[cfg(test)]
758mod tests {
759    use super::super::testing::*;
760    use super::super::*;
761
762    fn decode_iso_2022_jp(bytes: &[u8], expect: &str) {
763        decode(ISO_2022_JP, bytes, expect);
764    }
765
766    fn encode_iso_2022_jp(string: &str, expect: &[u8]) {
767        encode(ISO_2022_JP, string, expect);
768    }
769
770    #[test]
771    fn test_iso_2022_jp_decode() {
772        // Empty
773        decode_iso_2022_jp(b"", &"");
774
775        // ASCII
776        decode_iso_2022_jp(b"\x61\x62", "\u{0061}\u{0062}");
777        decode_iso_2022_jp(b"\x7F\x0E\x0F", "\u{007F}\u{FFFD}\u{FFFD}");
778
779        // Partial escapes
780        decode_iso_2022_jp(b"\x1B", "\u{FFFD}");
781        decode_iso_2022_jp(b"\x1B$", "\u{FFFD}$");
782        decode_iso_2022_jp(b"\x1B(", "\u{FFFD}(");
783        decode_iso_2022_jp(b"\x1B.", "\u{FFFD}.");
784
785        // ISO escapes
786        decode_iso_2022_jp(b"\x1B(B", ""); // ASCII
787        decode_iso_2022_jp(b"\x1B(J", ""); // Roman
788        decode_iso_2022_jp(b"\x1B$@", ""); // 0208
789        decode_iso_2022_jp(b"\x1B$B", ""); // 0208
790        decode_iso_2022_jp(b"\x1B$(D", "\u{FFFD}$(D"); // 2012
791        decode_iso_2022_jp(b"\x1B$A", "\u{FFFD}$A"); // GB2312
792        decode_iso_2022_jp(b"\x1B$(C", "\u{FFFD}$(C"); // KR
793        decode_iso_2022_jp(b"\x1B.A", "\u{FFFD}.A"); // Latin-1
794        decode_iso_2022_jp(b"\x1B.F", "\u{FFFD}.F"); // Greek
795        decode_iso_2022_jp(b"\x1B(I", ""); // Half-width Katakana
796        decode_iso_2022_jp(b"\x1B$(O", "\u{FFFD}$(O"); // 2013
797        decode_iso_2022_jp(b"\x1B$(P", "\u{FFFD}$(P"); // 2013
798        decode_iso_2022_jp(b"\x1B$(Q", "\u{FFFD}$(Q"); // 2013
799        decode_iso_2022_jp(b"\x1B$)C", "\u{FFFD}$)C"); // KR
800        decode_iso_2022_jp(b"\x1B$)A", "\u{FFFD}$)A"); // GB2312
801        decode_iso_2022_jp(b"\x1B$)G", "\u{FFFD}$)G"); // CNS
802        decode_iso_2022_jp(b"\x1B$*H", "\u{FFFD}$*H"); // CNS
803        decode_iso_2022_jp(b"\x1B$)E", "\u{FFFD}$)E"); // IR
804        decode_iso_2022_jp(b"\x1B$+I", "\u{FFFD}$+I"); // CNS
805        decode_iso_2022_jp(b"\x1B$+J", "\u{FFFD}$+J"); // CNS
806        decode_iso_2022_jp(b"\x1B$+K", "\u{FFFD}$+K"); // CNS
807        decode_iso_2022_jp(b"\x1B$+L", "\u{FFFD}$+L"); // CNS
808        decode_iso_2022_jp(b"\x1B$+M", "\u{FFFD}$+M"); // CNS
809        decode_iso_2022_jp(b"\x1B$(@", "\u{FFFD}$(@"); // 0208
810        decode_iso_2022_jp(b"\x1B$(A", "\u{FFFD}$(A"); // GB2312
811        decode_iso_2022_jp(b"\x1B$(B", "\u{FFFD}$(B"); // 0208
812        decode_iso_2022_jp(b"\x1B%G", "\u{FFFD}%G"); // UTF-8
813
814        // ASCII
815        decode_iso_2022_jp(b"\x5B", "\u{005B}");
816        decode_iso_2022_jp(b"\x5C", "\u{005C}");
817        decode_iso_2022_jp(b"\x7E", "\u{007E}");
818        decode_iso_2022_jp(b"\x0E", "\u{FFFD}");
819        decode_iso_2022_jp(b"\x0F", "\u{FFFD}");
820        decode_iso_2022_jp(b"\x80", "\u{FFFD}");
821        decode_iso_2022_jp(b"\xFF", "\u{FFFD}");
822        decode_iso_2022_jp(b"\x1B(B\x5B", "\u{005B}");
823        decode_iso_2022_jp(b"\x1B(B\x5C", "\u{005C}");
824        decode_iso_2022_jp(b"\x1B(B\x7E", "\u{007E}");
825        decode_iso_2022_jp(b"\x1B(B\x0E", "\u{FFFD}");
826        decode_iso_2022_jp(b"\x1B(B\x0F", "\u{FFFD}");
827        decode_iso_2022_jp(b"\x1B(B\x80", "\u{FFFD}");
828        decode_iso_2022_jp(b"\x1B(B\xFF", "\u{FFFD}");
829
830        // Roman
831        decode_iso_2022_jp(b"\x1B(J\x5B", "\u{005B}");
832        decode_iso_2022_jp(b"\x1B(J\x5C", "\u{00A5}");
833        decode_iso_2022_jp(b"\x1B(J\x7E", "\u{203E}");
834        decode_iso_2022_jp(b"\x1B(J\x0E", "\u{FFFD}");
835        decode_iso_2022_jp(b"\x1B(J\x0F", "\u{FFFD}");
836        decode_iso_2022_jp(b"\x1B(J\x80", "\u{FFFD}");
837        decode_iso_2022_jp(b"\x1B(J\xFF", "\u{FFFD}");
838
839        // Katakana
840        decode_iso_2022_jp(b"\x1B(I\x20", "\u{FFFD}");
841        decode_iso_2022_jp(b"\x1B(I\x21", "\u{FF61}");
842        decode_iso_2022_jp(b"\x1B(I\x5F", "\u{FF9F}");
843        decode_iso_2022_jp(b"\x1B(I\x60", "\u{FFFD}");
844        decode_iso_2022_jp(b"\x1B(I\x0E", "\u{FFFD}");
845        decode_iso_2022_jp(b"\x1B(I\x0F", "\u{FFFD}");
846        decode_iso_2022_jp(b"\x1B(I\x80", "\u{FFFD}");
847        decode_iso_2022_jp(b"\x1B(I\xFF", "\u{FFFD}");
848
849        // 0208 differences from 1978 to 1983
850        decode_iso_2022_jp(b"\x1B$@\x54\x64", "\u{58FA}");
851        decode_iso_2022_jp(b"\x1B$@\x44\x5B", "\u{58F7}");
852        decode_iso_2022_jp(b"\x1B$@\x74\x21", "\u{582F}");
853        decode_iso_2022_jp(b"\x1B$@\x36\x46", "\u{5C2D}");
854        decode_iso_2022_jp(b"\x1B$@\x28\x2E", "\u{250F}");
855        decode_iso_2022_jp(b"\x1B$B\x54\x64", "\u{58FA}");
856        decode_iso_2022_jp(b"\x1B$B\x44\x5B", "\u{58F7}");
857        decode_iso_2022_jp(b"\x1B$B\x74\x21", "\u{582F}");
858        decode_iso_2022_jp(b"\x1B$B\x36\x46", "\u{5C2D}");
859        decode_iso_2022_jp(b"\x1B$B\x28\x2E", "\u{250F}");
860
861        // Broken 0208
862        decode_iso_2022_jp(b"\x1B$B\x28\x41", "\u{FFFD}");
863        decode_iso_2022_jp(b"\x1B$@\x80\x54\x64", "\u{FFFD}\u{58FA}");
864        decode_iso_2022_jp(b"\x1B$B\x28\x80", "\u{FFFD}");
865
866        if cfg!(miri) {
867            // Miri is too slow
868            return;
869        }
870
871        // Transitions
872        decode_iso_2022_jp(b"\x1B(B\x5C\x1B(J\x5C", "\u{005C}\u{00A5}");
873        decode_iso_2022_jp(b"\x1B(B\x5C\x1B(I\x21", "\u{005C}\u{FF61}");
874        decode_iso_2022_jp(b"\x1B(B\x5C\x1B$@\x54\x64", "\u{005C}\u{58FA}");
875        decode_iso_2022_jp(b"\x1B(B\x5C\x1B$B\x54\x64", "\u{005C}\u{58FA}");
876
877        decode_iso_2022_jp(b"\x1B(J\x5C\x1B(B\x5C", "\u{00A5}\u{005C}");
878        decode_iso_2022_jp(b"\x1B(J\x5C\x1B(I\x21", "\u{00A5}\u{FF61}");
879        decode_iso_2022_jp(b"\x1B(J\x5C\x1B$@\x54\x64", "\u{00A5}\u{58FA}");
880        decode_iso_2022_jp(b"\x1B(J\x5C\x1B$B\x54\x64", "\u{00A5}\u{58FA}");
881
882        decode_iso_2022_jp(b"\x1B(I\x21\x1B(J\x5C", "\u{FF61}\u{00A5}");
883        decode_iso_2022_jp(b"\x1B(I\x21\x1B(B\x5C", "\u{FF61}\u{005C}");
884        decode_iso_2022_jp(b"\x1B(I\x21\x1B$@\x54\x64", "\u{FF61}\u{58FA}");
885        decode_iso_2022_jp(b"\x1B(I\x21\x1B$B\x54\x64", "\u{FF61}\u{58FA}");
886
887        decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
888        decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
889        decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
890        decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
891
892        decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
893        decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
894        decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
895        decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
896
897        // Empty transitions
898        decode_iso_2022_jp(b"\x1B(B\x1B(J", "\u{FFFD}");
899        decode_iso_2022_jp(b"\x1B(B\x1B(I", "\u{FFFD}");
900        decode_iso_2022_jp(b"\x1B(B\x1B$@", "\u{FFFD}");
901        decode_iso_2022_jp(b"\x1B(B\x1B$B", "\u{FFFD}");
902
903        decode_iso_2022_jp(b"\x1B(J\x1B(B", "\u{FFFD}");
904        decode_iso_2022_jp(b"\x1B(J\x1B(I", "\u{FFFD}");
905        decode_iso_2022_jp(b"\x1B(J\x1B$@", "\u{FFFD}");
906        decode_iso_2022_jp(b"\x1B(J\x1B$B", "\u{FFFD}");
907
908        decode_iso_2022_jp(b"\x1B(I\x1B(J", "\u{FFFD}");
909        decode_iso_2022_jp(b"\x1B(I\x1B(B", "\u{FFFD}");
910        decode_iso_2022_jp(b"\x1B(I\x1B$@", "\u{FFFD}");
911        decode_iso_2022_jp(b"\x1B(I\x1B$B", "\u{FFFD}");
912
913        decode_iso_2022_jp(b"\x1B$@\x1B(J", "\u{FFFD}");
914        decode_iso_2022_jp(b"\x1B$@\x1B(I", "\u{FFFD}");
915        decode_iso_2022_jp(b"\x1B$@\x1B(B", "\u{FFFD}");
916        decode_iso_2022_jp(b"\x1B$@\x1B$B", "\u{FFFD}");
917
918        decode_iso_2022_jp(b"\x1B$B\x1B(J", "\u{FFFD}");
919        decode_iso_2022_jp(b"\x1B$B\x1B(I", "\u{FFFD}");
920        decode_iso_2022_jp(b"\x1B$B\x1B$@", "\u{FFFD}");
921        decode_iso_2022_jp(b"\x1B$B\x1B(B", "\u{FFFD}");
922
923        // Transitions to self
924        decode_iso_2022_jp(b"\x1B(B\x5C\x1B(B\x5C", "\u{005C}\u{005C}");
925        decode_iso_2022_jp(b"\x1B(J\x5C\x1B(J\x5C", "\u{00A5}\u{00A5}");
926        decode_iso_2022_jp(b"\x1B(I\x21\x1B(I\x21", "\u{FF61}\u{FF61}");
927        decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
928        decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
929    }
930
931    #[test]
932    fn test_iso_2022_jp_encode() {
933        // Empty
934        encode_iso_2022_jp("", b"");
935
936        // ASCII
937        encode_iso_2022_jp("ab", b"ab");
938        encode_iso_2022_jp("\u{1F4A9}", b"&#128169;");
939        encode_iso_2022_jp("\x1B", b"&#65533;");
940        encode_iso_2022_jp("\x0E", b"&#65533;");
941        encode_iso_2022_jp("\x0F", b"&#65533;");
942
943        // Roman
944        encode_iso_2022_jp("a\u{00A5}b", b"a\x1B(J\x5Cb\x1B(B");
945        encode_iso_2022_jp("a\u{203E}b", b"a\x1B(J\x7Eb\x1B(B");
946        if !cfg!(miri) {
947            // Miri is too slow
948            encode_iso_2022_jp("a\u{00A5}b\x5C", b"a\x1B(J\x5Cb\x1B(B\x5C");
949            encode_iso_2022_jp("a\u{203E}b\x7E", b"a\x1B(J\x7Eb\x1B(B\x7E");
950            encode_iso_2022_jp("\u{00A5}\u{1F4A9}", b"\x1B(J\x5C&#128169;\x1B(B");
951            encode_iso_2022_jp("\u{00A5}\x1B", b"\x1B(J\x5C&#65533;\x1B(B");
952            encode_iso_2022_jp("\u{00A5}\x0E", b"\x1B(J\x5C&#65533;\x1B(B");
953            encode_iso_2022_jp("\u{00A5}\x0F", b"\x1B(J\x5C&#65533;\x1B(B");
954            encode_iso_2022_jp("\u{00A5}\u{58FA}", b"\x1B(J\x5C\x1B$B\x54\x64\x1B(B");
955        }
956
957        // Half-width Katakana
958        encode_iso_2022_jp("\u{FF61}", b"\x1B$B\x21\x23\x1B(B");
959        encode_iso_2022_jp("\u{FF65}", b"\x1B$B\x21\x26\x1B(B");
960        if !cfg!(miri) {
961            // Miri is too slow
962            encode_iso_2022_jp("\u{FF66}", b"\x1B$B\x25\x72\x1B(B");
963            encode_iso_2022_jp("\u{FF70}", b"\x1B$B\x21\x3C\x1B(B");
964            encode_iso_2022_jp("\u{FF9D}", b"\x1B$B\x25\x73\x1B(B");
965            encode_iso_2022_jp("\u{FF9E}", b"\x1B$B\x21\x2B\x1B(B");
966            encode_iso_2022_jp("\u{FF9F}", b"\x1B$B\x21\x2C\x1B(B");
967        }
968
969        // 0208
970        encode_iso_2022_jp("\u{58FA}", b"\x1B$B\x54\x64\x1B(B");
971        encode_iso_2022_jp("\u{58FA}\u{250F}", b"\x1B$B\x54\x64\x28\x2E\x1B(B");
972        if !cfg!(miri) {
973            // Miri is too slow
974            encode_iso_2022_jp("\u{58FA}\u{1F4A9}", b"\x1B$B\x54\x64\x1B(B&#128169;");
975            encode_iso_2022_jp("\u{58FA}\x1B", b"\x1B$B\x54\x64\x1B(B&#65533;");
976            encode_iso_2022_jp("\u{58FA}\x0E", b"\x1B$B\x54\x64\x1B(B&#65533;");
977            encode_iso_2022_jp("\u{58FA}\x0F", b"\x1B$B\x54\x64\x1B(B&#65533;");
978            encode_iso_2022_jp("\u{58FA}\u{00A5}", b"\x1B$B\x54\x64\x1B(J\x5C\x1B(B");
979            encode_iso_2022_jp("\u{58FA}a", b"\x1B$B\x54\x64\x1B(Ba");
980        }
981    }
982
983    #[test]
984    #[cfg_attr(miri, ignore)] // Miri is too slow
985    fn test_iso_2022_jp_decode_all() {
986        let input = include_bytes!("test_data/iso_2022_jp_in.txt");
987        let expectation = include_str!("test_data/iso_2022_jp_in_ref.txt");
988        let (cow, had_errors) = ISO_2022_JP.decode_without_bom_handling(input);
989        assert!(had_errors, "Should have had errors.");
990        assert_eq!(&cow[..], expectation);
991    }
992
993    #[test]
994    #[cfg_attr(miri, ignore)] // Miri is too slow
995    fn test_iso_2022_jp_encode_all() {
996        let input = include_str!("test_data/iso_2022_jp_out.txt");
997        let expectation = include_bytes!("test_data/iso_2022_jp_out_ref.txt");
998        let (cow, encoding, had_errors) = ISO_2022_JP.encode(input);
999        assert!(!had_errors, "Should not have had errors.");
1000        assert_eq!(encoding, ISO_2022_JP);
1001        assert_eq!(&cow[..], &expectation[..]);
1002    }
1003
1004    #[test]
1005    fn test_iso_2022_jp_half_width_katakana_length() {
1006        let mut output = [0u8; 20];
1007        let mut decoder = ISO_2022_JP.new_decoder();
1008        {
1009            let (result, read, written) =
1010                decoder.decode_to_utf8_without_replacement(b"\x1B\x28\x49", &mut output, false);
1011            assert_eq!(result, DecoderResult::InputEmpty);
1012            assert_eq!(read, 3);
1013            assert_eq!(written, 0);
1014        }
1015        {
1016            let needed = decoder
1017                .max_utf8_buffer_length_without_replacement(1)
1018                .unwrap();
1019            let (result, read, written) =
1020                decoder.decode_to_utf8_without_replacement(b"\x21", &mut output[..needed], true);
1021            assert_eq!(result, DecoderResult::InputEmpty);
1022            assert_eq!(read, 1);
1023            assert_eq!(written, 3);
1024            assert_eq!(output[0], 0xEF);
1025            assert_eq!(output[1], 0xBD);
1026            assert_eq!(output[2], 0xA1);
1027        }
1028    }
1029
1030    #[test]
1031    fn test_iso_2022_jp_length_after_escape() {
1032        let mut output = [0u16; 20];
1033        let mut decoder = ISO_2022_JP.new_decoder();
1034        {
1035            let (result, read, written, had_errors) =
1036                decoder.decode_to_utf16(b"\x1B", &mut output, false);
1037            assert_eq!(result, CoderResult::InputEmpty);
1038            assert_eq!(read, 1);
1039            assert_eq!(written, 0);
1040            assert!(!had_errors);
1041        }
1042        {
1043            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1044            let (result, read, written, had_errors) =
1045                decoder.decode_to_utf16(b"A", &mut output[..needed], true);
1046            assert_eq!(result, CoderResult::InputEmpty);
1047            assert_eq!(read, 1);
1048            assert_eq!(written, 2);
1049            assert!(had_errors);
1050            assert_eq!(output[0], 0xFFFD);
1051            assert_eq!(output[1], 0x0041);
1052        }
1053    }
1054
1055    #[test]
1056    fn test_iso_2022_jp_encode_from_two_low_surrogates() {
1057        let expectation = b"&#65533;&#65533;";
1058        let mut output = [0u8; 40];
1059        let mut encoder = ISO_2022_JP.new_encoder();
1060        let (result, read, written, had_errors) =
1061            encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
1062        assert_eq!(result, CoderResult::InputEmpty);
1063        assert_eq!(read, 2);
1064        assert_eq!(written, expectation.len());
1065        assert!(had_errors);
1066        assert_eq!(&output[..written], expectation);
1067    }
1068}