encoding_rs/
shift_jis.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range;
16use super::in_inclusive_range16;
17
18pub struct ShiftJisDecoder {
19    lead: Option<u8>,
20}
21
22impl ShiftJisDecoder {
23    pub fn new() -> VariantDecoder {
24        VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
25    }
26
27    pub fn in_neutral_state(&self) -> bool {
28        self.lead.is_none()
29    }
30
31    fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
32        byte_length.checked_add(match self.lead {
33            None => 0,
34            Some(_) => 1,
35        })
36    }
37
38    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
39        self.plus_one_if_lead(byte_length)
40    }
41
42    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
43        // worst case: 1 to 3 (half-width katakana)
44        self.max_utf8_buffer_length(byte_length)
45    }
46
47    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
48        checked_mul(3, self.plus_one_if_lead(byte_length))
49    }
50
51    ascii_compatible_two_byte_decoder_functions!(
52        {
53           // If lead is between 0x81 and 0x9F, inclusive,
54           // subtract offset 0x81. Else if lead is
55           // between 0xE0 and 0xFC, inclusive, subtract
56           // offset 0xC1. Else if lead is between
57           // 0xA1 and 0xDF, inclusive, map to half-width
58           // Katakana. Else if lead is 0x80, pass through.
59            let mut non_ascii_minus_offset =
60                non_ascii.wrapping_sub(0x81);
61            if non_ascii_minus_offset > (0x9F - 0x81) {
62                let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
63                if non_ascii_minus_range_start > (0xFC - 0xE0) {
64                    let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
65                    if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
66                        if non_ascii == 0x80 {
67                            handle.write_mid_bmp(0x80);
68                            // Not caring about optimizing subsequent non-ASCII
69                            continue 'outermost;
70                        }
71                        return (DecoderResult::Malformed(1, 0),
72                                source.consumed(),
73                                handle.written());
74                    }
75                    handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
76                    // Not caring about optimizing subsequent non-ASCII
77                    continue 'outermost;
78                }
79                non_ascii_minus_offset = non_ascii - 0xC1;
80            }
81            non_ascii_minus_offset
82        },
83        {
84            // If trail is between 0x40 and 0x7E, inclusive,
85            // subtract offset 0x40. Else if trail is
86            // between 0x80 and 0xFC, inclusive, subtract
87            // offset 0x41.
88            // Fast-track Hiragana (60% according to Lunde)
89            // and Katakana (10% acconding to Lunde).
90            // Hiragana doesn't cross 0x7F, but Katakana does.
91            // We can check for Hiragana before normalizing
92            // trail.
93            let trail_minus_hiragana = byte.wrapping_sub(0x9F);
94            if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
95            // Hiragana
96                handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
97            } else {
98                let mut trail_minus_offset =
99                    byte.wrapping_sub(0x40);
100                if trail_minus_offset > (0x7E - 0x40) {
101                    let trail_minus_range_start =
102                        byte.wrapping_sub(0x80);
103                    if trail_minus_range_start > (0xFC - 0x80) {
104                        if byte < 0x80 {
105                            return (DecoderResult::Malformed(1, 0),
106                                    unread_handle_trail.unread(),
107                                    handle.written());
108                        }
109                        return (DecoderResult::Malformed(2, 0),
110                                unread_handle_trail.consumed(),
111                                handle.written());
112                    }
113                    trail_minus_offset = byte - 0x41;
114                }
115                if lead_minus_offset == 0x02 &&
116                   trail_minus_offset < 0x56 {
117                    // Katakana
118                    handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
119                } else {
120                    let pointer = lead_minus_offset as usize *
121                                  188usize +
122                                  trail_minus_offset as usize;
123                    let level1_pointer = pointer.wrapping_sub(1410);
124                    if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
125                        handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
126                    } else {
127                        let level2_pointer = pointer.wrapping_sub(4418);
128                        if level2_pointer <
129                           JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
130                            handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
131                        } else {
132                            let upper_ibm_pointer = pointer.wrapping_sub(10744);
133                            if upper_ibm_pointer < IBM_KANJI.len() {
134                                handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
135                            } else {
136                                let lower_ibm_pointer = pointer.wrapping_sub(8272);
137                                if lower_ibm_pointer < IBM_KANJI.len() {
138                                    handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
139                                } else if in_inclusive_range(pointer, 8836, 10715) {
140                                    handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
141                                } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
142                                    handle.write_bmp_excl_ascii(bmp)
143                                } else if let Some(bmp) = jis0208_range_decode(pointer) {
144                                    handle.write_bmp_excl_ascii(bmp)
145                                } else {
146                                    if byte < 0x80 {
147                                        return (DecoderResult::Malformed(1, 0),
148                                                unread_handle_trail.unread(),
149                                                handle.written());
150                                    }
151                                    return (DecoderResult::Malformed(2, 0),
152                                            unread_handle_trail.consumed(),
153                                            handle.written());
154                                }
155                            }
156                        }
157                    }
158                }
159            }
160        },
161        self,
162        non_ascii,
163        byte,
164        lead_minus_offset,
165        unread_handle_trail,
166        source,
167        handle,
168        'outermost,
169        copy_ascii_from_check_space_bmp,
170        check_space_bmp,
171        false);
172}
173
174#[cfg(feature = "fast-kanji-encode")]
175#[inline(always)]
176fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
177    jis0208_kanji_shift_jis_encode(bmp)
178}
179
180#[cfg(not(feature = "fast-kanji-encode"))]
181#[inline(always)]
182fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
183    if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
184        return Some((lead, trail));
185    }
186    let pointer = if 0x4EDD == bmp {
187        // Ideograph on the symbol row!
188        23
189    } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
190        4418 + pos
191    } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
192        10744 + pos
193    } else {
194        return None;
195    };
196    let lead = pointer / 188;
197    let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
198    let trail = pointer % 188;
199    let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
200    Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
201}
202
203pub struct ShiftJisEncoder;
204
205impl ShiftJisEncoder {
206    pub fn new(encoding: &'static Encoding) -> Encoder {
207        Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
208    }
209
210    pub fn max_buffer_length_from_utf16_without_replacement(
211        &self,
212        u16_length: usize,
213    ) -> Option<usize> {
214        u16_length.checked_mul(2)
215    }
216
217    pub fn max_buffer_length_from_utf8_without_replacement(
218        &self,
219        byte_length: usize,
220    ) -> Option<usize> {
221        byte_length.checked_add(1)
222    }
223
224    ascii_compatible_bmp_encoder_functions!(
225        {
226            // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
227            let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
228            if bmp_minus_hiragana < 0x53 {
229                handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
230            } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
231                if let Some((lead, trail)) = encode_kanji(bmp) {
232                    handle.write_two(lead, trail)
233                } else {
234                    return (
235                        EncoderResult::unmappable_from_bmp(bmp),
236                        source.consumed(),
237                        handle.written(),
238                    );
239                }
240            } else {
241                let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
242                if bmp_minus_katakana < 0x56 {
243                    let trail_offset = if bmp_minus_katakana < 0x3F {
244                        0x40
245                    } else {
246                        0x41
247                    };
248                    handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
249                } else {
250                    let bmp_minus_space = bmp.wrapping_sub(0x3000);
251                    if bmp_minus_space < 3 {
252                        // fast-track common punctuation
253                        handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
254                    } else if bmp == 0xA5 {
255                        handle.write_one(0x5Cu8)
256                    } else if bmp == 0x80 {
257                        handle.write_one(0x80u8)
258                    } else if bmp == 0x203E {
259                        handle.write_one(0x7Eu8)
260                    } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
261                        handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
262                    } else if bmp == 0x2212 {
263                        handle.write_two(0x81u8, 0x7Cu8)
264                    } else {
265                        let bmp_minus_roman = bmp.wrapping_sub(0x2170);
266                        let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
267                            10716 + bmp_minus_roman as usize
268                        } else if let Some(pointer) = jis0208_range_encode(bmp) {
269                            pointer
270                        } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
271                            || bmp == 0xF929
272                            || bmp == 0xF9DC
273                        {
274                            // Guaranteed to be found in IBM_KANJI
275                            let pos = position(&IBM_KANJI[..], bmp).unwrap();
276                            10744 + pos
277                        } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
278                            pointer
279                        } else {
280                            return (
281                                EncoderResult::unmappable_from_bmp(bmp),
282                                source.consumed(),
283                                handle.written(),
284                            );
285                        };
286                        let lead = pointer / 188;
287                        let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
288                        let trail = pointer % 188;
289                        let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
290                        handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
291                    }
292                }
293            }
294        },
295        bmp,
296        self,
297        source,
298        handle,
299        copy_ascii_to_check_space_two,
300        check_space_two,
301        false
302    );
303}
304
305// Any copyright to the test code below this comment is dedicated to the
306// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
307
308#[cfg(test)]
309mod tests {
310    use super::super::testing::*;
311    use super::super::*;
312
313    fn decode_shift_jis(bytes: &[u8], expect: &str) {
314        decode(SHIFT_JIS, bytes, expect);
315    }
316
317    fn encode_shift_jis(string: &str, expect: &[u8]) {
318        encode(SHIFT_JIS, string, expect);
319    }
320
321    #[test]
322    fn test_shift_jis_decode() {
323        // Empty
324        decode_shift_jis(b"", &"");
325
326        // ASCII
327        decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
328
329        // Half-width
330        decode_shift_jis(b"\xA1", "\u{FF61}");
331        decode_shift_jis(b"\xDF", "\u{FF9F}");
332        decode_shift_jis(b"\xA0", "\u{FFFD}");
333        decode_shift_jis(b"\xE0", "\u{FFFD}");
334        decode_shift_jis(b"\xA0+", "\u{FFFD}+");
335        decode_shift_jis(b"\xE0+", "\u{FFFD}+");
336
337        // EUDC
338        decode_shift_jis(b"\xF0\x40", "\u{E000}");
339        decode_shift_jis(b"\xF9\xFC", "\u{E757}");
340        decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
341        decode_shift_jis(b"\xFA\x40", "\u{2170}");
342
343        // JIS 0208
344        decode_shift_jis(b"\x81\x40", "\u{3000}");
345        decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
346        decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
347        decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
348        decode_shift_jis(b"\xFA\x40", "\u{2170}");
349        decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
350        decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
351        decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
352        //
353    }
354
355    #[test]
356    fn test_shift_jis_encode() {
357        // Empty
358        encode_shift_jis("", b"");
359
360        // ASCII
361        encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
362
363        // Exceptional code points
364        encode_shift_jis("\u{0080}", b"\x80");
365        encode_shift_jis("\u{00A5}", b"\x5C");
366        encode_shift_jis("\u{203E}", b"\x7E");
367        encode_shift_jis("\u{2212}", b"\x81\x7C");
368
369        // Half-width
370        encode_shift_jis("\u{FF61}", b"\xA1");
371        encode_shift_jis("\u{FF9F}", b"\xDF");
372
373        // EUDC
374        encode_shift_jis("\u{E000}", b"&#57344;");
375        encode_shift_jis("\u{E757}", b"&#59223;");
376
377        // JIS 0212
378        encode_shift_jis("\u{02D8}", b"&#728;");
379
380        // JIS 0208
381        encode_shift_jis("\u{3000}", b"\x81\x40");
382        encode_shift_jis("\u{FF02}", b"\xFA\x57");
383        encode_shift_jis("\u{2170}", b"\xFA\x40");
384        encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
385    }
386
387    #[test]
388    #[cfg_attr(miri, ignore)] // Miri is too slow
389    fn test_shift_jis_decode_all() {
390        let input = include_bytes!("test_data/shift_jis_in.txt");
391        let expectation = include_str!("test_data/shift_jis_in_ref.txt");
392        let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
393        assert!(had_errors, "Should have had errors.");
394        assert_eq!(&cow[..], expectation);
395    }
396
397    #[test]
398    #[cfg_attr(miri, ignore)] // Miri is too slow
399    fn test_shift_jis_encode_all() {
400        let input = include_str!("test_data/shift_jis_out.txt");
401        let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
402        let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
403        assert!(!had_errors, "Should not have had errors.");
404        assert_eq!(encoding, SHIFT_JIS);
405        assert_eq!(&cow[..], &expectation[..]);
406    }
407
408    #[test]
409    fn test_shift_jis_half_width_katakana_length() {
410        let mut output = [0u8; 20];
411        let mut decoder = SHIFT_JIS.new_decoder();
412        {
413            let needed = decoder
414                .max_utf8_buffer_length_without_replacement(1)
415                .unwrap();
416            let (result, read, written) =
417                decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
418            assert_eq!(result, DecoderResult::InputEmpty);
419            assert_eq!(read, 1);
420            assert_eq!(written, 3);
421            assert_eq!(output[0], 0xEF);
422            assert_eq!(output[1], 0xBD);
423            assert_eq!(output[2], 0xA1);
424        }
425    }
426}