encoding_rs/
euc_kr.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16use super::in_range16;
17
18pub struct EucKrDecoder {
19    lead: Option<u8>,
20}
21
22impl EucKrDecoder {
23    pub fn new() -> VariantDecoder {
24        VariantDecoder::EucKr(EucKrDecoder { lead: None })
25    }
26
27    pub fn in_neutral_state(&self) -> bool {
28        self.lead.is_none()
29    }
30
31    fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
32        byte_length.checked_add(match self.lead {
33            None => 0,
34            Some(_) => 1,
35        })
36    }
37
38    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
39        self.plus_one_if_lead(byte_length)
40    }
41
42    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
43        // worst case: 2 to 3
44        let len = self.plus_one_if_lead(byte_length);
45        checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
46    }
47
48    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
49        checked_mul(3, self.plus_one_if_lead(byte_length))
50    }
51
52    ascii_compatible_two_byte_decoder_functions!(
53        {
54            // If lead is between 0x81 and 0xFE, inclusive,
55            // subtract offset 0x81.
56            let non_ascii_minus_offset =
57                non_ascii.wrapping_sub(0x81);
58            if non_ascii_minus_offset > (0xFE - 0x81) {
59                return (DecoderResult::Malformed(1, 0),
60                        source.consumed(),
61                        handle.written());
62            }
63            non_ascii_minus_offset
64        },
65        {
66            if lead_minus_offset >= 0x20 {
67                // Not the extension range above KS X 1001
68                let trail_minus_offset =
69                    byte.wrapping_sub(0xA1);
70                if trail_minus_offset <= (0xFE - 0xA1) {
71                    // KS X 1001
72                    let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
73                    let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
74                    if hangul_pointer < KSX1001_HANGUL.len() {
75                        let upper_bmp = KSX1001_HANGUL[hangul_pointer];
76                        handle.write_upper_bmp(upper_bmp)
77                    } else if ksx_pointer < KSX1001_SYMBOLS.len() {
78                        let bmp = KSX1001_SYMBOLS[ksx_pointer];
79                        handle.write_bmp_excl_ascii(bmp)
80                    } else {
81                        let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
82                        if hanja_pointer < KSX1001_HANJA.len() {
83                            let upper_bmp = KSX1001_HANJA[hanja_pointer];
84                            handle.write_upper_bmp(upper_bmp)
85                        } else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
86                            let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
87                            if mid_bmp == 0 {
88                                return (DecoderResult::Malformed(2, 0),
89                                        unread_handle_trail.consumed(),
90                                        handle.written());
91                            }
92                            handle.write_mid_bmp(mid_bmp)
93                        } else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
94                            let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
95                            handle.write_mid_bmp(mid_bmp)
96                        } else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
97                            let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
98                            handle.write_upper_bmp(upper_bmp)
99                        } else {
100                            let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
101                            if other_pointer < 0x039F {
102                                let bmp = ksx1001_other_decode(other_pointer as u16);
103                                // ASCII range means unassigned
104                                if bmp < 0x80 {
105                                    return (DecoderResult::Malformed(2, 0),
106                                            unread_handle_trail.consumed(),
107                                            handle.written());
108                                }
109                                handle.write_bmp_excl_ascii(bmp)
110                            } else {
111                                return (DecoderResult::Malformed(2, 0),
112                                        unread_handle_trail.consumed(),
113                                        handle.written());
114                            }
115                        }
116                    }
117                } else {
118                    // Extension range to the left of
119                    // KS X 1001
120                    let left_lead = lead_minus_offset - 0x20;
121                    let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
122                        byte - (12 + 0x41)
123                    } else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
124                        byte - (6 + 0x41)
125                    } else if byte.wrapping_sub(0x41) < 0x1A {
126                        byte - 0x41
127                    } else {
128                        if byte < 0x80 {
129                            return (DecoderResult::Malformed(1, 0),
130                                    unread_handle_trail.unread(),
131                                    handle.written());
132                        }
133                        return (DecoderResult::Malformed(2, 0),
134                                unread_handle_trail.consumed(),
135                                handle.written());
136                    };
137                    let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
138                    if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
139                        let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
140                        handle.write_upper_bmp(upper_bmp)
141                    } else {
142                        if byte < 0x80 {
143                            return (DecoderResult::Malformed(1, 0),
144                                    unread_handle_trail.unread(),
145                                    handle.written());
146                        }
147                        return (DecoderResult::Malformed(2, 0),
148                                unread_handle_trail.consumed(),
149                                handle.written());
150                    }
151                }
152            } else {
153                // Extension range above KS X 1001
154                let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
155                    byte - (12 + 0x41)
156                } else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
157                    byte - (6 + 0x41)
158                } else if byte.wrapping_sub(0x41) < 0x1A {
159                    byte - 0x41
160                } else {
161                    if byte < 0x80 {
162                        return (DecoderResult::Malformed(1, 0),
163                                unread_handle_trail.unread(),
164                                handle.written());
165                    }
166                    return (DecoderResult::Malformed(2, 0),
167                            unread_handle_trail.consumed(),
168                            handle.written());
169                };
170                let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
171                let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
172                handle.write_upper_bmp(upper_bmp)
173            }
174        },
175        self,
176        non_ascii,
177        byte,
178        lead_minus_offset,
179        unread_handle_trail,
180        source,
181        handle,
182        'outermost,
183        copy_ascii_from_check_space_bmp,
184        check_space_bmp,
185        true);
186}
187
188fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
189    if in_inclusive_range16(bmp, 0x3000, 0x3015) {
190        if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
191            return Some((0xA1, pos + 0xA1));
192        }
193    }
194    if let Some(other_pointer) = ksx1001_other_encode(bmp) {
195        let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
196        let other_trail = ((other_pointer as usize) % 94) + 0xA1;
197        return Some((other_lead, other_trail));
198    }
199    if in_range16(bmp, 0x00AA, 0x0168) {
200        // Latin
201        if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
202            return Some((0x81 + 0x28, 0xA1 + pos));
203        }
204        if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
205            return Some((0x81 + 0x27, 0xA1 + pos));
206        }
207    } else if in_range16(bmp, 0x2500, 0x254C) {
208        if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
209            return Some((0x81 + 0x25, 0xA1 + pos));
210        }
211    }
212    if in_inclusive_range16(bmp, 0x2015, 0x266D)
213        || in_inclusive_range16(bmp, 0x321C, 0x33D8)
214        || in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
215        || in_inclusive_range16(bmp, 0x00A1, 0x00F7)
216        || in_inclusive_range16(bmp, 0x02C7, 0x02DD)
217    {
218        if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
219            if pos < (94 - 3) {
220                return Some((0xA1, pos + 0xA1 + 3));
221            }
222            return Some((0xA2, pos - (94 - 3) + 0xA1));
223        }
224    }
225    None
226}
227
228#[cfg(not(feature = "fast-hangul-encode"))]
229#[inline(always)]
230fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
231    match KSX1001_HANGUL.binary_search(&bmp) {
232        Ok(ksx_hangul_pointer) => {
233            let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
234            let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
235            (ksx_hangul_lead as u8, ksx_hangul_trail as u8)
236        }
237        Err(_) => {
238            let (lead, cp949_trail) = if bmp < 0xC8A5 {
239                // Above KS X 1001
240                let top_pointer = cp949_top_hangul_encode(bmp) as usize;
241                let top_lead = (top_pointer / (190 - 12)) + 0x81;
242                let top_trail = top_pointer % (190 - 12);
243                (top_lead as u8, top_trail as u8)
244            } else {
245                // To the left of KS X 1001
246                let left_pointer = cp949_left_hangul_encode(bmp) as usize;
247                let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
248                let left_trail = left_pointer % (190 - 94 - 12);
249                (left_lead as u8, left_trail as u8)
250            };
251            let offset = if cp949_trail >= (0x40 - 12) {
252                0x41 + 12
253            } else if cp949_trail >= (0x20 - 6) {
254                0x41 + 6
255            } else {
256                0x41
257            };
258            (lead as u8, (cp949_trail + offset) as u8)
259        }
260    }
261}
262
263#[cfg(feature = "fast-hangul-encode")]
264#[inline(always)]
265fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
266    cp949_hangul_encode(bmp_minus_hangul_start)
267}
268
269#[cfg(not(feature = "fast-hanja-encode"))]
270#[inline(always)]
271fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
272    if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
273        let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
274        let hanja_trail = (hanja_pointer % 94) + 0xA1;
275        Some((hanja_lead as u8, hanja_trail as u8))
276    } else {
277        None
278    }
279}
280
281#[cfg(feature = "fast-hanja-encode")]
282#[inline(always)]
283fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
284    if bmp < 0xF900 {
285        ksx1001_unified_hangul_encode(bmp)
286    } else {
287        Some(ksx1001_compatibility_hangul_encode(bmp))
288    }
289}
290
291pub struct EucKrEncoder;
292
293impl EucKrEncoder {
294    pub fn new(encoding: &'static Encoding) -> Encoder {
295        Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
296    }
297
298    pub fn max_buffer_length_from_utf16_without_replacement(
299        &self,
300        u16_length: usize,
301    ) -> Option<usize> {
302        u16_length.checked_mul(2)
303    }
304
305    pub fn max_buffer_length_from_utf8_without_replacement(
306        &self,
307        byte_length: usize,
308    ) -> Option<usize> {
309        byte_length.checked_add(1)
310    }
311
312    ascii_compatible_bmp_encoder_functions!(
313        {
314            let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
315            let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
316                // Hangul
317                ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
318            } else if in_range16(bmp, 0x33DE, 0xFF01) {
319                // Vast range that includes no other
320                // mappables except Hangul (already
321                // processed) and Hanja.
322                // Narrow the range further to Unified and
323                // Compatibility ranges of Hanja.
324                if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
325                    if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
326                        (hanja_lead, hanja_trail)
327                    } else {
328                        return (
329                            EncoderResult::unmappable_from_bmp(bmp),
330                            source.consumed(),
331                            handle.written(),
332                        );
333                    }
334                } else {
335                    return (
336                        EncoderResult::unmappable_from_bmp(bmp),
337                        source.consumed(),
338                        handle.written(),
339                    );
340                }
341            } else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
342                (lead as u8, trail as u8)
343            } else {
344                return (
345                    EncoderResult::unmappable_from_bmp(bmp),
346                    source.consumed(),
347                    handle.written(),
348                );
349            };
350            handle.write_two(lead, trail)
351        },
352        bmp,
353        self,
354        source,
355        handle,
356        copy_ascii_to_check_space_two,
357        check_space_two,
358        true
359    );
360}
361
362// Any copyright to the test code below this comment is dedicated to the
363// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
364
365#[cfg(test)]
366mod tests {
367    use super::super::testing::*;
368    use super::super::*;
369
370    fn decode_euc_kr(bytes: &[u8], expect: &str) {
371        decode(EUC_KR, bytes, expect);
372    }
373
374    fn encode_euc_kr(string: &str, expect: &[u8]) {
375        encode(EUC_KR, string, expect);
376    }
377
378    #[test]
379    fn test_euc_kr_decode() {
380        // Empty
381        decode_euc_kr(b"", &"");
382
383        // ASCII
384        decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
385
386        decode_euc_kr(b"\x81\x41", "\u{AC02}");
387        decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
388        decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
389        decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
390        decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
391        decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
392        decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
393        decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
394    }
395
396    #[test]
397    fn test_euc_kr_encode() {
398        // Empty
399        encode_euc_kr("", b"");
400
401        // ASCII
402        encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
403
404        encode_euc_kr("\u{AC02}", b"\x81\x41");
405        encode_euc_kr("\u{8A70}", b"\xFD\xFE");
406    }
407
408    #[test]
409    #[cfg_attr(miri, ignore)] // Miri is too slow
410    fn test_euc_kr_decode_all() {
411        let input = include_bytes!("test_data/euc_kr_in.txt");
412        let expectation = include_str!("test_data/euc_kr_in_ref.txt");
413        let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
414        assert!(had_errors, "Should have had errors.");
415        assert_eq!(&cow[..], expectation);
416    }
417
418    #[test]
419    #[cfg_attr(miri, ignore)] // Miri is too slow
420    fn test_euc_kr_encode_all() {
421        let input = include_str!("test_data/euc_kr_out.txt");
422        let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
423        let (cow, encoding, had_errors) = EUC_KR.encode(input);
424        assert!(!had_errors, "Should not have had errors.");
425        assert_eq!(encoding, EUC_KR);
426        assert_eq!(&cow[..], &expectation[..]);
427    }
428
429    #[test]
430    fn test_euc_kr_encode_from_two_low_surrogates() {
431        let expectation = b"&#65533;&#65533;";
432        let mut output = [0u8; 40];
433        let mut encoder = EUC_KR.new_encoder();
434        let (result, read, written, had_errors) =
435            encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
436        assert_eq!(result, CoderResult::InputEmpty);
437        assert_eq!(read, 2);
438        assert_eq!(written, expectation.len());
439        assert!(had_errors);
440        assert_eq!(&output[..written], expectation);
441    }
442}