encoding_rs/
utf_16.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::handles::*;
12use crate::variant::*;
13
14pub struct Utf16Decoder {
15    lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
16    lead_byte: Option<u8>,
17    be: bool,
18    pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
19}
20
21impl Utf16Decoder {
22    pub fn new(big_endian: bool) -> VariantDecoder {
23        VariantDecoder::Utf16(Utf16Decoder {
24            lead_surrogate: 0,
25            lead_byte: None,
26            be: big_endian,
27            pending_bmp: false,
28        })
29    }
30
31    pub fn additional_from_state(&self) -> usize {
32        1 + if self.lead_byte.is_some() { 1 } else { 0 }
33            + if self.lead_surrogate == 0 { 0 } else { 2 }
34    }
35
36    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
37        checked_add(
38            1,
39            checked_div(byte_length.checked_add(self.additional_from_state()), 2),
40        )
41    }
42
43    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
44        checked_add(
45            1,
46            checked_mul(
47                3,
48                checked_div(byte_length.checked_add(self.additional_from_state()), 2),
49            ),
50        )
51    }
52
53    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
54        checked_add(
55            1,
56            checked_mul(
57                3,
58                checked_div(byte_length.checked_add(self.additional_from_state()), 2),
59            ),
60        )
61    }
62
63    decoder_functions!(
64        {
65            if self.pending_bmp {
66                match dest.check_space_bmp() {
67                    Space::Full(_) => {
68                        return (DecoderResult::OutputFull, 0, 0);
69                    }
70                    Space::Available(destination_handle) => {
71                        destination_handle.write_bmp(self.lead_surrogate);
72                        self.pending_bmp = false;
73                        self.lead_surrogate = 0;
74                    }
75                }
76            }
77        },
78        {
79            // This is the fast path. The rest runs only at the
80            // start and end for partial sequences.
81            if self.lead_byte.is_none() && self.lead_surrogate == 0 {
82                if let Some((read, written)) = if self.be {
83                    dest.copy_utf16_from::<BigEndian>(&mut source)
84                } else {
85                    dest.copy_utf16_from::<LittleEndian>(&mut source)
86                } {
87                    return (DecoderResult::Malformed(2, 0), read, written);
88                }
89            }
90        },
91        {
92            debug_assert!(!self.pending_bmp);
93            if self.lead_surrogate != 0 || self.lead_byte.is_some() {
94                // We need to check space without intent to write in order to
95                // make sure that there is space for the replacement character.
96                match dest.check_space_bmp() {
97                    Space::Full(_) => {
98                        return (DecoderResult::OutputFull, 0, 0);
99                    }
100                    Space::Available(_) => {
101                        if self.lead_surrogate != 0 {
102                            self.lead_surrogate = 0;
103                            match self.lead_byte {
104                                None => {
105                                    return (
106                                        DecoderResult::Malformed(2, 0),
107                                        src_consumed,
108                                        dest.written(),
109                                    );
110                                }
111                                Some(_) => {
112                                    self.lead_byte = None;
113                                    return (
114                                        DecoderResult::Malformed(3, 0),
115                                        src_consumed,
116                                        dest.written(),
117                                    );
118                                }
119                            }
120                        }
121                        debug_assert!(self.lead_byte.is_some());
122                        self.lead_byte = None;
123                        return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
124                    }
125                }
126            }
127        },
128        {
129            match self.lead_byte {
130                None => {
131                    self.lead_byte = Some(b);
132                    continue;
133                }
134                Some(lead) => {
135                    self.lead_byte = None;
136                    let code_unit = if self.be {
137                        u16::from(lead) << 8 | u16::from(b)
138                    } else {
139                        u16::from(b) << 8 | u16::from(lead)
140                    };
141                    let high_bits = code_unit & 0xFC00u16;
142                    if high_bits == 0xD800u16 {
143                        // high surrogate
144                        if self.lead_surrogate != 0 {
145                            // The previous high surrogate was in
146                            // error and this one becomes the new
147                            // pending one.
148                            self.lead_surrogate = code_unit as u16;
149                            return (
150                                DecoderResult::Malformed(2, 2),
151                                unread_handle.consumed(),
152                                destination_handle.written(),
153                            );
154                        }
155                        self.lead_surrogate = code_unit;
156                        continue;
157                    }
158                    if high_bits == 0xDC00u16 {
159                        // low surrogate
160                        if self.lead_surrogate == 0 {
161                            return (
162                                DecoderResult::Malformed(2, 0),
163                                unread_handle.consumed(),
164                                destination_handle.written(),
165                            );
166                        }
167                        destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
168                        self.lead_surrogate = 0;
169                        continue;
170                    }
171                    // bmp
172                    if self.lead_surrogate != 0 {
173                        // The previous high surrogate was in
174                        // error and this code unit becomes a
175                        // pending BMP character.
176                        self.lead_surrogate = code_unit;
177                        self.pending_bmp = true;
178                        return (
179                            DecoderResult::Malformed(2, 2),
180                            unread_handle.consumed(),
181                            destination_handle.written(),
182                        );
183                    }
184                    destination_handle.write_bmp(code_unit);
185                    continue;
186                }
187            }
188        },
189        self,
190        src_consumed,
191        dest,
192        source,
193        b,
194        destination_handle,
195        unread_handle,
196        check_space_astral
197    );
198}
199
200// Any copyright to the test code below this comment is dedicated to the
201// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
202
203#[cfg(test)]
204mod tests {
205    use super::super::testing::*;
206    use super::super::*;
207
208    fn decode_utf_16le(bytes: &[u8], expect: &str) {
209        decode_without_padding(UTF_16LE, bytes, expect);
210    }
211
212    fn decode_utf_16be(bytes: &[u8], expect: &str) {
213        decode_without_padding(UTF_16BE, bytes, expect);
214    }
215
216    fn encode_utf_16le(string: &str, expect: &[u8]) {
217        encode(UTF_16LE, string, expect);
218    }
219
220    fn encode_utf_16be(string: &str, expect: &[u8]) {
221        encode(UTF_16BE, string, expect);
222    }
223
224    #[test]
225    fn test_utf_16_decode() {
226        decode_utf_16le(b"", "");
227        decode_utf_16be(b"", "");
228
229        decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
230        decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
231
232        decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
233        decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
234
235        decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
236        decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
237
238        decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
239        decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
240
241        decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
242        decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
243
244        decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
245        decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
246
247        decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
248        decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
249
250        // The \xFF makes sure that the parts before and after have different alignment
251        let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
252        let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
253        let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
254        decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
255        decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
256        decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
257        decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
258    }
259
260    #[test]
261    fn test_utf_16_encode() {
262        // Empty
263        encode_utf_16be("", b"");
264        encode_utf_16le("", b"");
265
266        // Encodes as UTF-8
267        assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
268        assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
269        encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
270        encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
271    }
272
273    #[test]
274    fn test_utf_16be_decode_one_by_one() {
275        let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
276        let mut output = [0u16; 20];
277        let mut decoder = UTF_16BE.new_decoder();
278        for b in input.chunks(1) {
279            assert_eq!(b.len(), 1);
280            let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
281            let (result, read, _, had_errors) =
282                decoder.decode_to_utf16(b, &mut output[..needed], false);
283            assert_eq!(result, CoderResult::InputEmpty);
284            assert_eq!(read, 1);
285            assert!(!had_errors);
286        }
287    }
288
289    #[test]
290    fn test_utf_16le_decode_one_by_one() {
291        let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
292        let mut output = [0u16; 20];
293        let mut decoder = UTF_16LE.new_decoder();
294        for b in input.chunks(1) {
295            assert_eq!(b.len(), 1);
296            let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
297            let (result, read, _, had_errors) =
298                decoder.decode_to_utf16(b, &mut output[..needed], false);
299            assert_eq!(result, CoderResult::InputEmpty);
300            assert_eq!(read, 1);
301            assert!(!had_errors);
302        }
303    }
304
305    #[test]
306    fn test_utf_16be_decode_three_at_a_time() {
307        let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
308        let mut output = [0u16; 20];
309        let mut decoder = UTF_16BE.new_decoder();
310        for b in input.chunks(3) {
311            assert_eq!(b.len(), 3);
312            let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
313            let (result, read, _, had_errors) =
314                decoder.decode_to_utf16(b, &mut output[..needed], false);
315            assert_eq!(result, CoderResult::InputEmpty);
316            assert_eq!(read, b.len());
317            assert!(!had_errors);
318        }
319    }
320
321    #[test]
322    fn test_utf_16le_decode_three_at_a_time() {
323        let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
324        let mut output = [0u16; 20];
325        let mut decoder = UTF_16LE.new_decoder();
326        for b in input.chunks(3) {
327            assert_eq!(b.len(), 3);
328            let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
329            let (result, read, _, had_errors) =
330                decoder.decode_to_utf16(b, &mut output[..needed], false);
331            assert_eq!(result, CoderResult::InputEmpty);
332            assert_eq!(read, b.len());
333            assert!(!had_errors);
334        }
335    }
336
337    #[test]
338    fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
339        let mut output = [0u16; 20];
340        let mut decoder = UTF_16LE.new_decoder();
341        {
342            let needed = decoder.max_utf16_buffer_length(1).unwrap();
343            let (result, read, written, had_errors) =
344                decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
345            assert_eq!(result, CoderResult::InputEmpty);
346            assert_eq!(read, 1);
347            assert_eq!(written, 0);
348            assert!(!had_errors);
349        }
350        {
351            let needed = decoder.max_utf16_buffer_length(1).unwrap();
352            let (result, read, written, had_errors) =
353                decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
354            assert_eq!(result, CoderResult::InputEmpty);
355            assert_eq!(read, 1);
356            assert_eq!(written, 1);
357            assert!(!had_errors);
358            assert_eq!(output[0], 0xFDFF);
359        }
360    }
361
362    #[test]
363    fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
364        let mut output = [0u16; 20];
365        let mut decoder = UTF_16BE.new_decoder();
366        {
367            let needed = decoder.max_utf16_buffer_length(1).unwrap();
368            let (result, read, written, had_errors) =
369                decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
370            assert_eq!(result, CoderResult::InputEmpty);
371            assert_eq!(read, 1);
372            assert_eq!(written, 0);
373            assert!(!had_errors);
374        }
375        {
376            let needed = decoder.max_utf16_buffer_length(1).unwrap();
377            let (result, read, written, had_errors) =
378                decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
379            assert_eq!(result, CoderResult::InputEmpty);
380            assert_eq!(read, 1);
381            assert_eq!(written, 1);
382            assert!(!had_errors);
383            assert_eq!(output[0], 0xFEFD);
384        }
385    }
386
387    #[test]
388    fn test_utf_16le_decode_bom_prefix() {
389        let mut output = [0u16; 20];
390        let mut decoder = UTF_16LE.new_decoder();
391        {
392            let needed = decoder.max_utf16_buffer_length(1).unwrap();
393            let (result, read, written, had_errors) =
394                decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
395            assert_eq!(result, CoderResult::InputEmpty);
396            assert_eq!(read, 1);
397            assert_eq!(written, 1);
398            assert!(had_errors);
399            assert_eq!(output[0], 0xFFFD);
400        }
401    }
402
403    #[test]
404    fn test_utf_16be_decode_bom_prefix() {
405        let mut output = [0u16; 20];
406        let mut decoder = UTF_16BE.new_decoder();
407        {
408            let needed = decoder.max_utf16_buffer_length(1).unwrap();
409            let (result, read, written, had_errors) =
410                decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
411            assert_eq!(result, CoderResult::InputEmpty);
412            assert_eq!(read, 1);
413            assert_eq!(written, 1);
414            assert!(had_errors);
415            assert_eq!(output[0], 0xFFFD);
416        }
417    }
418
419    #[test]
420    fn test_utf_16le_decode_near_end() {
421        let mut output = [0u8; 4];
422        let mut decoder = UTF_16LE.new_decoder();
423        {
424            let (result, read, written, had_errors) =
425                decoder.decode_to_utf8(&[0x03], &mut output[..], false);
426            assert_eq!(result, CoderResult::InputEmpty);
427            assert_eq!(read, 1);
428            assert_eq!(written, 0);
429            assert!(!had_errors);
430            assert_eq!(output[0], 0x0);
431        }
432        {
433            let (result, read, written, had_errors) =
434                decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
435            assert_eq!(result, CoderResult::OutputFull);
436            assert_eq!(read, 1);
437            assert_eq!(written, 3);
438            assert!(!had_errors);
439            assert_eq!(output[0], 0xE2);
440            assert_eq!(output[1], 0x98);
441            assert_eq!(output[2], 0x83);
442            assert_eq!(output[3], 0x00);
443        }
444    }
445
446    #[test]
447    fn test_utf_16be_decode_near_end() {
448        let mut output = [0u8; 4];
449        let mut decoder = UTF_16BE.new_decoder();
450        {
451            let (result, read, written, had_errors) =
452                decoder.decode_to_utf8(&[0x26], &mut output[..], false);
453            assert_eq!(result, CoderResult::InputEmpty);
454            assert_eq!(read, 1);
455            assert_eq!(written, 0);
456            assert!(!had_errors);
457            assert_eq!(output[0], 0x0);
458        }
459        {
460            let (result, read, written, had_errors) =
461                decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
462            assert_eq!(result, CoderResult::OutputFull);
463            assert_eq!(read, 1);
464            assert_eq!(written, 3);
465            assert!(!had_errors);
466            assert_eq!(output[0], 0xE2);
467            assert_eq!(output[1], 0x98);
468            assert_eq!(output[2], 0x83);
469            assert_eq!(output[3], 0x00);
470        }
471    }
472}