encoding/codec/
utf_8.rs

1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4//
5// Portions Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in
15// all copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25//! UTF-8, the universal encoding.
26
27use std::{str, mem};
28use std::convert::Into;
29use types::*;
30
31/**
32 * UTF-8 (UCS Transformation Format, 8-bit).
33 *
34 * This is a Unicode encoding compatible to ASCII (ISO/IEC 646:US)
35 * and able to represent all Unicode codepoints uniquely and unambiguously.
36 * It has a variable-length design,
37 * where one codepoint may use 1 (up to U+007F), 2 (up to U+07FF), 3 (up to U+FFFF)
38 * and 4 bytes (up to U+10FFFF) depending on its value.
39 * The first byte of the sequence is distinct from other "continuation" bytes of the sequence
40 * making UTF-8 self-synchronizable and easy to handle.
41 * It has a fixed endianness, and can be lexicographically sorted by codepoints.
42 *
43 * The UTF-8 scanner used by this module is heavily based on Bjoern Hoehrmann's
44 * [Flexible and Economical UTF-8 Decoder](http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
45 */
46#[derive(Clone, Copy)]
47pub struct UTF8Encoding;
48
49impl Encoding for UTF8Encoding {
50    fn name(&self) -> &'static str { "utf-8" }
51    fn whatwg_name(&self) -> Option<&'static str> { Some("utf-8") }
52    fn raw_encoder(&self) -> Box<RawEncoder> { UTF8Encoder::new() }
53    fn raw_decoder(&self) -> Box<RawDecoder> { UTF8Decoder::new() }
54}
55
56/// An encoder for UTF-8.
57#[derive(Clone, Copy)]
58pub struct UTF8Encoder;
59
60impl UTF8Encoder {
61    pub fn new() -> Box<RawEncoder> { Box::new(UTF8Encoder) }
62}
63
64impl RawEncoder for UTF8Encoder {
65    fn from_self(&self) -> Box<RawEncoder> { UTF8Encoder::new() }
66    fn is_ascii_compatible(&self) -> bool { true }
67
68    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
69        let input: &[u8] = input.as_bytes();
70        assert!(str::from_utf8(input).is_ok());
71        output.write_bytes(input);
72        (input.len(), None)
73    }
74
75    fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
76        None
77    }
78}
79
80/// A decoder for UTF-8.
81#[derive(Clone, Copy)]
82pub struct UTF8Decoder {
83    queuelen: usize,
84    queue: [u8; 4],
85    state: u8,
86}
87
88impl UTF8Decoder {
89    pub fn new() -> Box<RawDecoder> {
90        Box::new(UTF8Decoder { queuelen: 0, queue: [0; 4], state: INITIAL_STATE })
91    }
92}
93
94static CHAR_CATEGORY: [u8; 256] = [
95    //  0 (00-7F): one byte sequence
96    //  1 (80-8F): continuation byte
97    //  2 (C2-DF): start of two byte sequence
98    //  3 (E1-EC,EE-EF): start of three byte sequence, next byte unrestricted
99    //  4 (ED): start of three byte sequence, next byte restricted to non-surrogates (80-9F)
100    //  5 (F4): start of four byte sequence, next byte restricted to 0+10FFFF (80-8F)
101    //  6 (F1-F3): start of four byte sequence, next byte unrestricted
102    //  7 (A0-BF): continuation byte
103    //  8 (C0-C1,F5-FF): invalid (overlong or out-of-range) start of multi byte sequences
104    //  9 (90-9F): continuation byte
105    // 10 (E0): start of three byte sequence, next byte restricted to non-overlong (A0-BF)
106    // 11 (F0): start of four byte sequence, next byte restricted to non-overlong (90-BF)
107
108     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
112     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
113     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
114     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
115    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
116];
117
118static STATE_TRANSITIONS: [u8; 110] = [
119     0,98,12,24,48,84,72,98,98,98,36,60,       //  0: '??
120    86, 0,86,86,86,86,86, 0,86, 0,86,86,       // 12: .. 'cc
121    86,12,86,86,86,86,86,12,86,12,86,86,       // 24: .. 'cc cc
122    86,86,86,86,86,86,86,12,86,86,86,86,       // 36: .. 'cc(A0-BF) cc
123    86,12,86,86,86,86,86,86,86,12,86,86,       // 48: .. 'cc(80-9F) cc
124    86,86,86,86,86,86,86,24,86,24,86,86,       // 60: .. 'cc(90-BF) cc cc
125    86,24,86,86,86,86,86,24,86,24,86,86,       // 72: .. 'cc cc cc
126    86,24,86,86,86,86,86,86,86,86,86,86,86,86, // 84: .. 'cc(80-8F) cc cc
127       // 86,86,86,86,86,86,86,86,86,86,86,86, // 86: .. xx '..
128          98,98,98,98,98,98,98,98,98,98,98,98, // 98: xx '..
129];
130
131static INITIAL_STATE: u8 = 0;
132static ACCEPT_STATE: u8 = 0;
133static REJECT_STATE: u8 = 98;
134static REJECT_STATE_WITH_BACKUP: u8 = 86;
135
136macro_rules! is_reject_state(($state:expr) => ($state >= REJECT_STATE_WITH_BACKUP));
137macro_rules! next_state(($state:expr, $ch:expr) => (
138    STATE_TRANSITIONS[($state + CHAR_CATEGORY[$ch as usize]) as usize]
139));
140
141impl RawDecoder for UTF8Decoder {
142    fn from_self(&self) -> Box<RawDecoder> { UTF8Decoder::new() }
143    fn is_ascii_compatible(&self) -> bool { true }
144
145    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
146        output.writer_hint(input.len());
147
148        fn write_bytes(output: &mut StringWriter, bytes: &[u8]) {
149            output.write_str(unsafe {mem::transmute(bytes)});
150        }
151
152        let mut state = self.state;
153        let mut processed = 0;
154        let mut offset = 0;
155
156        // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
157        if state == INITIAL_STATE {
158            let first_msb = input.iter().position(|&ch| ch >= 0x80).unwrap_or(input.len());
159            offset += first_msb;
160            processed += first_msb;
161        }
162
163        for (i, &ch) in input[offset..].iter().enumerate() {
164            state = next_state!(state, ch);
165            if state == ACCEPT_STATE {
166                processed = i + offset + 1;
167            } else if is_reject_state!(state) {
168                let upto = if state == REJECT_STATE {i + offset + 1} else {i + offset};
169                self.state = INITIAL_STATE;
170                if processed > 0 && self.queuelen > 0 { // flush `queue` outside the problem
171                    write_bytes(output, &self.queue[0..self.queuelen]);
172                }
173                self.queuelen = 0;
174                write_bytes(output, &input[0..processed]);
175                return (processed, Some(CodecError {
176                    upto: upto as isize, cause: "invalid sequence".into()
177                }));
178            }
179        }
180
181        self.state = state;
182        if processed > 0 && self.queuelen > 0 { // flush `queue`
183            write_bytes(output, &self.queue[0..self.queuelen]);
184            self.queuelen = 0;
185        }
186        write_bytes(output, &input[0..processed]);
187        if processed < input.len() {
188            let morequeuelen = input.len() - processed;
189            for i in 0..morequeuelen {
190                self.queue[self.queuelen + i] = input[processed + i];
191            }
192            self.queuelen += morequeuelen;
193        }
194        (processed, None)
195    }
196
197    fn raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError> {
198        let state = self.state;
199        let queuelen = self.queuelen;
200        self.state = INITIAL_STATE;
201        self.queuelen = 0;
202        if state != ACCEPT_STATE {
203            Some(CodecError { upto: 0, cause: "incomplete sequence".into() })
204        } else {
205            assert!(queuelen == 0);
206            None
207        }
208    }
209}
210
211/// Almost equivalent to `std::str::from_utf8`.
212/// This function is provided for the fair benchmark against the stdlib's UTF-8 conversion
213/// functions, as rust-encoding always allocates a new string.
214pub fn from_utf8<'a>(input: &'a [u8]) -> Option<&'a str> {
215    let mut iter = input.iter();
216    let mut state;
217
218    macro_rules! return_as_whole(() => (return Some(unsafe {mem::transmute(input)})));
219
220    // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
221    loop {
222        match iter.next() {
223            Some(&ch) if ch < 0x80 => {}
224            Some(&ch) => {
225                state = next_state!(INITIAL_STATE, ch);
226                break;
227            }
228            None => { return_as_whole!(); }
229        }
230    }
231
232    for &ch in iter {
233        state = next_state!(state, ch);
234        if is_reject_state!(state) { return None; }
235    }
236    if state != ACCEPT_STATE { return None; }
237    return_as_whole!();
238}
239
240#[cfg(test)]
241mod tests {
242    // portions of these tests are adopted from Markus Kuhn's UTF-8 decoder capability and
243    // stress test: <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>.
244
245    use super::{UTF8Encoding, from_utf8};
246    use std::str;
247    use testutils;
248    use types::*;
249
250    #[test]
251    fn test_valid() {
252        // one byte
253        let mut d = UTF8Encoding.raw_decoder();
254        assert_feed_ok!(d, [0x41], [], "A");
255        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
256        assert_feed_ok!(d, [], [], "");
257        assert_feed_ok!(d, [0x44, 0x45, 0x46], [], "DEF");
258        assert_finish_ok!(d, "");
259
260        // two bytes
261        let mut d = UTF8Encoding.raw_decoder();
262        assert_feed_ok!(d, [0xc2, 0xa2], [], "\u{a2}");
263        assert_feed_ok!(d, [0xc2, 0xac, 0xc2, 0xa9], [], "\u{ac}\u{0a9}");
264        assert_feed_ok!(d, [], [], "");
265        assert_feed_ok!(d, [0xd5, 0xa1, 0xd5, 0xb5, 0xd5, 0xa2, 0xd5, 0xb8, 0xd6, 0x82,
266                            0xd5, 0xa2, 0xd5, 0xa5, 0xd5, 0xb6], [],
267                        "\u{561}\u{0575}\u{562}\u{578}\u{582}\u{562}\u{565}\u{576}");
268        assert_finish_ok!(d, "");
269
270        // three bytes
271        let mut d = UTF8Encoding.raw_decoder();
272        assert_feed_ok!(d, [0xed, 0x92, 0x89], [], "\u{d489}");
273        assert_feed_ok!(d, [0xe6, 0xbc, 0xa2, 0xe5, 0xad, 0x97], [], "\u{6f22}\u{5b57}");
274        assert_feed_ok!(d, [], [], "");
275        assert_feed_ok!(d, [0xc9, 0x99, 0xc9, 0x94, 0xc9, 0x90], [], "\u{259}\u{0254}\u{250}");
276        assert_finish_ok!(d, "");
277
278        // four bytes
279        let mut d = UTF8Encoding.raw_decoder();
280        assert_feed_ok!(d, [0xf0, 0x90, 0x82, 0x82], [], "\u{10082}");
281        assert_feed_ok!(d, [], [], "");
282        assert_finish_ok!(d, "");
283
284        // we don't test encoders as it is largely a no-op.
285    }
286
287    #[test]
288    fn test_valid_boundary() {
289        let mut d = UTF8Encoding.raw_decoder();
290        assert_feed_ok!(d, [0x00], [], "\x00");
291        assert_finish_ok!(d, "");
292
293        let mut d = UTF8Encoding.raw_decoder();
294        assert_feed_ok!(d, [0x7f], [], "\x7f");
295        assert_finish_ok!(d, "");
296
297        let mut d = UTF8Encoding.raw_decoder();
298        assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
299        assert_finish_ok!(d, "");
300
301        let mut d = UTF8Encoding.raw_decoder();
302        assert_feed_ok!(d, [0xdf, 0xbf], [], "\u{7ff}");
303        assert_finish_ok!(d, "");
304
305        let mut d = UTF8Encoding.raw_decoder();
306        assert_feed_ok!(d, [0xe0, 0xa0, 0x80], [], "\u{800}");
307        assert_finish_ok!(d, "");
308
309        let mut d = UTF8Encoding.raw_decoder();
310        assert_feed_ok!(d, [0xed, 0x9f, 0xbf], [], "\u{d7ff}");
311        assert_finish_ok!(d, "");
312
313        let mut d = UTF8Encoding.raw_decoder();
314        assert_feed_ok!(d, [0xee, 0x80, 0x80], [], "\u{e000}");
315        assert_finish_ok!(d, "");
316
317        let mut d = UTF8Encoding.raw_decoder();
318        assert_feed_ok!(d, [0xef, 0xbf, 0xbf], [], "\u{ffff}");
319        assert_finish_ok!(d, "");
320
321        let mut d = UTF8Encoding.raw_decoder();
322        assert_feed_ok!(d, [0xf0, 0x90, 0x80, 0x80], [], "\u{10000}");
323        assert_finish_ok!(d, "");
324
325        let mut d = UTF8Encoding.raw_decoder();
326        assert_feed_ok!(d, [0xf4, 0x8f, 0xbf, 0xbf], [], "\u{10ffff}");
327        assert_finish_ok!(d, "");
328    }
329
330    #[test]
331    fn test_valid_partial() {
332        let mut d = UTF8Encoding.raw_decoder();
333        assert_feed_ok!(d, [], [0xf0], "");
334        assert_feed_ok!(d, [], [0x90], "");
335        assert_feed_ok!(d, [], [0x82], "");
336        assert_feed_ok!(d, [0x82], [0xed], "\u{10082}");
337        assert_feed_ok!(d, [0x92, 0x89], [], "\u{d489}");
338        assert_finish_ok!(d, "");
339
340        let mut d = UTF8Encoding.raw_decoder();
341        assert_feed_ok!(d, [], [0xc2], "");
342        assert_feed_ok!(d, [0xa9, 0x20], [], "\u{a9}\u{020}");
343        assert_finish_ok!(d, "");
344    }
345
346    #[test]
347    fn test_invalid_continuation() {
348        for c in 0x80..0xc0 {
349            let mut d = UTF8Encoding.raw_decoder();
350            assert_feed_err!(d, [], [c], [], "");
351            assert_finish_ok!(d, "");
352
353            let mut d = UTF8Encoding.raw_decoder();
354            assert_feed_err!(d, [], [c], [c], "");
355            assert_finish_ok!(d, "");
356
357            let mut d = UTF8Encoding.raw_decoder();
358            assert_feed_err!(d, [], [c], [c, c], "");
359            assert_finish_ok!(d, "");
360        }
361    }
362
363    #[test]
364    fn test_invalid_surrogate() {
365        // surrogates should fail at the second byte.
366
367        let mut d = UTF8Encoding.raw_decoder();
368        assert_feed_err!(d, [], [0xed], [0xa0, 0x80], "");
369        assert_finish_ok!(d, "");
370
371        let mut d = UTF8Encoding.raw_decoder();
372        assert_feed_err!(d, [], [0xed], [0xad, 0xbf], "");
373        assert_finish_ok!(d, "");
374
375        let mut d = UTF8Encoding.raw_decoder();
376        assert_feed_err!(d, [], [0xed], [0xae, 0x80], "");
377        assert_finish_ok!(d, "");
378
379        let mut d = UTF8Encoding.raw_decoder();
380        assert_feed_err!(d, [], [0xed], [0xaf, 0xbf], "");
381        assert_finish_ok!(d, "");
382
383        let mut d = UTF8Encoding.raw_decoder();
384        assert_feed_err!(d, [], [0xed], [0xb0, 0x80], "");
385        assert_finish_ok!(d, "");
386
387        let mut d = UTF8Encoding.raw_decoder();
388        assert_feed_err!(d, [], [0xed], [0xbe, 0x80], "");
389        assert_finish_ok!(d, "");
390
391        let mut d = UTF8Encoding.raw_decoder();
392        assert_feed_err!(d, [], [0xed], [0xbf, 0xbf], "");
393        assert_finish_ok!(d, "");
394    }
395
396    #[test]
397    fn test_invalid_boundary() {
398        // as with surrogates, should fail at the second byte.
399        let mut d = UTF8Encoding.raw_decoder();
400        assert_feed_err!(d, [], [0xf4], [0x90, 0x90, 0x90], ""); // U+110000
401        assert_finish_ok!(d, "");
402    }
403
404    #[test]
405    fn test_invalid_start_immediate_test_finish() {
406        for c in 0xf5..0x100 {
407            let c = c as u8;
408            let mut d = UTF8Encoding.raw_decoder();
409            assert_feed_err!(d, [], [c], [], "");
410            assert_finish_ok!(d, "");
411        }
412    }
413
414    #[test]
415    fn test_invalid_start_followed_by_space() {
416        for c in 0xf5..0x100 {
417            let c = c as u8;
418
419            let mut d = UTF8Encoding.raw_decoder();
420            assert_feed_err!(d, [], [c], [0x20], "");
421            assert_finish_ok!(d, "");
422
423            let mut d = UTF8Encoding.raw_decoder();
424            assert_feed_err!(d, [], [c], [], "");
425            assert_feed_ok!(d, [0x20], [], "\x20");
426            assert_finish_ok!(d, "");
427        }
428    }
429
430    #[test]
431    fn test_invalid_lone_start_immediate_test_finish() {
432        for c in 0xc2..0xf5 {
433            let mut d = UTF8Encoding.raw_decoder();
434            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
435            assert_finish_err!(d, "");
436        }
437    }
438
439    #[test]
440    fn test_invalid_lone_start_followed_by_space() {
441        for c in 0xc2..0xf5 {
442            let mut d = UTF8Encoding.raw_decoder();
443            assert_feed_err!(d, [], [c], [0x20], "");
444            assert_finish_ok!(d, "");
445
446            let mut d = UTF8Encoding.raw_decoder();
447            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
448            assert_feed_err!(d, [], [], [0x20], "");
449            assert_finish_ok!(d, "");
450        }
451    }
452
453    #[test]
454    fn test_invalid_incomplete_three_byte_seq_followed_by_space() {
455        for b in 0xe0..0xf5 {
456            let c = if b == 0xe0 || b == 0xf0 {0xa0} else {0x80};
457
458            let mut d = UTF8Encoding.raw_decoder();
459            assert_feed_err!(d, [], [b, c], [0x20], "");
460            assert_finish_ok!(d, "");
461
462            let mut d = UTF8Encoding.raw_decoder();
463            assert_feed_ok!(d, [], [b, c], ""); // wait for cont. bytes
464            assert_feed_err!(d, [], [], [0x20], "");
465            assert_finish_ok!(d, "");
466
467            let mut d = UTF8Encoding.raw_decoder();
468            assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
469            assert_feed_err!(d, [], [c], [0x20], "");
470            assert_finish_ok!(d, "");
471
472            let mut d = UTF8Encoding.raw_decoder();
473            assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
474            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
475            assert_feed_err!(d, [], [], [0x20], "");
476            assert_finish_ok!(d, "");
477        }
478    }
479
480    #[test]
481    fn test_invalid_incomplete_four_byte_seq_followed_by_space() {
482        for a in 0xf0..0xf5 {
483            let b = if a == 0xf0 {0xa0} else {0x80};
484            let c = 0x80;
485
486            let mut d = UTF8Encoding.raw_decoder();
487            assert_feed_err!(d, [], [a, b, c], [0x20], "");
488            assert_finish_ok!(d, "");
489
490            let mut d = UTF8Encoding.raw_decoder();
491            assert_feed_ok!(d, [], [a], ""); // wait for cont. bytes
492            assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
493            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
494            assert_feed_err!(d, [], [], [0x20], "");
495            assert_finish_ok!(d, "");
496
497            let mut d = UTF8Encoding.raw_decoder();
498            assert_feed_ok!(d, [], [a, b], ""); // wait for cont. bytes
499            assert_feed_err!(d, [], [c], [0x20], "");
500            assert_finish_ok!(d, "");
501
502            let mut d = UTF8Encoding.raw_decoder();
503            assert_feed_ok!(d, [], [a, b, c], ""); // wait for cont. bytes
504            assert_feed_err!(d, [], [], [0x20], "");
505            assert_finish_ok!(d, "");
506        }
507    }
508
509    #[test]
510    fn test_invalid_too_many_cont_bytes() {
511        let mut d = UTF8Encoding.raw_decoder();
512        assert_feed_err!(d, [0xc2, 0x80], [0x80], [], "\u{80}");
513        assert_finish_ok!(d, "");
514
515        let mut d = UTF8Encoding.raw_decoder();
516        assert_feed_err!(d, [0xe0, 0xa0, 0x80], [0x80], [], "\u{800}");
517        assert_finish_ok!(d, "");
518
519        let mut d = UTF8Encoding.raw_decoder();
520        assert_feed_err!(d, [0xf0, 0x90, 0x80, 0x80], [0x80], [], "\u{10000}");
521        assert_finish_ok!(d, "");
522
523        // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
524        let mut d = UTF8Encoding.raw_decoder();
525        assert_feed_err!(d, [], [0xf8], [0x88, 0x80, 0x80, 0x80, 0x80], "");
526        assert_finish_ok!(d, "");
527
528        let mut d = UTF8Encoding.raw_decoder();
529        assert_feed_err!(d, [], [0xfc], [0x84, 0x80, 0x80, 0x80, 0x80, 0x80], "");
530        assert_finish_ok!(d, "");
531
532        let mut d = UTF8Encoding.raw_decoder();
533        assert_feed_err!(d, [], [0xfe], [0x80], "");
534        assert_finish_ok!(d, "");
535
536        let mut d = UTF8Encoding.raw_decoder();
537        assert_feed_err!(d, [], [0xff], [0x80], "");
538        assert_finish_ok!(d, "");
539    }
540
541    #[test]
542    fn test_invalid_too_many_cont_bytes_partial() {
543        let mut d = UTF8Encoding.raw_decoder();
544        assert_feed_ok!(d, [], [0xc2], "");
545        assert_feed_err!(d, [0x80], [0x80], [], "\u{80}");
546        assert_finish_ok!(d, "");
547
548        let mut d = UTF8Encoding.raw_decoder();
549        assert_feed_ok!(d, [], [0xe0, 0xa0], "");
550        assert_feed_err!(d, [0x80], [0x80], [], "\u{800}");
551        assert_finish_ok!(d, "");
552
553        let mut d = UTF8Encoding.raw_decoder();
554        assert_feed_ok!(d, [], [0xf0, 0x90, 0x80], "");
555        assert_feed_err!(d, [0x80], [0x80], [], "\u{10000}");
556        assert_finish_ok!(d, "");
557
558        // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
559        let mut d = UTF8Encoding.raw_decoder();
560        assert_feed_err!(d, [], [0xf8], [], "");
561        assert_feed_err!(d, [], [0x88], [0x80, 0x80, 0x80, 0x80], "");
562        assert_finish_ok!(d, "");
563
564        let mut d = UTF8Encoding.raw_decoder();
565        assert_feed_err!(d, [], [0xfc], [], "");
566        assert_feed_err!(d, [], [0x84], [0x80, 0x80, 0x80, 0x80, 0x80], "");
567        assert_finish_ok!(d, "");
568
569        let mut d = UTF8Encoding.raw_decoder();
570        assert_feed_err!(d, [], [0xfe], [], "");
571        assert_feed_err!(d, [], [0x80], [], "");
572        assert_finish_ok!(d, "");
573
574        let mut d = UTF8Encoding.raw_decoder();
575        assert_feed_err!(d, [], [0xff], [], "");
576        assert_feed_err!(d, [], [0x80], [], "");
577        assert_finish_ok!(d, "");
578    }
579
580    #[test]
581    fn test_invalid_overlong_minimal() {
582        let mut d = UTF8Encoding.raw_decoder();
583        assert_feed_err!(d, [], [0xc0], [0x80], "");
584        assert_finish_ok!(d, "");
585
586        let mut d = UTF8Encoding.raw_decoder();
587        assert_feed_err!(d, [], [0xe0], [0x80, 0x80], "");
588        assert_finish_ok!(d, "");
589
590        let mut d = UTF8Encoding.raw_decoder();
591        assert_feed_err!(d, [], [0xf0], [0x80, 0x80, 0x80], "");
592        assert_finish_ok!(d, "");
593    }
594
595    #[test]
596    fn test_invalid_overlong_maximal() {
597        let mut d = UTF8Encoding.raw_decoder();
598        assert_feed_err!(d, [], [0xc1], [0xbf], "");
599        assert_finish_ok!(d, "");
600
601        let mut d = UTF8Encoding.raw_decoder();
602        assert_feed_err!(d, [], [0xe0], [0x9f, 0xbf], "");
603        assert_finish_ok!(d, "");
604
605        let mut d = UTF8Encoding.raw_decoder();
606        assert_feed_err!(d, [], [0xf0], [0x8f, 0xbf, 0xbf], "");
607        assert_finish_ok!(d, "");
608    }
609
610    #[test]
611    fn test_feed_after_finish() {
612        let mut d = UTF8Encoding.raw_decoder();
613        assert_feed_ok!(d, [0xc2, 0x80], [0xc2], "\u{80}");
614        assert_finish_err!(d, "");
615        assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
616        assert_finish_ok!(d, "");
617    }
618
619    #[test]
620    fn test_correct_from_utf8() {
621        let s = testutils::ASCII_TEXT.as_bytes();
622        assert_eq!(from_utf8(s), str::from_utf8(s).ok());
623
624        let s = testutils::KOREAN_TEXT.as_bytes();
625        assert_eq!(from_utf8(s), str::from_utf8(s).ok());
626
627        let s = testutils::INVALID_UTF8_TEXT;
628        assert_eq!(from_utf8(s), str::from_utf8(s).ok());
629    }
630
631    mod bench_ascii {
632        extern crate test;
633        use super::super::{UTF8Encoding, from_utf8};
634        use std::str;
635        use testutils;
636        use types::*;
637
638        #[bench]
639        fn bench_encode(bencher: &mut test::Bencher) {
640            let s = testutils::ASCII_TEXT;
641            bencher.bytes = s.len() as u64;
642            bencher.iter(|| test::black_box({
643                UTF8Encoding.encode(s, EncoderTrap::Strict)
644            }))
645        }
646
647        #[bench]
648        fn bench_decode(bencher: &mut test::Bencher) {
649            let s = testutils::ASCII_TEXT.as_bytes();
650            bencher.bytes = s.len() as u64;
651            bencher.iter(|| test::black_box({
652                UTF8Encoding.decode(s, DecoderTrap::Strict)
653            }))
654        }
655
656        #[bench]
657        fn bench_from_utf8(bencher: &mut test::Bencher) {
658            let s = testutils::ASCII_TEXT.as_bytes();
659            bencher.bytes = s.len() as u64;
660            bencher.iter(|| test::black_box({
661                from_utf8(s)
662            }))
663        }
664
665        #[bench] // for the comparison
666        fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
667            let s = testutils::ASCII_TEXT.as_bytes();
668            bencher.bytes = s.len() as u64;
669            bencher.iter(|| test::black_box({
670                str::from_utf8(s)
671            }))
672        }
673
674        #[bench] // for the comparison
675        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
676            let s = testutils::ASCII_TEXT.as_bytes();
677            bencher.bytes = s.len() as u64;
678            bencher.iter(|| test::black_box({
679                String::from_utf8_lossy(s)
680            }))
681        }
682    }
683
684    // why Korean? it has an excellent mix of multibyte sequences and ASCII sequences
685    // unlike other CJK scripts, so it reflects a practical use case a bit better.
686    mod bench_korean {
687        extern crate test;
688        use super::super::{UTF8Encoding, from_utf8};
689        use std::str;
690        use testutils;
691        use types::*;
692
693        #[bench]
694        fn bench_encode(bencher: &mut test::Bencher) {
695            let s = testutils::KOREAN_TEXT;
696            bencher.bytes = s.len() as u64;
697            bencher.iter(|| test::black_box({
698                UTF8Encoding.encode(s, EncoderTrap::Strict)
699            }))
700        }
701
702        #[bench]
703        fn bench_decode(bencher: &mut test::Bencher) {
704            let s = testutils::KOREAN_TEXT.as_bytes();
705            bencher.bytes = s.len() as u64;
706            bencher.iter(|| test::black_box({
707                UTF8Encoding.decode(s, DecoderTrap::Strict)
708            }))
709        }
710
711        #[bench]
712        fn bench_from_utf8(bencher: &mut test::Bencher) {
713            let s = testutils::KOREAN_TEXT.as_bytes();
714            bencher.bytes = s.len() as u64;
715            bencher.iter(|| test::black_box({
716                from_utf8(s)
717            }))
718        }
719
720        #[bench] // for the comparison
721        fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
722            let s = testutils::KOREAN_TEXT.as_bytes();
723            bencher.bytes = s.len() as u64;
724            bencher.iter(|| test::black_box({
725                str::from_utf8(s)
726            }))
727        }
728
729        #[bench] // for the comparison
730        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
731            let s = testutils::KOREAN_TEXT.as_bytes();
732            bencher.bytes = s.len() as u64;
733            bencher.iter(|| test::black_box({
734                String::from_utf8_lossy(s)
735            }))
736        }
737    }
738
739    mod bench_lossy_invalid {
740        extern crate test;
741        use super::super::{UTF8Encoding, from_utf8};
742        use std::str;
743        use testutils;
744        use types::*;
745        use types::DecoderTrap::Replace as DecodeReplace;
746
747        #[bench]
748        fn bench_decode_replace(bencher: &mut test::Bencher) {
749            let s = testutils::INVALID_UTF8_TEXT;
750            bencher.bytes = s.len() as u64;
751            bencher.iter(|| test::black_box({
752                UTF8Encoding.decode(s, DecodeReplace)
753            }))
754        }
755
756        #[bench] // for the comparison
757        fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
758            let s = testutils::INVALID_UTF8_TEXT;
759            bencher.bytes = s.len() as u64;
760            bencher.iter(|| test::black_box({
761                from_utf8(s)
762            }))
763        }
764
765        #[bench] // for the comparison
766        fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
767            let s = testutils::INVALID_UTF8_TEXT;
768            bencher.bytes = s.len() as u64;
769            bencher.iter(|| test::black_box({
770                str::from_utf8(s)
771            }))
772        }
773
774        #[bench] // for the comparison
775        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
776            let s = testutils::INVALID_UTF8_TEXT;
777            bencher.bytes = s.len() as u64;
778            bencher.iter(|| test::black_box({
779                String::from_utf8_lossy(s)
780            }))
781        }
782    }
783
784    mod bench_lossy_external {
785        extern crate test;
786        use super::super::{UTF8Encoding, from_utf8};
787        use std::str;
788        use testutils;
789        use types::*;
790        use types::DecoderTrap::Replace as DecodeReplace;
791
792        #[bench]
793        fn bench_decode_replace(bencher: &mut test::Bencher) {
794            let s = testutils::get_external_bench_data();
795            bencher.bytes = s.len() as u64;
796            bencher.iter(|| test::black_box({
797                UTF8Encoding.decode(&s, DecodeReplace)
798            }))
799        }
800
801        #[bench] // for the comparison
802        fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
803            let s = testutils::get_external_bench_data();
804            bencher.bytes = s.len() as u64;
805            bencher.iter(|| test::black_box({
806                from_utf8(&s)
807            }))
808        }
809
810        #[bench] // for the comparison
811        fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
812            let s = testutils::get_external_bench_data();
813            bencher.bytes = s.len() as u64;
814            bencher.iter(|| test::black_box({
815                str::from_utf8(&s)
816            }))
817        }
818
819        #[bench] // for the comparison
820        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
821            let s = testutils::get_external_bench_data();
822            bencher.bytes = s.len() as u64;
823            bencher.iter(|| test::black_box({
824                String::from_utf8_lossy(&s)
825            }))
826        }
827    }
828}