encoding_rs/
lib.rs

Help
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10#![cfg_attr(
11    feature = "cargo-clippy",
12    allow(doc_markdown, inline_always, new_ret_no_self)
13)]
14
15//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17//! Gecko-oriented means that converting to and from UTF-16 is supported in
18//! addition to converting to and from UTF-8, that the performance and
19//! streamability goals are browser-oriented, and that FFI-friendliness is a
20//! goal.
21//!
22//! Additionally, the `mem` module provides functions that are useful for
23//! applications that need to be able to deal with legacy in-memory
24//! representations of Unicode.
25//!
26//! For expectation setting, please be sure to read the sections
27//! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28//! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29//!
30//! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31//! design and internals of the crate.
32//!
33//! # Availability
34//!
35//! The code is available under the
36//! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37//! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38//! See the
39//! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40//! file for details.
41//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43//!
44//! # Integration with `std::io`
45//!
46//! This crate doesn't implement traits from `std::io`. However, for the case of
47//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50//!
51//! # Examples
52//!
53//! Example programs:
54//!
55//! * [Rust](https://github.com/hsivonen/recode_rs)
56//! * [C](https://github.com/hsivonen/recode_c)
57//! * [C++](https://github.com/hsivonen/recode_cpp)
58//!
59//! Decode using the non-streaming API:
60//!
61//! ```
62//! use encoding_rs::*;
63//!
64//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65//! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66//!
67//! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68//! assert_eq!(&cow[..], expectation);
69//! assert_eq!(encoding_used, SHIFT_JIS);
70//! assert!(!had_errors);
71//! ```
72//!
73//! Decode using the streaming API with minimal `unsafe`:
74//!
75//! ```
76//! use encoding_rs::*;
77//!
78//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79//!
80//! // Use an array of byte slices to demonstrate content arriving piece by
81//! // piece from the network.
82//! let bytes: [&'static [u8]; 4] = [b"\x83",
83//!                                  b"n\x83\x8D\x81",
84//!                                  b"[\x81E\x83\x8F\x81[\x83",
85//!                                  b"\x8B\x83h"];
86//!
87//! // Very short output buffer to demonstrate the output buffer getting full.
88//! // Normally, you'd use something like `[0u8; 2048]`.
89//! let mut buffer_bytes = [0u8; 8];
90//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91//!
92//! // How many bytes in the buffer currently hold significant data.
93//! let mut bytes_in_buffer = 0usize;
94//!
95//! // Collect the output to a string for demonstration purposes.
96//! let mut output = String::new();
97//!
98//! // The `Decoder`
99//! let mut decoder = SHIFT_JIS.new_decoder();
100//!
101//! // Track whether we see errors.
102//! let mut total_had_errors = false;
103//!
104//! // Decode using a fixed-size intermediate buffer (for demonstrating the
105//! // use of a fixed-size buffer; normally when the output of an incremental
106//! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107//! // avoid the intermediate buffer).
108//! for input in &bytes[..] {
109//!     // The number of bytes already read from current `input` in total.
110//!     let mut total_read_from_current_input = 0usize;
111//!
112//!     loop {
113//!         let (result, read, written, had_errors) =
114//!             decoder.decode_to_str(&input[total_read_from_current_input..],
115//!                                   &mut buffer[bytes_in_buffer..],
116//!                                   false);
117//!         total_read_from_current_input += read;
118//!         bytes_in_buffer += written;
119//!         total_had_errors |= had_errors;
120//!         match result {
121//!             CoderResult::InputEmpty => {
122//!                 // We have consumed the current input buffer. Break out of
123//!                 // the inner loop to get the next input buffer from the
124//!                 // outer loop.
125//!                 break;
126//!             },
127//!             CoderResult::OutputFull => {
128//!                 // Write the current buffer out and consider the buffer
129//!                 // empty.
130//!                 output.push_str(&buffer[..bytes_in_buffer]);
131//!                 bytes_in_buffer = 0usize;
132//!                 continue;
133//!             }
134//!         }
135//!     }
136//! }
137//!
138//! // Process EOF
139//! loop {
140//!     let (result, _, written, had_errors) =
141//!         decoder.decode_to_str(b"",
142//!                               &mut buffer[bytes_in_buffer..],
143//!                               true);
144//!     bytes_in_buffer += written;
145//!     total_had_errors |= had_errors;
146//!     // Write the current buffer out and consider the buffer empty.
147//!     // Need to do this here for both `match` arms, because we exit the
148//!     // loop on `CoderResult::InputEmpty`.
149//!     output.push_str(&buffer[..bytes_in_buffer]);
150//!     bytes_in_buffer = 0usize;
151//!     match result {
152//!         CoderResult::InputEmpty => {
153//!             // Done!
154//!             break;
155//!         },
156//!         CoderResult::OutputFull => {
157//!             continue;
158//!         }
159//!     }
160//! }
161//!
162//! assert_eq!(&output[..], expectation);
163//! assert!(!total_had_errors);
164//! ```
165//!
166//! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167//!
168//! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169//! __so this crate does not provide encoders for those encodings__!
170//! Along with the replacement encoding, their _output encoding_ is UTF-8,
171//! so you get an UTF-8 encoder if you request an encoder for them.
172//!
173//! Additionally, the Encoding Standard factors BOM handling into wrapper
174//! algorithms so that BOM handling isn't part of the definition of the
175//! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176//! Standard define BOM handling or lack thereof as part of the encoding
177//! scheme.
178//!
179//! When used with the `_without_bom_handling` entry points, the UTF-16LE
180//! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181//! the Unicode Standard.
182//!
183//! When used with the `_with_bom_removal` entry points, the UTF-8
184//! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185//! Standard.
186//!
187//! This crate does not provide a mode that matches the UTF-16 _encoding
188//! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189//! the entry points without `_bom_` qualifiers is the closest match,
190//! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191//! not part of the behavior of the UTF-16 _encoding scheme_ per the
192//! Unicode Standard.
193//!
194//! The UTF-32 family of Unicode encoding schemes is not supported
195//! by this crate. The Encoding Standard doesn't define any UTF-32
196//! family encodings, since they aren't necessary for consuming Web
197//! content.
198//!
199//! ## ISO-8859-1
200//!
201//! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202//! the Encoding Standard. Therefore, an encoding that maps the unsigned
203//! byte value to the same Unicode scalar value is not available via
204//! `Encoding` in this crate.
205//!
206//! However, the functions whose name starts with `convert` and contains
207//! `latin1` in the `mem` module support such conversions, which are known as
208//! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209//! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210//! in the [Infra Standard](https://infra.spec.whatwg.org/).
211//!
212//! ## Web / Browser Focus
213//!
214//! Both in terms of scope and performance, the focus is on the Web. For scope,
215//! this means that encoding_rs implements the Encoding Standard fully and
216//! doesn't implement encodings that are not specified in the Encoding
217//! Standard. For performance, this means that decoding performance is
218//! important as well as performance for encoding into UTF-8 or encoding the
219//! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220//! be encoded into legacy encodings in only two places in the Web platform: in
221//! the query part of URLs, in which case it's a matter of relatively rare
222//! error handling, and in form submission, in which case the user action and
223//! networking tend to hide the performance of the encoder.
224//!
225//! Deemphasizing performance of encoding non-Basic Latin text into legacy
226//! encodings enables smaller code size thanks to the encoder side using the
227//! decode-optimized data tables without having encode-optimized data tables at
228//! all. Even in decoders, smaller lookup table size is preferred over avoiding
229//! multiplication operations.
230//!
231//! Additionally, performance is a non-goal for the ASCII-incompatible
232//! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234//! of implementation.
235//!
236//! Despite the browser focus, the hope is that non-browser applications
237//! that wish to consume Web content or submit Web forms in a Web-compatible
238//! way will find encoding_rs useful. While encoding_rs does not try to match
239//! Windows behavior, many of the encodings are close enough to legacy
240//! encodings implemented by Windows that applications that need to consume
241//! data in legacy Windows encodins may find encoding_rs useful. The
242//! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243//! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244//!
245//! For decoding email, UTF-7 support is needed (unfortunately) in additition
246//! to the encodings defined in the Encoding Standard. The
247//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248//! UTF-7 decoding for email purposes.
249//!
250//! For single-byte DOS encodings beyond the ones supported by the Encoding
251//! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
252//!
253//! # Preparing Text for the Encoders
254//!
255//! Normalizing text into Unicode Normalization Form C prior to encoding text
256//! into a legacy encoding minimizes unmappable characters. Text can be
257//! normalized to Unicode Normalization Form C using the
258//! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
259//!
260//! The exception is windows-1258, which after normalizing to Unicode
261//! Normalization Form C requires tone marks to be decomposed in order to
262//! minimize unmappable characters. Vietnamese tone marks can be decomposed
263//! using the [`detone`](https://crates.io/crates/detone) crate.
264//!
265//! # Streaming & Non-Streaming; Rust & C/C++
266//!
267//! The API in Rust has two modes of operation: streaming and non-streaming.
268//! The streaming API is the foundation of the implementation and should be
269//! used when processing data that arrives piecemeal from an i/o stream. The
270//! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
271//! to C callers. The non-streaming part of the API is for Rust callers only and
272//! is smart about borrowing instead of copying when possible. When
273//! streamability is not needed, the non-streaming API should be preferrer in
274//! order to avoid copying data when a borrow suffices.
275//!
276//! There is no analogous C API exposed via FFI, mainly because C doesn't have
277//! standard types for growable byte buffers and Unicode strings that know
278//! their length.
279//!
280//! The C API (header file generated at `target/include/encoding_rs.h` when
281//! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
282//! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
283//! The C binding comes with a [C++14 wrapper][2] that uses standard library +
284//! [GSL][3] types and that recreates the non-streaming API in C++ on top of
285//! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
286//! as part of Mozilla [bug 1261841][4].
287//!
288//! The `Encoding` type is common to both the streaming and non-streaming
289//! modes. In the streaming mode, decoding operations are performed with a
290//! `Decoder` and encoding operations with an `Encoder` object obtained via
291//! `Encoding`. In the non-streaming mode, decoding and encoding operations are
292//! performed using methods on `Encoding` objects themselves, so the `Decoder`
293//! and `Encoder` objects are not used at all.
294//!
295//! [1]: https://github.com/hsivonen/encoding_c
296//! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
297//! [3]: https://github.com/Microsoft/GSL/
298//! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
299//!
300//! # Memory management
301//!
302//! The non-streaming mode never performs heap allocations (even the methods
303//! that write into a `Vec<u8>` or a `String` by taking them as arguments do
304//! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
305//! is, the non-streaming mode uses caller-allocated buffers exclusively.
306//!
307//! The methods of the streaming mode that return a `Vec<u8>` or a `String`
308//! perform heap allocations but only to allocate the backing buffer of the
309//! `Vec<u8>` or the `String`.
310//!
311//! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
312//! `Drop` cleanup.
313//!
314//! # Buffer reading and writing behavior
315//!
316//! Based on experience gained with the `java.nio.charset` encoding converter
317//! API and with the Gecko uconv encoding converter API, the buffer reading
318//! and writing behaviors of encoding_rs are asymmetric: input buffers are
319//! fully drained but output buffers are not always fully filled.
320//!
321//! When reading from an input buffer, encoding_rs always consumes all input
322//! up to the next error or to the end of the buffer. In particular, when
323//! decoding, even if the input buffer ends in the middle of a byte sequence
324//! for a character, the decoder consumes all input. This has the benefit that
325//! the caller of the API can always fill the next buffer from the start from
326//! whatever source the bytes come from and never has to first copy the last
327//! bytes of the previous buffer to the start of the next buffer. However, when
328//! encoding, the UTF-8 input buffers have to end at a character boundary, which
329//! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
330//! boundaries falling in the middle of a surrogate pair result in both
331//! suggorates being treated individually as unpaired surrogates.
332//!
333//! Additionally, decoders guarantee that they can be fed even one byte at a
334//! time and encoders guarantee that they can be fed even one code point at a
335//! time. This has the benefit of not placing restrictions on the size of
336//! chunks the content arrives e.g. from network.
337//!
338//! When writing into an output buffer, encoding_rs makes sure that the code
339//! unit sequence for a character is never split across output buffer
340//! boundaries. This may result in wasted space at the end of an output buffer,
341//! but the advantages are that the output side of both decoders and encoders
342//! is greatly simplified compared to designs that attempt to fill output
343//! buffers exactly even when that entails splitting a code unit sequence and
344//! when encoding_rs methods return to the caller, the output produces thus
345//! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
346//! the output needs to be considered as a whole, because the latest output
347//! buffer taken alone might not be valid taken alone if the transition away
348//! from the ASCII state occurred in an earlier output buffer. However, since
349//! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
350//! state as being in error despite the encoder generating a transition to the
351//! ASCII state at the end, the claim about the partial output taken as a whole
352//! being valid is true even for ISO-2022-JP.)
353//!
354//! # Error Reporting
355//!
356//! Based on experience gained with the `java.nio.charset` encoding converter
357//! API and with the Gecko uconv encoding converter API, the error reporting
358//! behaviors of encoding_rs are asymmetric: decoder errors include offsets
359//! that leave it up to the caller to extract the erroneous bytes from the
360//! input stream if the caller wishes to do so but encoder errors provide the
361//! code point associated with the error without requiring the caller to
362//! extract it from the input on its own.
363//!
364//! On the encoder side, an error is always triggered by the most recently
365//! pushed Unicode scalar, which makes it simple to pass the `char` to the
366//! caller. Also, it's very typical for the caller to wish to do something with
367//! this data: generate a numeric escape for the character. Additionally, the
368//! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
369//! certain cases, so requiring the caller to extract the character from the
370//! input buffer would require the caller to handle ISO-2022-JP details.
371//! Furthermore, requiring the caller to extract the character from the input
372//! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
373//! the job of an encoding conversion library.
374//!
375//! On the decoder side, errors are triggered in more complex ways. For
376//! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
377//! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
378//! the buffer boundary when processing 'A'. Thus, the bytes in error might not
379//! be the ones most recently pushed to the decoder and the error might not even
380//! be in the current buffer.
381//!
382//! Some encoding conversion APIs address the problem by not acknowledging
383//! trailing bytes of an input buffer as consumed if it's still possible for
384//! future bytes to cause the trailing bytes to be in error. This way, error
385//! reporting can always refer to the most recently pushed buffer. This has the
386//! problem that the caller of the API has to copy the unconsumed trailing
387//! bytes to the start of the next buffer before being able to fill the rest
388//! of the next buffer. This is annoying, error-prone and inefficient.
389//!
390//! A possible solution would be making the decoder remember recently consumed
391//! bytes in order to be able to include a copy of the erroneous bytes when
392//! reporting an error. This has two problem: First, callers a rarely
393//! interested in the erroneous bytes, so attempts to identify them are most
394//! often just overhead anyway. Second, the rare applications that are
395//! interested typically care about the location of the error in the input
396//! stream.
397//!
398//! To keep the API convenient for common uses and the overhead low while making
399//! it possible to develop applications, such as HTML validators, that care
400//! about which bytes were in error, encoding_rs reports the length of the
401//! erroneous sequence and the number of bytes consumed after the erroneous
402//! sequence. As long as the caller doesn't discard the 6 most recent bytes,
403//! this makes it possible for callers that care about the erroneous bytes to
404//! locate them.
405//!
406//! # No Convenience API for Custom Replacements
407//!
408//! The Web Platform and, therefore, the Encoding Standard supports only one
409//! error recovery mode for decoders and only one error recovery mode for
410//! encoders. The supported error recovery mode for decoders is emitting the
411//! REPLACEMENT CHARACTER on error. The supported error recovery mode for
412//! encoders is emitting an HTML decimal numeric character reference for
413//! unmappable characters.
414//!
415//! Since encoding_rs is Web-focused, these are the only error recovery modes
416//! for which convenient support is provided. Moreover, on the decoder side,
417//! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
418//! on error (other than treating errors as fatal). In particular, simply
419//! ignoring errors is a
420//! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
421//! so it would be a bad idea for encoding_rs to provide a mode that encouraged
422//! callers to ignore errors.
423//!
424//! On the encoder side, there are plausible alternatives for HTML decimal
425//! numeric character references. For example, when outputting CSS, CSS-style
426//! escapes would seem to make sense. However, instead of facilitating the
427//! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
428//! position that you shouldn't generate output in encodings other than UTF-8,
429//! except where backward compatibility with interacting with the legacy Web
430//! requires it. The legacy Web requires it only when parsing the query strings
431//! of URLs and when submitting forms, and those two both use HTML decimal
432//! numeric character references.
433//!
434//! While encoding_rs doesn't make encoder replacements other than HTML decimal
435//! numeric character references easy, it does make them _possible_.
436//! `encode_from_utf8()`, which emits HTML decimal numeric character references
437//! for unmappable characters, is implemented on top of
438//! `encode_from_utf8_without_replacement()`. Applications that really, really
439//! want other replacement schemes for unmappable characters can likewise
440//! implement them on top of `encode_from_utf8_without_replacement()`.
441//!
442//! # No Extensibility by Design
443//!
444//! The set of encodings supported by encoding_rs is not extensible by design.
445//! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
446//! rather than `trait`s. encoding_rs takes the design position that all future
447//! text interchange should be done using UTF-8, which can represent all of
448//! Unicode. (It is, in fact, the only encoding supported by the Encoding
449//! Standard and encoding_rs that can represent all of Unicode and that has
450//! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
451//! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
452//! legacy compatibility and not due to non-UTF-8 encodings having benefits
453//! other than being able to consume legacy content.
454//!
455//! Considering that UTF-8 can represent all of Unicode and is already supported
456//! by all Web browsers, introducing a new encoding wouldn't add to the
457//! expressiveness but would add to compatibility problems. In that sense,
458//! adding new encodings to the Web Platform doesn't make sense, and, in fact,
459//! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
460//! the Web Platform. On the other hand, the set of legacy encodings that must
461//! be supported for a Web browser to be able to be successful is not going to
462//! expand. Empirically, the set of encodings specified in the Encoding Standard
463//! is already sufficient and the set of legacy encodings won't grow
464//! retroactively.
465//!
466//! Since extensibility doesn't make sense considering the Web focus of
467//! encoding_rs and adding encodings to Web clients would be actively harmful,
468//! it makes sense to make the set of encodings that encoding_rs supports
469//! non-extensible and to take the (admittedly small) benefits arising from
470//! that, such as the size of `Decoder` and `Encoder` objects being known ahead
471//!  of time, which enables stack allocation thereof.
472//!
473//! This does have downsides for applications that might want to put encoding_rs
474//! to non-Web uses if those non-Web uses involve legacy encodings that aren't
475//! needed for Web uses. The needs of such applications should not complicate
476//! encoding_rs itself, though. It is up to those applications to provide a
477//! framework that delegates the operations with encodings that encoding_rs
478//! supports to encoding_rs and operations with other encodings to something
479//! else (as opposed to encoding_rs itself providing an extensibility
480//! framework).
481//!
482//! # Panics
483//!
484//! Methods in encoding_rs can panic if the API is used against the requirements
485//! stated in the documentation, if a state that's supposed to be impossible
486//! is reached due to an internal bug or on integer overflow. When used
487//! according to documentation with buffer sizes that stay below integer
488//! overflow, in the absence of internal bugs, encoding_rs does not panic.
489//!
490//! Panics arising from API misuse aren't documented beyond this on individual
491//! methods.
492//!
493//! # At-Risk Parts of the API
494//!
495//! The foreseeable source of partially backward-incompatible API change is the
496//! way the instances of `Encoding` are made available.
497//!
498//! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
499//! initialized with `static`s of type `&'static Encoding`, the non-reference
500//! `FOO_INIT` public `Encoding` instances will be removed from the public API.
501//!
502//! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
503//! unique when the constant is used in different crates, the reference-typed
504//! `static`s for the encoding instances will be changed from `static` to
505//! `const` and the non-reference-typed `_INIT` instances will be removed.
506//!
507//! # Mapping Spec Concepts onto the API
508//!
509//! <table>
510//! <thead>
511//! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
512//! </thead>
513//! <tbody>
514//! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
515//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
516//! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
517//! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
518//! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
519//! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
520//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
521//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
522//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
523//! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
524//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
525//! </tbody>
526//! </table>
527//!
528//! # Compatibility with the rust-encoding API
529//!
530//! The crate
531//! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
532//! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
533//! the API of rust-encoding 0.2.32 on top of encoding_rs.
534//!
535//! # Mapping rust-encoding concepts to encoding_rs concepts
536//!
537//! The following table provides a mapping from rust-encoding constructs to
538//! encoding_rs ones.
539//!
540//! <table>
541//! <thead>
542//! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
543//! </thead>
544//! <tbody>
545//! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
546//! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
547//! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
548//! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
549//! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
550//! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
551//! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
552//! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
553//! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
554//! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
555//! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
556//! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
557//! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
558//! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
559//! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
560//! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
561//! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
562//! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
563//! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
564//! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
565//! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
566//! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
567//! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568//! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569//! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
570//! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571//! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572//! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
573//! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574//! </tbody>
575//! </table>
576//!
577//! # Relationship with Windows Code Pages
578//!
579//! Despite the Web and browser focus, the encodings defined by the Encoding
580//! Standard and implemented by this crate may be useful for decoding legacy
581//! data that uses Windows code pages. The following table names the single-byte
582//! encodings
583//! that have a closely related Windows code page, the number of the closest
584//! code page, a column indicating whether Windows maps unassigned code points
585//! to the Unicode Private Use Area instead of U+FFFD and a remark number
586//! indicating remarks in the list after the table.
587//!
588//! <table>
589//! <thead>
590//! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
591//! </thead>
592//! <tbody>
593//! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
594//! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
595//! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
596//! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
597//! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
598//! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
599//! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
600//! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
601//! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
602//! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
603//! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
604//! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
605//! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
606//! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
607//! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
608//! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
609//! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
610//! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
611//! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
612//! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
613//! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
614//! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
615//! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
616//! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
617//! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
618//! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
619//! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
620//! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
621//! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
622//! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
623//! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
624//! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
625//! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
626//! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
627//! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
628//! </tbody>
629//! </table>
630//!
631//! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
632//! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
633//! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
634//!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
635//!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
636//!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
637//!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
638//! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
639//!    of LRM and RLM.
640//! 5. Remarks from the previous item apply.
641//!
642//! The differences between this crate and Windows in the case of multibyte encodings
643//! are not yet fully documented here. The lack of remarks above should not be taken
644//! as indication of lack of differences.
645//!
646//! # Notable Differences from IANA Naming
647//!
648//! In some cases, the Encoding Standard specifies the popular unextended encoding
649//! name where in IANA terms one of the other labels would be more precise considering
650//! the extensions that the Encoding Standard has unified into the encoding.
651//!
652//! <table>
653//! <thead>
654//! <tr><th>Encoding</th><th>IANA</th></tr>
655//! </thead>
656//! <tbody>
657//! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
658//! <tr><td>EUC-KR</td><td>windows-949</td></tr>
659//! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
660//! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
661//! </tbody>
662//! </table>
663//!
664//! In other cases where the Encoding Standard unifies unextended and extended
665//! variants of an encoding, the encoding gets the name of the extended
666//! variant.
667//!
668//! <table>
669//! <thead>
670//! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
671//! </thead>
672//! <tbody>
673//! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
674//! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
675//! <tr><td>TIS-620</td><td>windows-874</td></tr>
676//! </tbody>
677//! </table>
678//!
679//! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
680//! for discussion about the UTF-16 family.
681
682#![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
683
684#[macro_use]
685extern crate cfg_if;
686
687#[cfg(all(
688    feature = "simd-accel",
689    any(
690        target_feature = "sse2",
691        all(target_endian = "little", target_arch = "aarch64"),
692        all(target_endian = "little", target_feature = "neon")
693    )
694))]
695#[macro_use(shuffle)]
696extern crate packed_simd;
697
698#[cfg(feature = "serde")]
699extern crate serde;
700
701#[cfg(all(test, feature = "serde"))]
702extern crate bincode;
703#[cfg(all(test, feature = "serde"))]
704#[macro_use]
705extern crate serde_derive;
706#[cfg(all(test, feature = "serde"))]
707extern crate serde_json;
708
709#[macro_use]
710mod macros;
711
712#[cfg(all(
713    feature = "simd-accel",
714    any(
715        target_feature = "sse2",
716        all(target_endian = "little", target_arch = "aarch64"),
717        all(target_endian = "little", target_feature = "neon")
718    )
719))]
720mod simd_funcs;
721
722#[cfg(test)]
723mod testing;
724
725mod big5;
726mod euc_jp;
727mod euc_kr;
728mod gb18030;
729mod iso_2022_jp;
730mod replacement;
731mod shift_jis;
732mod single_byte;
733mod utf_16;
734mod utf_8;
735mod x_user_defined;
736
737mod ascii;
738mod data;
739mod handles;
740mod variant;
741
742pub mod mem;
743
744use crate::ascii::ascii_valid_up_to;
745use crate::ascii::iso_2022_jp_ascii_valid_up_to;
746use crate::utf_8::utf8_valid_up_to;
747use crate::variant::*;
748
749use std::borrow::Cow;
750use std::cmp::Ordering;
751use std::hash::Hash;
752use std::hash::Hasher;
753
754#[cfg(feature = "serde")]
755use serde::de::Visitor;
756#[cfg(feature = "serde")]
757use serde::{Deserialize, Deserializer, Serialize, Serializer};
758
759/// This has to be the max length of an NCR instead of max
760/// minus one, because we can't rely on getting the minus
761/// one from the space reserved for the current unmappable,
762/// because the ISO-2022-JP encoder can fill up that space
763/// with a state transition escape.
764const NCR_EXTRA: usize = 10; // &#1114111;
765
766// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
767// Instead, please regenerate using generate-encoding-data.py
768
769const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
770
771/// The initializer for the [Big5](static.BIG5.html) encoding.
772///
773/// For use only for taking the address of this form when
774/// Rust prohibits the use of the non-`_INIT` form directly,
775/// such as in initializers of other `static`s. If in doubt,
776/// use the corresponding non-`_INIT` reference-typed `static`.
777///
778/// This part of the public API will go away if Rust changes
779/// to make the referent of `pub const FOO: &'static Encoding`
780/// unique cross-crate or if Rust starts allowing static arrays
781/// to be initialized with `pub static FOO: &'static Encoding`
782/// items.
783pub static BIG5_INIT: Encoding = Encoding {
784    name: "Big5",
785    variant: VariantEncoding::Big5,
786};
787
788/// The Big5 encoding.
789///
790/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
791/// instead of the Private Use Area code points that have been used historically.
792/// It is believed to be able to decode existing Web content in a way that makes
793/// sense.
794///
795/// To avoid form submissions generating data that Web servers don't understand,
796/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
797/// Big5 in the lexical order.
798///
799/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
800/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
801///
802/// This encoding is designed to be suited for decoding the Windows code page 950
803/// and its HKSCS patched "951" variant such that the text makes sense, given
804/// assignments that Unicode has made after those encodings used Private Use
805/// Area characters.
806///
807/// This will change from `static` to `const` if Rust changes
808/// to make the referent of `pub const FOO: &'static Encoding`
809/// unique cross-crate, so don't take the address of this
810/// `static`.
811pub static BIG5: &'static Encoding = &BIG5_INIT;
812
813/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
814///
815/// For use only for taking the address of this form when
816/// Rust prohibits the use of the non-`_INIT` form directly,
817/// such as in initializers of other `static`s. If in doubt,
818/// use the corresponding non-`_INIT` reference-typed `static`.
819///
820/// This part of the public API will go away if Rust changes
821/// to make the referent of `pub const FOO: &'static Encoding`
822/// unique cross-crate or if Rust starts allowing static arrays
823/// to be initialized with `pub static FOO: &'static Encoding`
824/// items.
825pub static EUC_JP_INIT: Encoding = Encoding {
826    name: "EUC-JP",
827    variant: VariantEncoding::EucJp,
828};
829
830/// The EUC-JP encoding.
831///
832/// This is the legacy Unix encoding for Japanese.
833///
834/// For compatibility with Web servers that don't expect three-byte sequences
835/// in form submissions, the encoder doesn't generate three-byte sequences.
836/// That is, the JIS X 0212 support is decode-only.
837///
838/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
839/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
840///
841/// This encoding roughly matches the Windows code page 20932. There are error
842/// handling differences and a handful of 2-byte sequences that decode differently.
843/// Additionall, Windows doesn't support 3-byte sequences.
844///
845/// This will change from `static` to `const` if Rust changes
846/// to make the referent of `pub const FOO: &'static Encoding`
847/// unique cross-crate, so don't take the address of this
848/// `static`.
849pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
850
851/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
852///
853/// For use only for taking the address of this form when
854/// Rust prohibits the use of the non-`_INIT` form directly,
855/// such as in initializers of other `static`s. If in doubt,
856/// use the corresponding non-`_INIT` reference-typed `static`.
857///
858/// This part of the public API will go away if Rust changes
859/// to make the referent of `pub const FOO: &'static Encoding`
860/// unique cross-crate or if Rust starts allowing static arrays
861/// to be initialized with `pub static FOO: &'static Encoding`
862/// items.
863pub static EUC_KR_INIT: Encoding = Encoding {
864    name: "EUC-KR",
865    variant: VariantEncoding::EucKr,
866};
867
868/// The EUC-KR encoding.
869///
870/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
871/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
872/// Classic), with all the characters from the Hangul Syllables block of Unicode.
873///
874/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
875/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
876///
877/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
878/// to U+0080 and some byte sequences that are error per the Encoding Standard to
879/// the question mark or the Private Use Area.
880///
881/// This will change from `static` to `const` if Rust changes
882/// to make the referent of `pub const FOO: &'static Encoding`
883/// unique cross-crate, so don't take the address of this
884/// `static`.
885pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
886
887/// The initializer for the [GBK](static.GBK.html) encoding.
888///
889/// For use only for taking the address of this form when
890/// Rust prohibits the use of the non-`_INIT` form directly,
891/// such as in initializers of other `static`s. If in doubt,
892/// use the corresponding non-`_INIT` reference-typed `static`.
893///
894/// This part of the public API will go away if Rust changes
895/// to make the referent of `pub const FOO: &'static Encoding`
896/// unique cross-crate or if Rust starts allowing static arrays
897/// to be initialized with `pub static FOO: &'static Encoding`
898/// items.
899pub static GBK_INIT: Encoding = Encoding {
900    name: "GBK",
901    variant: VariantEncoding::Gbk,
902};
903
904/// The GBK encoding.
905///
906/// The decoder for this encoding is the same as the decoder for gb18030.
907/// The encoder side of this encoding is GBK with Windows code page 936 euro
908/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
909/// Unicode block as well as a handful of ideographs from the CJK Unified
910/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
911///
912/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
913/// unified with the gb18030 encoder in the Encoding Standard out of concern
914/// that servers that expect GBK form submissions might not be able to handle
915/// the four-byte sequences.
916///
917/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
918/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
919///
920/// The encoder of this encoding roughly matches the Windows code page 936.
921/// The decoder side is a superset.
922///
923/// This will change from `static` to `const` if Rust changes
924/// to make the referent of `pub const FOO: &'static Encoding`
925/// unique cross-crate, so don't take the address of this
926/// `static`.
927pub static GBK: &'static Encoding = &GBK_INIT;
928
929/// The initializer for the [IBM866](static.IBM866.html) encoding.
930///
931/// For use only for taking the address of this form when
932/// Rust prohibits the use of the non-`_INIT` form directly,
933/// such as in initializers of other `static`s. If in doubt,
934/// use the corresponding non-`_INIT` reference-typed `static`.
935///
936/// This part of the public API will go away if Rust changes
937/// to make the referent of `pub const FOO: &'static Encoding`
938/// unique cross-crate or if Rust starts allowing static arrays
939/// to be initialized with `pub static FOO: &'static Encoding`
940/// items.
941pub static IBM866_INIT: Encoding = Encoding {
942    name: "IBM866",
943    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
944};
945
946/// The IBM866 encoding.
947///
948/// This the most notable one of the DOS Cyrillic code pages. It has the same
949/// box drawing characters as code page 437, so it can be used for decoding
950/// DOS-era ASCII + box drawing data.
951///
952/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
953/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
954///
955/// This encoding matches the Windows code page 866.
956///
957/// This will change from `static` to `const` if Rust changes
958/// to make the referent of `pub const FOO: &'static Encoding`
959/// unique cross-crate, so don't take the address of this
960/// `static`.
961pub static IBM866: &'static Encoding = &IBM866_INIT;
962
963/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
964///
965/// For use only for taking the address of this form when
966/// Rust prohibits the use of the non-`_INIT` form directly,
967/// such as in initializers of other `static`s. If in doubt,
968/// use the corresponding non-`_INIT` reference-typed `static`.
969///
970/// This part of the public API will go away if Rust changes
971/// to make the referent of `pub const FOO: &'static Encoding`
972/// unique cross-crate or if Rust starts allowing static arrays
973/// to be initialized with `pub static FOO: &'static Encoding`
974/// items.
975pub static ISO_2022_JP_INIT: Encoding = Encoding {
976    name: "ISO-2022-JP",
977    variant: VariantEncoding::Iso2022Jp,
978};
979
980/// The ISO-2022-JP encoding.
981///
982/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
983/// byte range to encode non-Basic Latin characters. It's the only encoding
984/// supported by this crate whose encoder is stateful.
985///
986/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
987/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
988///
989/// This encoding roughly matches the Windows code page 50220. Notably, Windows
990/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
991/// error handling.
992///
993/// This will change from `static` to `const` if Rust changes
994/// to make the referent of `pub const FOO: &'static Encoding`
995/// unique cross-crate, so don't take the address of this
996/// `static`.
997pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
998
999/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1000///
1001/// For use only for taking the address of this form when
1002/// Rust prohibits the use of the non-`_INIT` form directly,
1003/// such as in initializers of other `static`s. If in doubt,
1004/// use the corresponding non-`_INIT` reference-typed `static`.
1005///
1006/// This part of the public API will go away if Rust changes
1007/// to make the referent of `pub const FOO: &'static Encoding`
1008/// unique cross-crate or if Rust starts allowing static arrays
1009/// to be initialized with `pub static FOO: &'static Encoding`
1010/// items.
1011pub static ISO_8859_10_INIT: Encoding = Encoding {
1012    name: "ISO-8859-10",
1013    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1014};
1015
1016/// The ISO-8859-10 encoding.
1017///
1018/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1019/// is also known as Latin 6.
1020///
1021/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1022/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1023///
1024/// The Windows code page number for this encoding is 28600, but kernel32.dll
1025/// does not support this encoding.
1026///
1027/// This will change from `static` to `const` if Rust changes
1028/// to make the referent of `pub const FOO: &'static Encoding`
1029/// unique cross-crate, so don't take the address of this
1030/// `static`.
1031pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1032
1033/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1034///
1035/// For use only for taking the address of this form when
1036/// Rust prohibits the use of the non-`_INIT` form directly,
1037/// such as in initializers of other `static`s. If in doubt,
1038/// use the corresponding non-`_INIT` reference-typed `static`.
1039///
1040/// This part of the public API will go away if Rust changes
1041/// to make the referent of `pub const FOO: &'static Encoding`
1042/// unique cross-crate or if Rust starts allowing static arrays
1043/// to be initialized with `pub static FOO: &'static Encoding`
1044/// items.
1045pub static ISO_8859_13_INIT: Encoding = Encoding {
1046    name: "ISO-8859-13",
1047    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1048};
1049
1050/// The ISO-8859-13 encoding.
1051///
1052/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1053/// is also known as Latin 7.
1054///
1055/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1056/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1057///
1058/// This encoding matches the Windows code page 28603, except Windows decodes
1059/// unassigned code points to the Private Use Area of Unicode.
1060///
1061/// This will change from `static` to `const` if Rust changes
1062/// to make the referent of `pub const FOO: &'static Encoding`
1063/// unique cross-crate, so don't take the address of this
1064/// `static`.
1065pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1066
1067/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1068///
1069/// For use only for taking the address of this form when
1070/// Rust prohibits the use of the non-`_INIT` form directly,
1071/// such as in initializers of other `static`s. If in doubt,
1072/// use the corresponding non-`_INIT` reference-typed `static`.
1073///
1074/// This part of the public API will go away if Rust changes
1075/// to make the referent of `pub const FOO: &'static Encoding`
1076/// unique cross-crate or if Rust starts allowing static arrays
1077/// to be initialized with `pub static FOO: &'static Encoding`
1078/// items.
1079pub static ISO_8859_14_INIT: Encoding = Encoding {
1080    name: "ISO-8859-14",
1081    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1082};
1083
1084/// The ISO-8859-14 encoding.
1085///
1086/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1087/// is also known as Latin 8.
1088///
1089/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1090/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1091///
1092/// The Windows code page number for this encoding is 28604, but kernel32.dll
1093/// does not support this encoding.
1094///
1095/// This will change from `static` to `const` if Rust changes
1096/// to make the referent of `pub const FOO: &'static Encoding`
1097/// unique cross-crate, so don't take the address of this
1098/// `static`.
1099pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1100
1101/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1102///
1103/// For use only for taking the address of this form when
1104/// Rust prohibits the use of the non-`_INIT` form directly,
1105/// such as in initializers of other `static`s. If in doubt,
1106/// use the corresponding non-`_INIT` reference-typed `static`.
1107///
1108/// This part of the public API will go away if Rust changes
1109/// to make the referent of `pub const FOO: &'static Encoding`
1110/// unique cross-crate or if Rust starts allowing static arrays
1111/// to be initialized with `pub static FOO: &'static Encoding`
1112/// items.
1113pub static ISO_8859_15_INIT: Encoding = Encoding {
1114    name: "ISO-8859-15",
1115    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1116};
1117
1118/// The ISO-8859-15 encoding.
1119///
1120/// This is the revised Western European part of the ISO/IEC 8859 encoding
1121/// family. This encoding is also known as Latin 9.
1122///
1123/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1124/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1125///
1126/// This encoding matches the Windows code page 28605.
1127///
1128/// This will change from `static` to `const` if Rust changes
1129/// to make the referent of `pub const FOO: &'static Encoding`
1130/// unique cross-crate, so don't take the address of this
1131/// `static`.
1132pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1133
1134/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1135///
1136/// For use only for taking the address of this form when
1137/// Rust prohibits the use of the non-`_INIT` form directly,
1138/// such as in initializers of other `static`s. If in doubt,
1139/// use the corresponding non-`_INIT` reference-typed `static`.
1140///
1141/// This part of the public API will go away if Rust changes
1142/// to make the referent of `pub const FOO: &'static Encoding`
1143/// unique cross-crate or if Rust starts allowing static arrays
1144/// to be initialized with `pub static FOO: &'static Encoding`
1145/// items.
1146pub static ISO_8859_16_INIT: Encoding = Encoding {
1147    name: "ISO-8859-16",
1148    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1149};
1150
1151/// The ISO-8859-16 encoding.
1152///
1153/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1154/// family. This encoding is also known as Latin 10.
1155///
1156/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1157/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1158///
1159/// The Windows code page number for this encoding is 28606, but kernel32.dll
1160/// does not support this encoding.
1161///
1162/// This will change from `static` to `const` if Rust changes
1163/// to make the referent of `pub const FOO: &'static Encoding`
1164/// unique cross-crate, so don't take the address of this
1165/// `static`.
1166pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1167
1168/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1169///
1170/// For use only for taking the address of this form when
1171/// Rust prohibits the use of the non-`_INIT` form directly,
1172/// such as in initializers of other `static`s. If in doubt,
1173/// use the corresponding non-`_INIT` reference-typed `static`.
1174///
1175/// This part of the public API will go away if Rust changes
1176/// to make the referent of `pub const FOO: &'static Encoding`
1177/// unique cross-crate or if Rust starts allowing static arrays
1178/// to be initialized with `pub static FOO: &'static Encoding`
1179/// items.
1180pub static ISO_8859_2_INIT: Encoding = Encoding {
1181    name: "ISO-8859-2",
1182    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1183};
1184
1185/// The ISO-8859-2 encoding.
1186///
1187/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1188///
1189/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1190/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1191///
1192/// This encoding matches the Windows code page 28592.
1193///
1194/// This will change from `static` to `const` if Rust changes
1195/// to make the referent of `pub const FOO: &'static Encoding`
1196/// unique cross-crate, so don't take the address of this
1197/// `static`.
1198pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1199
1200/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1201///
1202/// For use only for taking the address of this form when
1203/// Rust prohibits the use of the non-`_INIT` form directly,
1204/// such as in initializers of other `static`s. If in doubt,
1205/// use the corresponding non-`_INIT` reference-typed `static`.
1206///
1207/// This part of the public API will go away if Rust changes
1208/// to make the referent of `pub const FOO: &'static Encoding`
1209/// unique cross-crate or if Rust starts allowing static arrays
1210/// to be initialized with `pub static FOO: &'static Encoding`
1211/// items.
1212pub static ISO_8859_3_INIT: Encoding = Encoding {
1213    name: "ISO-8859-3",
1214    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1215};
1216
1217/// The ISO-8859-3 encoding.
1218///
1219/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1220///
1221/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1222/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1223///
1224/// This encoding matches the Windows code page 28593.
1225///
1226/// This will change from `static` to `const` if Rust changes
1227/// to make the referent of `pub const FOO: &'static Encoding`
1228/// unique cross-crate, so don't take the address of this
1229/// `static`.
1230pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1231
1232/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1233///
1234/// For use only for taking the address of this form when
1235/// Rust prohibits the use of the non-`_INIT` form directly,
1236/// such as in initializers of other `static`s. If in doubt,
1237/// use the corresponding non-`_INIT` reference-typed `static`.
1238///
1239/// This part of the public API will go away if Rust changes
1240/// to make the referent of `pub const FOO: &'static Encoding`
1241/// unique cross-crate or if Rust starts allowing static arrays
1242/// to be initialized with `pub static FOO: &'static Encoding`
1243/// items.
1244pub static ISO_8859_4_INIT: Encoding = Encoding {
1245    name: "ISO-8859-4",
1246    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1247};
1248
1249/// The ISO-8859-4 encoding.
1250///
1251/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1252///
1253/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1254/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1255///
1256/// This encoding matches the Windows code page 28594.
1257///
1258/// This will change from `static` to `const` if Rust changes
1259/// to make the referent of `pub const FOO: &'static Encoding`
1260/// unique cross-crate, so don't take the address of this
1261/// `static`.
1262pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1263
1264/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1265///
1266/// For use only for taking the address of this form when
1267/// Rust prohibits the use of the non-`_INIT` form directly,
1268/// such as in initializers of other `static`s. If in doubt,
1269/// use the corresponding non-`_INIT` reference-typed `static`.
1270///
1271/// This part of the public API will go away if Rust changes
1272/// to make the referent of `pub const FOO: &'static Encoding`
1273/// unique cross-crate or if Rust starts allowing static arrays
1274/// to be initialized with `pub static FOO: &'static Encoding`
1275/// items.
1276pub static ISO_8859_5_INIT: Encoding = Encoding {
1277    name: "ISO-8859-5",
1278    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1279};
1280
1281/// The ISO-8859-5 encoding.
1282///
1283/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1284///
1285/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1286/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1287///
1288/// This encoding matches the Windows code page 28595.
1289///
1290/// This will change from `static` to `const` if Rust changes
1291/// to make the referent of `pub const FOO: &'static Encoding`
1292/// unique cross-crate, so don't take the address of this
1293/// `static`.
1294pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1295
1296/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1297///
1298/// For use only for taking the address of this form when
1299/// Rust prohibits the use of the non-`_INIT` form directly,
1300/// such as in initializers of other `static`s. If in doubt,
1301/// use the corresponding non-`_INIT` reference-typed `static`.
1302///
1303/// This part of the public API will go away if Rust changes
1304/// to make the referent of `pub const FOO: &'static Encoding`
1305/// unique cross-crate or if Rust starts allowing static arrays
1306/// to be initialized with `pub static FOO: &'static Encoding`
1307/// items.
1308pub static ISO_8859_6_INIT: Encoding = Encoding {
1309    name: "ISO-8859-6",
1310    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1311};
1312
1313/// The ISO-8859-6 encoding.
1314///
1315/// This is the Arabic part of the ISO/IEC 8859 encoding family.
1316///
1317/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1318/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1319///
1320/// This encoding matches the Windows code page 28596, except Windows decodes
1321/// unassigned code points to the Private Use Area of Unicode.
1322///
1323/// This will change from `static` to `const` if Rust changes
1324/// to make the referent of `pub const FOO: &'static Encoding`
1325/// unique cross-crate, so don't take the address of this
1326/// `static`.
1327pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1328
1329/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1330///
1331/// For use only for taking the address of this form when
1332/// Rust prohibits the use of the non-`_INIT` form directly,
1333/// such as in initializers of other `static`s. If in doubt,
1334/// use the corresponding non-`_INIT` reference-typed `static`.
1335///
1336/// This part of the public API will go away if Rust changes
1337/// to make the referent of `pub const FOO: &'static Encoding`
1338/// unique cross-crate or if Rust starts allowing static arrays
1339/// to be initialized with `pub static FOO: &'static Encoding`
1340/// items.
1341pub static ISO_8859_7_INIT: Encoding = Encoding {
1342    name: "ISO-8859-7",
1343    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1344};
1345
1346/// The ISO-8859-7 encoding.
1347///
1348/// This is the Greek part of the ISO/IEC 8859 encoding family.
1349///
1350/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1351/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1352///
1353/// This encoding roughly matches the Windows code page 28597. Windows decodes
1354/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1355/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1356/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1357/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1358/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1359///
1360/// This will change from `static` to `const` if Rust changes
1361/// to make the referent of `pub const FOO: &'static Encoding`
1362/// unique cross-crate, so don't take the address of this
1363/// `static`.
1364pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1365
1366/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1367///
1368/// For use only for taking the address of this form when
1369/// Rust prohibits the use of the non-`_INIT` form directly,
1370/// such as in initializers of other `static`s. If in doubt,
1371/// use the corresponding non-`_INIT` reference-typed `static`.
1372///
1373/// This part of the public API will go away if Rust changes
1374/// to make the referent of `pub const FOO: &'static Encoding`
1375/// unique cross-crate or if Rust starts allowing static arrays
1376/// to be initialized with `pub static FOO: &'static Encoding`
1377/// items.
1378pub static ISO_8859_8_INIT: Encoding = Encoding {
1379    name: "ISO-8859-8",
1380    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1381};
1382
1383/// The ISO-8859-8 encoding.
1384///
1385/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1386///
1387/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1388/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1389///
1390/// This encoding roughly matches the Windows code page 28598. Windows decodes
1391/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1392/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1393/// the private use area.
1394///
1395/// This will change from `static` to `const` if Rust changes
1396/// to make the referent of `pub const FOO: &'static Encoding`
1397/// unique cross-crate, so don't take the address of this
1398/// `static`.
1399pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1400
1401/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1402///
1403/// For use only for taking the address of this form when
1404/// Rust prohibits the use of the non-`_INIT` form directly,
1405/// such as in initializers of other `static`s. If in doubt,
1406/// use the corresponding non-`_INIT` reference-typed `static`.
1407///
1408/// This part of the public API will go away if Rust changes
1409/// to make the referent of `pub const FOO: &'static Encoding`
1410/// unique cross-crate or if Rust starts allowing static arrays
1411/// to be initialized with `pub static FOO: &'static Encoding`
1412/// items.
1413pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1414    name: "ISO-8859-8-I",
1415    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1416};
1417
1418/// The ISO-8859-8-I encoding.
1419///
1420/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1421///
1422/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1423/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1424///
1425/// This encoding roughly matches the Windows code page 38598. Windows decodes
1426/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1427/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1428/// the private use area.
1429///
1430/// This will change from `static` to `const` if Rust changes
1431/// to make the referent of `pub const FOO: &'static Encoding`
1432/// unique cross-crate, so don't take the address of this
1433/// `static`.
1434pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1435
1436/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1437///
1438/// For use only for taking the address of this form when
1439/// Rust prohibits the use of the non-`_INIT` form directly,
1440/// such as in initializers of other `static`s. If in doubt,
1441/// use the corresponding non-`_INIT` reference-typed `static`.
1442///
1443/// This part of the public API will go away if Rust changes
1444/// to make the referent of `pub const FOO: &'static Encoding`
1445/// unique cross-crate or if Rust starts allowing static arrays
1446/// to be initialized with `pub static FOO: &'static Encoding`
1447/// items.
1448pub static KOI8_R_INIT: Encoding = Encoding {
1449    name: "KOI8-R",
1450    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1451};
1452
1453/// The KOI8-R encoding.
1454///
1455/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1456///
1457/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1458/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1459///
1460/// This encoding matches the Windows code page 20866.
1461///
1462/// This will change from `static` to `const` if Rust changes
1463/// to make the referent of `pub const FOO: &'static Encoding`
1464/// unique cross-crate, so don't take the address of this
1465/// `static`.
1466pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1467
1468/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1469///
1470/// For use only for taking the address of this form when
1471/// Rust prohibits the use of the non-`_INIT` form directly,
1472/// such as in initializers of other `static`s. If in doubt,
1473/// use the corresponding non-`_INIT` reference-typed `static`.
1474///
1475/// This part of the public API will go away if Rust changes
1476/// to make the referent of `pub const FOO: &'static Encoding`
1477/// unique cross-crate or if Rust starts allowing static arrays
1478/// to be initialized with `pub static FOO: &'static Encoding`
1479/// items.
1480pub static KOI8_U_INIT: Encoding = Encoding {
1481    name: "KOI8-U",
1482    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1483};
1484
1485/// The KOI8-U encoding.
1486///
1487/// This is an encoding for Ukrainian adapted from KOI8-R.
1488///
1489/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1490/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1491///
1492/// This encoding matches the Windows code page 21866.
1493///
1494/// This will change from `static` to `const` if Rust changes
1495/// to make the referent of `pub const FOO: &'static Encoding`
1496/// unique cross-crate, so don't take the address of this
1497/// `static`.
1498pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1499
1500/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1501///
1502/// For use only for taking the address of this form when
1503/// Rust prohibits the use of the non-`_INIT` form directly,
1504/// such as in initializers of other `static`s. If in doubt,
1505/// use the corresponding non-`_INIT` reference-typed `static`.
1506///
1507/// This part of the public API will go away if Rust changes
1508/// to make the referent of `pub const FOO: &'static Encoding`
1509/// unique cross-crate or if Rust starts allowing static arrays
1510/// to be initialized with `pub static FOO: &'static Encoding`
1511/// items.
1512pub static SHIFT_JIS_INIT: Encoding = Encoding {
1513    name: "Shift_JIS",
1514    variant: VariantEncoding::ShiftJis,
1515};
1516
1517/// The Shift_JIS encoding.
1518///
1519/// This is the Japanese encoding for Windows.
1520///
1521/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1522/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1523///
1524/// This encoding matches the Windows code page 932, except Windows decodes some byte
1525/// sequences that are error per the Encoding Standard to the question mark or the
1526/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1527///
1528/// This will change from `static` to `const` if Rust changes
1529/// to make the referent of `pub const FOO: &'static Encoding`
1530/// unique cross-crate, so don't take the address of this
1531/// `static`.
1532pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1533
1534/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1535///
1536/// For use only for taking the address of this form when
1537/// Rust prohibits the use of the non-`_INIT` form directly,
1538/// such as in initializers of other `static`s. If in doubt,
1539/// use the corresponding non-`_INIT` reference-typed `static`.
1540///
1541/// This part of the public API will go away if Rust changes
1542/// to make the referent of `pub const FOO: &'static Encoding`
1543/// unique cross-crate or if Rust starts allowing static arrays
1544/// to be initialized with `pub static FOO: &'static Encoding`
1545/// items.
1546pub static UTF_16BE_INIT: Encoding = Encoding {
1547    name: "UTF-16BE",
1548    variant: VariantEncoding::Utf16Be,
1549};
1550
1551/// The UTF-16BE encoding.
1552///
1553/// This decode-only encoding uses 16-bit code units due to Unicode originally
1554/// having been designed as a 16-bit reportoire. In the absence of a byte order
1555/// mark the big endian byte order is assumed.
1556///
1557/// There is no corresponding encoder in this crate or in the Encoding
1558/// Standard. The output encoding of this encoding is UTF-8.
1559///
1560/// This encoding matches the Windows code page 1201.
1561///
1562/// This will change from `static` to `const` if Rust changes
1563/// to make the referent of `pub const FOO: &'static Encoding`
1564/// unique cross-crate, so don't take the address of this
1565/// `static`.
1566pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1567
1568/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1569///
1570/// For use only for taking the address of this form when
1571/// Rust prohibits the use of the non-`_INIT` form directly,
1572/// such as in initializers of other `static`s. If in doubt,
1573/// use the corresponding non-`_INIT` reference-typed `static`.
1574///
1575/// This part of the public API will go away if Rust changes
1576/// to make the referent of `pub const FOO: &'static Encoding`
1577/// unique cross-crate or if Rust starts allowing static arrays
1578/// to be initialized with `pub static FOO: &'static Encoding`
1579/// items.
1580pub static UTF_16LE_INIT: Encoding = Encoding {
1581    name: "UTF-16LE",
1582    variant: VariantEncoding::Utf16Le,
1583};
1584
1585/// The UTF-16LE encoding.
1586///
1587/// This decode-only encoding uses 16-bit code units due to Unicode originally
1588/// having been designed as a 16-bit reportoire. In the absence of a byte order
1589/// mark the little endian byte order is assumed.
1590///
1591/// There is no corresponding encoder in this crate or in the Encoding
1592/// Standard. The output encoding of this encoding is UTF-8.
1593///
1594/// This encoding matches the Windows code page 1200.
1595///
1596/// This will change from `static` to `const` if Rust changes
1597/// to make the referent of `pub const FOO: &'static Encoding`
1598/// unique cross-crate, so don't take the address of this
1599/// `static`.
1600pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1601
1602/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1603///
1604/// For use only for taking the address of this form when
1605/// Rust prohibits the use of the non-`_INIT` form directly,
1606/// such as in initializers of other `static`s. If in doubt,
1607/// use the corresponding non-`_INIT` reference-typed `static`.
1608///
1609/// This part of the public API will go away if Rust changes
1610/// to make the referent of `pub const FOO: &'static Encoding`
1611/// unique cross-crate or if Rust starts allowing static arrays
1612/// to be initialized with `pub static FOO: &'static Encoding`
1613/// items.
1614pub static UTF_8_INIT: Encoding = Encoding {
1615    name: "UTF-8",
1616    variant: VariantEncoding::Utf8,
1617};
1618
1619/// The UTF-8 encoding.
1620///
1621/// This is the encoding that should be used for all new development it can
1622/// represent all of Unicode.
1623///
1624/// This encoding matches the Windows code page 65001, except Windows differs
1625/// in the number of errors generated for some erroneous byte sequences.
1626///
1627/// This will change from `static` to `const` if Rust changes
1628/// to make the referent of `pub const FOO: &'static Encoding`
1629/// unique cross-crate, so don't take the address of this
1630/// `static`.
1631pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1632
1633/// The initializer for the [gb18030](static.GB18030.html) encoding.
1634///
1635/// For use only for taking the address of this form when
1636/// Rust prohibits the use of the non-`_INIT` form directly,
1637/// such as in initializers of other `static`s. If in doubt,
1638/// use the corresponding non-`_INIT` reference-typed `static`.
1639///
1640/// This part of the public API will go away if Rust changes
1641/// to make the referent of `pub const FOO: &'static Encoding`
1642/// unique cross-crate or if Rust starts allowing static arrays
1643/// to be initialized with `pub static FOO: &'static Encoding`
1644/// items.
1645pub static GB18030_INIT: Encoding = Encoding {
1646    name: "gb18030",
1647    variant: VariantEncoding::Gb18030,
1648};
1649
1650/// The gb18030 encoding.
1651///
1652/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1653/// maps to U+3000 for compatibility with existing Web content. As a result,
1654/// this encoding can represent all of Unicode except for the private-use
1655/// character U+E5E5.
1656///
1657/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1658/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1659///
1660/// This encoding matches the Windows code page 54936.
1661///
1662/// This will change from `static` to `const` if Rust changes
1663/// to make the referent of `pub const FOO: &'static Encoding`
1664/// unique cross-crate, so don't take the address of this
1665/// `static`.
1666pub static GB18030: &'static Encoding = &GB18030_INIT;
1667
1668/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1669///
1670/// For use only for taking the address of this form when
1671/// Rust prohibits the use of the non-`_INIT` form directly,
1672/// such as in initializers of other `static`s. If in doubt,
1673/// use the corresponding non-`_INIT` reference-typed `static`.
1674///
1675/// This part of the public API will go away if Rust changes
1676/// to make the referent of `pub const FOO: &'static Encoding`
1677/// unique cross-crate or if Rust starts allowing static arrays
1678/// to be initialized with `pub static FOO: &'static Encoding`
1679/// items.
1680pub static MACINTOSH_INIT: Encoding = Encoding {
1681    name: "macintosh",
1682    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1683};
1684
1685/// The macintosh encoding.
1686///
1687/// This is the MacRoman encoding from Mac OS Classic.
1688///
1689/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1690/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1691///
1692/// This encoding matches the Windows code page 10000, except Windows decodes
1693/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1694///
1695/// This will change from `static` to `const` if Rust changes
1696/// to make the referent of `pub const FOO: &'static Encoding`
1697/// unique cross-crate, so don't take the address of this
1698/// `static`.
1699pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1700
1701/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1702///
1703/// For use only for taking the address of this form when
1704/// Rust prohibits the use of the non-`_INIT` form directly,
1705/// such as in initializers of other `static`s. If in doubt,
1706/// use the corresponding non-`_INIT` reference-typed `static`.
1707///
1708/// This part of the public API will go away if Rust changes
1709/// to make the referent of `pub const FOO: &'static Encoding`
1710/// unique cross-crate or if Rust starts allowing static arrays
1711/// to be initialized with `pub static FOO: &'static Encoding`
1712/// items.
1713pub static REPLACEMENT_INIT: Encoding = Encoding {
1714    name: "replacement",
1715    variant: VariantEncoding::Replacement,
1716};
1717
1718/// The replacement encoding.
1719///
1720/// This decode-only encoding decodes all non-zero-length streams to a single
1721/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1722/// ASCII-compatible fallback encoding (typically windows-1252) for some
1723/// encodings that are no longer supported by the Web Platform and that
1724/// would be dangerous to treat as ASCII-compatible.
1725///
1726/// There is no corresponding encoder. The output encoding of this encoding
1727/// is UTF-8.
1728///
1729/// This encoding does not have a Windows code page number.
1730///
1731/// This will change from `static` to `const` if Rust changes
1732/// to make the referent of `pub const FOO: &'static Encoding`
1733/// unique cross-crate, so don't take the address of this
1734/// `static`.
1735pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1736
1737/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1738///
1739/// For use only for taking the address of this form when
1740/// Rust prohibits the use of the non-`_INIT` form directly,
1741/// such as in initializers of other `static`s. If in doubt,
1742/// use the corresponding non-`_INIT` reference-typed `static`.
1743///
1744/// This part of the public API will go away if Rust changes
1745/// to make the referent of `pub const FOO: &'static Encoding`
1746/// unique cross-crate or if Rust starts allowing static arrays
1747/// to be initialized with `pub static FOO: &'static Encoding`
1748/// items.
1749pub static WINDOWS_1250_INIT: Encoding = Encoding {
1750    name: "windows-1250",
1751    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1752};
1753
1754/// The windows-1250 encoding.
1755///
1756/// This is the Central European encoding for Windows.
1757///
1758/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1759/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1760///
1761/// This encoding matches the Windows code page 1250.
1762///
1763/// This will change from `static` to `const` if Rust changes
1764/// to make the referent of `pub const FOO: &'static Encoding`
1765/// unique cross-crate, so don't take the address of this
1766/// `static`.
1767pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1768
1769/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1770///
1771/// For use only for taking the address of this form when
1772/// Rust prohibits the use of the non-`_INIT` form directly,
1773/// such as in initializers of other `static`s. If in doubt,
1774/// use the corresponding non-`_INIT` reference-typed `static`.
1775///
1776/// This part of the public API will go away if Rust changes
1777/// to make the referent of `pub const FOO: &'static Encoding`
1778/// unique cross-crate or if Rust starts allowing static arrays
1779/// to be initialized with `pub static FOO: &'static Encoding`
1780/// items.
1781pub static WINDOWS_1251_INIT: Encoding = Encoding {
1782    name: "windows-1251",
1783    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1784};
1785
1786/// The windows-1251 encoding.
1787///
1788/// This is the Cyrillic encoding for Windows.
1789///
1790/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1791/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1792///
1793/// This encoding matches the Windows code page 1251.
1794///
1795/// This will change from `static` to `const` if Rust changes
1796/// to make the referent of `pub const FOO: &'static Encoding`
1797/// unique cross-crate, so don't take the address of this
1798/// `static`.
1799pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1800
1801/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1802///
1803/// For use only for taking the address of this form when
1804/// Rust prohibits the use of the non-`_INIT` form directly,
1805/// such as in initializers of other `static`s. If in doubt,
1806/// use the corresponding non-`_INIT` reference-typed `static`.
1807///
1808/// This part of the public API will go away if Rust changes
1809/// to make the referent of `pub const FOO: &'static Encoding`
1810/// unique cross-crate or if Rust starts allowing static arrays
1811/// to be initialized with `pub static FOO: &'static Encoding`
1812/// items.
1813pub static WINDOWS_1252_INIT: Encoding = Encoding {
1814    name: "windows-1252",
1815    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1816};
1817
1818/// The windows-1252 encoding.
1819///
1820/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1821/// which is known as Latin 1.
1822///
1823/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1824/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1825///
1826/// This encoding matches the Windows code page 1252.
1827///
1828/// This will change from `static` to `const` if Rust changes
1829/// to make the referent of `pub const FOO: &'static Encoding`
1830/// unique cross-crate, so don't take the address of this
1831/// `static`.
1832pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1833
1834/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1835///
1836/// For use only for taking the address of this form when
1837/// Rust prohibits the use of the non-`_INIT` form directly,
1838/// such as in initializers of other `static`s. If in doubt,
1839/// use the corresponding non-`_INIT` reference-typed `static`.
1840///
1841/// This part of the public API will go away if Rust changes
1842/// to make the referent of `pub const FOO: &'static Encoding`
1843/// unique cross-crate or if Rust starts allowing static arrays
1844/// to be initialized with `pub static FOO: &'static Encoding`
1845/// items.
1846pub static WINDOWS_1253_INIT: Encoding = Encoding {
1847    name: "windows-1253",
1848    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1849};
1850
1851/// The windows-1253 encoding.
1852///
1853/// This is the Greek encoding for Windows. It is mostly an extension of
1854/// ISO-8859-7, but U+0386 is mapped to a different byte.
1855///
1856/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1857/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1858///
1859/// This encoding matches the Windows code page 1253, except Windows decodes
1860/// unassigned code points to the Private Use Area of Unicode.
1861///
1862/// This will change from `static` to `const` if Rust changes
1863/// to make the referent of `pub const FOO: &'static Encoding`
1864/// unique cross-crate, so don't take the address of this
1865/// `static`.
1866pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1867
1868/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1869///
1870/// For use only for taking the address of this form when
1871/// Rust prohibits the use of the non-`_INIT` form directly,
1872/// such as in initializers of other `static`s. If in doubt,
1873/// use the corresponding non-`_INIT` reference-typed `static`.
1874///
1875/// This part of the public API will go away if Rust changes
1876/// to make the referent of `pub const FOO: &'static Encoding`
1877/// unique cross-crate or if Rust starts allowing static arrays
1878/// to be initialized with `pub static FOO: &'static Encoding`
1879/// items.
1880pub static WINDOWS_1254_INIT: Encoding = Encoding {
1881    name: "windows-1254",
1882    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1883};
1884
1885/// The windows-1254 encoding.
1886///
1887/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1888/// which is known as Latin 5.
1889///
1890/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1891/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1892///
1893/// This encoding matches the Windows code page 1254.
1894///
1895/// This will change from `static` to `const` if Rust changes
1896/// to make the referent of `pub const FOO: &'static Encoding`
1897/// unique cross-crate, so don't take the address of this
1898/// `static`.
1899pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1900
1901/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1902///
1903/// For use only for taking the address of this form when
1904/// Rust prohibits the use of the non-`_INIT` form directly,
1905/// such as in initializers of other `static`s. If in doubt,
1906/// use the corresponding non-`_INIT` reference-typed `static`.
1907///
1908/// This part of the public API will go away if Rust changes
1909/// to make the referent of `pub const FOO: &'static Encoding`
1910/// unique cross-crate or if Rust starts allowing static arrays
1911/// to be initialized with `pub static FOO: &'static Encoding`
1912/// items.
1913pub static WINDOWS_1255_INIT: Encoding = Encoding {
1914    name: "windows-1255",
1915    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1916};
1917
1918/// The windows-1255 encoding.
1919///
1920/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1921/// except for a currency sign swap.
1922///
1923/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1924/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1925///
1926/// This encoding matches the Windows code page 1255, except Windows decodes
1927/// unassigned code points to the Private Use Area of Unicode.
1928///
1929/// This will change from `static` to `const` if Rust changes
1930/// to make the referent of `pub const FOO: &'static Encoding`
1931/// unique cross-crate, so don't take the address of this
1932/// `static`.
1933pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1934
1935/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1936///
1937/// For use only for taking the address of this form when
1938/// Rust prohibits the use of the non-`_INIT` form directly,
1939/// such as in initializers of other `static`s. If in doubt,
1940/// use the corresponding non-`_INIT` reference-typed `static`.
1941///
1942/// This part of the public API will go away if Rust changes
1943/// to make the referent of `pub const FOO: &'static Encoding`
1944/// unique cross-crate or if Rust starts allowing static arrays
1945/// to be initialized with `pub static FOO: &'static Encoding`
1946/// items.
1947pub static WINDOWS_1256_INIT: Encoding = Encoding {
1948    name: "windows-1256",
1949    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1950};
1951
1952/// The windows-1256 encoding.
1953///
1954/// This is the Arabic encoding for Windows.
1955///
1956/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1957/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1958///
1959/// This encoding matches the Windows code page 1256.
1960///
1961/// This will change from `static` to `const` if Rust changes
1962/// to make the referent of `pub const FOO: &'static Encoding`
1963/// unique cross-crate, so don't take the address of this
1964/// `static`.
1965pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1966
1967/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1968///
1969/// For use only for taking the address of this form when
1970/// Rust prohibits the use of the non-`_INIT` form directly,
1971/// such as in initializers of other `static`s. If in doubt,
1972/// use the corresponding non-`_INIT` reference-typed `static`.
1973///
1974/// This part of the public API will go away if Rust changes
1975/// to make the referent of `pub const FOO: &'static Encoding`
1976/// unique cross-crate or if Rust starts allowing static arrays
1977/// to be initialized with `pub static FOO: &'static Encoding`
1978/// items.
1979pub static WINDOWS_1257_INIT: Encoding = Encoding {
1980    name: "windows-1257",
1981    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1982};
1983
1984/// The windows-1257 encoding.
1985///
1986/// This is the Baltic encoding for Windows.
1987///
1988/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1989/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1990///
1991/// This encoding matches the Windows code page 1257, except Windows decodes
1992/// unassigned code points to the Private Use Area of Unicode.
1993///
1994/// This will change from `static` to `const` if Rust changes
1995/// to make the referent of `pub const FOO: &'static Encoding`
1996/// unique cross-crate, so don't take the address of this
1997/// `static`.
1998pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1999
2000/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2001///
2002/// For use only for taking the address of this form when
2003/// Rust prohibits the use of the non-`_INIT` form directly,
2004/// such as in initializers of other `static`s. If in doubt,
2005/// use the corresponding non-`_INIT` reference-typed `static`.
2006///
2007/// This part of the public API will go away if Rust changes
2008/// to make the referent of `pub const FOO: &'static Encoding`
2009/// unique cross-crate or if Rust starts allowing static arrays
2010/// to be initialized with `pub static FOO: &'static Encoding`
2011/// items.
2012pub static WINDOWS_1258_INIT: Encoding = Encoding {
2013    name: "windows-1258",
2014    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2015};
2016
2017/// The windows-1258 encoding.
2018///
2019/// This is the Vietnamese encoding for Windows.
2020///
2021/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2022/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2023///
2024/// This encoding matches the Windows code page 1258 when used in the
2025/// non-normalizing mode. Unlike with the other single-byte encodings, the
2026/// result of decoding is not necessarily in Normalization Form C. On the
2027/// other hand, input in the Normalization Form C is not encoded without
2028/// replacement. In general, it's a bad idea to encode to encodings other
2029/// than UTF-8, but this encoding is especially hazardous to encode to.
2030///
2031/// This will change from `static` to `const` if Rust changes
2032/// to make the referent of `pub const FOO: &'static Encoding`
2033/// unique cross-crate, so don't take the address of this
2034/// `static`.
2035pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2036
2037/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2038///
2039/// For use only for taking the address of this form when
2040/// Rust prohibits the use of the non-`_INIT` form directly,
2041/// such as in initializers of other `static`s. If in doubt,
2042/// use the corresponding non-`_INIT` reference-typed `static`.
2043///
2044/// This part of the public API will go away if Rust changes
2045/// to make the referent of `pub const FOO: &'static Encoding`
2046/// unique cross-crate or if Rust starts allowing static arrays
2047/// to be initialized with `pub static FOO: &'static Encoding`
2048/// items.
2049pub static WINDOWS_874_INIT: Encoding = Encoding {
2050    name: "windows-874",
2051    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2052};
2053
2054/// The windows-874 encoding.
2055///
2056/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2057///
2058/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2059/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2060///
2061/// This encoding matches the Windows code page 874, except Windows decodes
2062/// unassigned code points to the Private Use Area of Unicode.
2063///
2064/// This will change from `static` to `const` if Rust changes
2065/// to make the referent of `pub const FOO: &'static Encoding`
2066/// unique cross-crate, so don't take the address of this
2067/// `static`.
2068pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2069
2070/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2071///
2072/// For use only for taking the address of this form when
2073/// Rust prohibits the use of the non-`_INIT` form directly,
2074/// such as in initializers of other `static`s. If in doubt,
2075/// use the corresponding non-`_INIT` reference-typed `static`.
2076///
2077/// This part of the public API will go away if Rust changes
2078/// to make the referent of `pub const FOO: &'static Encoding`
2079/// unique cross-crate or if Rust starts allowing static arrays
2080/// to be initialized with `pub static FOO: &'static Encoding`
2081/// items.
2082pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2083    name: "x-mac-cyrillic",
2084    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2085};
2086
2087/// The x-mac-cyrillic encoding.
2088///
2089/// This is the MacUkrainian encoding from Mac OS Classic.
2090///
2091/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2092/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2093///
2094/// This encoding matches the Windows code page 10017.
2095///
2096/// This will change from `static` to `const` if Rust changes
2097/// to make the referent of `pub const FOO: &'static Encoding`
2098/// unique cross-crate, so don't take the address of this
2099/// `static`.
2100pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2101
2102/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2103///
2104/// For use only for taking the address of this form when
2105/// Rust prohibits the use of the non-`_INIT` form directly,
2106/// such as in initializers of other `static`s. If in doubt,
2107/// use the corresponding non-`_INIT` reference-typed `static`.
2108///
2109/// This part of the public API will go away if Rust changes
2110/// to make the referent of `pub const FOO: &'static Encoding`
2111/// unique cross-crate or if Rust starts allowing static arrays
2112/// to be initialized with `pub static FOO: &'static Encoding`
2113/// items.
2114pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2115    name: "x-user-defined",
2116    variant: VariantEncoding::UserDefined,
2117};
2118
2119/// The x-user-defined encoding.
2120///
2121/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2122/// them to the Private Use Area of Unicode. It was used for loading binary
2123/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2124/// the `"arraybuffer"` response type.
2125///
2126/// This encoding does not have a Windows code page number.
2127///
2128/// This will change from `static` to `const` if Rust changes
2129/// to make the referent of `pub const FOO: &'static Encoding`
2130/// unique cross-crate, so don't take the address of this
2131/// `static`.
2132pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2133
2134static LABELS_SORTED: [&'static str; 219] = [
2135    "l1",
2136    "l2",
2137    "l3",
2138    "l4",
2139    "l5",
2140    "l6",
2141    "l9",
2142    "866",
2143    "mac",
2144    "koi",
2145    "gbk",
2146    "big5",
2147    "utf8",
2148    "koi8",
2149    "sjis",
2150    "ms932",
2151    "cp866",
2152    "utf-8",
2153    "cp819",
2154    "ascii",
2155    "x-gbk",
2156    "greek",
2157    "cp1250",
2158    "cp1251",
2159    "latin1",
2160    "gb2312",
2161    "cp1252",
2162    "latin2",
2163    "cp1253",
2164    "latin3",
2165    "cp1254",
2166    "latin4",
2167    "cp1255",
2168    "csbig5",
2169    "latin5",
2170    "utf-16",
2171    "cp1256",
2172    "ibm866",
2173    "latin6",
2174    "cp1257",
2175    "cp1258",
2176    "greek8",
2177    "ibm819",
2178    "arabic",
2179    "visual",
2180    "korean",
2181    "euc-jp",
2182    "koi8-r",
2183    "koi8_r",
2184    "euc-kr",
2185    "x-sjis",
2186    "koi8-u",
2187    "hebrew",
2188    "tis-620",
2189    "gb18030",
2190    "ksc5601",
2191    "gb_2312",
2192    "dos-874",
2193    "cn-big5",
2194    "chinese",
2195    "logical",
2196    "cskoi8r",
2197    "cseuckr",
2198    "koi8-ru",
2199    "x-cp1250",
2200    "ksc_5601",
2201    "x-cp1251",
2202    "iso88591",
2203    "csgb2312",
2204    "x-cp1252",
2205    "iso88592",
2206    "x-cp1253",
2207    "iso88593",
2208    "ecma-114",
2209    "x-cp1254",
2210    "iso88594",
2211    "x-cp1255",
2212    "iso88595",
2213    "x-x-big5",
2214    "x-cp1256",
2215    "csibm866",
2216    "iso88596",
2217    "x-cp1257",
2218    "iso88597",
2219    "asmo-708",
2220    "ecma-118",
2221    "elot_928",
2222    "x-cp1258",
2223    "iso88598",
2224    "iso88599",
2225    "cyrillic",
2226    "utf-16be",
2227    "utf-16le",
2228    "us-ascii",
2229    "ms_kanji",
2230    "x-euc-jp",
2231    "iso885910",
2232    "iso8859-1",
2233    "iso885911",
2234    "iso8859-2",
2235    "iso8859-3",
2236    "iso885913",
2237    "iso8859-4",
2238    "iso885914",
2239    "iso8859-5",
2240    "iso885915",
2241    "iso8859-6",
2242    "iso8859-7",
2243    "iso8859-8",
2244    "iso-ir-58",
2245    "iso8859-9",
2246    "macintosh",
2247    "shift-jis",
2248    "shift_jis",
2249    "iso-ir-100",
2250    "iso8859-10",
2251    "iso-ir-110",
2252    "gb_2312-80",
2253    "iso-8859-1",
2254    "iso_8859-1",
2255    "iso-ir-101",
2256    "iso8859-11",
2257    "iso-8859-2",
2258    "iso_8859-2",
2259    "hz-gb-2312",
2260    "iso-8859-3",
2261    "iso_8859-3",
2262    "iso8859-13",
2263    "iso-8859-4",
2264    "iso_8859-4",
2265    "iso8859-14",
2266    "iso-ir-144",
2267    "iso-8859-5",
2268    "iso_8859-5",
2269    "iso8859-15",
2270    "iso-8859-6",
2271    "iso_8859-6",
2272    "iso-ir-126",
2273    "iso-8859-7",
2274    "iso_8859-7",
2275    "iso-ir-127",
2276    "iso-ir-157",
2277    "iso-8859-8",
2278    "iso_8859-8",
2279    "iso-ir-138",
2280    "iso-ir-148",
2281    "iso-8859-9",
2282    "iso_8859-9",
2283    "iso-ir-109",
2284    "iso-ir-149",
2285    "big5-hkscs",
2286    "csshiftjis",
2287    "iso-8859-10",
2288    "iso-8859-11",
2289    "csisolatin1",
2290    "csisolatin2",
2291    "iso-8859-13",
2292    "csisolatin3",
2293    "iso-8859-14",
2294    "windows-874",
2295    "csisolatin4",
2296    "iso-8859-15",
2297    "iso_8859-15",
2298    "csisolatin5",
2299    "iso-8859-16",
2300    "csisolatin6",
2301    "windows-949",
2302    "csisolatin9",
2303    "csiso88596e",
2304    "csiso88598e",
2305    "csmacintosh",
2306    "csiso88596i",
2307    "csiso88598i",
2308    "windows-31j",
2309    "x-mac-roman",
2310    "iso-2022-cn",
2311    "iso-2022-jp",
2312    "csiso2022jp",
2313    "iso-2022-kr",
2314    "csiso2022kr",
2315    "replacement",
2316    "windows-1250",
2317    "windows-1251",
2318    "windows-1252",
2319    "windows-1253",
2320    "windows-1254",
2321    "windows-1255",
2322    "windows-1256",
2323    "windows-1257",
2324    "windows-1258",
2325    "iso-8859-6-e",
2326    "iso-8859-8-e",
2327    "iso-8859-6-i",
2328    "iso-8859-8-i",
2329    "sun_eu_greek",
2330    "csksc56011987",
2331    "ks_c_5601-1987",
2332    "ansi_x3.4-1968",
2333    "ks_c_5601-1989",
2334    "x-mac-cyrillic",
2335    "x-user-defined",
2336    "csiso58gb231280",
2337    "iso_8859-1:1987",
2338    "iso_8859-2:1987",
2339    "iso_8859-6:1987",
2340    "iso_8859-7:1987",
2341    "iso_8859-3:1988",
2342    "iso_8859-4:1988",
2343    "iso_8859-5:1988",
2344    "iso_8859-8:1988",
2345    "iso_8859-9:1989",
2346    "csisolatingreek",
2347    "x-mac-ukrainian",
2348    "iso-2022-cn-ext",
2349    "csisolatinarabic",
2350    "csisolatinhebrew",
2351    "unicode-1-1-utf-8",
2352    "csisolatincyrillic",
2353    "cseucpkdfmtjapanese",
2354];
2355
2356static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2357    &WINDOWS_1252_INIT,
2358    &ISO_8859_2_INIT,
2359    &ISO_8859_3_INIT,
2360    &ISO_8859_4_INIT,
2361    &WINDOWS_1254_INIT,
2362    &ISO_8859_10_INIT,
2363    &ISO_8859_15_INIT,
2364    &IBM866_INIT,
2365    &MACINTOSH_INIT,
2366    &KOI8_R_INIT,
2367    &GBK_INIT,
2368    &BIG5_INIT,
2369    &UTF_8_INIT,
2370    &KOI8_R_INIT,
2371    &SHIFT_JIS_INIT,
2372    &SHIFT_JIS_INIT,
2373    &IBM866_INIT,
2374    &UTF_8_INIT,
2375    &WINDOWS_1252_INIT,
2376    &WINDOWS_1252_INIT,
2377    &GBK_INIT,
2378    &ISO_8859_7_INIT,
2379    &WINDOWS_1250_INIT,
2380    &WINDOWS_1251_INIT,
2381    &WINDOWS_1252_INIT,
2382    &GBK_INIT,
2383    &WINDOWS_1252_INIT,
2384    &ISO_8859_2_INIT,
2385    &WINDOWS_1253_INIT,
2386    &ISO_8859_3_INIT,
2387    &WINDOWS_1254_INIT,
2388    &ISO_8859_4_INIT,
2389    &WINDOWS_1255_INIT,
2390    &BIG5_INIT,
2391    &WINDOWS_1254_INIT,
2392    &UTF_16LE_INIT,
2393    &WINDOWS_1256_INIT,
2394    &IBM866_INIT,
2395    &ISO_8859_10_INIT,
2396    &WINDOWS_1257_INIT,
2397    &WINDOWS_1258_INIT,
2398    &ISO_8859_7_INIT,
2399    &WINDOWS_1252_INIT,
2400    &ISO_8859_6_INIT,
2401    &ISO_8859_8_INIT,
2402    &EUC_KR_INIT,
2403    &EUC_JP_INIT,
2404    &KOI8_R_INIT,
2405    &KOI8_R_INIT,
2406    &EUC_KR_INIT,
2407    &SHIFT_JIS_INIT,
2408    &KOI8_U_INIT,
2409    &ISO_8859_8_INIT,
2410    &WINDOWS_874_INIT,
2411    &GB18030_INIT,
2412    &EUC_KR_INIT,
2413    &GBK_INIT,
2414    &WINDOWS_874_INIT,
2415    &BIG5_INIT,
2416    &GBK_INIT,
2417    &ISO_8859_8_I_INIT,
2418    &KOI8_R_INIT,
2419    &EUC_KR_INIT,
2420    &KOI8_U_INIT,
2421    &WINDOWS_1250_INIT,
2422    &EUC_KR_INIT,
2423    &WINDOWS_1251_INIT,
2424    &WINDOWS_1252_INIT,
2425    &GBK_INIT,
2426    &WINDOWS_1252_INIT,
2427    &ISO_8859_2_INIT,
2428    &WINDOWS_1253_INIT,
2429    &ISO_8859_3_INIT,
2430    &ISO_8859_6_INIT,
2431    &WINDOWS_1254_INIT,
2432    &ISO_8859_4_INIT,
2433    &WINDOWS_1255_INIT,
2434    &ISO_8859_5_INIT,
2435    &BIG5_INIT,
2436    &WINDOWS_1256_INIT,
2437    &IBM866_INIT,
2438    &ISO_8859_6_INIT,
2439    &WINDOWS_1257_INIT,
2440    &ISO_8859_7_INIT,
2441    &ISO_8859_6_INIT,
2442    &ISO_8859_7_INIT,
2443    &ISO_8859_7_INIT,
2444    &WINDOWS_1258_INIT,
2445    &ISO_8859_8_INIT,
2446    &WINDOWS_1254_INIT,
2447    &ISO_8859_5_INIT,
2448    &UTF_16BE_INIT,
2449    &UTF_16LE_INIT,
2450    &WINDOWS_1252_INIT,
2451    &SHIFT_JIS_INIT,
2452    &EUC_JP_INIT,
2453    &ISO_8859_10_INIT,
2454    &WINDOWS_1252_INIT,
2455    &WINDOWS_874_INIT,
2456    &ISO_8859_2_INIT,
2457    &ISO_8859_3_INIT,
2458    &ISO_8859_13_INIT,
2459    &ISO_8859_4_INIT,
2460    &ISO_8859_14_INIT,
2461    &ISO_8859_5_INIT,
2462    &ISO_8859_15_INIT,
2463    &ISO_8859_6_INIT,
2464    &ISO_8859_7_INIT,
2465    &ISO_8859_8_INIT,
2466    &GBK_INIT,
2467    &WINDOWS_1254_INIT,
2468    &MACINTOSH_INIT,
2469    &SHIFT_JIS_INIT,
2470    &SHIFT_JIS_INIT,
2471    &WINDOWS_1252_INIT,
2472    &ISO_8859_10_INIT,
2473    &ISO_8859_4_INIT,
2474    &GBK_INIT,
2475    &WINDOWS_1252_INIT,
2476    &WINDOWS_1252_INIT,
2477    &ISO_8859_2_INIT,
2478    &WINDOWS_874_INIT,
2479    &ISO_8859_2_INIT,
2480    &ISO_8859_2_INIT,
2481    &REPLACEMENT_INIT,
2482    &ISO_8859_3_INIT,
2483    &ISO_8859_3_INIT,
2484    &ISO_8859_13_INIT,
2485    &ISO_8859_4_INIT,
2486    &ISO_8859_4_INIT,
2487    &ISO_8859_14_INIT,
2488    &ISO_8859_5_INIT,
2489    &ISO_8859_5_INIT,
2490    &ISO_8859_5_INIT,
2491    &ISO_8859_15_INIT,
2492    &ISO_8859_6_INIT,
2493    &ISO_8859_6_INIT,
2494    &ISO_8859_7_INIT,
2495    &ISO_8859_7_INIT,
2496    &ISO_8859_7_INIT,
2497    &ISO_8859_6_INIT,
2498    &ISO_8859_10_INIT,
2499    &ISO_8859_8_INIT,
2500    &ISO_8859_8_INIT,
2501    &ISO_8859_8_INIT,
2502    &WINDOWS_1254_INIT,
2503    &WINDOWS_1254_INIT,
2504    &WINDOWS_1254_INIT,
2505    &ISO_8859_3_INIT,
2506    &EUC_KR_INIT,
2507    &BIG5_INIT,
2508    &SHIFT_JIS_INIT,
2509    &ISO_8859_10_INIT,
2510    &WINDOWS_874_INIT,
2511    &WINDOWS_1252_INIT,
2512    &ISO_8859_2_INIT,
2513    &ISO_8859_13_INIT,
2514    &ISO_8859_3_INIT,
2515    &ISO_8859_14_INIT,
2516    &WINDOWS_874_INIT,
2517    &ISO_8859_4_INIT,
2518    &ISO_8859_15_INIT,
2519    &ISO_8859_15_INIT,
2520    &WINDOWS_1254_INIT,
2521    &ISO_8859_16_INIT,
2522    &ISO_8859_10_INIT,
2523    &EUC_KR_INIT,
2524    &ISO_8859_15_INIT,
2525    &ISO_8859_6_INIT,
2526    &ISO_8859_8_INIT,
2527    &MACINTOSH_INIT,
2528    &ISO_8859_6_INIT,
2529    &ISO_8859_8_I_INIT,
2530    &SHIFT_JIS_INIT,
2531    &MACINTOSH_INIT,
2532    &REPLACEMENT_INIT,
2533    &ISO_2022_JP_INIT,
2534    &ISO_2022_JP_INIT,
2535    &REPLACEMENT_INIT,
2536    &REPLACEMENT_INIT,
2537    &REPLACEMENT_INIT,
2538    &WINDOWS_1250_INIT,
2539    &WINDOWS_1251_INIT,
2540    &WINDOWS_1252_INIT,
2541    &WINDOWS_1253_INIT,
2542    &WINDOWS_1254_INIT,
2543    &WINDOWS_1255_INIT,
2544    &WINDOWS_1256_INIT,
2545    &WINDOWS_1257_INIT,
2546    &WINDOWS_1258_INIT,
2547    &ISO_8859_6_INIT,
2548    &ISO_8859_8_INIT,
2549    &ISO_8859_6_INIT,
2550    &ISO_8859_8_I_INIT,
2551    &ISO_8859_7_INIT,
2552    &EUC_KR_INIT,
2553    &EUC_KR_INIT,
2554    &WINDOWS_1252_INIT,
2555    &EUC_KR_INIT,
2556    &X_MAC_CYRILLIC_INIT,
2557    &X_USER_DEFINED_INIT,
2558    &GBK_INIT,
2559    &WINDOWS_1252_INIT,
2560    &ISO_8859_2_INIT,
2561    &ISO_8859_6_INIT,
2562    &ISO_8859_7_INIT,
2563    &ISO_8859_3_INIT,
2564    &ISO_8859_4_INIT,
2565    &ISO_8859_5_INIT,
2566    &ISO_8859_8_INIT,
2567    &WINDOWS_1254_INIT,
2568    &ISO_8859_7_INIT,
2569    &X_MAC_CYRILLIC_INIT,
2570    &REPLACEMENT_INIT,
2571    &ISO_8859_6_INIT,
2572    &ISO_8859_8_INIT,
2573    &UTF_8_INIT,
2574    &ISO_8859_5_INIT,
2575    &EUC_JP_INIT,
2576];
2577
2578// END GENERATED CODE
2579
2580/// An encoding as defined in the [Encoding Standard][1].
2581///
2582/// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2583/// and, in most cases, vice versa. Each encoding has a name, an output
2584/// encoding, and one or more labels.
2585///
2586/// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2587/// encoding in formats and protocols. The _name_ of the encoding is the
2588/// preferred label in the case appropriate for returning from the
2589/// [`characterSet`][2] property of the `Document` DOM interface.
2590///
2591/// The _output encoding_ is the encoding used for form submission and URL
2592/// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2593/// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2594/// encodings.
2595///
2596/// [1]: https://encoding.spec.whatwg.org/
2597/// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2598///
2599/// # Streaming vs. Non-Streaming
2600///
2601/// When you have the entire input in a single buffer, you can use the
2602/// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2603/// [`decode_without_bom_handling()`][5],
2604/// [`decode_without_bom_handling_and_without_replacement()`][6] and
2605/// [`encode()`][7]. (These methods are available to Rust callers only and are
2606/// not available in the C API.) Unlike the rest of the API available to Rust,
2607/// these methods perform heap allocations. You should the `Decoder` and
2608/// `Encoder` objects when your input is split into multiple buffers or when
2609/// you want to control the allocation of the output buffers.
2610///
2611/// [3]: #method.decode
2612/// [4]: #method.decode_with_bom_removal
2613/// [5]: #method.decode_without_bom_handling
2614/// [6]: #method.decode_without_bom_handling_and_without_replacement
2615/// [7]: #method.encode
2616///
2617/// # Instances
2618///
2619/// All instances of `Encoding` are statically allocated and have the `'static`
2620/// lifetime. There is precisely one unique `Encoding` instance for each
2621/// encoding defined in the Encoding Standard.
2622///
2623/// To obtain a reference to a particular encoding whose identity you know at
2624/// compile time, use a `static` that refers to encoding. There is a `static`
2625/// for each encoding. The `static`s are named in all caps with hyphens
2626/// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2627/// name). For example, if you know at compile time that you will want to
2628/// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2629/// in C/C++).
2630///
2631/// Additionally, there are non-reference-typed forms ending with `_INIT` to
2632/// work around the problem that `static`s of the type `&'static Encoding`
2633/// cannot be used to initialize items of an array whose type is
2634/// `[&'static Encoding; N]`.
2635///
2636/// If you don't know what encoding you need at compile time and need to
2637/// dynamically get an encoding by label, use
2638/// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2639///
2640/// Instances of `Encoding` can be compared with `==` (in both Rust and in
2641/// C/C++).
2642pub struct Encoding {
2643    name: &'static str,
2644    variant: VariantEncoding,
2645}
2646
2647impl Encoding {
2648    /// Implements the
2649    /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2650    /// algorithm.
2651    ///
2652    /// If, after ASCII-lowercasing and removing leading and trailing
2653    /// whitespace, the argument matches a label defined in the Encoding
2654    /// Standard, `Some(&'static Encoding)` representing the corresponding
2655    /// encoding is returned. If there is no match, `None` is returned.
2656    ///
2657    /// This is the right method to use if the action upon the method returning
2658    /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2659    /// When the action upon the method returning `None` is not to proceed with
2660    /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2661    /// appropriate.
2662    ///
2663    /// The argument is of type `&[u8]` instead of `&str` to save callers
2664    /// that are extracting the label from a non-UTF-8 protocol the trouble
2665    /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2666    /// on it.)
2667    ///
2668    /// Available via the C wrapper.
2669    pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2670        let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2671        let mut trimmed_pos = 0usize;
2672        let mut iter = label.into_iter();
2673        // before
2674        loop {
2675            match iter.next() {
2676                None => {
2677                    return None;
2678                }
2679                Some(byte) => {
2680                    // The characters used in labels are:
2681                    // a-z (except q, but excluding it below seems excessive)
2682                    // 0-9
2683                    // . _ - :
2684                    match *byte {
2685                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2686                            continue;
2687                        }
2688                        b'A'..=b'Z' => {
2689                            trimmed[trimmed_pos] = *byte + 0x20u8;
2690                            trimmed_pos = 1usize;
2691                            break;
2692                        }
2693                        b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2694                            trimmed[trimmed_pos] = *byte;
2695                            trimmed_pos = 1usize;
2696                            break;
2697                        }
2698                        _ => {
2699                            return None;
2700                        }
2701                    }
2702                }
2703            }
2704        }
2705        // inside
2706        loop {
2707            match iter.next() {
2708                None => {
2709                    break;
2710                }
2711                Some(byte) => {
2712                    match *byte {
2713                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2714                            break;
2715                        }
2716                        b'A'..=b'Z' => {
2717                            if trimmed_pos == LONGEST_LABEL_LENGTH {
2718                                // There's no encoding with a label this long
2719                                return None;
2720                            }
2721                            trimmed[trimmed_pos] = *byte + 0x20u8;
2722                            trimmed_pos += 1usize;
2723                            continue;
2724                        }
2725                        b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2726                            if trimmed_pos == LONGEST_LABEL_LENGTH {
2727                                // There's no encoding with a label this long
2728                                return None;
2729                            }
2730                            trimmed[trimmed_pos] = *byte;
2731                            trimmed_pos += 1usize;
2732                            continue;
2733                        }
2734                        _ => {
2735                            return None;
2736                        }
2737                    }
2738                }
2739            }
2740        }
2741        // after
2742        loop {
2743            match iter.next() {
2744                None => {
2745                    break;
2746                }
2747                Some(byte) => {
2748                    match *byte {
2749                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2750                            continue;
2751                        }
2752                        _ => {
2753                            // There's no label with space in the middle
2754                            return None;
2755                        }
2756                    }
2757                }
2758            }
2759        }
2760        let candidate = &trimmed[..trimmed_pos];
2761        match LABELS_SORTED.binary_search_by(|probe| {
2762            let bytes = probe.as_bytes();
2763            let c = bytes.len().cmp(&candidate.len());
2764            if c != Ordering::Equal {
2765                return c;
2766            }
2767            let probe_iter = bytes.iter().rev();
2768            let candidate_iter = candidate.iter().rev();
2769            probe_iter.cmp(candidate_iter)
2770        }) {
2771            Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2772            Err(_) => None,
2773        }
2774    }
2775
2776    /// This method behaves the same as `for_label()`, except when `for_label()`
2777    /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2778    ///
2779    /// This method is useful in scenarios where a fatal error is required
2780    /// upon invalid label, because in those cases the caller typically wishes
2781    /// to treat the labels that map to the replacement encoding as fatal
2782    /// errors, too.
2783    ///
2784    /// It is not OK to use this method when the action upon the method returning
2785    /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2786    /// case, the `for_label()` method should be used instead in order to avoid
2787    /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2788    ///
2789    /// Available via the C wrapper.
2790    #[inline]
2791    pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2792        match Encoding::for_label(label) {
2793            None => None,
2794            Some(encoding) => {
2795                if encoding == REPLACEMENT {
2796                    None
2797                } else {
2798                    Some(encoding)
2799                }
2800            }
2801        }
2802    }
2803
2804    /// Performs non-incremental BOM sniffing.
2805    ///
2806    /// The argument must either be a buffer representing the entire input
2807    /// stream (non-streaming case) or a buffer representing at least the first
2808    /// three bytes of the input stream (streaming case).
2809    ///
2810    /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2811    /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2812    /// or UTF-16BE BOM or `None` otherwise.
2813    ///
2814    /// Available via the C wrapper.
2815    #[inline]
2816    pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2817        if buffer.starts_with(b"\xEF\xBB\xBF") {
2818            Some((UTF_8, 3))
2819        } else if buffer.starts_with(b"\xFF\xFE") {
2820            Some((UTF_16LE, 2))
2821        } else if buffer.starts_with(b"\xFE\xFF") {
2822            Some((UTF_16BE, 2))
2823        } else {
2824            None
2825        }
2826    }
2827
2828    /// Returns the name of this encoding.
2829    ///
2830    /// This name is appropriate to return as-is from the DOM
2831    /// `document.characterSet` property.
2832    ///
2833    /// Available via the C wrapper.
2834    #[inline]
2835    pub fn name(&'static self) -> &'static str {
2836        self.name
2837    }
2838
2839    /// Checks whether the _output encoding_ of this encoding can encode every
2840    /// `char`. (Only true if the output encoding is UTF-8.)
2841    ///
2842    /// Available via the C wrapper.
2843    #[inline]
2844    pub fn can_encode_everything(&'static self) -> bool {
2845        self.output_encoding() == UTF_8
2846    }
2847
2848    /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2849    /// U+0000...U+007F and vice versa.
2850    ///
2851    /// Available via the C wrapper.
2852    #[inline]
2853    pub fn is_ascii_compatible(&'static self) -> bool {
2854        !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2855    }
2856
2857    /// Checks whether this encoding maps one byte to one Basic Multilingual
2858    /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2859    /// vice versa (for mappable characters).
2860    ///
2861    /// `true` iff this encoding is on the list of [Legacy single-byte
2862    /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2863    /// in the spec or x-user-defined.
2864    ///
2865    /// Available via the C wrapper.
2866    #[inline]
2867    pub fn is_single_byte(&'static self) -> bool {
2868        self.variant.is_single_byte()
2869    }
2870
2871    /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2872    /// U+0000...U+007F and vice versa.
2873    #[inline]
2874    fn is_potentially_borrowable(&'static self) -> bool {
2875        !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2876    }
2877
2878    /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2879    /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2880    ///
2881    /// Available via the C wrapper.
2882    #[inline]
2883    pub fn output_encoding(&'static self) -> &'static Encoding {
2884        if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2885            UTF_8
2886        } else {
2887            self
2888        }
2889    }
2890
2891    /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2892    /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2893    /// entire input is available as a single buffer (i.e. the end of the
2894    /// buffer marks the end of the stream).
2895    ///
2896    /// This method implements the (non-streaming version of) the
2897    /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2898    ///
2899    /// The second item in the returned tuple is the encoding that was actually
2900    /// used (which may differ from this encoding thanks to BOM sniffing).
2901    ///
2902    /// The third item in the returned tuple indicates whether there were
2903    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2904    ///
2905    /// _Note:_ It is wrong to use this when the input buffer represents only
2906    /// a segment of the input instead of the whole input. Use `new_decoder()`
2907    /// when decoding segmented input.
2908    ///
2909    /// This method performs a one or two heap allocations for the backing
2910    /// buffer of the `String` when unable to borrow. (One allocation if not
2911    /// errors and potentially another one in the presence of errors.) The
2912    /// first allocation assumes jemalloc and may not be optimal with
2913    /// allocators that do not use power-of-two buckets. A borrow is performed
2914    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2915    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2916    /// ISO-2022-JP and the input is entirely in the ASCII state without state
2917    /// transitions.
2918    ///
2919    /// # Panics
2920    ///
2921    /// If the size calculation for a heap-allocated backing buffer overflows
2922    /// `usize`.
2923    ///
2924    /// Available to Rust only.
2925    #[inline]
2926    pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2927        let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2928            Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2929            None => (self, bytes),
2930        };
2931        let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2932        (cow, encoding, had_errors)
2933    }
2934
2935    /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2936    /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2937    /// entire input is available as a single buffer (i.e. the end of the
2938    /// buffer marks the end of the stream).
2939    ///
2940    /// When invoked on `UTF_8`, this method implements the (non-streaming
2941    /// version of) the
2942    /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2943    /// concept.
2944    ///
2945    /// The second item in the returned pair indicates whether there were
2946    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2947    ///
2948    /// _Note:_ It is wrong to use this when the input buffer represents only
2949    /// a segment of the input instead of the whole input. Use
2950    /// `new_decoder_with_bom_removal()` when decoding segmented input.
2951    ///
2952    /// This method performs a one or two heap allocations for the backing
2953    /// buffer of the `String` when unable to borrow. (One allocation if not
2954    /// errors and potentially another one in the presence of errors.) The
2955    /// first allocation assumes jemalloc and may not be optimal with
2956    /// allocators that do not use power-of-two buckets. A borrow is performed
2957    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2958    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2959    /// ISO-2022-JP and the input is entirely in the ASCII state without state
2960    /// transitions.
2961    ///
2962    /// # Panics
2963    ///
2964    /// If the size calculation for a heap-allocated backing buffer overflows
2965    /// `usize`.
2966    ///
2967    /// Available to Rust only.
2968    #[inline]
2969    pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2970        let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2971            &bytes[3..]
2972        } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2973            || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2974        {
2975            &bytes[2..]
2976        } else {
2977            bytes
2978        };
2979        self.decode_without_bom_handling(without_bom)
2980    }
2981
2982    /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2983    /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2984    /// the entire input is available as a single buffer (i.e. the end of the
2985    /// buffer marks the end of the stream).
2986    ///
2987    /// When invoked on `UTF_8`, this method implements the (non-streaming
2988    /// version of) the
2989    /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2990    /// spec concept.
2991    ///
2992    /// The second item in the returned pair indicates whether there were
2993    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2994    ///
2995    /// _Note:_ It is wrong to use this when the input buffer represents only
2996    /// a segment of the input instead of the whole input. Use
2997    /// `new_decoder_without_bom_handling()` when decoding segmented input.
2998    ///
2999    /// This method performs a one or two heap allocations for the backing
3000    /// buffer of the `String` when unable to borrow. (One allocation if not
3001    /// errors and potentially another one in the presence of errors.) The
3002    /// first allocation assumes jemalloc and may not be optimal with
3003    /// allocators that do not use power-of-two buckets. A borrow is performed
3004    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3005    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3006    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3007    /// transitions.
3008    ///
3009    /// # Panics
3010    ///
3011    /// If the size calculation for a heap-allocated backing buffer overflows
3012    /// `usize`.
3013    ///
3014    /// Available to Rust only.
3015    pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3016        let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3017            let valid_up_to = if self == UTF_8 {
3018                utf8_valid_up_to(bytes)
3019            } else if self == ISO_2022_JP {
3020                iso_2022_jp_ascii_valid_up_to(bytes)
3021            } else {
3022                ascii_valid_up_to(bytes)
3023            };
3024            if valid_up_to == bytes.len() {
3025                let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3026                return (Cow::Borrowed(str), false);
3027            }
3028            let decoder = self.new_decoder_without_bom_handling();
3029
3030            let rounded_without_replacement = checked_next_power_of_two(checked_add(
3031                valid_up_to,
3032                decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3033            ));
3034            let with_replacement = checked_add(
3035                valid_up_to,
3036                decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3037            );
3038            let mut string = String::with_capacity(
3039                checked_min(rounded_without_replacement, with_replacement).unwrap(),
3040            );
3041            unsafe {
3042                let vec = string.as_mut_vec();
3043                vec.set_len(valid_up_to);
3044                std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3045            }
3046            (decoder, string, valid_up_to)
3047        } else {
3048            let decoder = self.new_decoder_without_bom_handling();
3049            let rounded_without_replacement = checked_next_power_of_two(
3050                decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3051            );
3052            let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3053            let string = String::with_capacity(
3054                checked_min(rounded_without_replacement, with_replacement).unwrap(),
3055            );
3056            (decoder, string, 0)
3057        };
3058
3059        let mut total_had_errors = false;
3060        loop {
3061            let (result, read, had_errors) =
3062                decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3063            total_read += read;
3064            total_had_errors |= had_errors;
3065            match result {
3066                CoderResult::InputEmpty => {
3067                    debug_assert_eq!(total_read, bytes.len());
3068                    return (Cow::Owned(string), total_had_errors);
3069                }
3070                CoderResult::OutputFull => {
3071                    // Allocate for the worst case. That is, we should come
3072                    // here at most once per invocation of this method.
3073                    let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3074                    string.reserve(needed.unwrap());
3075                }
3076            }
3077        }
3078    }
3079
3080    /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3081    /// _with malformed sequences treated as fatal_ when the entire input is
3082    /// available as a single buffer (i.e. the end of the buffer marks the end
3083    /// of the stream).
3084    ///
3085    /// When invoked on `UTF_8`, this method implements the (non-streaming
3086    /// version of) the
3087    /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3088    /// spec concept.
3089    ///
3090    /// Returns `None` if a malformed sequence was encountered and the result
3091    /// of the decode as `Some(String)` otherwise.
3092    ///
3093    /// _Note:_ It is wrong to use this when the input buffer represents only
3094    /// a segment of the input instead of the whole input. Use
3095    /// `new_decoder_without_bom_handling()` when decoding segmented input.
3096    ///
3097    /// This method performs a single heap allocation for the backing
3098    /// buffer of the `String` when unable to borrow. A borrow is performed if
3099    /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3100    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3101    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3102    /// transitions.
3103    ///
3104    /// # Panics
3105    ///
3106    /// If the size calculation for a heap-allocated backing buffer overflows
3107    /// `usize`.
3108    ///
3109    /// Available to Rust only.
3110    pub fn decode_without_bom_handling_and_without_replacement<'a>(
3111        &'static self,
3112        bytes: &'a [u8],
3113    ) -> Option<Cow<'a, str>> {
3114        if self == UTF_8 {
3115            let valid_up_to = utf8_valid_up_to(bytes);
3116            if valid_up_to == bytes.len() {
3117                let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3118                return Some(Cow::Borrowed(str));
3119            }
3120            return None;
3121        }
3122        let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3123            let valid_up_to = if self == ISO_2022_JP {
3124                iso_2022_jp_ascii_valid_up_to(bytes)
3125            } else {
3126                ascii_valid_up_to(bytes)
3127            };
3128            if valid_up_to == bytes.len() {
3129                let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3130                return Some(Cow::Borrowed(str));
3131            }
3132            let decoder = self.new_decoder_without_bom_handling();
3133            let mut string = String::with_capacity(
3134                checked_add(
3135                    valid_up_to,
3136                    decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3137                )
3138                .unwrap(),
3139            );
3140            unsafe {
3141                let vec = string.as_mut_vec();
3142                vec.set_len(valid_up_to);
3143                std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3144            }
3145            (decoder, string, &bytes[valid_up_to..])
3146        } else {
3147            let decoder = self.new_decoder_without_bom_handling();
3148            let string = String::with_capacity(
3149                decoder
3150                    .max_utf8_buffer_length_without_replacement(bytes.len())
3151                    .unwrap(),
3152            );
3153            (decoder, string, bytes)
3154        };
3155        let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3156        match result {
3157            DecoderResult::InputEmpty => {
3158                debug_assert_eq!(read, input.len());
3159                Some(Cow::Owned(string))
3160            }
3161            DecoderResult::Malformed(_, _) => None,
3162            DecoderResult::OutputFull => unreachable!(),
3163        }
3164    }
3165
3166    /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3167    /// replaced with decimal numeric character references when the entire input
3168    /// is available as a single buffer (i.e. the end of the buffer marks the
3169    /// end of the stream).
3170    ///
3171    /// This method implements the (non-streaming version of) the
3172    /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3173    /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3174    /// spec concept, it is slightly more efficient to use
3175    /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3176    /// method on `UTF_8`.
3177    ///
3178    /// The second item in the returned tuple is the encoding that was actually
3179    /// used (which may differ from this encoding thanks to some encodings
3180    /// having UTF-8 as their output encoding).
3181    ///
3182    /// The third item in the returned tuple indicates whether there were
3183    /// unmappable characters (that were replaced with HTML numeric character
3184    /// references).
3185    ///
3186    /// _Note:_ It is wrong to use this when the input buffer represents only
3187    /// a segment of the input instead of the whole input. Use `new_encoder()`
3188    /// when encoding segmented output.
3189    ///
3190    /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3191    /// ASCII-compatible encoding, this method returns a borrow of the input
3192    /// without a heap allocation. Otherwise, this method performs a single
3193    /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3194    /// unmappable characters and potentially multiple heap allocations if
3195    /// there are. These allocations are tuned for jemalloc and may not be
3196    /// optimal when using a different allocator that doesn't use power-of-two
3197    /// buckets.
3198    ///
3199    /// # Panics
3200    ///
3201    /// If the size calculation for a heap-allocated backing buffer overflows
3202    /// `usize`.
3203    ///
3204    /// Available to Rust only.
3205    pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3206        let output_encoding = self.output_encoding();
3207        if output_encoding == UTF_8 {
3208            return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3209        }
3210        debug_assert!(output_encoding.is_potentially_borrowable());
3211        let bytes = string.as_bytes();
3212        let valid_up_to = if output_encoding == ISO_2022_JP {
3213            iso_2022_jp_ascii_valid_up_to(bytes)
3214        } else {
3215            ascii_valid_up_to(bytes)
3216        };
3217        if valid_up_to == bytes.len() {
3218            return (Cow::Borrowed(bytes), output_encoding, false);
3219        }
3220        let mut encoder = output_encoding.new_encoder();
3221        let mut vec: Vec<u8> = Vec::with_capacity(
3222            (checked_add(
3223                valid_up_to,
3224                encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3225            ))
3226            .unwrap()
3227            .next_power_of_two(),
3228        );
3229        unsafe {
3230            vec.set_len(valid_up_to);
3231            std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3232        }
3233        let mut total_read = valid_up_to;
3234        let mut total_had_errors = false;
3235        loop {
3236            let (result, read, had_errors) =
3237                encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3238            total_read += read;
3239            total_had_errors |= had_errors;
3240            match result {
3241                CoderResult::InputEmpty => {
3242                    debug_assert_eq!(total_read, string.len());
3243                    return (Cow::Owned(vec), output_encoding, total_had_errors);
3244                }
3245                CoderResult::OutputFull => {
3246                    // reserve_exact wants to know how much more on top of current
3247                    // length--not current capacity.
3248                    let needed = encoder
3249                        .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3250                    let rounded = (checked_add(vec.capacity(), needed))
3251                        .unwrap()
3252                        .next_power_of_two();
3253                    let additional = rounded - vec.len();
3254                    vec.reserve_exact(additional);
3255                }
3256            }
3257        }
3258    }
3259
3260    fn new_variant_decoder(&'static self) -> VariantDecoder {
3261        self.variant.new_variant_decoder()
3262    }
3263
3264    /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3265    ///
3266    /// BOM sniffing may cause the returned decoder to morph into a decoder
3267    /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3268    ///
3269    /// Available via the C wrapper.
3270    #[inline]
3271    pub fn new_decoder(&'static self) -> Decoder {
3272        Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3273    }
3274
3275    /// Instantiates a new decoder for this encoding with BOM removal.
3276    ///
3277    /// If the input starts with bytes that are the BOM for this encoding,
3278    /// those bytes are removed. However, the decoder never morphs into a
3279    /// decoder for another encoding: A BOM for another encoding is treated as
3280    /// (potentially malformed) input to the decoding algorithm for this
3281    /// encoding.
3282    ///
3283    /// Available via the C wrapper.
3284    #[inline]
3285    pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3286        Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3287    }
3288
3289    /// Instantiates a new decoder for this encoding with BOM handling disabled.
3290    ///
3291    /// If the input starts with bytes that look like a BOM, those bytes are
3292    /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3293    /// for another encoding.)
3294    ///
3295    /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3296    /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3297    /// instead of this method to cause the BOM to be removed.
3298    ///
3299    /// Available via the C wrapper.
3300    #[inline]
3301    pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3302        Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3303    }
3304
3305    /// Instantiates a new encoder for the output encoding of this encoding.
3306    ///
3307    /// Available via the C wrapper.
3308    #[inline]
3309    pub fn new_encoder(&'static self) -> Encoder {
3310        let enc = self.output_encoding();
3311        enc.variant.new_encoder(enc)
3312    }
3313
3314    /// Validates UTF-8.
3315    ///
3316    /// Returns the index of the first byte that makes the input malformed as
3317    /// UTF-8 or the length of the slice if the slice is entirely valid.
3318    ///
3319    /// This is currently faster than the corresponding standard library
3320    /// functionality. If this implementation gets upstreamed to the standard
3321    /// library, this method may be removed in the future.
3322    ///
3323    /// Available via the C wrapper.
3324    pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3325        utf8_valid_up_to(bytes)
3326    }
3327
3328    /// Validates ASCII.
3329    ///
3330    /// Returns the index of the first byte that makes the input malformed as
3331    /// ASCII or the length of the slice if the slice is entirely valid.
3332    ///
3333    /// Available via the C wrapper.
3334    pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3335        ascii_valid_up_to(bytes)
3336    }
3337
3338    /// Validates ISO-2022-JP ASCII-state data.
3339    ///
3340    /// Returns the index of the first byte that makes the input not
3341    /// representable in the ASCII state of ISO-2022-JP or the length of the
3342    /// slice if the slice is entirely representable in the ASCII state of
3343    /// ISO-2022-JP.
3344    ///
3345    /// Available via the C wrapper.
3346    pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3347        iso_2022_jp_ascii_valid_up_to(bytes)
3348    }
3349}
3350
3351impl PartialEq for Encoding {
3352    #[inline]
3353    fn eq(&self, other: &Encoding) -> bool {
3354        (self as *const Encoding) == (other as *const Encoding)
3355    }
3356}
3357
3358impl Eq for Encoding {}
3359
3360impl Hash for Encoding {
3361    #[inline]
3362    fn hash<H: Hasher>(&self, state: &mut H) {
3363        (self as *const Encoding).hash(state);
3364    }
3365}
3366
3367impl std::fmt::Debug for Encoding {
3368    #[inline]
3369    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3370        write!(f, "Encoding {{ {} }}", self.name)
3371    }
3372}
3373
3374#[cfg(feature = "serde")]
3375impl Serialize for Encoding {
3376    #[inline]
3377    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3378    where
3379        S: Serializer,
3380    {
3381        serializer.serialize_str(self.name)
3382    }
3383}
3384
3385#[cfg(feature = "serde")]
3386struct EncodingVisitor;
3387
3388#[cfg(feature = "serde")]
3389impl<'de> Visitor<'de> for EncodingVisitor {
3390    type Value = &'static Encoding;
3391
3392    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3393        formatter.write_str("a valid encoding label")
3394    }
3395
3396    fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3397    where
3398        E: serde::de::Error,
3399    {
3400        if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3401            Ok(enc)
3402        } else {
3403            Err(E::custom(format!("invalid encoding label: {}", value)))
3404        }
3405    }
3406}
3407
3408#[cfg(feature = "serde")]
3409impl<'de> Deserialize<'de> for &'static Encoding {
3410    fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3411    where
3412        D: Deserializer<'de>,
3413    {
3414        deserializer.deserialize_str(EncodingVisitor)
3415    }
3416}
3417
3418/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3419#[derive(PartialEq, Debug, Copy, Clone)]
3420enum DecoderLifeCycle {
3421    /// The decoder has seen no input yet.
3422    AtStart,
3423    /// The decoder has seen no input yet but expects UTF-8.
3424    AtUtf8Start,
3425    /// The decoder has seen no input yet but expects UTF-16BE.
3426    AtUtf16BeStart,
3427    /// The decoder has seen no input yet but expects UTF-16LE.
3428    AtUtf16LeStart,
3429    /// The decoder has seen EF.
3430    SeenUtf8First,
3431    /// The decoder has seen EF, BB.
3432    SeenUtf8Second,
3433    /// The decoder has seen FE.
3434    SeenUtf16BeFirst,
3435    /// The decoder has seen FF.
3436    SeenUtf16LeFirst,
3437    /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3438    /// underlying decoder reported EF as an error, so we need to remember to
3439    /// push BB before the next buffer.
3440    ConvertingWithPendingBB,
3441    /// No longer looking for a BOM and EOF not yet seen.
3442    Converting,
3443    /// EOF has been seen.
3444    Finished,
3445}
3446
3447/// Communicate the BOM handling mode.
3448#[derive(Debug, Copy, Clone)]
3449enum BomHandling {
3450    /// Don't handle the BOM
3451    Off,
3452    /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3453    Sniff,
3454    /// Remove the BOM only if it's the BOM for this encoding
3455    Remove,
3456}
3457
3458/// Result of a (potentially partial) decode or encode operation with
3459/// replacement.
3460#[must_use]
3461#[derive(Debug, PartialEq, Eq)]
3462pub enum CoderResult {
3463    /// The input was exhausted.
3464    ///
3465    /// If this result was returned from a call where `last` was `true`, the
3466    /// conversion process has completed. Otherwise, the caller should call a
3467    /// decode or encode method again with more input.
3468    InputEmpty,
3469
3470    /// The converter cannot produce another unit of output, because the output
3471    /// buffer does not have enough space left.
3472    ///
3473    /// The caller must provide more output space upon the next call and re-push
3474    /// the remaining input to the converter.
3475    OutputFull,
3476}
3477
3478/// Result of a (potentially partial) decode operation without replacement.
3479#[must_use]
3480#[derive(Debug, PartialEq, Eq)]
3481pub enum DecoderResult {
3482    /// The input was exhausted.
3483    ///
3484    /// If this result was returned from a call where `last` was `true`, the
3485    /// decoding process has completed. Otherwise, the caller should call a
3486    /// decode method again with more input.
3487    InputEmpty,
3488
3489    /// The decoder cannot produce another unit of output, because the output
3490    /// buffer does not have enough space left.
3491    ///
3492    /// The caller must provide more output space upon the next call and re-push
3493    /// the remaining input to the decoder.
3494    OutputFull,
3495
3496    /// The decoder encountered a malformed byte sequence.
3497    ///
3498    /// The caller must either treat this as a fatal error or must append one
3499    /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3500    /// the remaining input to the decoder.
3501    ///
3502    /// The first wrapped integer indicates the length of the malformed byte
3503    /// sequence. The second wrapped integer indicates the number of bytes
3504    /// that were consumed after the malformed sequence. If the second
3505    /// integer is zero, the last byte that was consumed is the last byte of
3506    /// the malformed sequence. Note that the malformed bytes may have been part
3507    /// of an earlier input buffer.
3508    ///
3509    /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3510    /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3511    /// of the two is 6, which happens with ISO-2022-JP.
3512    Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3513}
3514
3515/// A converter that decodes a byte stream into Unicode according to a
3516/// character encoding in a streaming (incremental) manner.
3517///
3518/// The various `decode_*` methods take an input buffer (`src`) and an output
3519/// buffer `dst` both of which are caller-allocated. There are variants for
3520/// both UTF-8 and UTF-16 output buffers.
3521///
3522/// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3523/// into `dst` until one of the following three things happens:
3524///
3525/// 1. A malformed byte sequence is encountered (`*_without_replacement`
3526///    variants only).
3527///
3528/// 2. The output buffer has been filled so near capacity that the decoder
3529///    cannot be sure that processing an additional byte of input wouldn't
3530///    cause so much output that the output buffer would overflow.
3531///
3532/// 3. All the input bytes have been processed.
3533///
3534/// The `decode_*` method then returns tuple of a status indicating which one
3535/// of the three reasons to return happened, how many input bytes were read,
3536/// how many output code units (`u8` when decoding into UTF-8 and `u16`
3537/// when decoding to UTF-16) were written (except when decoding into `String`,
3538/// whose length change indicates this), and in the case of the
3539/// variants performing replacement, a boolean indicating whether an error was
3540/// replaced with the REPLACEMENT CHARACTER during the call.
3541///
3542/// The number of bytes "written" is what's logically written. Garbage may be
3543/// written in the output buffer beyond the point logically written to.
3544/// Therefore, if you wish to decode into an `&mut str`, you should use the
3545/// methods that take an `&mut str` argument instead of the ones that take an
3546/// `&mut [u8]` argument. The former take care of overwriting the trailing
3547/// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3548/// latter don't.
3549///
3550/// In the case of the `*_without_replacement` variants, the status is a
3551/// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3552/// `InputEmpty` corresponding to the three cases listed above).
3553///
3554/// In the case of methods whose name does not end with
3555/// `*_without_replacement`, malformed sequences are automatically replaced
3556/// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3557/// return early.
3558///
3559/// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3560/// space. When decoding to UTF-16, the output buffer must have at least two
3561/// UTF-16 code units (`u16`) of space.
3562///
3563/// When decoding to UTF-8 without replacement, the methods are guaranteed
3564/// not to return indicating that more output space is needed if the length
3565/// of the output buffer is at least the length returned by
3566/// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3567/// with replacement, the length of the output buffer that guarantees the
3568/// methods not to return indicating that more output space is needed is given
3569/// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3570/// or without replacement, the length of the output buffer that guarantees
3571/// the methods not to return indicating that more output space is needed is
3572/// given by [`max_utf16_buffer_length()`][4].
3573///
3574/// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3575/// and the output after each `decode_*` call is guaranteed to consist of
3576/// complete characters. (I.e. the code unit sequence for the last character is
3577/// guaranteed not to be split across output buffers.)
3578///
3579/// The boolean argument `last` indicates that the end of the stream is reached
3580/// when all the bytes in `src` have been consumed.
3581///
3582/// A `Decoder` object can be used to incrementally decode a byte stream.
3583///
3584/// During the processing of a single stream, the caller must call `decode_*`
3585/// zero or more times with `last` set to `false` and then call `decode_*` at
3586/// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3587/// the processing of the stream has ended. Otherwise, the caller must call
3588/// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3589///  a fatal error).
3590///
3591/// Once the stream has ended, the `Decoder` object must not be used anymore.
3592/// That is, you need to create another one to process another stream.
3593///
3594/// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3595/// the caller does not wish to treat it as a fatal error, the input buffer
3596/// `src` may not have been completely consumed. In that case, the caller must
3597/// pass the unconsumed contents of `src` to `decode_*` again upon the next
3598/// call.
3599///
3600/// [1]: enum.DecoderResult.html
3601/// [2]: #method.max_utf8_buffer_length_without_replacement
3602/// [3]: #method.max_utf8_buffer_length
3603/// [4]: #method.max_utf16_buffer_length
3604///
3605/// # Infinite loops
3606///
3607/// When converting with a fixed-size output buffer whose size is too small to
3608/// accommodate one character or (when applicable) one numeric character
3609/// reference of output, an infinite loop ensues. When converting with a
3610/// fixed-size output buffer, it generally makes sense to make the buffer
3611/// fairly large (e.g. couple of kilobytes).
3612pub struct Decoder {
3613    encoding: &'static Encoding,
3614    variant: VariantDecoder,
3615    life_cycle: DecoderLifeCycle,
3616}
3617
3618impl Decoder {
3619    fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3620        Decoder {
3621            encoding: enc,
3622            variant: decoder,
3623            life_cycle: match sniffing {
3624                BomHandling::Off => DecoderLifeCycle::Converting,
3625                BomHandling::Sniff => DecoderLifeCycle::AtStart,
3626                BomHandling::Remove => {
3627                    if enc == UTF_8 {
3628                        DecoderLifeCycle::AtUtf8Start
3629                    } else if enc == UTF_16BE {
3630                        DecoderLifeCycle::AtUtf16BeStart
3631                    } else if enc == UTF_16LE {
3632                        DecoderLifeCycle::AtUtf16LeStart
3633                    } else {
3634                        DecoderLifeCycle::Converting
3635                    }
3636                }
3637            },
3638        }
3639    }
3640
3641    /// The `Encoding` this `Decoder` is for.
3642    ///
3643    /// BOM sniffing can change the return value of this method during the life
3644    /// of the decoder.
3645    ///
3646    /// Available via the C wrapper.
3647    #[inline]
3648    pub fn encoding(&self) -> &'static Encoding {
3649        self.encoding
3650    }
3651
3652    /// Query the worst-case UTF-8 output size _with replacement_.
3653    ///
3654    /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3655    /// that will not overflow given the current state of the decoder and
3656    /// `byte_length` number of additional input bytes when decoding with
3657    /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3658    /// sequence or `None` if `usize` would overflow.
3659    ///
3660    /// Available via the C wrapper.
3661    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3662        // Need to consider a) the decoder morphing due to the BOM and b) a partial
3663        // BOM getting pushed to the underlying decoder.
3664        match self.life_cycle {
3665            DecoderLifeCycle::Converting
3666            | DecoderLifeCycle::AtUtf8Start
3667            | DecoderLifeCycle::AtUtf16LeStart
3668            | DecoderLifeCycle::AtUtf16BeStart => {
3669                return self.variant.max_utf8_buffer_length(byte_length);
3670            }
3671            DecoderLifeCycle::AtStart => {
3672                if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3673                    if let Some(utf16_bom) = checked_add(
3674                        1,
3675                        checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3676                    ) {
3677                        let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3678                        let encoding = self.encoding();
3679                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3680                            // No need to consider the internal state of the underlying decoder,
3681                            // because it is at start, because no data has reached it yet.
3682                            return Some(utf_bom);
3683                        } else if let Some(non_bom) =
3684                            self.variant.max_utf8_buffer_length(byte_length)
3685                        {
3686                            return Some(std::cmp::max(utf_bom, non_bom));
3687                        }
3688                    }
3689                }
3690            }
3691            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3692                // Add two bytes even when only one byte has been seen,
3693                // because the one byte can become a lead byte in multibyte
3694                // decoders, but only after the decoder has been queried
3695                // for max length, so the decoder's own logic for adding
3696                // one for a pending lead cannot work.
3697                if let Some(sum) = byte_length.checked_add(2) {
3698                    if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3699                        if self.encoding() == UTF_8 {
3700                            // No need to consider the internal state of the underlying decoder,
3701                            // because it is at start, because no data has reached it yet.
3702                            return Some(utf8_bom);
3703                        } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3704                            return Some(std::cmp::max(utf8_bom, non_bom));
3705                        }
3706                    }
3707                }
3708            }
3709            DecoderLifeCycle::ConvertingWithPendingBB => {
3710                if let Some(sum) = byte_length.checked_add(2) {
3711                    return self.variant.max_utf8_buffer_length(sum);
3712                }
3713            }
3714            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3715                // Add two bytes even when only one byte has been seen,
3716                // because the one byte can become a lead byte in multibyte
3717                // decoders, but only after the decoder has been queried
3718                // for max length, so the decoder's own logic for adding
3719                // one for a pending lead cannot work.
3720                if let Some(sum) = byte_length.checked_add(2) {
3721                    if let Some(utf16_bom) =
3722                        checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3723                    {
3724                        let encoding = self.encoding();
3725                        if encoding == UTF_16LE || encoding == UTF_16BE {
3726                            // No need to consider the internal state of the underlying decoder,
3727                            // because it is at start, because no data has reached it yet.
3728                            return Some(utf16_bom);
3729                        } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3730                            return Some(std::cmp::max(utf16_bom, non_bom));
3731                        }
3732                    }
3733                }
3734            }
3735            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3736        }
3737        None
3738    }
3739
3740    /// Query the worst-case UTF-8 output size _without replacement_.
3741    ///
3742    /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3743    /// that will not overflow given the current state of the decoder and
3744    /// `byte_length` number of additional input bytes when decoding without
3745    /// replacement error handling or `None` if `usize` would overflow.
3746    ///
3747    /// Note that this value may be too small for the `_with_replacement` case.
3748    /// Use `max_utf8_buffer_length()` for that case.
3749    ///
3750    /// Available via the C wrapper.
3751    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3752        // Need to consider a) the decoder morphing due to the BOM and b) a partial
3753        // BOM getting pushed to the underlying decoder.
3754        match self.life_cycle {
3755            DecoderLifeCycle::Converting
3756            | DecoderLifeCycle::AtUtf8Start
3757            | DecoderLifeCycle::AtUtf16LeStart
3758            | DecoderLifeCycle::AtUtf16BeStart => {
3759                return self
3760                    .variant
3761                    .max_utf8_buffer_length_without_replacement(byte_length);
3762            }
3763            DecoderLifeCycle::AtStart => {
3764                if let Some(utf8_bom) = byte_length.checked_add(3) {
3765                    if let Some(utf16_bom) = checked_add(
3766                        1,
3767                        checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3768                    ) {
3769                        let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3770                        let encoding = self.encoding();
3771                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3772                            // No need to consider the internal state of the underlying decoder,
3773                            // because it is at start, because no data has reached it yet.
3774                            return Some(utf_bom);
3775                        } else if let Some(non_bom) = self
3776                            .variant
3777                            .max_utf8_buffer_length_without_replacement(byte_length)
3778                        {
3779                            return Some(std::cmp::max(utf_bom, non_bom));
3780                        }
3781                    }
3782                }
3783            }
3784            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3785                // Add two bytes even when only one byte has been seen,
3786                // because the one byte can become a lead byte in multibyte
3787                // decoders, but only after the decoder has been queried
3788                // for max length, so the decoder's own logic for adding
3789                // one for a pending lead cannot work.
3790                if let Some(sum) = byte_length.checked_add(2) {
3791                    if let Some(utf8_bom) = sum.checked_add(3) {
3792                        if self.encoding() == UTF_8 {
3793                            // No need to consider the internal state of the underlying decoder,
3794                            // because it is at start, because no data has reached it yet.
3795                            return Some(utf8_bom);
3796                        } else if let Some(non_bom) =
3797                            self.variant.max_utf8_buffer_length_without_replacement(sum)
3798                        {
3799                            return Some(std::cmp::max(utf8_bom, non_bom));
3800                        }
3801                    }
3802                }
3803            }
3804            DecoderLifeCycle::ConvertingWithPendingBB => {
3805                if let Some(sum) = byte_length.checked_add(2) {
3806                    return self.variant.max_utf8_buffer_length_without_replacement(sum);
3807                }
3808            }
3809            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3810                // Add two bytes even when only one byte has been seen,
3811                // because the one byte can become a lead byte in multibyte
3812                // decoders, but only after the decoder has been queried
3813                // for max length, so the decoder's own logic for adding
3814                // one for a pending lead cannot work.
3815                if let Some(sum) = byte_length.checked_add(2) {
3816                    if let Some(utf16_bom) =
3817                        checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3818                    {
3819                        let encoding = self.encoding();
3820                        if encoding == UTF_16LE || encoding == UTF_16BE {
3821                            // No need to consider the internal state of the underlying decoder,
3822                            // because it is at start, because no data has reached it yet.
3823                            return Some(utf16_bom);
3824                        } else if let Some(non_bom) =
3825                            self.variant.max_utf8_buffer_length_without_replacement(sum)
3826                        {
3827                            return Some(std::cmp::max(utf16_bom, non_bom));
3828                        }
3829                    }
3830                }
3831            }
3832            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3833        }
3834        None
3835    }
3836
3837    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3838    /// replaced with the REPLACEMENT CHARACTER.
3839    ///
3840    /// See the documentation of the struct for documentation for `decode_*`
3841    /// methods collectively.
3842    ///
3843    /// Available via the C wrapper.
3844    pub fn decode_to_utf8(
3845        &mut self,
3846        src: &[u8],
3847        dst: &mut [u8],
3848        last: bool,
3849    ) -> (CoderResult, usize, usize, bool) {
3850        let mut had_errors = false;
3851        let mut total_read = 0usize;
3852        let mut total_written = 0usize;
3853        loop {
3854            let (result, read, written) = self.decode_to_utf8_without_replacement(
3855                &src[total_read..],
3856                &mut dst[total_written..],
3857                last,
3858            );
3859            total_read += read;
3860            total_written += written;
3861            match result {
3862                DecoderResult::InputEmpty => {
3863                    return (
3864                        CoderResult::InputEmpty,
3865                        total_read,
3866                        total_written,
3867                        had_errors,
3868                    );
3869                }
3870                DecoderResult::OutputFull => {
3871                    return (
3872                        CoderResult::OutputFull,
3873                        total_read,
3874                        total_written,
3875                        had_errors,
3876                    );
3877                }
3878                DecoderResult::Malformed(_, _) => {
3879                    had_errors = true;
3880                    // There should always be space for the U+FFFD, because
3881                    // otherwise we'd have gotten OutputFull already.
3882                    // XXX: is the above comment actually true for UTF-8 itself?
3883                    // TODO: Consider having fewer bound checks here.
3884                    dst[total_written] = 0xEFu8;
3885                    total_written += 1;
3886                    dst[total_written] = 0xBFu8;
3887                    total_written += 1;
3888                    dst[total_written] = 0xBDu8;
3889                    total_written += 1;
3890                }
3891            }
3892        }
3893    }
3894
3895    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3896    /// replaced with the REPLACEMENT CHARACTER with type system signaling
3897    /// of UTF-8 validity.
3898    ///
3899    /// This methods calls `decode_to_utf8` and then zeroes
3900    /// out up to three bytes that aren't logically part of the write in order
3901    /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3902    ///
3903    /// See the documentation of the struct for documentation for `decode_*`
3904    /// methods collectively.
3905    ///
3906    /// Available to Rust only.
3907    pub fn decode_to_str(
3908        &mut self,
3909        src: &[u8],
3910        dst: &mut str,
3911        last: bool,
3912    ) -> (CoderResult, usize, usize, bool) {
3913        let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3914        let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3915        let len = bytes.len();
3916        let mut trail = written;
3917        // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3918        // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3919        // encodings to avoid overwriting here.
3920        if self.encoding != UTF_8 {
3921            let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3922            while trail < max {
3923                bytes[trail] = 0;
3924                trail += 1;
3925            }
3926        }
3927        while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3928            bytes[trail] = 0;
3929            trail += 1;
3930        }
3931        (result, read, written, replaced)
3932    }
3933
3934    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3935    /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3936    ///
3937    /// Like the others, this method follows the logic that the output buffer is
3938    /// caller-allocated. This method treats the capacity of the `String` as
3939    /// the output limit. That is, this method guarantees not to cause a
3940    /// reallocation of the backing buffer of `String`.
3941    ///
3942    /// The return value is a tuple that contains the `DecoderResult`, the
3943    /// number of bytes read and a boolean indicating whether replacements
3944    /// were done. The number of bytes written is signaled via the length of
3945    /// the `String` changing.
3946    ///
3947    /// See the documentation of the struct for documentation for `decode_*`
3948    /// methods collectively.
3949    ///
3950    /// Available to Rust only.
3951    pub fn decode_to_string(
3952        &mut self,
3953        src: &[u8],
3954        dst: &mut String,
3955        last: bool,
3956    ) -> (CoderResult, usize, bool) {
3957        unsafe {
3958            let vec = dst.as_mut_vec();
3959            let old_len = vec.len();
3960            let capacity = vec.capacity();
3961            vec.set_len(capacity);
3962            let (result, read, written, replaced) =
3963                self.decode_to_utf8(src, &mut vec[old_len..], last);
3964            vec.set_len(old_len + written);
3965            (result, read, replaced)
3966        }
3967    }
3968
3969    public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3970                            /// _without replacement_.
3971                            ///
3972                            /// See the documentation of the struct for
3973                            /// documentation for `decode_*` methods
3974                            /// collectively.
3975                            ///
3976                            /// Available via the C wrapper.
3977                            ,
3978                            decode_to_utf8_without_replacement,
3979                            decode_to_utf8_raw,
3980                            decode_to_utf8_checking_end,
3981                            decode_to_utf8_after_one_potential_bom_byte,
3982                            decode_to_utf8_after_two_potential_bom_bytes,
3983                            decode_to_utf8_checking_end_with_offset,
3984                            u8);
3985
3986    /// Incrementally decode a byte stream into UTF-8 with type system signaling
3987    /// of UTF-8 validity.
3988    ///
3989    /// This methods calls `decode_to_utf8` and then zeroes out up to three
3990    /// bytes that aren't logically part of the write in order to retain the
3991    /// UTF-8 validity even for the unwritten part of the buffer.
3992    ///
3993    /// See the documentation of the struct for documentation for `decode_*`
3994    /// methods collectively.
3995    ///
3996    /// Available to Rust only.
3997    pub fn decode_to_str_without_replacement(
3998        &mut self,
3999        src: &[u8],
4000        dst: &mut str,
4001        last: bool,
4002    ) -> (DecoderResult, usize, usize) {
4003        let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4004        let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4005        let len = bytes.len();
4006        let mut trail = written;
4007        // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4008        // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4009        // encodings to avoid overwriting here.
4010        if self.encoding != UTF_8 {
4011            let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4012            while trail < max {
4013                bytes[trail] = 0;
4014                trail += 1;
4015            }
4016        }
4017        while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4018            bytes[trail] = 0;
4019            trail += 1;
4020        }
4021        (result, read, written)
4022    }
4023
4024    /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4025    ///
4026    /// Like the others, this method follows the logic that the output buffer is
4027    /// caller-allocated. This method treats the capacity of the `String` as
4028    /// the output limit. That is, this method guarantees not to cause a
4029    /// reallocation of the backing buffer of `String`.
4030    ///
4031    /// The return value is a pair that contains the `DecoderResult` and the
4032    /// number of bytes read. The number of bytes written is signaled via
4033    /// the length of the `String` changing.
4034    ///
4035    /// See the documentation of the struct for documentation for `decode_*`
4036    /// methods collectively.
4037    ///
4038    /// Available to Rust only.
4039    pub fn decode_to_string_without_replacement(
4040        &mut self,
4041        src: &[u8],
4042        dst: &mut String,
4043        last: bool,
4044    ) -> (DecoderResult, usize) {
4045        unsafe {
4046            let vec = dst.as_mut_vec();
4047            let old_len = vec.len();
4048            let capacity = vec.capacity();
4049            vec.set_len(capacity);
4050            let (result, read, written) =
4051                self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4052            vec.set_len(old_len + written);
4053            (result, read)
4054        }
4055    }
4056
4057    /// Query the worst-case UTF-16 output size (with or without replacement).
4058    ///
4059    /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4060    /// that will not overflow given the current state of the decoder and
4061    /// `byte_length` number of additional input bytes or `None` if `usize`
4062    /// would overflow.
4063    ///
4064    /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4065    /// return value of this method applies also in the
4066    /// `_without_replacement` case.
4067    ///
4068    /// Available via the C wrapper.
4069    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4070        // Need to consider a) the decoder morphing due to the BOM and b) a partial
4071        // BOM getting pushed to the underlying decoder.
4072        match self.life_cycle {
4073            DecoderLifeCycle::Converting
4074            | DecoderLifeCycle::AtUtf8Start
4075            | DecoderLifeCycle::AtUtf16LeStart
4076            | DecoderLifeCycle::AtUtf16BeStart => {
4077                return self.variant.max_utf16_buffer_length(byte_length);
4078            }
4079            DecoderLifeCycle::AtStart => {
4080                if let Some(utf8_bom) = byte_length.checked_add(1) {
4081                    if let Some(utf16_bom) =
4082                        checked_add(1, checked_div(byte_length.checked_add(1), 2))
4083                    {
4084                        let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4085                        let encoding = self.encoding();
4086                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4087                            // No need to consider the internal state of the underlying decoder,
4088                            // because it is at start, because no data has reached it yet.
4089                            return Some(utf_bom);
4090                        } else if let Some(non_bom) =
4091                            self.variant.max_utf16_buffer_length(byte_length)
4092                        {
4093                            return Some(std::cmp::max(utf_bom, non_bom));
4094                        }
4095                    }
4096                }
4097            }
4098            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4099                // Add two bytes even when only one byte has been seen,
4100                // because the one byte can become a lead byte in multibyte
4101                // decoders, but only after the decoder has been queried
4102                // for max length, so the decoder's own logic for adding
4103                // one for a pending lead cannot work.
4104                if let Some(sum) = byte_length.checked_add(2) {
4105                    if let Some(utf8_bom) = sum.checked_add(1) {
4106                        if self.encoding() == UTF_8 {
4107                            // No need to consider the internal state of the underlying decoder,
4108                            // because it is at start, because no data has reached it yet.
4109                            return Some(utf8_bom);
4110                        } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4111                            return Some(std::cmp::max(utf8_bom, non_bom));
4112                        }
4113                    }
4114                }
4115            }
4116            DecoderLifeCycle::ConvertingWithPendingBB => {
4117                if let Some(sum) = byte_length.checked_add(2) {
4118                    return self.variant.max_utf16_buffer_length(sum);
4119                }
4120            }
4121            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4122                // Add two bytes even when only one byte has been seen,
4123                // because the one byte can become a lead byte in multibyte
4124                // decoders, but only after the decoder has been queried
4125                // for max length, so the decoder's own logic for adding
4126                // one for a pending lead cannot work.
4127                if let Some(sum) = byte_length.checked_add(2) {
4128                    if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4129                        let encoding = self.encoding();
4130                        if encoding == UTF_16LE || encoding == UTF_16BE {
4131                            // No need to consider the internal state of the underlying decoder,
4132                            // because it is at start, because no data has reached it yet.
4133                            return Some(utf16_bom);
4134                        } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4135                            return Some(std::cmp::max(utf16_bom, non_bom));
4136                        }
4137                    }
4138                }
4139            }
4140            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4141        }
4142        None
4143    }
4144
4145    /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4146    /// replaced with the REPLACEMENT CHARACTER.
4147    ///
4148    /// See the documentation of the struct for documentation for `decode_*`
4149    /// methods collectively.
4150    ///
4151    /// Available via the C wrapper.
4152    pub fn decode_to_utf16(
4153        &mut self,
4154        src: &[u8],
4155        dst: &mut [u16],
4156        last: bool,
4157    ) -> (CoderResult, usize, usize, bool) {
4158        let mut had_errors = false;
4159        let mut total_read = 0usize;
4160        let mut total_written = 0usize;
4161        loop {
4162            let (result, read, written) = self.decode_to_utf16_without_replacement(
4163                &src[total_read..],
4164                &mut dst[total_written..],
4165                last,
4166            );
4167            total_read += read;
4168            total_written += written;
4169            match result {
4170                DecoderResult::InputEmpty => {
4171                    return (
4172                        CoderResult::InputEmpty,
4173                        total_read,
4174                        total_written,
4175                        had_errors,
4176                    );
4177                }
4178                DecoderResult::OutputFull => {
4179                    return (
4180                        CoderResult::OutputFull,
4181                        total_read,
4182                        total_written,
4183                        had_errors,
4184                    );
4185                }
4186                DecoderResult::Malformed(_, _) => {
4187                    had_errors = true;
4188                    // There should always be space for the U+FFFD, because
4189                    // otherwise we'd have gotten OutputFull already.
4190                    dst[total_written] = 0xFFFD;
4191                    total_written += 1;
4192                }
4193            }
4194        }
4195    }
4196
4197    public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4198                            /// _without replacement_.
4199                            ///
4200                            /// See the documentation of the struct for
4201                            /// documentation for `decode_*` methods
4202                            /// collectively.
4203                            ///
4204                            /// Available via the C wrapper.
4205                            ,
4206                            decode_to_utf16_without_replacement,
4207                            decode_to_utf16_raw,
4208                            decode_to_utf16_checking_end,
4209                            decode_to_utf16_after_one_potential_bom_byte,
4210                            decode_to_utf16_after_two_potential_bom_bytes,
4211                            decode_to_utf16_checking_end_with_offset,
4212                            u16);
4213
4214    /// Checks for compatibility with storing Unicode scalar values as unsigned
4215    /// bytes taking into account the state of the decoder.
4216    ///
4217    /// Returns `None` if the decoder is not in a neutral state, including waiting
4218    /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4219    ///
4220    /// Otherwise returns the index of the first byte whose unsigned value doesn't
4221    /// directly correspond to the decoded Unicode scalar value, or the length
4222    /// of the input if all bytes in the input decode directly to scalar values
4223    /// corresponding to the unsigned byte values.
4224    ///
4225    /// Does not change the state of the decoder.
4226    ///
4227    /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4228    /// storage optimizations.
4229    ///
4230    /// Available via the C wrapper.
4231    pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4232        match self.life_cycle {
4233            DecoderLifeCycle::Converting => {
4234                return self.variant.latin1_byte_compatible_up_to(bytes);
4235            }
4236            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4237            _ => None,
4238        }
4239    }
4240}
4241
4242/// Result of a (potentially partial) encode operation without replacement.
4243#[must_use]
4244#[derive(Debug, PartialEq, Eq)]
4245pub enum EncoderResult {
4246    /// The input was exhausted.
4247    ///
4248    /// If this result was returned from a call where `last` was `true`, the
4249    /// decoding process has completed. Otherwise, the caller should call a
4250    /// decode method again with more input.
4251    InputEmpty,
4252
4253    /// The encoder cannot produce another unit of output, because the output
4254    /// buffer does not have enough space left.
4255    ///
4256    /// The caller must provide more output space upon the next call and re-push
4257    /// the remaining input to the decoder.
4258    OutputFull,
4259
4260    /// The encoder encountered an unmappable character.
4261    ///
4262    /// The caller must either treat this as a fatal error or must append
4263    /// a placeholder to the output and then re-push the remaining input to the
4264    /// encoder.
4265    Unmappable(char),
4266}
4267
4268impl EncoderResult {
4269    fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4270        EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4271    }
4272}
4273
4274/// A converter that encodes a Unicode stream into bytes according to a
4275/// character encoding in a streaming (incremental) manner.
4276///
4277/// The various `encode_*` methods take an input buffer (`src`) and an output
4278/// buffer `dst` both of which are caller-allocated. There are variants for
4279/// both UTF-8 and UTF-16 input buffers.
4280///
4281/// An `encode_*` method encode characters from `src` into bytes characters
4282/// stored into `dst` until one of the following three things happens:
4283///
4284/// 1. An unmappable character is encountered (`*_without_replacement` variants
4285///    only).
4286///
4287/// 2. The output buffer has been filled so near capacity that the decoder
4288///    cannot be sure that processing an additional character of input wouldn't
4289///    cause so much output that the output buffer would overflow.
4290///
4291/// 3. All the input characters have been processed.
4292///
4293/// The `encode_*` method then returns tuple of a status indicating which one
4294/// of the three reasons to return happened, how many input code units (`u8`
4295/// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4296/// how many output bytes were written (except when encoding into `Vec<u8>`,
4297/// whose length change indicates this), and in the case of the variants that
4298/// perform replacement, a boolean indicating whether an unmappable
4299/// character was replaced with a numeric character reference during the call.
4300///
4301/// The number of bytes "written" is what's logically written. Garbage may be
4302/// written in the output buffer beyond the point logically written to.
4303///
4304/// In the case of the methods whose name ends with
4305/// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4306/// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4307/// the three cases listed above).
4308///
4309/// In the case of methods whose name does not end with
4310/// `*_without_replacement`, unmappable characters are automatically replaced
4311/// with the corresponding numeric character references and unmappable
4312/// characters do not cause the methods to return early.
4313///
4314/// When encoding from UTF-8 without replacement, the methods are guaranteed
4315/// not to return indicating that more output space is needed if the length
4316/// of the output buffer is at least the length returned by
4317/// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4318/// UTF-8 with replacement, the length of the output buffer that guarantees the
4319/// methods not to return indicating that more output space is needed in the
4320/// absence of unmappable characters is given by
4321/// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4322/// UTF-16 without replacement, the methods are guaranteed not to return
4323/// indicating that more output space is needed if the length of the output
4324/// buffer is at least the length returned by
4325/// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4326/// from UTF-16 with replacement, the the length of the output buffer that
4327/// guarantees the methods not to return indicating that more output space is
4328/// needed in the absence of unmappable characters is given by
4329/// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4330/// When encoding with replacement, applications are not expected to size the
4331/// buffer for the worst case ahead of time but to resize the buffer if there
4332/// are unmappable characters. This is why max length queries are only available
4333/// for the case where there are no unmappable characters.
4334///
4335/// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4336/// calling from Rust, the type system takes care of this.) When encoding from
4337/// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4338/// CHARACTERS. Therefore, in order for astral characters not to turn into a
4339/// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4340/// are not split across input buffer boundaries.
4341///
4342/// After an `encode_*` call returns, the output produced so far, taken as a
4343/// whole from the start of the stream, is guaranteed to consist of a valid
4344/// byte sequence in the target encoding. (I.e. the code unit sequence for a
4345/// character is guaranteed not to be split across output buffers. However, due
4346/// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4347/// from the start for it to be valid. For other encodings, the validity holds
4348/// on a per-output buffer basis.)
4349///
4350/// The boolean argument `last` indicates that the end of the stream is reached
4351/// when all the characters in `src` have been consumed. This argument is needed
4352/// for ISO-2022-JP and is ignored for other encodings.
4353///
4354/// An `Encoder` object can be used to incrementally encode a byte stream.
4355///
4356/// During the processing of a single stream, the caller must call `encode_*`
4357/// zero or more times with `last` set to `false` and then call `encode_*` at
4358/// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4359/// the processing of the stream has ended. Otherwise, the caller must call
4360/// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4361/// as a fatal error).
4362///
4363/// Once the stream has ended, the `Encoder` object must not be used anymore.
4364/// That is, you need to create another one to process another stream.
4365///
4366/// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4367/// and the caller does not wish to treat it as a fatal error, the input buffer
4368/// `src` may not have been completely consumed. In that case, the caller must
4369/// pass the unconsumed contents of `src` to `encode_*` again upon the next
4370/// call.
4371///
4372/// [1]: enum.EncoderResult.html
4373/// [2]: #method.max_buffer_length_from_utf8_without_replacement
4374/// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4375/// [4]: #method.max_buffer_length_from_utf16_without_replacement
4376/// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4377///
4378/// # Infinite loops
4379///
4380/// When converting with a fixed-size output buffer whose size is too small to
4381/// accommodate one character of output, an infinite loop ensues. When
4382/// converting with a fixed-size output buffer, it generally makes sense to
4383/// make the buffer fairly large (e.g. couple of kilobytes).
4384pub struct Encoder {
4385    encoding: &'static Encoding,
4386    variant: VariantEncoder,
4387}
4388
4389impl Encoder {
4390    fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4391        Encoder {
4392            encoding: enc,
4393            variant: encoder,
4394        }
4395    }
4396
4397    /// The `Encoding` this `Encoder` is for.
4398    #[inline]
4399    pub fn encoding(&self) -> &'static Encoding {
4400        self.encoding
4401    }
4402
4403    /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4404    /// ASCII state and `false` otherwise.
4405    #[inline]
4406    pub fn has_pending_state(&self) -> bool {
4407        self.variant.has_pending_state()
4408    }
4409
4410    /// Query the worst-case output size when encoding from UTF-8 with
4411    /// replacement.
4412    ///
4413    /// Returns the size of the output buffer in bytes that will not overflow
4414    /// given the current state of the encoder and `byte_length` number of
4415    /// additional input code units if there are no unmappable characters in
4416    /// the input or `None` if `usize` would overflow.
4417    ///
4418    /// Available via the C wrapper.
4419    pub fn max_buffer_length_from_utf8_if_no_unmappables(
4420        &self,
4421        byte_length: usize,
4422    ) -> Option<usize> {
4423        checked_add(
4424            if self.encoding().can_encode_everything() {
4425                0
4426            } else {
4427                NCR_EXTRA
4428            },
4429            self.max_buffer_length_from_utf8_without_replacement(byte_length),
4430        )
4431    }
4432
4433    /// Query the worst-case output size when encoding from UTF-8 without
4434    /// replacement.
4435    ///
4436    /// Returns the size of the output buffer in bytes that will not overflow
4437    /// given the current state of the encoder and `byte_length` number of
4438    /// additional input code units or `None` if `usize` would overflow.
4439    ///
4440    /// Available via the C wrapper.
4441    pub fn max_buffer_length_from_utf8_without_replacement(
4442        &self,
4443        byte_length: usize,
4444    ) -> Option<usize> {
4445        self.variant
4446            .max_buffer_length_from_utf8_without_replacement(byte_length)
4447    }
4448
4449    /// Incrementally encode into byte stream from UTF-8 with unmappable
4450    /// characters replaced with HTML (decimal) numeric character references.
4451    ///
4452    /// See the documentation of the struct for documentation for `encode_*`
4453    /// methods collectively.
4454    ///
4455    /// Available via the C wrapper.
4456    pub fn encode_from_utf8(
4457        &mut self,
4458        src: &str,
4459        dst: &mut [u8],
4460        last: bool,
4461    ) -> (CoderResult, usize, usize, bool) {
4462        let dst_len = dst.len();
4463        let effective_dst_len = if self.encoding().can_encode_everything() {
4464            dst_len
4465        } else {
4466            if dst_len < NCR_EXTRA {
4467                if src.is_empty() && !(last && self.has_pending_state()) {
4468                    return (CoderResult::InputEmpty, 0, 0, false);
4469                }
4470                return (CoderResult::OutputFull, 0, 0, false);
4471            }
4472            dst_len - NCR_EXTRA
4473        };
4474        let mut had_unmappables = false;
4475        let mut total_read = 0usize;
4476        let mut total_written = 0usize;
4477        loop {
4478            let (result, read, written) = self.encode_from_utf8_without_replacement(
4479                &src[total_read..],
4480                &mut dst[total_written..effective_dst_len],
4481                last,
4482            );
4483            total_read += read;
4484            total_written += written;
4485            match result {
4486                EncoderResult::InputEmpty => {
4487                    return (
4488                        CoderResult::InputEmpty,
4489                        total_read,
4490                        total_written,
4491                        had_unmappables,
4492                    );
4493                }
4494                EncoderResult::OutputFull => {
4495                    return (
4496                        CoderResult::OutputFull,
4497                        total_read,
4498                        total_written,
4499                        had_unmappables,
4500                    );
4501                }
4502                EncoderResult::Unmappable(unmappable) => {
4503                    had_unmappables = true;
4504                    debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4505                    debug_assert_ne!(self.encoding(), UTF_16BE);
4506                    debug_assert_ne!(self.encoding(), UTF_16LE);
4507                    // Additionally, Iso2022JpEncoder is responsible for
4508                    // transitioning to ASCII when returning with Unmappable.
4509                    total_written += write_ncr(unmappable, &mut dst[total_written..]);
4510                    if total_written >= effective_dst_len {
4511                        if total_read == src.len() && !(last && self.has_pending_state()) {
4512                            return (
4513                                CoderResult::InputEmpty,
4514                                total_read,
4515                                total_written,
4516                                had_unmappables,
4517                            );
4518                        }
4519                        return (
4520                            CoderResult::OutputFull,
4521                            total_read,
4522                            total_written,
4523                            had_unmappables,
4524                        );
4525                    }
4526                }
4527            }
4528        }
4529    }
4530
4531    /// Incrementally encode into byte stream from UTF-8 with unmappable
4532    /// characters replaced with HTML (decimal) numeric character references.
4533    ///
4534    /// See the documentation of the struct for documentation for `encode_*`
4535    /// methods collectively.
4536    ///
4537    /// Available to Rust only.
4538    pub fn encode_from_utf8_to_vec(
4539        &mut self,
4540        src: &str,
4541        dst: &mut Vec<u8>,
4542        last: bool,
4543    ) -> (CoderResult, usize, bool) {
4544        unsafe {
4545            let old_len = dst.len();
4546            let capacity = dst.capacity();
4547            dst.set_len(capacity);
4548            let (result, read, written, replaced) =
4549                self.encode_from_utf8(src, &mut dst[old_len..], last);
4550            dst.set_len(old_len + written);
4551            (result, read, replaced)
4552        }
4553    }
4554
4555    /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4556    ///
4557    /// See the documentation of the struct for documentation for `encode_*`
4558    /// methods collectively.
4559    ///
4560    /// Available via the C wrapper.
4561    pub fn encode_from_utf8_without_replacement(
4562        &mut self,
4563        src: &str,
4564        dst: &mut [u8],
4565        last: bool,
4566    ) -> (EncoderResult, usize, usize) {
4567        self.variant.encode_from_utf8_raw(src, dst, last)
4568    }
4569
4570    /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4571    ///
4572    /// See the documentation of the struct for documentation for `encode_*`
4573    /// methods collectively.
4574    ///
4575    /// Available to Rust only.
4576    pub fn encode_from_utf8_to_vec_without_replacement(
4577        &mut self,
4578        src: &str,
4579        dst: &mut Vec<u8>,
4580        last: bool,
4581    ) -> (EncoderResult, usize) {
4582        unsafe {
4583            let old_len = dst.len();
4584            let capacity = dst.capacity();
4585            dst.set_len(capacity);
4586            let (result, read, written) =
4587                self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4588            dst.set_len(old_len + written);
4589            (result, read)
4590        }
4591    }
4592
4593    /// Query the worst-case output size when encoding from UTF-16 with
4594    /// replacement.
4595    ///
4596    /// Returns the size of the output buffer in bytes that will not overflow
4597    /// given the current state of the encoder and `u16_length` number of
4598    /// additional input code units if there are no unmappable characters in
4599    /// the input or `None` if `usize` would overflow.
4600    ///
4601    /// Available via the C wrapper.
4602    pub fn max_buffer_length_from_utf16_if_no_unmappables(
4603        &self,
4604        u16_length: usize,
4605    ) -> Option<usize> {
4606        checked_add(
4607            if self.encoding().can_encode_everything() {
4608                0
4609            } else {
4610                NCR_EXTRA
4611            },
4612            self.max_buffer_length_from_utf16_without_replacement(u16_length),
4613        )
4614    }
4615
4616    /// Query the worst-case output size when encoding from UTF-16 without
4617    /// replacement.
4618    ///
4619    /// Returns the size of the output buffer in bytes that will not overflow
4620    /// given the current state of the encoder and `u16_length` number of
4621    /// additional input code units or `None` if `usize` would overflow.
4622    ///
4623    /// Available via the C wrapper.
4624    pub fn max_buffer_length_from_utf16_without_replacement(
4625        &self,
4626        u16_length: usize,
4627    ) -> Option<usize> {
4628        self.variant
4629            .max_buffer_length_from_utf16_without_replacement(u16_length)
4630    }
4631
4632    /// Incrementally encode into byte stream from UTF-16 with unmappable
4633    /// characters replaced with HTML (decimal) numeric character references.
4634    ///
4635    /// See the documentation of the struct for documentation for `encode_*`
4636    /// methods collectively.
4637    ///
4638    /// Available via the C wrapper.
4639    pub fn encode_from_utf16(
4640        &mut self,
4641        src: &[u16],
4642        dst: &mut [u8],
4643        last: bool,
4644    ) -> (CoderResult, usize, usize, bool) {
4645        let dst_len = dst.len();
4646        let effective_dst_len = if self.encoding().can_encode_everything() {
4647            dst_len
4648        } else {
4649            if dst_len < NCR_EXTRA {
4650                if src.is_empty() && !(last && self.has_pending_state()) {
4651                    return (CoderResult::InputEmpty, 0, 0, false);
4652                }
4653                return (CoderResult::OutputFull, 0, 0, false);
4654            }
4655            dst_len - NCR_EXTRA
4656        };
4657        let mut had_unmappables = false;
4658        let mut total_read = 0usize;
4659        let mut total_written = 0usize;
4660        loop {
4661            let (result, read, written) = self.encode_from_utf16_without_replacement(
4662                &src[total_read..],
4663                &mut dst[total_written..effective_dst_len],
4664                last,
4665            );
4666            total_read += read;
4667            total_written += written;
4668            match result {
4669                EncoderResult::InputEmpty => {
4670                    return (
4671                        CoderResult::InputEmpty,
4672                        total_read,
4673                        total_written,
4674                        had_unmappables,
4675                    );
4676                }
4677                EncoderResult::OutputFull => {
4678                    return (
4679                        CoderResult::OutputFull,
4680                        total_read,
4681                        total_written,
4682                        had_unmappables,
4683                    );
4684                }
4685                EncoderResult::Unmappable(unmappable) => {
4686                    had_unmappables = true;
4687                    debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4688                    // There are no UTF-16 encoders and even if there were,
4689                    // they'd never have unmappables.
4690                    debug_assert_ne!(self.encoding(), UTF_16BE);
4691                    debug_assert_ne!(self.encoding(), UTF_16LE);
4692                    // Additionally, Iso2022JpEncoder is responsible for
4693                    // transitioning to ASCII when returning with Unmappable
4694                    // from the jis0208 state. That is, when we encode
4695                    // ISO-2022-JP and come here, the encoder is in either the
4696                    // ASCII or the Roman state. We are allowed to generate any
4697                    // printable ASCII excluding \ and ~.
4698                    total_written += write_ncr(unmappable, &mut dst[total_written..]);
4699                    if total_written >= effective_dst_len {
4700                        if total_read == src.len() && !(last && self.has_pending_state()) {
4701                            return (
4702                                CoderResult::InputEmpty,
4703                                total_read,
4704                                total_written,
4705                                had_unmappables,
4706                            );
4707                        }
4708                        return (
4709                            CoderResult::OutputFull,
4710                            total_read,
4711                            total_written,
4712                            had_unmappables,
4713                        );
4714                    }
4715                }
4716            }
4717        }
4718    }
4719
4720    /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4721    ///
4722    /// See the documentation of the struct for documentation for `encode_*`
4723    /// methods collectively.
4724    ///
4725    /// Available via the C wrapper.
4726    pub fn encode_from_utf16_without_replacement(
4727        &mut self,
4728        src: &[u16],
4729        dst: &mut [u8],
4730        last: bool,
4731    ) -> (EncoderResult, usize, usize) {
4732        self.variant.encode_from_utf16_raw(src, dst, last)
4733    }
4734}
4735
4736/// Format an unmappable as NCR without heap allocation.
4737fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4738    // len is the number of decimal digits needed to represent unmappable plus
4739    // 3 (the length of "&#" and ";").
4740    let mut number = unmappable as u32;
4741    let len = if number >= 1_000_000u32 {
4742        10usize
4743    } else if number >= 100_000u32 {
4744        9usize
4745    } else if number >= 10_000u32 {
4746        8usize
4747    } else if number >= 1_000u32 {
4748        7usize
4749    } else if number >= 100u32 {
4750        6usize
4751    } else {
4752        // Review the outcome of https://github.com/whatwg/encoding/issues/15
4753        // to see if this case is possible
4754        5usize
4755    };
4756    debug_assert!(number >= 10u32);
4757    debug_assert!(len <= dst.len());
4758    let mut pos = len - 1;
4759    dst[pos] = b';';
4760    pos -= 1;
4761    loop {
4762        let rightmost = number % 10;
4763        dst[pos] = rightmost as u8 + b'0';
4764        pos -= 1;
4765        if number < 10 {
4766            break;
4767        }
4768        number /= 10;
4769    }
4770    dst[1] = b'#';
4771    dst[0] = b'&';
4772    len
4773}
4774
4775#[inline(always)]
4776fn in_range16(i: u16, start: u16, end: u16) -> bool {
4777    i.wrapping_sub(start) < (end - start)
4778}
4779
4780#[inline(always)]
4781fn in_range32(i: u32, start: u32, end: u32) -> bool {
4782    i.wrapping_sub(start) < (end - start)
4783}
4784
4785#[inline(always)]
4786fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4787    i.wrapping_sub(start) <= (end - start)
4788}
4789
4790#[inline(always)]
4791fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4792    i.wrapping_sub(start) <= (end - start)
4793}
4794
4795#[inline(always)]
4796fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4797    i.wrapping_sub(start) <= (end - start)
4798}
4799
4800#[inline(always)]
4801fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4802    i.wrapping_sub(start) <= (end - start)
4803}
4804
4805#[inline(always)]
4806fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4807    if let Some(n) = opt {
4808        n.checked_add(num)
4809    } else {
4810        None
4811    }
4812}
4813
4814#[inline(always)]
4815fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4816    if let Some(n) = one {
4817        checked_add(n, other)
4818    } else {
4819        None
4820    }
4821}
4822
4823#[inline(always)]
4824fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4825    if let Some(n) = opt {
4826        n.checked_mul(num)
4827    } else {
4828        None
4829    }
4830}
4831
4832#[inline(always)]
4833fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4834    if let Some(n) = opt {
4835        n.checked_div(num)
4836    } else {
4837        None
4838    }
4839}
4840
4841#[inline(always)]
4842fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4843    opt.map(|n| n.next_power_of_two())
4844}
4845
4846#[inline(always)]
4847fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4848    if let Some(a) = one {
4849        if let Some(b) = other {
4850            Some(::std::cmp::min(a, b))
4851        } else {
4852            Some(a)
4853        }
4854    } else {
4855        other
4856    }
4857}
4858
4859// ############## TESTS ###############
4860
4861#[cfg(all(test, feature = "serde"))]
4862#[derive(Serialize, Deserialize, Debug, PartialEq)]
4863struct Demo {
4864    num: u32,
4865    name: String,
4866    enc: &'static Encoding,
4867}
4868
4869#[cfg(test)]
4870mod test_labels_names;
4871
4872#[cfg(test)]
4873mod tests {
4874    use super::*;
4875    use std::borrow::Cow;
4876
4877    fn sniff_to_utf16(
4878        initial_encoding: &'static Encoding,
4879        expected_encoding: &'static Encoding,
4880        bytes: &[u8],
4881        expect: &[u16],
4882        breaks: &[usize],
4883    ) {
4884        let mut decoder = initial_encoding.new_decoder();
4885
4886        let mut dest: Vec<u16> =
4887            Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4888        let capacity = dest.capacity();
4889        dest.resize(capacity, 0u16);
4890
4891        let mut total_written = 0usize;
4892        let mut start = 0usize;
4893        for br in breaks {
4894            let (result, read, written, _) =
4895                decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4896            total_written += written;
4897            assert_eq!(read, *br - start);
4898            match result {
4899                CoderResult::InputEmpty => {}
4900                CoderResult::OutputFull => {
4901                    unreachable!();
4902                }
4903            }
4904            start = *br;
4905        }
4906        let (result, read, written, _) =
4907            decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4908        total_written += written;
4909        match result {
4910            CoderResult::InputEmpty => {}
4911            CoderResult::OutputFull => {
4912                unreachable!();
4913            }
4914        }
4915        assert_eq!(read, bytes.len() - start);
4916        assert_eq!(total_written, expect.len());
4917        assert_eq!(&dest[..total_written], expect);
4918        assert_eq!(decoder.encoding(), expected_encoding);
4919    }
4920
4921    // Any copyright to the test code below this comment is dedicated to the
4922    // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4923
4924    #[test]
4925    fn test_bom_sniffing() {
4926        // ASCII
4927        sniff_to_utf16(
4928            WINDOWS_1252,
4929            WINDOWS_1252,
4930            b"\x61\x62",
4931            &[0x0061u16, 0x0062u16],
4932            &[],
4933        );
4934        // UTF-8
4935        sniff_to_utf16(
4936            WINDOWS_1252,
4937            UTF_8,
4938            b"\xEF\xBB\xBF\x61\x62",
4939            &[0x0061u16, 0x0062u16],
4940            &[],
4941        );
4942        sniff_to_utf16(
4943            WINDOWS_1252,
4944            UTF_8,
4945            b"\xEF\xBB\xBF\x61\x62",
4946            &[0x0061u16, 0x0062u16],
4947            &[1],
4948        );
4949        sniff_to_utf16(
4950            WINDOWS_1252,
4951            UTF_8,
4952            b"\xEF\xBB\xBF\x61\x62",
4953            &[0x0061u16, 0x0062u16],
4954            &[2],
4955        );
4956        sniff_to_utf16(
4957            WINDOWS_1252,
4958            UTF_8,
4959            b"\xEF\xBB\xBF\x61\x62",
4960            &[0x0061u16, 0x0062u16],
4961            &[3],
4962        );
4963        sniff_to_utf16(
4964            WINDOWS_1252,
4965            UTF_8,
4966            b"\xEF\xBB\xBF\x61\x62",
4967            &[0x0061u16, 0x0062u16],
4968            &[4],
4969        );
4970        sniff_to_utf16(
4971            WINDOWS_1252,
4972            UTF_8,
4973            b"\xEF\xBB\xBF\x61\x62",
4974            &[0x0061u16, 0x0062u16],
4975            &[2, 3],
4976        );
4977        sniff_to_utf16(
4978            WINDOWS_1252,
4979            UTF_8,
4980            b"\xEF\xBB\xBF\x61\x62",
4981            &[0x0061u16, 0x0062u16],
4982            &[1, 2],
4983        );
4984        sniff_to_utf16(
4985            WINDOWS_1252,
4986            UTF_8,
4987            b"\xEF\xBB\xBF\x61\x62",
4988            &[0x0061u16, 0x0062u16],
4989            &[1, 3],
4990        );
4991        sniff_to_utf16(
4992            WINDOWS_1252,
4993            UTF_8,
4994            b"\xEF\xBB\xBF\x61\x62",
4995            &[0x0061u16, 0x0062u16],
4996            &[1, 2, 3, 4],
4997        );
4998        sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4999        // Not UTF-8
5000        sniff_to_utf16(
5001            WINDOWS_1252,
5002            WINDOWS_1252,
5003            b"\xEF\xBB\x61\x62",
5004            &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5005            &[],
5006        );
5007        sniff_to_utf16(
5008            WINDOWS_1252,
5009            WINDOWS_1252,
5010            b"\xEF\xBB\x61\x62",
5011            &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5012            &[1],
5013        );
5014        sniff_to_utf16(
5015            WINDOWS_1252,
5016            WINDOWS_1252,
5017            b"\xEF\x61\x62",
5018            &[0x00EFu16, 0x0061u16, 0x0062u16],
5019            &[],
5020        );
5021        sniff_to_utf16(
5022            WINDOWS_1252,
5023            WINDOWS_1252,
5024            b"\xEF\x61\x62",
5025            &[0x00EFu16, 0x0061u16, 0x0062u16],
5026            &[1],
5027        );
5028        sniff_to_utf16(
5029            WINDOWS_1252,
5030            WINDOWS_1252,
5031            b"\xEF\xBB",
5032            &[0x00EFu16, 0x00BBu16],
5033            &[],
5034        );
5035        sniff_to_utf16(
5036            WINDOWS_1252,
5037            WINDOWS_1252,
5038            b"\xEF\xBB",
5039            &[0x00EFu16, 0x00BBu16],
5040            &[1],
5041        );
5042        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5043        // Not UTF-16
5044        sniff_to_utf16(
5045            WINDOWS_1252,
5046            WINDOWS_1252,
5047            b"\xFE\x61\x62",
5048            &[0x00FEu16, 0x0061u16, 0x0062u16],
5049            &[],
5050        );
5051        sniff_to_utf16(
5052            WINDOWS_1252,
5053            WINDOWS_1252,
5054            b"\xFE\x61\x62",
5055            &[0x00FEu16, 0x0061u16, 0x0062u16],
5056            &[1],
5057        );
5058        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5059        sniff_to_utf16(
5060            WINDOWS_1252,
5061            WINDOWS_1252,
5062            b"\xFF\x61\x62",
5063            &[0x00FFu16, 0x0061u16, 0x0062u16],
5064            &[],
5065        );
5066        sniff_to_utf16(
5067            WINDOWS_1252,
5068            WINDOWS_1252,
5069            b"\xFF\x61\x62",
5070            &[0x00FFu16, 0x0061u16, 0x0062u16],
5071            &[1],
5072        );
5073        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5074        // UTF-16
5075        sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5076        sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5077        sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5078        sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5079    }
5080
5081    #[test]
5082    fn test_output_encoding() {
5083        assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5084        assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5085        assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5086        assert_eq!(UTF_8.output_encoding(), UTF_8);
5087        assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5088        assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5089        assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5090        assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5091        assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5092        assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5093    }
5094
5095    #[test]
5096    fn test_label_resolution() {
5097        assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5098        assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5099        assert_eq!(
5100            Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5101            Some(UTF_8)
5102        );
5103        assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5104        assert_eq!(Encoding::for_label(b"bogus"), None);
5105        assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5106    }
5107
5108    #[test]
5109    fn test_decode_valid_windows_1257_to_cow() {
5110        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5111        match cow {
5112            Cow::Borrowed(_) => unreachable!(),
5113            Cow::Owned(s) => {
5114                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5115            }
5116        }
5117        assert_eq!(encoding, WINDOWS_1257);
5118        assert!(!had_errors);
5119    }
5120
5121    #[test]
5122    fn test_decode_invalid_windows_1257_to_cow() {
5123        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5124        match cow {
5125            Cow::Borrowed(_) => unreachable!(),
5126            Cow::Owned(s) => {
5127                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5128            }
5129        }
5130        assert_eq!(encoding, WINDOWS_1257);
5131        assert!(had_errors);
5132    }
5133
5134    #[test]
5135    fn test_decode_ascii_only_windows_1257_to_cow() {
5136        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5137        match cow {
5138            Cow::Borrowed(s) => {
5139                assert_eq!(s, "abc");
5140            }
5141            Cow::Owned(_) => unreachable!(),
5142        }
5143        assert_eq!(encoding, WINDOWS_1257);
5144        assert!(!had_errors);
5145    }
5146
5147    #[test]
5148    fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5149        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5150        match cow {
5151            Cow::Borrowed(s) => {
5152                assert_eq!(s, "\u{20AC}\u{00E4}");
5153            }
5154            Cow::Owned(_) => unreachable!(),
5155        }
5156        assert_eq!(encoding, UTF_8);
5157        assert!(!had_errors);
5158    }
5159
5160    #[test]
5161    fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5162        let (cow, encoding, had_errors) =
5163            WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5164        match cow {
5165            Cow::Borrowed(_) => unreachable!(),
5166            Cow::Owned(s) => {
5167                assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5168            }
5169        }
5170        assert_eq!(encoding, UTF_8);
5171        assert!(had_errors);
5172    }
5173
5174    #[test]
5175    fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5176        let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5177        match cow {
5178            Cow::Borrowed(s) => {
5179                assert_eq!(s, "\u{20AC}\u{00E4}");
5180            }
5181            Cow::Owned(_) => unreachable!(),
5182        }
5183        assert_eq!(encoding, UTF_8);
5184        assert!(!had_errors);
5185    }
5186
5187    #[test]
5188    fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5189        let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5190        match cow {
5191            Cow::Borrowed(_) => unreachable!(),
5192            Cow::Owned(s) => {
5193                assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5194            }
5195        }
5196        assert_eq!(encoding, UTF_8);
5197        assert!(had_errors);
5198    }
5199
5200    #[test]
5201    fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5202        let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5203        match cow {
5204            Cow::Borrowed(s) => {
5205                assert_eq!(s, "\u{20AC}\u{00E4}");
5206            }
5207            Cow::Owned(_) => unreachable!(),
5208        }
5209        assert!(!had_errors);
5210    }
5211
5212    #[test]
5213    fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5214        let (cow, had_errors) =
5215            WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5216        match cow {
5217            Cow::Borrowed(_) => unreachable!(),
5218            Cow::Owned(s) => {
5219                assert_eq!(
5220                    s,
5221                    "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5222                );
5223            }
5224        }
5225        assert!(!had_errors);
5226    }
5227
5228    #[test]
5229    fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5230        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5231        match cow {
5232            Cow::Borrowed(_) => unreachable!(),
5233            Cow::Owned(s) => {
5234                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5235            }
5236        }
5237        assert!(!had_errors);
5238    }
5239
5240    #[test]
5241    fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5242        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5243        match cow {
5244            Cow::Borrowed(_) => unreachable!(),
5245            Cow::Owned(s) => {
5246                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5247            }
5248        }
5249        assert!(had_errors);
5250    }
5251
5252    #[test]
5253    fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5254        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5255        match cow {
5256            Cow::Borrowed(s) => {
5257                assert_eq!(s, "abc");
5258            }
5259            Cow::Owned(_) => unreachable!(),
5260        }
5261        assert!(!had_errors);
5262    }
5263
5264    #[test]
5265    fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5266        let (cow, had_errors) =
5267            UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5268        match cow {
5269            Cow::Borrowed(s) => {
5270                assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5271            }
5272            Cow::Owned(_) => unreachable!(),
5273        }
5274        assert!(!had_errors);
5275    }
5276
5277    #[test]
5278    fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5279        let (cow, had_errors) =
5280            UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5281        match cow {
5282            Cow::Borrowed(_) => unreachable!(),
5283            Cow::Owned(s) => {
5284                assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5285            }
5286        }
5287        assert!(had_errors);
5288    }
5289
5290    #[test]
5291    fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5292        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5293        match cow {
5294            Cow::Borrowed(_) => unreachable!(),
5295            Cow::Owned(s) => {
5296                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5297            }
5298        }
5299        assert!(!had_errors);
5300    }
5301
5302    #[test]
5303    fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5304        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5305        match cow {
5306            Cow::Borrowed(_) => unreachable!(),
5307            Cow::Owned(s) => {
5308                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5309            }
5310        }
5311        assert!(had_errors);
5312    }
5313
5314    #[test]
5315    fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5316        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5317        match cow {
5318            Cow::Borrowed(s) => {
5319                assert_eq!(s, "abc");
5320            }
5321            Cow::Owned(_) => unreachable!(),
5322        }
5323        assert!(!had_errors);
5324    }
5325
5326    #[test]
5327    fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5328        match UTF_8.decode_without_bom_handling_and_without_replacement(
5329            b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5330        ) {
5331            Some(cow) => match cow {
5332                Cow::Borrowed(s) => {
5333                    assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5334                }
5335                Cow::Owned(_) => unreachable!(),
5336            },
5337            None => unreachable!(),
5338        }
5339    }
5340
5341    #[test]
5342    fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5343        assert!(UTF_8
5344            .decode_without_bom_handling_and_without_replacement(
5345                b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5346            )
5347            .is_none());
5348    }
5349
5350    #[test]
5351    fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5352        match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5353            Some(cow) => match cow {
5354                Cow::Borrowed(_) => unreachable!(),
5355                Cow::Owned(s) => {
5356                    assert_eq!(s, "abc\u{20AC}\u{00E4}");
5357                }
5358            },
5359            None => unreachable!(),
5360        }
5361    }
5362
5363    #[test]
5364    fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5365        assert!(WINDOWS_1257
5366            .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5367            .is_none());
5368    }
5369
5370    #[test]
5371    fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5372        match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5373            Some(cow) => match cow {
5374                Cow::Borrowed(s) => {
5375                    assert_eq!(s, "abc");
5376                }
5377                Cow::Owned(_) => unreachable!(),
5378            },
5379            None => unreachable!(),
5380        }
5381    }
5382
5383    #[test]
5384    fn test_encode_ascii_only_windows_1257_to_cow() {
5385        let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5386        match cow {
5387            Cow::Borrowed(s) => {
5388                assert_eq!(s, b"abc");
5389            }
5390            Cow::Owned(_) => unreachable!(),
5391        }
5392        assert_eq!(encoding, WINDOWS_1257);
5393        assert!(!had_errors);
5394    }
5395
5396    #[test]
5397    fn test_encode_valid_windows_1257_to_cow() {
5398        let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5399        match cow {
5400            Cow::Borrowed(_) => unreachable!(),
5401            Cow::Owned(s) => {
5402                assert_eq!(s, b"abc\x80\xE4");
5403            }
5404        }
5405        assert_eq!(encoding, WINDOWS_1257);
5406        assert!(!had_errors);
5407    }
5408
5409    #[test]
5410    fn test_utf16_space_with_one_bom_byte() {
5411        let mut decoder = UTF_16LE.new_decoder();
5412        let mut dst = [0u16; 12];
5413        {
5414            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5415            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5416            assert_eq!(result, CoderResult::InputEmpty);
5417        }
5418        {
5419            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5420            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5421            assert_eq!(result, CoderResult::InputEmpty);
5422        }
5423    }
5424
5425    #[test]
5426    fn test_utf8_space_with_one_bom_byte() {
5427        let mut decoder = UTF_8.new_decoder();
5428        let mut dst = [0u16; 12];
5429        {
5430            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5431            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5432            assert_eq!(result, CoderResult::InputEmpty);
5433        }
5434        {
5435            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5436            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5437            assert_eq!(result, CoderResult::InputEmpty);
5438        }
5439    }
5440
5441    #[test]
5442    fn test_utf16_space_with_two_bom_bytes() {
5443        let mut decoder = UTF_16LE.new_decoder();
5444        let mut dst = [0u16; 12];
5445        {
5446            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5447            let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5448            assert_eq!(result, CoderResult::InputEmpty);
5449        }
5450        {
5451            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5452            let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5453            assert_eq!(result, CoderResult::InputEmpty);
5454        }
5455        {
5456            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5457            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5458            assert_eq!(result, CoderResult::InputEmpty);
5459        }
5460    }
5461
5462    #[test]
5463    fn test_utf8_space_with_two_bom_bytes() {
5464        let mut decoder = UTF_8.new_decoder();
5465        let mut dst = [0u16; 12];
5466        {
5467            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5468            let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5469            assert_eq!(result, CoderResult::InputEmpty);
5470        }
5471        {
5472            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5473            let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5474            assert_eq!(result, CoderResult::InputEmpty);
5475        }
5476        {
5477            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5478            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5479            assert_eq!(result, CoderResult::InputEmpty);
5480        }
5481    }
5482
5483    #[test]
5484    fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5485        let mut decoder = UTF_16LE.new_decoder();
5486        let mut dst = [0u16; 12];
5487        {
5488            let needed = decoder.max_utf16_buffer_length(2).unwrap();
5489            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5490            assert_eq!(result, CoderResult::InputEmpty);
5491        }
5492    }
5493
5494    #[test]
5495    fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5496        let mut dst = [0u8; 8];
5497        let mut encoder = ISO_2022_JP.new_encoder();
5498        {
5499            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5500            assert_eq!(result, CoderResult::InputEmpty);
5501        }
5502        {
5503            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5504            assert_eq!(result, CoderResult::InputEmpty);
5505        }
5506    }
5507
5508    #[test]
5509    fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5510        let mut dst = [0u8; 16];
5511        let mut encoder = ISO_2022_JP.new_encoder();
5512        {
5513            let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5514            assert_eq!(result, CoderResult::InputEmpty);
5515        }
5516        {
5517            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5518            assert_eq!(result, CoderResult::InputEmpty);
5519        }
5520        {
5521            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5522            assert_eq!(result, CoderResult::OutputFull);
5523        }
5524    }
5525
5526    #[test]
5527    fn test_buffer_end_iso_2022_jp_from_utf8() {
5528        let mut dst = [0u8; 18];
5529        {
5530            let mut encoder = ISO_2022_JP.new_encoder();
5531            let (result, _, _, _) =
5532                encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5533            assert_eq!(result, CoderResult::InputEmpty);
5534        }
5535        {
5536            let mut encoder = ISO_2022_JP.new_encoder();
5537            let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5538            assert_eq!(result, CoderResult::OutputFull);
5539        }
5540        {
5541            let mut encoder = ISO_2022_JP.new_encoder();
5542            let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5543            assert_eq!(result, CoderResult::InputEmpty);
5544        }
5545        {
5546            let mut encoder = ISO_2022_JP.new_encoder();
5547            let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5548            assert_eq!(result, CoderResult::InputEmpty);
5549        }
5550    }
5551
5552    #[test]
5553    fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5554        let mut dst = [0u8; 8];
5555        let mut encoder = ISO_2022_JP.new_encoder();
5556        {
5557            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5558            assert_eq!(result, CoderResult::InputEmpty);
5559        }
5560        {
5561            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5562            assert_eq!(result, CoderResult::InputEmpty);
5563        }
5564    }
5565
5566    #[test]
5567    fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5568        let mut dst = [0u8; 16];
5569        let mut encoder = ISO_2022_JP.new_encoder();
5570        {
5571            let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5572            assert_eq!(result, CoderResult::InputEmpty);
5573        }
5574        {
5575            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5576            assert_eq!(result, CoderResult::InputEmpty);
5577        }
5578        {
5579            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5580            assert_eq!(result, CoderResult::OutputFull);
5581        }
5582    }
5583
5584    #[test]
5585    fn test_buffer_end_iso_2022_jp_from_utf16() {
5586        let mut dst = [0u8; 18];
5587        {
5588            let mut encoder = ISO_2022_JP.new_encoder();
5589            let (result, _, _, _) =
5590                encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5591            assert_eq!(result, CoderResult::InputEmpty);
5592        }
5593        {
5594            let mut encoder = ISO_2022_JP.new_encoder();
5595            let (result, _, _, _) =
5596                encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5597            assert_eq!(result, CoderResult::OutputFull);
5598        }
5599        {
5600            let mut encoder = ISO_2022_JP.new_encoder();
5601            let (result, _, _, _) =
5602                encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5603            assert_eq!(result, CoderResult::InputEmpty);
5604        }
5605        {
5606            let mut encoder = ISO_2022_JP.new_encoder();
5607            let (result, _, _, _) =
5608                encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5609            assert_eq!(result, CoderResult::InputEmpty);
5610        }
5611    }
5612
5613    #[test]
5614    fn test_buffer_end_utf16be() {
5615        let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5616        let mut dest = [0u8; 4];
5617
5618        assert_eq!(
5619            decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5620            (CoderResult::InputEmpty, 2, 0, false)
5621        );
5622
5623        let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5624    }
5625
5626    #[test]
5627    fn test_hash() {
5628        let mut encodings = ::std::collections::HashSet::new();
5629        encodings.insert(UTF_8);
5630        encodings.insert(ISO_2022_JP);
5631        assert!(encodings.contains(UTF_8));
5632        assert!(encodings.contains(ISO_2022_JP));
5633        assert!(!encodings.contains(WINDOWS_1252));
5634        encodings.remove(ISO_2022_JP);
5635        assert!(!encodings.contains(ISO_2022_JP));
5636    }
5637
5638    #[test]
5639    fn test_iso_2022_jp_ncr_extra_from_utf16() {
5640        let mut dst = [0u8; 17];
5641        {
5642            let mut encoder = ISO_2022_JP.new_encoder();
5643            let (result, _, _, _) =
5644                encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5645            assert_eq!(result, CoderResult::OutputFull);
5646        }
5647    }
5648
5649    #[test]
5650    fn test_iso_2022_jp_ncr_extra_from_utf8() {
5651        let mut dst = [0u8; 17];
5652        {
5653            let mut encoder = ISO_2022_JP.new_encoder();
5654            let (result, _, _, _) =
5655                encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5656            assert_eq!(result, CoderResult::OutputFull);
5657        }
5658    }
5659
5660    #[test]
5661    fn test_max_length_with_bom_to_utf8() {
5662        let mut output = [0u8; 20];
5663        let mut decoder = REPLACEMENT.new_decoder();
5664        let input = b"\xEF\xBB\xBFA";
5665        {
5666            let needed = decoder
5667                .max_utf8_buffer_length_without_replacement(input.len())
5668                .unwrap();
5669            let (result, read, written) =
5670                decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5671            assert_eq!(result, DecoderResult::InputEmpty);
5672            assert_eq!(read, input.len());
5673            assert_eq!(written, 1);
5674            assert_eq!(output[0], 0x41);
5675        }
5676    }
5677
5678    #[cfg(feature = "serde")]
5679    #[test]
5680    fn test_serde() {
5681        let demo = Demo {
5682            num: 42,
5683            name: "foo".into(),
5684            enc: UTF_8,
5685        };
5686
5687        let serialized = serde_json::to_string(&demo).unwrap();
5688
5689        let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5690        assert_eq!(deserialized, demo);
5691
5692        let bincoded = bincode::serialize(&demo).unwrap();
5693        let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5694        assert_eq!(debincoded, demo);
5695    }
5696
5697    #[test]
5698    fn test_is_single_byte() {
5699        assert!(!BIG5.is_single_byte());
5700        assert!(!EUC_JP.is_single_byte());
5701        assert!(!EUC_KR.is_single_byte());
5702        assert!(!GB18030.is_single_byte());
5703        assert!(!GBK.is_single_byte());
5704        assert!(!REPLACEMENT.is_single_byte());
5705        assert!(!SHIFT_JIS.is_single_byte());
5706        assert!(!UTF_8.is_single_byte());
5707        assert!(!UTF_16BE.is_single_byte());
5708        assert!(!UTF_16LE.is_single_byte());
5709        assert!(!ISO_2022_JP.is_single_byte());
5710
5711        assert!(IBM866.is_single_byte());
5712        assert!(ISO_8859_2.is_single_byte());
5713        assert!(ISO_8859_3.is_single_byte());
5714        assert!(ISO_8859_4.is_single_byte());
5715        assert!(ISO_8859_5.is_single_byte());
5716        assert!(ISO_8859_6.is_single_byte());
5717        assert!(ISO_8859_7.is_single_byte());
5718        assert!(ISO_8859_8.is_single_byte());
5719        assert!(ISO_8859_10.is_single_byte());
5720        assert!(ISO_8859_13.is_single_byte());
5721        assert!(ISO_8859_14.is_single_byte());
5722        assert!(ISO_8859_15.is_single_byte());
5723        assert!(ISO_8859_16.is_single_byte());
5724        assert!(ISO_8859_8_I.is_single_byte());
5725        assert!(KOI8_R.is_single_byte());
5726        assert!(KOI8_U.is_single_byte());
5727        assert!(MACINTOSH.is_single_byte());
5728        assert!(WINDOWS_874.is_single_byte());
5729        assert!(WINDOWS_1250.is_single_byte());
5730        assert!(WINDOWS_1251.is_single_byte());
5731        assert!(WINDOWS_1252.is_single_byte());
5732        assert!(WINDOWS_1253.is_single_byte());
5733        assert!(WINDOWS_1254.is_single_byte());
5734        assert!(WINDOWS_1255.is_single_byte());
5735        assert!(WINDOWS_1256.is_single_byte());
5736        assert!(WINDOWS_1257.is_single_byte());
5737        assert!(WINDOWS_1258.is_single_byte());
5738        assert!(X_MAC_CYRILLIC.is_single_byte());
5739        assert!(X_USER_DEFINED.is_single_byte());
5740    }
5741
5742    #[test]
5743    fn test_latin1_byte_compatible_up_to() {
5744        let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5745        assert_eq!(
5746            BIG5.new_decoder_without_bom_handling()
5747                .latin1_byte_compatible_up_to(buffer)
5748                .unwrap(),
5749            1
5750        );
5751        assert_eq!(
5752            EUC_JP
5753                .new_decoder_without_bom_handling()
5754                .latin1_byte_compatible_up_to(buffer)
5755                .unwrap(),
5756            1
5757        );
5758        assert_eq!(
5759            EUC_KR
5760                .new_decoder_without_bom_handling()
5761                .latin1_byte_compatible_up_to(buffer)
5762                .unwrap(),
5763            1
5764        );
5765        assert_eq!(
5766            GB18030
5767                .new_decoder_without_bom_handling()
5768                .latin1_byte_compatible_up_to(buffer)
5769                .unwrap(),
5770            1
5771        );
5772        assert_eq!(
5773            GBK.new_decoder_without_bom_handling()
5774                .latin1_byte_compatible_up_to(buffer)
5775                .unwrap(),
5776            1
5777        );
5778        assert!(REPLACEMENT
5779            .new_decoder_without_bom_handling()
5780            .latin1_byte_compatible_up_to(buffer)
5781            .is_none());
5782        assert_eq!(
5783            SHIFT_JIS
5784                .new_decoder_without_bom_handling()
5785                .latin1_byte_compatible_up_to(buffer)
5786                .unwrap(),
5787            1
5788        );
5789        assert_eq!(
5790            UTF_8
5791                .new_decoder_without_bom_handling()
5792                .latin1_byte_compatible_up_to(buffer)
5793                .unwrap(),
5794            1
5795        );
5796        assert!(UTF_16BE
5797            .new_decoder_without_bom_handling()
5798            .latin1_byte_compatible_up_to(buffer)
5799            .is_none());
5800        assert!(UTF_16LE
5801            .new_decoder_without_bom_handling()
5802            .latin1_byte_compatible_up_to(buffer)
5803            .is_none());
5804        assert_eq!(
5805            ISO_2022_JP
5806                .new_decoder_without_bom_handling()
5807                .latin1_byte_compatible_up_to(buffer)
5808                .unwrap(),
5809            1
5810        );
5811
5812        assert_eq!(
5813            IBM866
5814                .new_decoder_without_bom_handling()
5815                .latin1_byte_compatible_up_to(buffer)
5816                .unwrap(),
5817            1
5818        );
5819        assert_eq!(
5820            ISO_8859_2
5821                .new_decoder_without_bom_handling()
5822                .latin1_byte_compatible_up_to(buffer)
5823                .unwrap(),
5824            2
5825        );
5826        assert_eq!(
5827            ISO_8859_3
5828                .new_decoder_without_bom_handling()
5829                .latin1_byte_compatible_up_to(buffer)
5830                .unwrap(),
5831            2
5832        );
5833        assert_eq!(
5834            ISO_8859_4
5835                .new_decoder_without_bom_handling()
5836                .latin1_byte_compatible_up_to(buffer)
5837                .unwrap(),
5838            2
5839        );
5840        assert_eq!(
5841            ISO_8859_5
5842                .new_decoder_without_bom_handling()
5843                .latin1_byte_compatible_up_to(buffer)
5844                .unwrap(),
5845            2
5846        );
5847        assert_eq!(
5848            ISO_8859_6
5849                .new_decoder_without_bom_handling()
5850                .latin1_byte_compatible_up_to(buffer)
5851                .unwrap(),
5852            2
5853        );
5854        assert_eq!(
5855            ISO_8859_7
5856                .new_decoder_without_bom_handling()
5857                .latin1_byte_compatible_up_to(buffer)
5858                .unwrap(),
5859            2
5860        );
5861        assert_eq!(
5862            ISO_8859_8
5863                .new_decoder_without_bom_handling()
5864                .latin1_byte_compatible_up_to(buffer)
5865                .unwrap(),
5866            3
5867        );
5868        assert_eq!(
5869            ISO_8859_10
5870                .new_decoder_without_bom_handling()
5871                .latin1_byte_compatible_up_to(buffer)
5872                .unwrap(),
5873            2
5874        );
5875        assert_eq!(
5876            ISO_8859_13
5877                .new_decoder_without_bom_handling()
5878                .latin1_byte_compatible_up_to(buffer)
5879                .unwrap(),
5880            4
5881        );
5882        assert_eq!(
5883            ISO_8859_14
5884                .new_decoder_without_bom_handling()
5885                .latin1_byte_compatible_up_to(buffer)
5886                .unwrap(),
5887            4
5888        );
5889        assert_eq!(
5890            ISO_8859_15
5891                .new_decoder_without_bom_handling()
5892                .latin1_byte_compatible_up_to(buffer)
5893                .unwrap(),
5894            6
5895        );
5896        assert_eq!(
5897            ISO_8859_16
5898                .new_decoder_without_bom_handling()
5899                .latin1_byte_compatible_up_to(buffer)
5900                .unwrap(),
5901            4
5902        );
5903        assert_eq!(
5904            ISO_8859_8_I
5905                .new_decoder_without_bom_handling()
5906                .latin1_byte_compatible_up_to(buffer)
5907                .unwrap(),
5908            3
5909        );
5910        assert_eq!(
5911            KOI8_R
5912                .new_decoder_without_bom_handling()
5913                .latin1_byte_compatible_up_to(buffer)
5914                .unwrap(),
5915            1
5916        );
5917        assert_eq!(
5918            KOI8_U
5919                .new_decoder_without_bom_handling()
5920                .latin1_byte_compatible_up_to(buffer)
5921                .unwrap(),
5922            1
5923        );
5924        assert_eq!(
5925            MACINTOSH
5926                .new_decoder_without_bom_handling()
5927                .latin1_byte_compatible_up_to(buffer)
5928                .unwrap(),
5929            1
5930        );
5931        assert_eq!(
5932            WINDOWS_874
5933                .new_decoder_without_bom_handling()
5934                .latin1_byte_compatible_up_to(buffer)
5935                .unwrap(),
5936            2
5937        );
5938        assert_eq!(
5939            WINDOWS_1250
5940                .new_decoder_without_bom_handling()
5941                .latin1_byte_compatible_up_to(buffer)
5942                .unwrap(),
5943            4
5944        );
5945        assert_eq!(
5946            WINDOWS_1251
5947                .new_decoder_without_bom_handling()
5948                .latin1_byte_compatible_up_to(buffer)
5949                .unwrap(),
5950            1
5951        );
5952        assert_eq!(
5953            WINDOWS_1252
5954                .new_decoder_without_bom_handling()
5955                .latin1_byte_compatible_up_to(buffer)
5956                .unwrap(),
5957            5
5958        );
5959        assert_eq!(
5960            WINDOWS_1253
5961                .new_decoder_without_bom_handling()
5962                .latin1_byte_compatible_up_to(buffer)
5963                .unwrap(),
5964            3
5965        );
5966        assert_eq!(
5967            WINDOWS_1254
5968                .new_decoder_without_bom_handling()
5969                .latin1_byte_compatible_up_to(buffer)
5970                .unwrap(),
5971            4
5972        );
5973        assert_eq!(
5974            WINDOWS_1255
5975                .new_decoder_without_bom_handling()
5976                .latin1_byte_compatible_up_to(buffer)
5977                .unwrap(),
5978            3
5979        );
5980        assert_eq!(
5981            WINDOWS_1256
5982                .new_decoder_without_bom_handling()
5983                .latin1_byte_compatible_up_to(buffer)
5984                .unwrap(),
5985            1
5986        );
5987        assert_eq!(
5988            WINDOWS_1257
5989                .new_decoder_without_bom_handling()
5990                .latin1_byte_compatible_up_to(buffer)
5991                .unwrap(),
5992            4
5993        );
5994        assert_eq!(
5995            WINDOWS_1258
5996                .new_decoder_without_bom_handling()
5997                .latin1_byte_compatible_up_to(buffer)
5998                .unwrap(),
5999            4
6000        );
6001        assert_eq!(
6002            X_MAC_CYRILLIC
6003                .new_decoder_without_bom_handling()
6004                .latin1_byte_compatible_up_to(buffer)
6005                .unwrap(),
6006            1
6007        );
6008        assert_eq!(
6009            X_USER_DEFINED
6010                .new_decoder_without_bom_handling()
6011                .latin1_byte_compatible_up_to(buffer)
6012                .unwrap(),
6013            1
6014        );
6015
6016        assert!(UTF_8
6017            .new_decoder()
6018            .latin1_byte_compatible_up_to(buffer)
6019            .is_none());
6020
6021        let mut decoder = UTF_8.new_decoder();
6022        let mut output = [0u16; 4];
6023        let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6024        assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6025        let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6026        assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6027        let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6028        assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6029    }
6030}
encoding_rs/lib.rs

encoding_rs/
lib.rs