encoding_rs/lib.rs
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10#![cfg_attr(
11 feature = "cargo-clippy",
12 allow(doc_markdown, inline_always, new_ret_no_self)
13)]
14
15//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17//! Gecko-oriented means that converting to and from UTF-16 is supported in
18//! addition to converting to and from UTF-8, that the performance and
19//! streamability goals are browser-oriented, and that FFI-friendliness is a
20//! goal.
21//!
22//! Additionally, the `mem` module provides functions that are useful for
23//! applications that need to be able to deal with legacy in-memory
24//! representations of Unicode.
25//!
26//! For expectation setting, please be sure to read the sections
27//! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28//! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29//!
30//! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31//! design and internals of the crate.
32//!
33//! # Availability
34//!
35//! The code is available under the
36//! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37//! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38//! See the
39//! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40//! file for details.
41//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43//!
44//! # Integration with `std::io`
45//!
46//! This crate doesn't implement traits from `std::io`. However, for the case of
47//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50//!
51//! # Examples
52//!
53//! Example programs:
54//!
55//! * [Rust](https://github.com/hsivonen/recode_rs)
56//! * [C](https://github.com/hsivonen/recode_c)
57//! * [C++](https://github.com/hsivonen/recode_cpp)
58//!
59//! Decode using the non-streaming API:
60//!
61//! ```
62//! use encoding_rs::*;
63//!
64//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
65//! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
66//!
67//! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
68//! assert_eq!(&cow[..], expectation);
69//! assert_eq!(encoding_used, SHIFT_JIS);
70//! assert!(!had_errors);
71//! ```
72//!
73//! Decode using the streaming API with minimal `unsafe`:
74//!
75//! ```
76//! use encoding_rs::*;
77//!
78//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
79//!
80//! // Use an array of byte slices to demonstrate content arriving piece by
81//! // piece from the network.
82//! let bytes: [&'static [u8]; 4] = [b"\x83",
83//! b"n\x83\x8D\x81",
84//! b"[\x81E\x83\x8F\x81[\x83",
85//! b"\x8B\x83h"];
86//!
87//! // Very short output buffer to demonstrate the output buffer getting full.
88//! // Normally, you'd use something like `[0u8; 2048]`.
89//! let mut buffer_bytes = [0u8; 8];
90//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
91//!
92//! // How many bytes in the buffer currently hold significant data.
93//! let mut bytes_in_buffer = 0usize;
94//!
95//! // Collect the output to a string for demonstration purposes.
96//! let mut output = String::new();
97//!
98//! // The `Decoder`
99//! let mut decoder = SHIFT_JIS.new_decoder();
100//!
101//! // Track whether we see errors.
102//! let mut total_had_errors = false;
103//!
104//! // Decode using a fixed-size intermediate buffer (for demonstrating the
105//! // use of a fixed-size buffer; normally when the output of an incremental
106//! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
107//! // avoid the intermediate buffer).
108//! for input in &bytes[..] {
109//! // The number of bytes already read from current `input` in total.
110//! let mut total_read_from_current_input = 0usize;
111//!
112//! loop {
113//! let (result, read, written, had_errors) =
114//! decoder.decode_to_str(&input[total_read_from_current_input..],
115//! &mut buffer[bytes_in_buffer..],
116//! false);
117//! total_read_from_current_input += read;
118//! bytes_in_buffer += written;
119//! total_had_errors |= had_errors;
120//! match result {
121//! CoderResult::InputEmpty => {
122//! // We have consumed the current input buffer. Break out of
123//! // the inner loop to get the next input buffer from the
124//! // outer loop.
125//! break;
126//! },
127//! CoderResult::OutputFull => {
128//! // Write the current buffer out and consider the buffer
129//! // empty.
130//! output.push_str(&buffer[..bytes_in_buffer]);
131//! bytes_in_buffer = 0usize;
132//! continue;
133//! }
134//! }
135//! }
136//! }
137//!
138//! // Process EOF
139//! loop {
140//! let (result, _, written, had_errors) =
141//! decoder.decode_to_str(b"",
142//! &mut buffer[bytes_in_buffer..],
143//! true);
144//! bytes_in_buffer += written;
145//! total_had_errors |= had_errors;
146//! // Write the current buffer out and consider the buffer empty.
147//! // Need to do this here for both `match` arms, because we exit the
148//! // loop on `CoderResult::InputEmpty`.
149//! output.push_str(&buffer[..bytes_in_buffer]);
150//! bytes_in_buffer = 0usize;
151//! match result {
152//! CoderResult::InputEmpty => {
153//! // Done!
154//! break;
155//! },
156//! CoderResult::OutputFull => {
157//! continue;
158//! }
159//! }
160//! }
161//!
162//! assert_eq!(&output[..], expectation);
163//! assert!(!total_had_errors);
164//! ```
165//!
166//! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
167//!
168//! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
169//! __so this crate does not provide encoders for those encodings__!
170//! Along with the replacement encoding, their _output encoding_ is UTF-8,
171//! so you get an UTF-8 encoder if you request an encoder for them.
172//!
173//! Additionally, the Encoding Standard factors BOM handling into wrapper
174//! algorithms so that BOM handling isn't part of the definition of the
175//! encodings themselves. The Unicode _encoding schemes_ in the Unicode
176//! Standard define BOM handling or lack thereof as part of the encoding
177//! scheme.
178//!
179//! When used with the `_without_bom_handling` entry points, the UTF-16LE
180//! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
181//! the Unicode Standard.
182//!
183//! When used with the `_with_bom_removal` entry points, the UTF-8
184//! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
185//! Standard.
186//!
187//! This crate does not provide a mode that matches the UTF-16 _encoding
188//! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
189//! the entry points without `_bom_` qualifiers is the closest match,
190//! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
191//! not part of the behavior of the UTF-16 _encoding scheme_ per the
192//! Unicode Standard.
193//!
194//! The UTF-32 family of Unicode encoding schemes is not supported
195//! by this crate. The Encoding Standard doesn't define any UTF-32
196//! family encodings, since they aren't necessary for consuming Web
197//! content.
198//!
199//! ## ISO-8859-1
200//!
201//! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
202//! the Encoding Standard. Therefore, an encoding that maps the unsigned
203//! byte value to the same Unicode scalar value is not available via
204//! `Encoding` in this crate.
205//!
206//! However, the functions whose name starts with `convert` and contains
207//! `latin1` in the `mem` module support such conversions, which are known as
208//! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
209//! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
210//! in the [Infra Standard](https://infra.spec.whatwg.org/).
211//!
212//! ## Web / Browser Focus
213//!
214//! Both in terms of scope and performance, the focus is on the Web. For scope,
215//! this means that encoding_rs implements the Encoding Standard fully and
216//! doesn't implement encodings that are not specified in the Encoding
217//! Standard. For performance, this means that decoding performance is
218//! important as well as performance for encoding into UTF-8 or encoding the
219//! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
220//! be encoded into legacy encodings in only two places in the Web platform: in
221//! the query part of URLs, in which case it's a matter of relatively rare
222//! error handling, and in form submission, in which case the user action and
223//! networking tend to hide the performance of the encoder.
224//!
225//! Deemphasizing performance of encoding non-Basic Latin text into legacy
226//! encodings enables smaller code size thanks to the encoder side using the
227//! decode-optimized data tables without having encode-optimized data tables at
228//! all. Even in decoders, smaller lookup table size is preferred over avoiding
229//! multiplication operations.
230//!
231//! Additionally, performance is a non-goal for the ASCII-incompatible
232//! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
233//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
234//! of implementation.
235//!
236//! Despite the browser focus, the hope is that non-browser applications
237//! that wish to consume Web content or submit Web forms in a Web-compatible
238//! way will find encoding_rs useful. While encoding_rs does not try to match
239//! Windows behavior, many of the encodings are close enough to legacy
240//! encodings implemented by Windows that applications that need to consume
241//! data in legacy Windows encodins may find encoding_rs useful. The
242//! [codepage](https://crates.io/crates/codepage) crate maps from Windows
243//! code page identifiers onto encoding_rs `Encoding`s and vice versa.
244//!
245//! For decoding email, UTF-7 support is needed (unfortunately) in additition
246//! to the encodings defined in the Encoding Standard. The
247//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
248//! UTF-7 decoding for email purposes.
249//!
250//! For single-byte DOS encodings beyond the ones supported by the Encoding
251//! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
252//!
253//! # Preparing Text for the Encoders
254//!
255//! Normalizing text into Unicode Normalization Form C prior to encoding text
256//! into a legacy encoding minimizes unmappable characters. Text can be
257//! normalized to Unicode Normalization Form C using the
258//! [`unic-normal`](https://crates.io/crates/unic-normal) crate.
259//!
260//! The exception is windows-1258, which after normalizing to Unicode
261//! Normalization Form C requires tone marks to be decomposed in order to
262//! minimize unmappable characters. Vietnamese tone marks can be decomposed
263//! using the [`detone`](https://crates.io/crates/detone) crate.
264//!
265//! # Streaming & Non-Streaming; Rust & C/C++
266//!
267//! The API in Rust has two modes of operation: streaming and non-streaming.
268//! The streaming API is the foundation of the implementation and should be
269//! used when processing data that arrives piecemeal from an i/o stream. The
270//! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
271//! to C callers. The non-streaming part of the API is for Rust callers only and
272//! is smart about borrowing instead of copying when possible. When
273//! streamability is not needed, the non-streaming API should be preferrer in
274//! order to avoid copying data when a borrow suffices.
275//!
276//! There is no analogous C API exposed via FFI, mainly because C doesn't have
277//! standard types for growable byte buffers and Unicode strings that know
278//! their length.
279//!
280//! The C API (header file generated at `target/include/encoding_rs.h` when
281//! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
282//! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
283//! The C binding comes with a [C++14 wrapper][2] that uses standard library +
284//! [GSL][3] types and that recreates the non-streaming API in C++ on top of
285//! the streaming API. A C++ wrapper with XPCOM/MFBT types is being developed
286//! as part of Mozilla [bug 1261841][4].
287//!
288//! The `Encoding` type is common to both the streaming and non-streaming
289//! modes. In the streaming mode, decoding operations are performed with a
290//! `Decoder` and encoding operations with an `Encoder` object obtained via
291//! `Encoding`. In the non-streaming mode, decoding and encoding operations are
292//! performed using methods on `Encoding` objects themselves, so the `Decoder`
293//! and `Encoder` objects are not used at all.
294//!
295//! [1]: https://github.com/hsivonen/encoding_c
296//! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
297//! [3]: https://github.com/Microsoft/GSL/
298//! [4]: https://bugzilla.mozilla.org/show_bug.cgi?id=encoding_rs
299//!
300//! # Memory management
301//!
302//! The non-streaming mode never performs heap allocations (even the methods
303//! that write into a `Vec<u8>` or a `String` by taking them as arguments do
304//! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
305//! is, the non-streaming mode uses caller-allocated buffers exclusively.
306//!
307//! The methods of the streaming mode that return a `Vec<u8>` or a `String`
308//! perform heap allocations but only to allocate the backing buffer of the
309//! `Vec<u8>` or the `String`.
310//!
311//! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
312//! `Drop` cleanup.
313//!
314//! # Buffer reading and writing behavior
315//!
316//! Based on experience gained with the `java.nio.charset` encoding converter
317//! API and with the Gecko uconv encoding converter API, the buffer reading
318//! and writing behaviors of encoding_rs are asymmetric: input buffers are
319//! fully drained but output buffers are not always fully filled.
320//!
321//! When reading from an input buffer, encoding_rs always consumes all input
322//! up to the next error or to the end of the buffer. In particular, when
323//! decoding, even if the input buffer ends in the middle of a byte sequence
324//! for a character, the decoder consumes all input. This has the benefit that
325//! the caller of the API can always fill the next buffer from the start from
326//! whatever source the bytes come from and never has to first copy the last
327//! bytes of the previous buffer to the start of the next buffer. However, when
328//! encoding, the UTF-8 input buffers have to end at a character boundary, which
329//! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
330//! boundaries falling in the middle of a surrogate pair result in both
331//! suggorates being treated individually as unpaired surrogates.
332//!
333//! Additionally, decoders guarantee that they can be fed even one byte at a
334//! time and encoders guarantee that they can be fed even one code point at a
335//! time. This has the benefit of not placing restrictions on the size of
336//! chunks the content arrives e.g. from network.
337//!
338//! When writing into an output buffer, encoding_rs makes sure that the code
339//! unit sequence for a character is never split across output buffer
340//! boundaries. This may result in wasted space at the end of an output buffer,
341//! but the advantages are that the output side of both decoders and encoders
342//! is greatly simplified compared to designs that attempt to fill output
343//! buffers exactly even when that entails splitting a code unit sequence and
344//! when encoding_rs methods return to the caller, the output produces thus
345//! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
346//! the output needs to be considered as a whole, because the latest output
347//! buffer taken alone might not be valid taken alone if the transition away
348//! from the ASCII state occurred in an earlier output buffer. However, since
349//! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
350//! state as being in error despite the encoder generating a transition to the
351//! ASCII state at the end, the claim about the partial output taken as a whole
352//! being valid is true even for ISO-2022-JP.)
353//!
354//! # Error Reporting
355//!
356//! Based on experience gained with the `java.nio.charset` encoding converter
357//! API and with the Gecko uconv encoding converter API, the error reporting
358//! behaviors of encoding_rs are asymmetric: decoder errors include offsets
359//! that leave it up to the caller to extract the erroneous bytes from the
360//! input stream if the caller wishes to do so but encoder errors provide the
361//! code point associated with the error without requiring the caller to
362//! extract it from the input on its own.
363//!
364//! On the encoder side, an error is always triggered by the most recently
365//! pushed Unicode scalar, which makes it simple to pass the `char` to the
366//! caller. Also, it's very typical for the caller to wish to do something with
367//! this data: generate a numeric escape for the character. Additionally, the
368//! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
369//! certain cases, so requiring the caller to extract the character from the
370//! input buffer would require the caller to handle ISO-2022-JP details.
371//! Furthermore, requiring the caller to extract the character from the input
372//! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
373//! the job of an encoding conversion library.
374//!
375//! On the decoder side, errors are triggered in more complex ways. For
376//! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
377//! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
378//! the buffer boundary when processing 'A'. Thus, the bytes in error might not
379//! be the ones most recently pushed to the decoder and the error might not even
380//! be in the current buffer.
381//!
382//! Some encoding conversion APIs address the problem by not acknowledging
383//! trailing bytes of an input buffer as consumed if it's still possible for
384//! future bytes to cause the trailing bytes to be in error. This way, error
385//! reporting can always refer to the most recently pushed buffer. This has the
386//! problem that the caller of the API has to copy the unconsumed trailing
387//! bytes to the start of the next buffer before being able to fill the rest
388//! of the next buffer. This is annoying, error-prone and inefficient.
389//!
390//! A possible solution would be making the decoder remember recently consumed
391//! bytes in order to be able to include a copy of the erroneous bytes when
392//! reporting an error. This has two problem: First, callers a rarely
393//! interested in the erroneous bytes, so attempts to identify them are most
394//! often just overhead anyway. Second, the rare applications that are
395//! interested typically care about the location of the error in the input
396//! stream.
397//!
398//! To keep the API convenient for common uses and the overhead low while making
399//! it possible to develop applications, such as HTML validators, that care
400//! about which bytes were in error, encoding_rs reports the length of the
401//! erroneous sequence and the number of bytes consumed after the erroneous
402//! sequence. As long as the caller doesn't discard the 6 most recent bytes,
403//! this makes it possible for callers that care about the erroneous bytes to
404//! locate them.
405//!
406//! # No Convenience API for Custom Replacements
407//!
408//! The Web Platform and, therefore, the Encoding Standard supports only one
409//! error recovery mode for decoders and only one error recovery mode for
410//! encoders. The supported error recovery mode for decoders is emitting the
411//! REPLACEMENT CHARACTER on error. The supported error recovery mode for
412//! encoders is emitting an HTML decimal numeric character reference for
413//! unmappable characters.
414//!
415//! Since encoding_rs is Web-focused, these are the only error recovery modes
416//! for which convenient support is provided. Moreover, on the decoder side,
417//! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
418//! on error (other than treating errors as fatal). In particular, simply
419//! ignoring errors is a
420//! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
421//! so it would be a bad idea for encoding_rs to provide a mode that encouraged
422//! callers to ignore errors.
423//!
424//! On the encoder side, there are plausible alternatives for HTML decimal
425//! numeric character references. For example, when outputting CSS, CSS-style
426//! escapes would seem to make sense. However, instead of facilitating the
427//! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
428//! position that you shouldn't generate output in encodings other than UTF-8,
429//! except where backward compatibility with interacting with the legacy Web
430//! requires it. The legacy Web requires it only when parsing the query strings
431//! of URLs and when submitting forms, and those two both use HTML decimal
432//! numeric character references.
433//!
434//! While encoding_rs doesn't make encoder replacements other than HTML decimal
435//! numeric character references easy, it does make them _possible_.
436//! `encode_from_utf8()`, which emits HTML decimal numeric character references
437//! for unmappable characters, is implemented on top of
438//! `encode_from_utf8_without_replacement()`. Applications that really, really
439//! want other replacement schemes for unmappable characters can likewise
440//! implement them on top of `encode_from_utf8_without_replacement()`.
441//!
442//! # No Extensibility by Design
443//!
444//! The set of encodings supported by encoding_rs is not extensible by design.
445//! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
446//! rather than `trait`s. encoding_rs takes the design position that all future
447//! text interchange should be done using UTF-8, which can represent all of
448//! Unicode. (It is, in fact, the only encoding supported by the Encoding
449//! Standard and encoding_rs that can represent all of Unicode and that has
450//! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
451//! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
452//! legacy compatibility and not due to non-UTF-8 encodings having benefits
453//! other than being able to consume legacy content.
454//!
455//! Considering that UTF-8 can represent all of Unicode and is already supported
456//! by all Web browsers, introducing a new encoding wouldn't add to the
457//! expressiveness but would add to compatibility problems. In that sense,
458//! adding new encodings to the Web Platform doesn't make sense, and, in fact,
459//! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
460//! the Web Platform. On the other hand, the set of legacy encodings that must
461//! be supported for a Web browser to be able to be successful is not going to
462//! expand. Empirically, the set of encodings specified in the Encoding Standard
463//! is already sufficient and the set of legacy encodings won't grow
464//! retroactively.
465//!
466//! Since extensibility doesn't make sense considering the Web focus of
467//! encoding_rs and adding encodings to Web clients would be actively harmful,
468//! it makes sense to make the set of encodings that encoding_rs supports
469//! non-extensible and to take the (admittedly small) benefits arising from
470//! that, such as the size of `Decoder` and `Encoder` objects being known ahead
471//! of time, which enables stack allocation thereof.
472//!
473//! This does have downsides for applications that might want to put encoding_rs
474//! to non-Web uses if those non-Web uses involve legacy encodings that aren't
475//! needed for Web uses. The needs of such applications should not complicate
476//! encoding_rs itself, though. It is up to those applications to provide a
477//! framework that delegates the operations with encodings that encoding_rs
478//! supports to encoding_rs and operations with other encodings to something
479//! else (as opposed to encoding_rs itself providing an extensibility
480//! framework).
481//!
482//! # Panics
483//!
484//! Methods in encoding_rs can panic if the API is used against the requirements
485//! stated in the documentation, if a state that's supposed to be impossible
486//! is reached due to an internal bug or on integer overflow. When used
487//! according to documentation with buffer sizes that stay below integer
488//! overflow, in the absence of internal bugs, encoding_rs does not panic.
489//!
490//! Panics arising from API misuse aren't documented beyond this on individual
491//! methods.
492//!
493//! # At-Risk Parts of the API
494//!
495//! The foreseeable source of partially backward-incompatible API change is the
496//! way the instances of `Encoding` are made available.
497//!
498//! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
499//! initialized with `static`s of type `&'static Encoding`, the non-reference
500//! `FOO_INIT` public `Encoding` instances will be removed from the public API.
501//!
502//! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
503//! unique when the constant is used in different crates, the reference-typed
504//! `static`s for the encoding instances will be changed from `static` to
505//! `const` and the non-reference-typed `_INIT` instances will be removed.
506//!
507//! # Mapping Spec Concepts onto the API
508//!
509//! <table>
510//! <thead>
511//! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
512//! </thead>
513//! <tbody>
514//! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&'static Encoding</code></td><td><code>&'static Encoding</code></td></tr>
515//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
516//! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
517//! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
518//! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
519//! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
520//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
521//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
522//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// … (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
523//! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
524//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// …</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
525//! </tbody>
526//! </table>
527//!
528//! # Compatibility with the rust-encoding API
529//!
530//! The crate
531//! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
532//! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
533//! the API of rust-encoding 0.2.32 on top of encoding_rs.
534//!
535//! # Mapping rust-encoding concepts to encoding_rs concepts
536//!
537//! The following table provides a mapping from rust-encoding constructs to
538//! encoding_rs ones.
539//!
540//! <table>
541//! <thead>
542//! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
543//! </thead>
544//! <tbody>
545//! <tr><td><code>encoding::EncodingRef</code></td><td><code>&'static encoding_rs::Encoding</code></td></tr>
546//! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
547//! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
548//! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
549//! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
550//! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
551//! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
552//! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
553//! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
554//! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
555//! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
556//! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
557//! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
558//! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
559//! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
560//! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
561//! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
562//! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
563//! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
564//! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
565//! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
566//! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
567//! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
568//! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
569//! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
570//! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
571//! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
572//! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
573//! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
574//! </tbody>
575//! </table>
576//!
577//! # Relationship with Windows Code Pages
578//!
579//! Despite the Web and browser focus, the encodings defined by the Encoding
580//! Standard and implemented by this crate may be useful for decoding legacy
581//! data that uses Windows code pages. The following table names the single-byte
582//! encodings
583//! that have a closely related Windows code page, the number of the closest
584//! code page, a column indicating whether Windows maps unassigned code points
585//! to the Unicode Private Use Area instead of U+FFFD and a remark number
586//! indicating remarks in the list after the table.
587//!
588//! <table>
589//! <thead>
590//! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
591//! </thead>
592//! <tbody>
593//! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
594//! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
595//! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
596//! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
597//! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
598//! <tr><td>windows-874</td><td>874</td><td>•</td><td></td></tr>
599//! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
600//! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
601//! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
602//! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
603//! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
604//! <tr><td>windows-1253</td><td>1253</td><td>•</td><td></td></tr>
605//! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
606//! <tr><td>windows-1255</td><td>1255</td><td>•</td><td></td></tr>
607//! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
608//! <tr><td>windows-1257</td><td>1257</td><td>•</td><td></td></tr>
609//! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
610//! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
611//! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
612//! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
613//! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
614//! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
615//! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
616//! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
617//! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
618//! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
619//! <tr><td>ISO-8859-6</td><td>28596</td><td>•</td><td></td></tr>
620//! <tr><td>ISO-8859-7</td><td>28597</td><td>•</td><td>3</td></tr>
621//! <tr><td>ISO-8859-8</td><td>28598</td><td>•</td><td>4</td></tr>
622//! <tr><td>ISO-8859-13</td><td>28603</td><td>•</td><td></td></tr>
623//! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
624//! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
625//! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
626//! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
627//! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
628//! </tbody>
629//! </table>
630//!
631//! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
632//! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
633//! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
634//! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
635//! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
636//! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
637//! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
638//! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
639//! of LRM and RLM.
640//! 5. Remarks from the previous item apply.
641//!
642//! The differences between this crate and Windows in the case of multibyte encodings
643//! are not yet fully documented here. The lack of remarks above should not be taken
644//! as indication of lack of differences.
645//!
646//! # Notable Differences from IANA Naming
647//!
648//! In some cases, the Encoding Standard specifies the popular unextended encoding
649//! name where in IANA terms one of the other labels would be more precise considering
650//! the extensions that the Encoding Standard has unified into the encoding.
651//!
652//! <table>
653//! <thead>
654//! <tr><th>Encoding</th><th>IANA</th></tr>
655//! </thead>
656//! <tbody>
657//! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
658//! <tr><td>EUC-KR</td><td>windows-949</td></tr>
659//! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
660//! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
661//! </tbody>
662//! </table>
663//!
664//! In other cases where the Encoding Standard unifies unextended and extended
665//! variants of an encoding, the encoding gets the name of the extended
666//! variant.
667//!
668//! <table>
669//! <thead>
670//! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
671//! </thead>
672//! <tbody>
673//! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
674//! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
675//! <tr><td>TIS-620</td><td>windows-874</td></tr>
676//! </tbody>
677//! </table>
678//!
679//! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
680//! for discussion about the UTF-16 family.
681
682#![cfg_attr(feature = "simd-accel", feature(stdsimd, core_intrinsics))]
683
684#[macro_use]
685extern crate cfg_if;
686
687#[cfg(all(
688 feature = "simd-accel",
689 any(
690 target_feature = "sse2",
691 all(target_endian = "little", target_arch = "aarch64"),
692 all(target_endian = "little", target_feature = "neon")
693 )
694))]
695#[macro_use(shuffle)]
696extern crate packed_simd;
697
698#[cfg(feature = "serde")]
699extern crate serde;
700
701#[cfg(all(test, feature = "serde"))]
702extern crate bincode;
703#[cfg(all(test, feature = "serde"))]
704#[macro_use]
705extern crate serde_derive;
706#[cfg(all(test, feature = "serde"))]
707extern crate serde_json;
708
709#[macro_use]
710mod macros;
711
712#[cfg(all(
713 feature = "simd-accel",
714 any(
715 target_feature = "sse2",
716 all(target_endian = "little", target_arch = "aarch64"),
717 all(target_endian = "little", target_feature = "neon")
718 )
719))]
720mod simd_funcs;
721
722#[cfg(test)]
723mod testing;
724
725mod big5;
726mod euc_jp;
727mod euc_kr;
728mod gb18030;
729mod iso_2022_jp;
730mod replacement;
731mod shift_jis;
732mod single_byte;
733mod utf_16;
734mod utf_8;
735mod x_user_defined;
736
737mod ascii;
738mod data;
739mod handles;
740mod variant;
741
742pub mod mem;
743
744use crate::ascii::ascii_valid_up_to;
745use crate::ascii::iso_2022_jp_ascii_valid_up_to;
746use crate::utf_8::utf8_valid_up_to;
747use crate::variant::*;
748
749use std::borrow::Cow;
750use std::cmp::Ordering;
751use std::hash::Hash;
752use std::hash::Hasher;
753
754#[cfg(feature = "serde")]
755use serde::de::Visitor;
756#[cfg(feature = "serde")]
757use serde::{Deserialize, Deserializer, Serialize, Serializer};
758
759/// This has to be the max length of an NCR instead of max
760/// minus one, because we can't rely on getting the minus
761/// one from the space reserved for the current unmappable,
762/// because the ISO-2022-JP encoder can fill up that space
763/// with a state transition escape.
764const NCR_EXTRA: usize = 10; // 
765
766// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
767// Instead, please regenerate using generate-encoding-data.py
768
769const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
770
771/// The initializer for the [Big5](static.BIG5.html) encoding.
772///
773/// For use only for taking the address of this form when
774/// Rust prohibits the use of the non-`_INIT` form directly,
775/// such as in initializers of other `static`s. If in doubt,
776/// use the corresponding non-`_INIT` reference-typed `static`.
777///
778/// This part of the public API will go away if Rust changes
779/// to make the referent of `pub const FOO: &'static Encoding`
780/// unique cross-crate or if Rust starts allowing static arrays
781/// to be initialized with `pub static FOO: &'static Encoding`
782/// items.
783pub static BIG5_INIT: Encoding = Encoding {
784 name: "Big5",
785 variant: VariantEncoding::Big5,
786};
787
788/// The Big5 encoding.
789///
790/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
791/// instead of the Private Use Area code points that have been used historically.
792/// It is believed to be able to decode existing Web content in a way that makes
793/// sense.
794///
795/// To avoid form submissions generating data that Web servers don't understand,
796/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
797/// Big5 in the lexical order.
798///
799/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
800/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
801///
802/// This encoding is designed to be suited for decoding the Windows code page 950
803/// and its HKSCS patched "951" variant such that the text makes sense, given
804/// assignments that Unicode has made after those encodings used Private Use
805/// Area characters.
806///
807/// This will change from `static` to `const` if Rust changes
808/// to make the referent of `pub const FOO: &'static Encoding`
809/// unique cross-crate, so don't take the address of this
810/// `static`.
811pub static BIG5: &'static Encoding = &BIG5_INIT;
812
813/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
814///
815/// For use only for taking the address of this form when
816/// Rust prohibits the use of the non-`_INIT` form directly,
817/// such as in initializers of other `static`s. If in doubt,
818/// use the corresponding non-`_INIT` reference-typed `static`.
819///
820/// This part of the public API will go away if Rust changes
821/// to make the referent of `pub const FOO: &'static Encoding`
822/// unique cross-crate or if Rust starts allowing static arrays
823/// to be initialized with `pub static FOO: &'static Encoding`
824/// items.
825pub static EUC_JP_INIT: Encoding = Encoding {
826 name: "EUC-JP",
827 variant: VariantEncoding::EucJp,
828};
829
830/// The EUC-JP encoding.
831///
832/// This is the legacy Unix encoding for Japanese.
833///
834/// For compatibility with Web servers that don't expect three-byte sequences
835/// in form submissions, the encoder doesn't generate three-byte sequences.
836/// That is, the JIS X 0212 support is decode-only.
837///
838/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
839/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
840///
841/// This encoding roughly matches the Windows code page 20932. There are error
842/// handling differences and a handful of 2-byte sequences that decode differently.
843/// Additionall, Windows doesn't support 3-byte sequences.
844///
845/// This will change from `static` to `const` if Rust changes
846/// to make the referent of `pub const FOO: &'static Encoding`
847/// unique cross-crate, so don't take the address of this
848/// `static`.
849pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
850
851/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
852///
853/// For use only for taking the address of this form when
854/// Rust prohibits the use of the non-`_INIT` form directly,
855/// such as in initializers of other `static`s. If in doubt,
856/// use the corresponding non-`_INIT` reference-typed `static`.
857///
858/// This part of the public API will go away if Rust changes
859/// to make the referent of `pub const FOO: &'static Encoding`
860/// unique cross-crate or if Rust starts allowing static arrays
861/// to be initialized with `pub static FOO: &'static Encoding`
862/// items.
863pub static EUC_KR_INIT: Encoding = Encoding {
864 name: "EUC-KR",
865 variant: VariantEncoding::EucKr,
866};
867
868/// The EUC-KR encoding.
869///
870/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
871/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
872/// Classic), with all the characters from the Hangul Syllables block of Unicode.
873///
874/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
875/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
876///
877/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
878/// to U+0080 and some byte sequences that are error per the Encoding Standard to
879/// the question mark or the Private Use Area.
880///
881/// This will change from `static` to `const` if Rust changes
882/// to make the referent of `pub const FOO: &'static Encoding`
883/// unique cross-crate, so don't take the address of this
884/// `static`.
885pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
886
887/// The initializer for the [GBK](static.GBK.html) encoding.
888///
889/// For use only for taking the address of this form when
890/// Rust prohibits the use of the non-`_INIT` form directly,
891/// such as in initializers of other `static`s. If in doubt,
892/// use the corresponding non-`_INIT` reference-typed `static`.
893///
894/// This part of the public API will go away if Rust changes
895/// to make the referent of `pub const FOO: &'static Encoding`
896/// unique cross-crate or if Rust starts allowing static arrays
897/// to be initialized with `pub static FOO: &'static Encoding`
898/// items.
899pub static GBK_INIT: Encoding = Encoding {
900 name: "GBK",
901 variant: VariantEncoding::Gbk,
902};
903
904/// The GBK encoding.
905///
906/// The decoder for this encoding is the same as the decoder for gb18030.
907/// The encoder side of this encoding is GBK with Windows code page 936 euro
908/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
909/// Unicode block as well as a handful of ideographs from the CJK Unified
910/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
911///
912/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
913/// unified with the gb18030 encoder in the Encoding Standard out of concern
914/// that servers that expect GBK form submissions might not be able to handle
915/// the four-byte sequences.
916///
917/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
918/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
919///
920/// The encoder of this encoding roughly matches the Windows code page 936.
921/// The decoder side is a superset.
922///
923/// This will change from `static` to `const` if Rust changes
924/// to make the referent of `pub const FOO: &'static Encoding`
925/// unique cross-crate, so don't take the address of this
926/// `static`.
927pub static GBK: &'static Encoding = &GBK_INIT;
928
929/// The initializer for the [IBM866](static.IBM866.html) encoding.
930///
931/// For use only for taking the address of this form when
932/// Rust prohibits the use of the non-`_INIT` form directly,
933/// such as in initializers of other `static`s. If in doubt,
934/// use the corresponding non-`_INIT` reference-typed `static`.
935///
936/// This part of the public API will go away if Rust changes
937/// to make the referent of `pub const FOO: &'static Encoding`
938/// unique cross-crate or if Rust starts allowing static arrays
939/// to be initialized with `pub static FOO: &'static Encoding`
940/// items.
941pub static IBM866_INIT: Encoding = Encoding {
942 name: "IBM866",
943 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
944};
945
946/// The IBM866 encoding.
947///
948/// This the most notable one of the DOS Cyrillic code pages. It has the same
949/// box drawing characters as code page 437, so it can be used for decoding
950/// DOS-era ASCII + box drawing data.
951///
952/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
953/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
954///
955/// This encoding matches the Windows code page 866.
956///
957/// This will change from `static` to `const` if Rust changes
958/// to make the referent of `pub const FOO: &'static Encoding`
959/// unique cross-crate, so don't take the address of this
960/// `static`.
961pub static IBM866: &'static Encoding = &IBM866_INIT;
962
963/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
964///
965/// For use only for taking the address of this form when
966/// Rust prohibits the use of the non-`_INIT` form directly,
967/// such as in initializers of other `static`s. If in doubt,
968/// use the corresponding non-`_INIT` reference-typed `static`.
969///
970/// This part of the public API will go away if Rust changes
971/// to make the referent of `pub const FOO: &'static Encoding`
972/// unique cross-crate or if Rust starts allowing static arrays
973/// to be initialized with `pub static FOO: &'static Encoding`
974/// items.
975pub static ISO_2022_JP_INIT: Encoding = Encoding {
976 name: "ISO-2022-JP",
977 variant: VariantEncoding::Iso2022Jp,
978};
979
980/// The ISO-2022-JP encoding.
981///
982/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
983/// byte range to encode non-Basic Latin characters. It's the only encoding
984/// supported by this crate whose encoder is stateful.
985///
986/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
987/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
988///
989/// This encoding roughly matches the Windows code page 50220. Notably, Windows
990/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
991/// error handling.
992///
993/// This will change from `static` to `const` if Rust changes
994/// to make the referent of `pub const FOO: &'static Encoding`
995/// unique cross-crate, so don't take the address of this
996/// `static`.
997pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
998
999/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1000///
1001/// For use only for taking the address of this form when
1002/// Rust prohibits the use of the non-`_INIT` form directly,
1003/// such as in initializers of other `static`s. If in doubt,
1004/// use the corresponding non-`_INIT` reference-typed `static`.
1005///
1006/// This part of the public API will go away if Rust changes
1007/// to make the referent of `pub const FOO: &'static Encoding`
1008/// unique cross-crate or if Rust starts allowing static arrays
1009/// to be initialized with `pub static FOO: &'static Encoding`
1010/// items.
1011pub static ISO_8859_10_INIT: Encoding = Encoding {
1012 name: "ISO-8859-10",
1013 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1014};
1015
1016/// The ISO-8859-10 encoding.
1017///
1018/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1019/// is also known as Latin 6.
1020///
1021/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1022/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1023///
1024/// The Windows code page number for this encoding is 28600, but kernel32.dll
1025/// does not support this encoding.
1026///
1027/// This will change from `static` to `const` if Rust changes
1028/// to make the referent of `pub const FOO: &'static Encoding`
1029/// unique cross-crate, so don't take the address of this
1030/// `static`.
1031pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1032
1033/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1034///
1035/// For use only for taking the address of this form when
1036/// Rust prohibits the use of the non-`_INIT` form directly,
1037/// such as in initializers of other `static`s. If in doubt,
1038/// use the corresponding non-`_INIT` reference-typed `static`.
1039///
1040/// This part of the public API will go away if Rust changes
1041/// to make the referent of `pub const FOO: &'static Encoding`
1042/// unique cross-crate or if Rust starts allowing static arrays
1043/// to be initialized with `pub static FOO: &'static Encoding`
1044/// items.
1045pub static ISO_8859_13_INIT: Encoding = Encoding {
1046 name: "ISO-8859-13",
1047 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1048};
1049
1050/// The ISO-8859-13 encoding.
1051///
1052/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1053/// is also known as Latin 7.
1054///
1055/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1056/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1057///
1058/// This encoding matches the Windows code page 28603, except Windows decodes
1059/// unassigned code points to the Private Use Area of Unicode.
1060///
1061/// This will change from `static` to `const` if Rust changes
1062/// to make the referent of `pub const FOO: &'static Encoding`
1063/// unique cross-crate, so don't take the address of this
1064/// `static`.
1065pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1066
1067/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1068///
1069/// For use only for taking the address of this form when
1070/// Rust prohibits the use of the non-`_INIT` form directly,
1071/// such as in initializers of other `static`s. If in doubt,
1072/// use the corresponding non-`_INIT` reference-typed `static`.
1073///
1074/// This part of the public API will go away if Rust changes
1075/// to make the referent of `pub const FOO: &'static Encoding`
1076/// unique cross-crate or if Rust starts allowing static arrays
1077/// to be initialized with `pub static FOO: &'static Encoding`
1078/// items.
1079pub static ISO_8859_14_INIT: Encoding = Encoding {
1080 name: "ISO-8859-14",
1081 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1082};
1083
1084/// The ISO-8859-14 encoding.
1085///
1086/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1087/// is also known as Latin 8.
1088///
1089/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1090/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1091///
1092/// The Windows code page number for this encoding is 28604, but kernel32.dll
1093/// does not support this encoding.
1094///
1095/// This will change from `static` to `const` if Rust changes
1096/// to make the referent of `pub const FOO: &'static Encoding`
1097/// unique cross-crate, so don't take the address of this
1098/// `static`.
1099pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1100
1101/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1102///
1103/// For use only for taking the address of this form when
1104/// Rust prohibits the use of the non-`_INIT` form directly,
1105/// such as in initializers of other `static`s. If in doubt,
1106/// use the corresponding non-`_INIT` reference-typed `static`.
1107///
1108/// This part of the public API will go away if Rust changes
1109/// to make the referent of `pub const FOO: &'static Encoding`
1110/// unique cross-crate or if Rust starts allowing static arrays
1111/// to be initialized with `pub static FOO: &'static Encoding`
1112/// items.
1113pub static ISO_8859_15_INIT: Encoding = Encoding {
1114 name: "ISO-8859-15",
1115 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1116};
1117
1118/// The ISO-8859-15 encoding.
1119///
1120/// This is the revised Western European part of the ISO/IEC 8859 encoding
1121/// family. This encoding is also known as Latin 9.
1122///
1123/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1124/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1125///
1126/// This encoding matches the Windows code page 28605.
1127///
1128/// This will change from `static` to `const` if Rust changes
1129/// to make the referent of `pub const FOO: &'static Encoding`
1130/// unique cross-crate, so don't take the address of this
1131/// `static`.
1132pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1133
1134/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1135///
1136/// For use only for taking the address of this form when
1137/// Rust prohibits the use of the non-`_INIT` form directly,
1138/// such as in initializers of other `static`s. If in doubt,
1139/// use the corresponding non-`_INIT` reference-typed `static`.
1140///
1141/// This part of the public API will go away if Rust changes
1142/// to make the referent of `pub const FOO: &'static Encoding`
1143/// unique cross-crate or if Rust starts allowing static arrays
1144/// to be initialized with `pub static FOO: &'static Encoding`
1145/// items.
1146pub static ISO_8859_16_INIT: Encoding = Encoding {
1147 name: "ISO-8859-16",
1148 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1149};
1150
1151/// The ISO-8859-16 encoding.
1152///
1153/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1154/// family. This encoding is also known as Latin 10.
1155///
1156/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1157/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1158///
1159/// The Windows code page number for this encoding is 28606, but kernel32.dll
1160/// does not support this encoding.
1161///
1162/// This will change from `static` to `const` if Rust changes
1163/// to make the referent of `pub const FOO: &'static Encoding`
1164/// unique cross-crate, so don't take the address of this
1165/// `static`.
1166pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1167
1168/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1169///
1170/// For use only for taking the address of this form when
1171/// Rust prohibits the use of the non-`_INIT` form directly,
1172/// such as in initializers of other `static`s. If in doubt,
1173/// use the corresponding non-`_INIT` reference-typed `static`.
1174///
1175/// This part of the public API will go away if Rust changes
1176/// to make the referent of `pub const FOO: &'static Encoding`
1177/// unique cross-crate or if Rust starts allowing static arrays
1178/// to be initialized with `pub static FOO: &'static Encoding`
1179/// items.
1180pub static ISO_8859_2_INIT: Encoding = Encoding {
1181 name: "ISO-8859-2",
1182 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1183};
1184
1185/// The ISO-8859-2 encoding.
1186///
1187/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1188///
1189/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1190/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1191///
1192/// This encoding matches the Windows code page 28592.
1193///
1194/// This will change from `static` to `const` if Rust changes
1195/// to make the referent of `pub const FOO: &'static Encoding`
1196/// unique cross-crate, so don't take the address of this
1197/// `static`.
1198pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1199
1200/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1201///
1202/// For use only for taking the address of this form when
1203/// Rust prohibits the use of the non-`_INIT` form directly,
1204/// such as in initializers of other `static`s. If in doubt,
1205/// use the corresponding non-`_INIT` reference-typed `static`.
1206///
1207/// This part of the public API will go away if Rust changes
1208/// to make the referent of `pub const FOO: &'static Encoding`
1209/// unique cross-crate or if Rust starts allowing static arrays
1210/// to be initialized with `pub static FOO: &'static Encoding`
1211/// items.
1212pub static ISO_8859_3_INIT: Encoding = Encoding {
1213 name: "ISO-8859-3",
1214 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1215};
1216
1217/// The ISO-8859-3 encoding.
1218///
1219/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1220///
1221/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1222/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1223///
1224/// This encoding matches the Windows code page 28593.
1225///
1226/// This will change from `static` to `const` if Rust changes
1227/// to make the referent of `pub const FOO: &'static Encoding`
1228/// unique cross-crate, so don't take the address of this
1229/// `static`.
1230pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1231
1232/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1233///
1234/// For use only for taking the address of this form when
1235/// Rust prohibits the use of the non-`_INIT` form directly,
1236/// such as in initializers of other `static`s. If in doubt,
1237/// use the corresponding non-`_INIT` reference-typed `static`.
1238///
1239/// This part of the public API will go away if Rust changes
1240/// to make the referent of `pub const FOO: &'static Encoding`
1241/// unique cross-crate or if Rust starts allowing static arrays
1242/// to be initialized with `pub static FOO: &'static Encoding`
1243/// items.
1244pub static ISO_8859_4_INIT: Encoding = Encoding {
1245 name: "ISO-8859-4",
1246 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1247};
1248
1249/// The ISO-8859-4 encoding.
1250///
1251/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1252///
1253/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1254/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1255///
1256/// This encoding matches the Windows code page 28594.
1257///
1258/// This will change from `static` to `const` if Rust changes
1259/// to make the referent of `pub const FOO: &'static Encoding`
1260/// unique cross-crate, so don't take the address of this
1261/// `static`.
1262pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1263
1264/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1265///
1266/// For use only for taking the address of this form when
1267/// Rust prohibits the use of the non-`_INIT` form directly,
1268/// such as in initializers of other `static`s. If in doubt,
1269/// use the corresponding non-`_INIT` reference-typed `static`.
1270///
1271/// This part of the public API will go away if Rust changes
1272/// to make the referent of `pub const FOO: &'static Encoding`
1273/// unique cross-crate or if Rust starts allowing static arrays
1274/// to be initialized with `pub static FOO: &'static Encoding`
1275/// items.
1276pub static ISO_8859_5_INIT: Encoding = Encoding {
1277 name: "ISO-8859-5",
1278 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1279};
1280
1281/// The ISO-8859-5 encoding.
1282///
1283/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1284///
1285/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1286/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1287///
1288/// This encoding matches the Windows code page 28595.
1289///
1290/// This will change from `static` to `const` if Rust changes
1291/// to make the referent of `pub const FOO: &'static Encoding`
1292/// unique cross-crate, so don't take the address of this
1293/// `static`.
1294pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1295
1296/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1297///
1298/// For use only for taking the address of this form when
1299/// Rust prohibits the use of the non-`_INIT` form directly,
1300/// such as in initializers of other `static`s. If in doubt,
1301/// use the corresponding non-`_INIT` reference-typed `static`.
1302///
1303/// This part of the public API will go away if Rust changes
1304/// to make the referent of `pub const FOO: &'static Encoding`
1305/// unique cross-crate or if Rust starts allowing static arrays
1306/// to be initialized with `pub static FOO: &'static Encoding`
1307/// items.
1308pub static ISO_8859_6_INIT: Encoding = Encoding {
1309 name: "ISO-8859-6",
1310 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1311};
1312
1313/// The ISO-8859-6 encoding.
1314///
1315/// This is the Arabic part of the ISO/IEC 8859 encoding family.
1316///
1317/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1318/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1319///
1320/// This encoding matches the Windows code page 28596, except Windows decodes
1321/// unassigned code points to the Private Use Area of Unicode.
1322///
1323/// This will change from `static` to `const` if Rust changes
1324/// to make the referent of `pub const FOO: &'static Encoding`
1325/// unique cross-crate, so don't take the address of this
1326/// `static`.
1327pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1328
1329/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1330///
1331/// For use only for taking the address of this form when
1332/// Rust prohibits the use of the non-`_INIT` form directly,
1333/// such as in initializers of other `static`s. If in doubt,
1334/// use the corresponding non-`_INIT` reference-typed `static`.
1335///
1336/// This part of the public API will go away if Rust changes
1337/// to make the referent of `pub const FOO: &'static Encoding`
1338/// unique cross-crate or if Rust starts allowing static arrays
1339/// to be initialized with `pub static FOO: &'static Encoding`
1340/// items.
1341pub static ISO_8859_7_INIT: Encoding = Encoding {
1342 name: "ISO-8859-7",
1343 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1344};
1345
1346/// The ISO-8859-7 encoding.
1347///
1348/// This is the Greek part of the ISO/IEC 8859 encoding family.
1349///
1350/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1351/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1352///
1353/// This encoding roughly matches the Windows code page 28597. Windows decodes
1354/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1355/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1356/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1357/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1358/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1359///
1360/// This will change from `static` to `const` if Rust changes
1361/// to make the referent of `pub const FOO: &'static Encoding`
1362/// unique cross-crate, so don't take the address of this
1363/// `static`.
1364pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1365
1366/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1367///
1368/// For use only for taking the address of this form when
1369/// Rust prohibits the use of the non-`_INIT` form directly,
1370/// such as in initializers of other `static`s. If in doubt,
1371/// use the corresponding non-`_INIT` reference-typed `static`.
1372///
1373/// This part of the public API will go away if Rust changes
1374/// to make the referent of `pub const FOO: &'static Encoding`
1375/// unique cross-crate or if Rust starts allowing static arrays
1376/// to be initialized with `pub static FOO: &'static Encoding`
1377/// items.
1378pub static ISO_8859_8_INIT: Encoding = Encoding {
1379 name: "ISO-8859-8",
1380 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1381};
1382
1383/// The ISO-8859-8 encoding.
1384///
1385/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1386///
1387/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1388/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1389///
1390/// This encoding roughly matches the Windows code page 28598. Windows decodes
1391/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1392/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1393/// the private use area.
1394///
1395/// This will change from `static` to `const` if Rust changes
1396/// to make the referent of `pub const FOO: &'static Encoding`
1397/// unique cross-crate, so don't take the address of this
1398/// `static`.
1399pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1400
1401/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1402///
1403/// For use only for taking the address of this form when
1404/// Rust prohibits the use of the non-`_INIT` form directly,
1405/// such as in initializers of other `static`s. If in doubt,
1406/// use the corresponding non-`_INIT` reference-typed `static`.
1407///
1408/// This part of the public API will go away if Rust changes
1409/// to make the referent of `pub const FOO: &'static Encoding`
1410/// unique cross-crate or if Rust starts allowing static arrays
1411/// to be initialized with `pub static FOO: &'static Encoding`
1412/// items.
1413pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1414 name: "ISO-8859-8-I",
1415 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1416};
1417
1418/// The ISO-8859-8-I encoding.
1419///
1420/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1421///
1422/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1423/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1424///
1425/// This encoding roughly matches the Windows code page 38598. Windows decodes
1426/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1427/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1428/// the private use area.
1429///
1430/// This will change from `static` to `const` if Rust changes
1431/// to make the referent of `pub const FOO: &'static Encoding`
1432/// unique cross-crate, so don't take the address of this
1433/// `static`.
1434pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1435
1436/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1437///
1438/// For use only for taking the address of this form when
1439/// Rust prohibits the use of the non-`_INIT` form directly,
1440/// such as in initializers of other `static`s. If in doubt,
1441/// use the corresponding non-`_INIT` reference-typed `static`.
1442///
1443/// This part of the public API will go away if Rust changes
1444/// to make the referent of `pub const FOO: &'static Encoding`
1445/// unique cross-crate or if Rust starts allowing static arrays
1446/// to be initialized with `pub static FOO: &'static Encoding`
1447/// items.
1448pub static KOI8_R_INIT: Encoding = Encoding {
1449 name: "KOI8-R",
1450 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1451};
1452
1453/// The KOI8-R encoding.
1454///
1455/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1456///
1457/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1458/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1459///
1460/// This encoding matches the Windows code page 20866.
1461///
1462/// This will change from `static` to `const` if Rust changes
1463/// to make the referent of `pub const FOO: &'static Encoding`
1464/// unique cross-crate, so don't take the address of this
1465/// `static`.
1466pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1467
1468/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1469///
1470/// For use only for taking the address of this form when
1471/// Rust prohibits the use of the non-`_INIT` form directly,
1472/// such as in initializers of other `static`s. If in doubt,
1473/// use the corresponding non-`_INIT` reference-typed `static`.
1474///
1475/// This part of the public API will go away if Rust changes
1476/// to make the referent of `pub const FOO: &'static Encoding`
1477/// unique cross-crate or if Rust starts allowing static arrays
1478/// to be initialized with `pub static FOO: &'static Encoding`
1479/// items.
1480pub static KOI8_U_INIT: Encoding = Encoding {
1481 name: "KOI8-U",
1482 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1483};
1484
1485/// The KOI8-U encoding.
1486///
1487/// This is an encoding for Ukrainian adapted from KOI8-R.
1488///
1489/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1490/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1491///
1492/// This encoding matches the Windows code page 21866.
1493///
1494/// This will change from `static` to `const` if Rust changes
1495/// to make the referent of `pub const FOO: &'static Encoding`
1496/// unique cross-crate, so don't take the address of this
1497/// `static`.
1498pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1499
1500/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1501///
1502/// For use only for taking the address of this form when
1503/// Rust prohibits the use of the non-`_INIT` form directly,
1504/// such as in initializers of other `static`s. If in doubt,
1505/// use the corresponding non-`_INIT` reference-typed `static`.
1506///
1507/// This part of the public API will go away if Rust changes
1508/// to make the referent of `pub const FOO: &'static Encoding`
1509/// unique cross-crate or if Rust starts allowing static arrays
1510/// to be initialized with `pub static FOO: &'static Encoding`
1511/// items.
1512pub static SHIFT_JIS_INIT: Encoding = Encoding {
1513 name: "Shift_JIS",
1514 variant: VariantEncoding::ShiftJis,
1515};
1516
1517/// The Shift_JIS encoding.
1518///
1519/// This is the Japanese encoding for Windows.
1520///
1521/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1522/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1523///
1524/// This encoding matches the Windows code page 932, except Windows decodes some byte
1525/// sequences that are error per the Encoding Standard to the question mark or the
1526/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1527///
1528/// This will change from `static` to `const` if Rust changes
1529/// to make the referent of `pub const FOO: &'static Encoding`
1530/// unique cross-crate, so don't take the address of this
1531/// `static`.
1532pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1533
1534/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1535///
1536/// For use only for taking the address of this form when
1537/// Rust prohibits the use of the non-`_INIT` form directly,
1538/// such as in initializers of other `static`s. If in doubt,
1539/// use the corresponding non-`_INIT` reference-typed `static`.
1540///
1541/// This part of the public API will go away if Rust changes
1542/// to make the referent of `pub const FOO: &'static Encoding`
1543/// unique cross-crate or if Rust starts allowing static arrays
1544/// to be initialized with `pub static FOO: &'static Encoding`
1545/// items.
1546pub static UTF_16BE_INIT: Encoding = Encoding {
1547 name: "UTF-16BE",
1548 variant: VariantEncoding::Utf16Be,
1549};
1550
1551/// The UTF-16BE encoding.
1552///
1553/// This decode-only encoding uses 16-bit code units due to Unicode originally
1554/// having been designed as a 16-bit reportoire. In the absence of a byte order
1555/// mark the big endian byte order is assumed.
1556///
1557/// There is no corresponding encoder in this crate or in the Encoding
1558/// Standard. The output encoding of this encoding is UTF-8.
1559///
1560/// This encoding matches the Windows code page 1201.
1561///
1562/// This will change from `static` to `const` if Rust changes
1563/// to make the referent of `pub const FOO: &'static Encoding`
1564/// unique cross-crate, so don't take the address of this
1565/// `static`.
1566pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1567
1568/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1569///
1570/// For use only for taking the address of this form when
1571/// Rust prohibits the use of the non-`_INIT` form directly,
1572/// such as in initializers of other `static`s. If in doubt,
1573/// use the corresponding non-`_INIT` reference-typed `static`.
1574///
1575/// This part of the public API will go away if Rust changes
1576/// to make the referent of `pub const FOO: &'static Encoding`
1577/// unique cross-crate or if Rust starts allowing static arrays
1578/// to be initialized with `pub static FOO: &'static Encoding`
1579/// items.
1580pub static UTF_16LE_INIT: Encoding = Encoding {
1581 name: "UTF-16LE",
1582 variant: VariantEncoding::Utf16Le,
1583};
1584
1585/// The UTF-16LE encoding.
1586///
1587/// This decode-only encoding uses 16-bit code units due to Unicode originally
1588/// having been designed as a 16-bit reportoire. In the absence of a byte order
1589/// mark the little endian byte order is assumed.
1590///
1591/// There is no corresponding encoder in this crate or in the Encoding
1592/// Standard. The output encoding of this encoding is UTF-8.
1593///
1594/// This encoding matches the Windows code page 1200.
1595///
1596/// This will change from `static` to `const` if Rust changes
1597/// to make the referent of `pub const FOO: &'static Encoding`
1598/// unique cross-crate, so don't take the address of this
1599/// `static`.
1600pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1601
1602/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1603///
1604/// For use only for taking the address of this form when
1605/// Rust prohibits the use of the non-`_INIT` form directly,
1606/// such as in initializers of other `static`s. If in doubt,
1607/// use the corresponding non-`_INIT` reference-typed `static`.
1608///
1609/// This part of the public API will go away if Rust changes
1610/// to make the referent of `pub const FOO: &'static Encoding`
1611/// unique cross-crate or if Rust starts allowing static arrays
1612/// to be initialized with `pub static FOO: &'static Encoding`
1613/// items.
1614pub static UTF_8_INIT: Encoding = Encoding {
1615 name: "UTF-8",
1616 variant: VariantEncoding::Utf8,
1617};
1618
1619/// The UTF-8 encoding.
1620///
1621/// This is the encoding that should be used for all new development it can
1622/// represent all of Unicode.
1623///
1624/// This encoding matches the Windows code page 65001, except Windows differs
1625/// in the number of errors generated for some erroneous byte sequences.
1626///
1627/// This will change from `static` to `const` if Rust changes
1628/// to make the referent of `pub const FOO: &'static Encoding`
1629/// unique cross-crate, so don't take the address of this
1630/// `static`.
1631pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1632
1633/// The initializer for the [gb18030](static.GB18030.html) encoding.
1634///
1635/// For use only for taking the address of this form when
1636/// Rust prohibits the use of the non-`_INIT` form directly,
1637/// such as in initializers of other `static`s. If in doubt,
1638/// use the corresponding non-`_INIT` reference-typed `static`.
1639///
1640/// This part of the public API will go away if Rust changes
1641/// to make the referent of `pub const FOO: &'static Encoding`
1642/// unique cross-crate or if Rust starts allowing static arrays
1643/// to be initialized with `pub static FOO: &'static Encoding`
1644/// items.
1645pub static GB18030_INIT: Encoding = Encoding {
1646 name: "gb18030",
1647 variant: VariantEncoding::Gb18030,
1648};
1649
1650/// The gb18030 encoding.
1651///
1652/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1653/// maps to U+3000 for compatibility with existing Web content. As a result,
1654/// this encoding can represent all of Unicode except for the private-use
1655/// character U+E5E5.
1656///
1657/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1658/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1659///
1660/// This encoding matches the Windows code page 54936.
1661///
1662/// This will change from `static` to `const` if Rust changes
1663/// to make the referent of `pub const FOO: &'static Encoding`
1664/// unique cross-crate, so don't take the address of this
1665/// `static`.
1666pub static GB18030: &'static Encoding = &GB18030_INIT;
1667
1668/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1669///
1670/// For use only for taking the address of this form when
1671/// Rust prohibits the use of the non-`_INIT` form directly,
1672/// such as in initializers of other `static`s. If in doubt,
1673/// use the corresponding non-`_INIT` reference-typed `static`.
1674///
1675/// This part of the public API will go away if Rust changes
1676/// to make the referent of `pub const FOO: &'static Encoding`
1677/// unique cross-crate or if Rust starts allowing static arrays
1678/// to be initialized with `pub static FOO: &'static Encoding`
1679/// items.
1680pub static MACINTOSH_INIT: Encoding = Encoding {
1681 name: "macintosh",
1682 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1683};
1684
1685/// The macintosh encoding.
1686///
1687/// This is the MacRoman encoding from Mac OS Classic.
1688///
1689/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1690/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1691///
1692/// This encoding matches the Windows code page 10000, except Windows decodes
1693/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1694///
1695/// This will change from `static` to `const` if Rust changes
1696/// to make the referent of `pub const FOO: &'static Encoding`
1697/// unique cross-crate, so don't take the address of this
1698/// `static`.
1699pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1700
1701/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1702///
1703/// For use only for taking the address of this form when
1704/// Rust prohibits the use of the non-`_INIT` form directly,
1705/// such as in initializers of other `static`s. If in doubt,
1706/// use the corresponding non-`_INIT` reference-typed `static`.
1707///
1708/// This part of the public API will go away if Rust changes
1709/// to make the referent of `pub const FOO: &'static Encoding`
1710/// unique cross-crate or if Rust starts allowing static arrays
1711/// to be initialized with `pub static FOO: &'static Encoding`
1712/// items.
1713pub static REPLACEMENT_INIT: Encoding = Encoding {
1714 name: "replacement",
1715 variant: VariantEncoding::Replacement,
1716};
1717
1718/// The replacement encoding.
1719///
1720/// This decode-only encoding decodes all non-zero-length streams to a single
1721/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1722/// ASCII-compatible fallback encoding (typically windows-1252) for some
1723/// encodings that are no longer supported by the Web Platform and that
1724/// would be dangerous to treat as ASCII-compatible.
1725///
1726/// There is no corresponding encoder. The output encoding of this encoding
1727/// is UTF-8.
1728///
1729/// This encoding does not have a Windows code page number.
1730///
1731/// This will change from `static` to `const` if Rust changes
1732/// to make the referent of `pub const FOO: &'static Encoding`
1733/// unique cross-crate, so don't take the address of this
1734/// `static`.
1735pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1736
1737/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1738///
1739/// For use only for taking the address of this form when
1740/// Rust prohibits the use of the non-`_INIT` form directly,
1741/// such as in initializers of other `static`s. If in doubt,
1742/// use the corresponding non-`_INIT` reference-typed `static`.
1743///
1744/// This part of the public API will go away if Rust changes
1745/// to make the referent of `pub const FOO: &'static Encoding`
1746/// unique cross-crate or if Rust starts allowing static arrays
1747/// to be initialized with `pub static FOO: &'static Encoding`
1748/// items.
1749pub static WINDOWS_1250_INIT: Encoding = Encoding {
1750 name: "windows-1250",
1751 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1752};
1753
1754/// The windows-1250 encoding.
1755///
1756/// This is the Central European encoding for Windows.
1757///
1758/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1759/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1760///
1761/// This encoding matches the Windows code page 1250.
1762///
1763/// This will change from `static` to `const` if Rust changes
1764/// to make the referent of `pub const FOO: &'static Encoding`
1765/// unique cross-crate, so don't take the address of this
1766/// `static`.
1767pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1768
1769/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1770///
1771/// For use only for taking the address of this form when
1772/// Rust prohibits the use of the non-`_INIT` form directly,
1773/// such as in initializers of other `static`s. If in doubt,
1774/// use the corresponding non-`_INIT` reference-typed `static`.
1775///
1776/// This part of the public API will go away if Rust changes
1777/// to make the referent of `pub const FOO: &'static Encoding`
1778/// unique cross-crate or if Rust starts allowing static arrays
1779/// to be initialized with `pub static FOO: &'static Encoding`
1780/// items.
1781pub static WINDOWS_1251_INIT: Encoding = Encoding {
1782 name: "windows-1251",
1783 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1784};
1785
1786/// The windows-1251 encoding.
1787///
1788/// This is the Cyrillic encoding for Windows.
1789///
1790/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1791/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1792///
1793/// This encoding matches the Windows code page 1251.
1794///
1795/// This will change from `static` to `const` if Rust changes
1796/// to make the referent of `pub const FOO: &'static Encoding`
1797/// unique cross-crate, so don't take the address of this
1798/// `static`.
1799pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1800
1801/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1802///
1803/// For use only for taking the address of this form when
1804/// Rust prohibits the use of the non-`_INIT` form directly,
1805/// such as in initializers of other `static`s. If in doubt,
1806/// use the corresponding non-`_INIT` reference-typed `static`.
1807///
1808/// This part of the public API will go away if Rust changes
1809/// to make the referent of `pub const FOO: &'static Encoding`
1810/// unique cross-crate or if Rust starts allowing static arrays
1811/// to be initialized with `pub static FOO: &'static Encoding`
1812/// items.
1813pub static WINDOWS_1252_INIT: Encoding = Encoding {
1814 name: "windows-1252",
1815 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1816};
1817
1818/// The windows-1252 encoding.
1819///
1820/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1821/// which is known as Latin 1.
1822///
1823/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1824/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1825///
1826/// This encoding matches the Windows code page 1252.
1827///
1828/// This will change from `static` to `const` if Rust changes
1829/// to make the referent of `pub const FOO: &'static Encoding`
1830/// unique cross-crate, so don't take the address of this
1831/// `static`.
1832pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1833
1834/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1835///
1836/// For use only for taking the address of this form when
1837/// Rust prohibits the use of the non-`_INIT` form directly,
1838/// such as in initializers of other `static`s. If in doubt,
1839/// use the corresponding non-`_INIT` reference-typed `static`.
1840///
1841/// This part of the public API will go away if Rust changes
1842/// to make the referent of `pub const FOO: &'static Encoding`
1843/// unique cross-crate or if Rust starts allowing static arrays
1844/// to be initialized with `pub static FOO: &'static Encoding`
1845/// items.
1846pub static WINDOWS_1253_INIT: Encoding = Encoding {
1847 name: "windows-1253",
1848 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1849};
1850
1851/// The windows-1253 encoding.
1852///
1853/// This is the Greek encoding for Windows. It is mostly an extension of
1854/// ISO-8859-7, but U+0386 is mapped to a different byte.
1855///
1856/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1857/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1858///
1859/// This encoding matches the Windows code page 1253, except Windows decodes
1860/// unassigned code points to the Private Use Area of Unicode.
1861///
1862/// This will change from `static` to `const` if Rust changes
1863/// to make the referent of `pub const FOO: &'static Encoding`
1864/// unique cross-crate, so don't take the address of this
1865/// `static`.
1866pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1867
1868/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1869///
1870/// For use only for taking the address of this form when
1871/// Rust prohibits the use of the non-`_INIT` form directly,
1872/// such as in initializers of other `static`s. If in doubt,
1873/// use the corresponding non-`_INIT` reference-typed `static`.
1874///
1875/// This part of the public API will go away if Rust changes
1876/// to make the referent of `pub const FOO: &'static Encoding`
1877/// unique cross-crate or if Rust starts allowing static arrays
1878/// to be initialized with `pub static FOO: &'static Encoding`
1879/// items.
1880pub static WINDOWS_1254_INIT: Encoding = Encoding {
1881 name: "windows-1254",
1882 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1883};
1884
1885/// The windows-1254 encoding.
1886///
1887/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1888/// which is known as Latin 5.
1889///
1890/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1891/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1892///
1893/// This encoding matches the Windows code page 1254.
1894///
1895/// This will change from `static` to `const` if Rust changes
1896/// to make the referent of `pub const FOO: &'static Encoding`
1897/// unique cross-crate, so don't take the address of this
1898/// `static`.
1899pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1900
1901/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1902///
1903/// For use only for taking the address of this form when
1904/// Rust prohibits the use of the non-`_INIT` form directly,
1905/// such as in initializers of other `static`s. If in doubt,
1906/// use the corresponding non-`_INIT` reference-typed `static`.
1907///
1908/// This part of the public API will go away if Rust changes
1909/// to make the referent of `pub const FOO: &'static Encoding`
1910/// unique cross-crate or if Rust starts allowing static arrays
1911/// to be initialized with `pub static FOO: &'static Encoding`
1912/// items.
1913pub static WINDOWS_1255_INIT: Encoding = Encoding {
1914 name: "windows-1255",
1915 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1916};
1917
1918/// The windows-1255 encoding.
1919///
1920/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1921/// except for a currency sign swap.
1922///
1923/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1924/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1925///
1926/// This encoding matches the Windows code page 1255, except Windows decodes
1927/// unassigned code points to the Private Use Area of Unicode.
1928///
1929/// This will change from `static` to `const` if Rust changes
1930/// to make the referent of `pub const FOO: &'static Encoding`
1931/// unique cross-crate, so don't take the address of this
1932/// `static`.
1933pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1934
1935/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1936///
1937/// For use only for taking the address of this form when
1938/// Rust prohibits the use of the non-`_INIT` form directly,
1939/// such as in initializers of other `static`s. If in doubt,
1940/// use the corresponding non-`_INIT` reference-typed `static`.
1941///
1942/// This part of the public API will go away if Rust changes
1943/// to make the referent of `pub const FOO: &'static Encoding`
1944/// unique cross-crate or if Rust starts allowing static arrays
1945/// to be initialized with `pub static FOO: &'static Encoding`
1946/// items.
1947pub static WINDOWS_1256_INIT: Encoding = Encoding {
1948 name: "windows-1256",
1949 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1950};
1951
1952/// The windows-1256 encoding.
1953///
1954/// This is the Arabic encoding for Windows.
1955///
1956/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1957/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1958///
1959/// This encoding matches the Windows code page 1256.
1960///
1961/// This will change from `static` to `const` if Rust changes
1962/// to make the referent of `pub const FOO: &'static Encoding`
1963/// unique cross-crate, so don't take the address of this
1964/// `static`.
1965pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1966
1967/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1968///
1969/// For use only for taking the address of this form when
1970/// Rust prohibits the use of the non-`_INIT` form directly,
1971/// such as in initializers of other `static`s. If in doubt,
1972/// use the corresponding non-`_INIT` reference-typed `static`.
1973///
1974/// This part of the public API will go away if Rust changes
1975/// to make the referent of `pub const FOO: &'static Encoding`
1976/// unique cross-crate or if Rust starts allowing static arrays
1977/// to be initialized with `pub static FOO: &'static Encoding`
1978/// items.
1979pub static WINDOWS_1257_INIT: Encoding = Encoding {
1980 name: "windows-1257",
1981 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
1982};
1983
1984/// The windows-1257 encoding.
1985///
1986/// This is the Baltic encoding for Windows.
1987///
1988/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
1989/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
1990///
1991/// This encoding matches the Windows code page 1257, except Windows decodes
1992/// unassigned code points to the Private Use Area of Unicode.
1993///
1994/// This will change from `static` to `const` if Rust changes
1995/// to make the referent of `pub const FOO: &'static Encoding`
1996/// unique cross-crate, so don't take the address of this
1997/// `static`.
1998pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
1999
2000/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2001///
2002/// For use only for taking the address of this form when
2003/// Rust prohibits the use of the non-`_INIT` form directly,
2004/// such as in initializers of other `static`s. If in doubt,
2005/// use the corresponding non-`_INIT` reference-typed `static`.
2006///
2007/// This part of the public API will go away if Rust changes
2008/// to make the referent of `pub const FOO: &'static Encoding`
2009/// unique cross-crate or if Rust starts allowing static arrays
2010/// to be initialized with `pub static FOO: &'static Encoding`
2011/// items.
2012pub static WINDOWS_1258_INIT: Encoding = Encoding {
2013 name: "windows-1258",
2014 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2015};
2016
2017/// The windows-1258 encoding.
2018///
2019/// This is the Vietnamese encoding for Windows.
2020///
2021/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2022/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2023///
2024/// This encoding matches the Windows code page 1258 when used in the
2025/// non-normalizing mode. Unlike with the other single-byte encodings, the
2026/// result of decoding is not necessarily in Normalization Form C. On the
2027/// other hand, input in the Normalization Form C is not encoded without
2028/// replacement. In general, it's a bad idea to encode to encodings other
2029/// than UTF-8, but this encoding is especially hazardous to encode to.
2030///
2031/// This will change from `static` to `const` if Rust changes
2032/// to make the referent of `pub const FOO: &'static Encoding`
2033/// unique cross-crate, so don't take the address of this
2034/// `static`.
2035pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2036
2037/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2038///
2039/// For use only for taking the address of this form when
2040/// Rust prohibits the use of the non-`_INIT` form directly,
2041/// such as in initializers of other `static`s. If in doubt,
2042/// use the corresponding non-`_INIT` reference-typed `static`.
2043///
2044/// This part of the public API will go away if Rust changes
2045/// to make the referent of `pub const FOO: &'static Encoding`
2046/// unique cross-crate or if Rust starts allowing static arrays
2047/// to be initialized with `pub static FOO: &'static Encoding`
2048/// items.
2049pub static WINDOWS_874_INIT: Encoding = Encoding {
2050 name: "windows-874",
2051 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2052};
2053
2054/// The windows-874 encoding.
2055///
2056/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2057///
2058/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2059/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2060///
2061/// This encoding matches the Windows code page 874, except Windows decodes
2062/// unassigned code points to the Private Use Area of Unicode.
2063///
2064/// This will change from `static` to `const` if Rust changes
2065/// to make the referent of `pub const FOO: &'static Encoding`
2066/// unique cross-crate, so don't take the address of this
2067/// `static`.
2068pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2069
2070/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2071///
2072/// For use only for taking the address of this form when
2073/// Rust prohibits the use of the non-`_INIT` form directly,
2074/// such as in initializers of other `static`s. If in doubt,
2075/// use the corresponding non-`_INIT` reference-typed `static`.
2076///
2077/// This part of the public API will go away if Rust changes
2078/// to make the referent of `pub const FOO: &'static Encoding`
2079/// unique cross-crate or if Rust starts allowing static arrays
2080/// to be initialized with `pub static FOO: &'static Encoding`
2081/// items.
2082pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2083 name: "x-mac-cyrillic",
2084 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2085};
2086
2087/// The x-mac-cyrillic encoding.
2088///
2089/// This is the MacUkrainian encoding from Mac OS Classic.
2090///
2091/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2092/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2093///
2094/// This encoding matches the Windows code page 10017.
2095///
2096/// This will change from `static` to `const` if Rust changes
2097/// to make the referent of `pub const FOO: &'static Encoding`
2098/// unique cross-crate, so don't take the address of this
2099/// `static`.
2100pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2101
2102/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2103///
2104/// For use only for taking the address of this form when
2105/// Rust prohibits the use of the non-`_INIT` form directly,
2106/// such as in initializers of other `static`s. If in doubt,
2107/// use the corresponding non-`_INIT` reference-typed `static`.
2108///
2109/// This part of the public API will go away if Rust changes
2110/// to make the referent of `pub const FOO: &'static Encoding`
2111/// unique cross-crate or if Rust starts allowing static arrays
2112/// to be initialized with `pub static FOO: &'static Encoding`
2113/// items.
2114pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2115 name: "x-user-defined",
2116 variant: VariantEncoding::UserDefined,
2117};
2118
2119/// The x-user-defined encoding.
2120///
2121/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2122/// them to the Private Use Area of Unicode. It was used for loading binary
2123/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2124/// the `"arraybuffer"` response type.
2125///
2126/// This encoding does not have a Windows code page number.
2127///
2128/// This will change from `static` to `const` if Rust changes
2129/// to make the referent of `pub const FOO: &'static Encoding`
2130/// unique cross-crate, so don't take the address of this
2131/// `static`.
2132pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2133
2134static LABELS_SORTED: [&'static str; 219] = [
2135 "l1",
2136 "l2",
2137 "l3",
2138 "l4",
2139 "l5",
2140 "l6",
2141 "l9",
2142 "866",
2143 "mac",
2144 "koi",
2145 "gbk",
2146 "big5",
2147 "utf8",
2148 "koi8",
2149 "sjis",
2150 "ms932",
2151 "cp866",
2152 "utf-8",
2153 "cp819",
2154 "ascii",
2155 "x-gbk",
2156 "greek",
2157 "cp1250",
2158 "cp1251",
2159 "latin1",
2160 "gb2312",
2161 "cp1252",
2162 "latin2",
2163 "cp1253",
2164 "latin3",
2165 "cp1254",
2166 "latin4",
2167 "cp1255",
2168 "csbig5",
2169 "latin5",
2170 "utf-16",
2171 "cp1256",
2172 "ibm866",
2173 "latin6",
2174 "cp1257",
2175 "cp1258",
2176 "greek8",
2177 "ibm819",
2178 "arabic",
2179 "visual",
2180 "korean",
2181 "euc-jp",
2182 "koi8-r",
2183 "koi8_r",
2184 "euc-kr",
2185 "x-sjis",
2186 "koi8-u",
2187 "hebrew",
2188 "tis-620",
2189 "gb18030",
2190 "ksc5601",
2191 "gb_2312",
2192 "dos-874",
2193 "cn-big5",
2194 "chinese",
2195 "logical",
2196 "cskoi8r",
2197 "cseuckr",
2198 "koi8-ru",
2199 "x-cp1250",
2200 "ksc_5601",
2201 "x-cp1251",
2202 "iso88591",
2203 "csgb2312",
2204 "x-cp1252",
2205 "iso88592",
2206 "x-cp1253",
2207 "iso88593",
2208 "ecma-114",
2209 "x-cp1254",
2210 "iso88594",
2211 "x-cp1255",
2212 "iso88595",
2213 "x-x-big5",
2214 "x-cp1256",
2215 "csibm866",
2216 "iso88596",
2217 "x-cp1257",
2218 "iso88597",
2219 "asmo-708",
2220 "ecma-118",
2221 "elot_928",
2222 "x-cp1258",
2223 "iso88598",
2224 "iso88599",
2225 "cyrillic",
2226 "utf-16be",
2227 "utf-16le",
2228 "us-ascii",
2229 "ms_kanji",
2230 "x-euc-jp",
2231 "iso885910",
2232 "iso8859-1",
2233 "iso885911",
2234 "iso8859-2",
2235 "iso8859-3",
2236 "iso885913",
2237 "iso8859-4",
2238 "iso885914",
2239 "iso8859-5",
2240 "iso885915",
2241 "iso8859-6",
2242 "iso8859-7",
2243 "iso8859-8",
2244 "iso-ir-58",
2245 "iso8859-9",
2246 "macintosh",
2247 "shift-jis",
2248 "shift_jis",
2249 "iso-ir-100",
2250 "iso8859-10",
2251 "iso-ir-110",
2252 "gb_2312-80",
2253 "iso-8859-1",
2254 "iso_8859-1",
2255 "iso-ir-101",
2256 "iso8859-11",
2257 "iso-8859-2",
2258 "iso_8859-2",
2259 "hz-gb-2312",
2260 "iso-8859-3",
2261 "iso_8859-3",
2262 "iso8859-13",
2263 "iso-8859-4",
2264 "iso_8859-4",
2265 "iso8859-14",
2266 "iso-ir-144",
2267 "iso-8859-5",
2268 "iso_8859-5",
2269 "iso8859-15",
2270 "iso-8859-6",
2271 "iso_8859-6",
2272 "iso-ir-126",
2273 "iso-8859-7",
2274 "iso_8859-7",
2275 "iso-ir-127",
2276 "iso-ir-157",
2277 "iso-8859-8",
2278 "iso_8859-8",
2279 "iso-ir-138",
2280 "iso-ir-148",
2281 "iso-8859-9",
2282 "iso_8859-9",
2283 "iso-ir-109",
2284 "iso-ir-149",
2285 "big5-hkscs",
2286 "csshiftjis",
2287 "iso-8859-10",
2288 "iso-8859-11",
2289 "csisolatin1",
2290 "csisolatin2",
2291 "iso-8859-13",
2292 "csisolatin3",
2293 "iso-8859-14",
2294 "windows-874",
2295 "csisolatin4",
2296 "iso-8859-15",
2297 "iso_8859-15",
2298 "csisolatin5",
2299 "iso-8859-16",
2300 "csisolatin6",
2301 "windows-949",
2302 "csisolatin9",
2303 "csiso88596e",
2304 "csiso88598e",
2305 "csmacintosh",
2306 "csiso88596i",
2307 "csiso88598i",
2308 "windows-31j",
2309 "x-mac-roman",
2310 "iso-2022-cn",
2311 "iso-2022-jp",
2312 "csiso2022jp",
2313 "iso-2022-kr",
2314 "csiso2022kr",
2315 "replacement",
2316 "windows-1250",
2317 "windows-1251",
2318 "windows-1252",
2319 "windows-1253",
2320 "windows-1254",
2321 "windows-1255",
2322 "windows-1256",
2323 "windows-1257",
2324 "windows-1258",
2325 "iso-8859-6-e",
2326 "iso-8859-8-e",
2327 "iso-8859-6-i",
2328 "iso-8859-8-i",
2329 "sun_eu_greek",
2330 "csksc56011987",
2331 "ks_c_5601-1987",
2332 "ansi_x3.4-1968",
2333 "ks_c_5601-1989",
2334 "x-mac-cyrillic",
2335 "x-user-defined",
2336 "csiso58gb231280",
2337 "iso_8859-1:1987",
2338 "iso_8859-2:1987",
2339 "iso_8859-6:1987",
2340 "iso_8859-7:1987",
2341 "iso_8859-3:1988",
2342 "iso_8859-4:1988",
2343 "iso_8859-5:1988",
2344 "iso_8859-8:1988",
2345 "iso_8859-9:1989",
2346 "csisolatingreek",
2347 "x-mac-ukrainian",
2348 "iso-2022-cn-ext",
2349 "csisolatinarabic",
2350 "csisolatinhebrew",
2351 "unicode-1-1-utf-8",
2352 "csisolatincyrillic",
2353 "cseucpkdfmtjapanese",
2354];
2355
2356static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 219] = [
2357 &WINDOWS_1252_INIT,
2358 &ISO_8859_2_INIT,
2359 &ISO_8859_3_INIT,
2360 &ISO_8859_4_INIT,
2361 &WINDOWS_1254_INIT,
2362 &ISO_8859_10_INIT,
2363 &ISO_8859_15_INIT,
2364 &IBM866_INIT,
2365 &MACINTOSH_INIT,
2366 &KOI8_R_INIT,
2367 &GBK_INIT,
2368 &BIG5_INIT,
2369 &UTF_8_INIT,
2370 &KOI8_R_INIT,
2371 &SHIFT_JIS_INIT,
2372 &SHIFT_JIS_INIT,
2373 &IBM866_INIT,
2374 &UTF_8_INIT,
2375 &WINDOWS_1252_INIT,
2376 &WINDOWS_1252_INIT,
2377 &GBK_INIT,
2378 &ISO_8859_7_INIT,
2379 &WINDOWS_1250_INIT,
2380 &WINDOWS_1251_INIT,
2381 &WINDOWS_1252_INIT,
2382 &GBK_INIT,
2383 &WINDOWS_1252_INIT,
2384 &ISO_8859_2_INIT,
2385 &WINDOWS_1253_INIT,
2386 &ISO_8859_3_INIT,
2387 &WINDOWS_1254_INIT,
2388 &ISO_8859_4_INIT,
2389 &WINDOWS_1255_INIT,
2390 &BIG5_INIT,
2391 &WINDOWS_1254_INIT,
2392 &UTF_16LE_INIT,
2393 &WINDOWS_1256_INIT,
2394 &IBM866_INIT,
2395 &ISO_8859_10_INIT,
2396 &WINDOWS_1257_INIT,
2397 &WINDOWS_1258_INIT,
2398 &ISO_8859_7_INIT,
2399 &WINDOWS_1252_INIT,
2400 &ISO_8859_6_INIT,
2401 &ISO_8859_8_INIT,
2402 &EUC_KR_INIT,
2403 &EUC_JP_INIT,
2404 &KOI8_R_INIT,
2405 &KOI8_R_INIT,
2406 &EUC_KR_INIT,
2407 &SHIFT_JIS_INIT,
2408 &KOI8_U_INIT,
2409 &ISO_8859_8_INIT,
2410 &WINDOWS_874_INIT,
2411 &GB18030_INIT,
2412 &EUC_KR_INIT,
2413 &GBK_INIT,
2414 &WINDOWS_874_INIT,
2415 &BIG5_INIT,
2416 &GBK_INIT,
2417 &ISO_8859_8_I_INIT,
2418 &KOI8_R_INIT,
2419 &EUC_KR_INIT,
2420 &KOI8_U_INIT,
2421 &WINDOWS_1250_INIT,
2422 &EUC_KR_INIT,
2423 &WINDOWS_1251_INIT,
2424 &WINDOWS_1252_INIT,
2425 &GBK_INIT,
2426 &WINDOWS_1252_INIT,
2427 &ISO_8859_2_INIT,
2428 &WINDOWS_1253_INIT,
2429 &ISO_8859_3_INIT,
2430 &ISO_8859_6_INIT,
2431 &WINDOWS_1254_INIT,
2432 &ISO_8859_4_INIT,
2433 &WINDOWS_1255_INIT,
2434 &ISO_8859_5_INIT,
2435 &BIG5_INIT,
2436 &WINDOWS_1256_INIT,
2437 &IBM866_INIT,
2438 &ISO_8859_6_INIT,
2439 &WINDOWS_1257_INIT,
2440 &ISO_8859_7_INIT,
2441 &ISO_8859_6_INIT,
2442 &ISO_8859_7_INIT,
2443 &ISO_8859_7_INIT,
2444 &WINDOWS_1258_INIT,
2445 &ISO_8859_8_INIT,
2446 &WINDOWS_1254_INIT,
2447 &ISO_8859_5_INIT,
2448 &UTF_16BE_INIT,
2449 &UTF_16LE_INIT,
2450 &WINDOWS_1252_INIT,
2451 &SHIFT_JIS_INIT,
2452 &EUC_JP_INIT,
2453 &ISO_8859_10_INIT,
2454 &WINDOWS_1252_INIT,
2455 &WINDOWS_874_INIT,
2456 &ISO_8859_2_INIT,
2457 &ISO_8859_3_INIT,
2458 &ISO_8859_13_INIT,
2459 &ISO_8859_4_INIT,
2460 &ISO_8859_14_INIT,
2461 &ISO_8859_5_INIT,
2462 &ISO_8859_15_INIT,
2463 &ISO_8859_6_INIT,
2464 &ISO_8859_7_INIT,
2465 &ISO_8859_8_INIT,
2466 &GBK_INIT,
2467 &WINDOWS_1254_INIT,
2468 &MACINTOSH_INIT,
2469 &SHIFT_JIS_INIT,
2470 &SHIFT_JIS_INIT,
2471 &WINDOWS_1252_INIT,
2472 &ISO_8859_10_INIT,
2473 &ISO_8859_4_INIT,
2474 &GBK_INIT,
2475 &WINDOWS_1252_INIT,
2476 &WINDOWS_1252_INIT,
2477 &ISO_8859_2_INIT,
2478 &WINDOWS_874_INIT,
2479 &ISO_8859_2_INIT,
2480 &ISO_8859_2_INIT,
2481 &REPLACEMENT_INIT,
2482 &ISO_8859_3_INIT,
2483 &ISO_8859_3_INIT,
2484 &ISO_8859_13_INIT,
2485 &ISO_8859_4_INIT,
2486 &ISO_8859_4_INIT,
2487 &ISO_8859_14_INIT,
2488 &ISO_8859_5_INIT,
2489 &ISO_8859_5_INIT,
2490 &ISO_8859_5_INIT,
2491 &ISO_8859_15_INIT,
2492 &ISO_8859_6_INIT,
2493 &ISO_8859_6_INIT,
2494 &ISO_8859_7_INIT,
2495 &ISO_8859_7_INIT,
2496 &ISO_8859_7_INIT,
2497 &ISO_8859_6_INIT,
2498 &ISO_8859_10_INIT,
2499 &ISO_8859_8_INIT,
2500 &ISO_8859_8_INIT,
2501 &ISO_8859_8_INIT,
2502 &WINDOWS_1254_INIT,
2503 &WINDOWS_1254_INIT,
2504 &WINDOWS_1254_INIT,
2505 &ISO_8859_3_INIT,
2506 &EUC_KR_INIT,
2507 &BIG5_INIT,
2508 &SHIFT_JIS_INIT,
2509 &ISO_8859_10_INIT,
2510 &WINDOWS_874_INIT,
2511 &WINDOWS_1252_INIT,
2512 &ISO_8859_2_INIT,
2513 &ISO_8859_13_INIT,
2514 &ISO_8859_3_INIT,
2515 &ISO_8859_14_INIT,
2516 &WINDOWS_874_INIT,
2517 &ISO_8859_4_INIT,
2518 &ISO_8859_15_INIT,
2519 &ISO_8859_15_INIT,
2520 &WINDOWS_1254_INIT,
2521 &ISO_8859_16_INIT,
2522 &ISO_8859_10_INIT,
2523 &EUC_KR_INIT,
2524 &ISO_8859_15_INIT,
2525 &ISO_8859_6_INIT,
2526 &ISO_8859_8_INIT,
2527 &MACINTOSH_INIT,
2528 &ISO_8859_6_INIT,
2529 &ISO_8859_8_I_INIT,
2530 &SHIFT_JIS_INIT,
2531 &MACINTOSH_INIT,
2532 &REPLACEMENT_INIT,
2533 &ISO_2022_JP_INIT,
2534 &ISO_2022_JP_INIT,
2535 &REPLACEMENT_INIT,
2536 &REPLACEMENT_INIT,
2537 &REPLACEMENT_INIT,
2538 &WINDOWS_1250_INIT,
2539 &WINDOWS_1251_INIT,
2540 &WINDOWS_1252_INIT,
2541 &WINDOWS_1253_INIT,
2542 &WINDOWS_1254_INIT,
2543 &WINDOWS_1255_INIT,
2544 &WINDOWS_1256_INIT,
2545 &WINDOWS_1257_INIT,
2546 &WINDOWS_1258_INIT,
2547 &ISO_8859_6_INIT,
2548 &ISO_8859_8_INIT,
2549 &ISO_8859_6_INIT,
2550 &ISO_8859_8_I_INIT,
2551 &ISO_8859_7_INIT,
2552 &EUC_KR_INIT,
2553 &EUC_KR_INIT,
2554 &WINDOWS_1252_INIT,
2555 &EUC_KR_INIT,
2556 &X_MAC_CYRILLIC_INIT,
2557 &X_USER_DEFINED_INIT,
2558 &GBK_INIT,
2559 &WINDOWS_1252_INIT,
2560 &ISO_8859_2_INIT,
2561 &ISO_8859_6_INIT,
2562 &ISO_8859_7_INIT,
2563 &ISO_8859_3_INIT,
2564 &ISO_8859_4_INIT,
2565 &ISO_8859_5_INIT,
2566 &ISO_8859_8_INIT,
2567 &WINDOWS_1254_INIT,
2568 &ISO_8859_7_INIT,
2569 &X_MAC_CYRILLIC_INIT,
2570 &REPLACEMENT_INIT,
2571 &ISO_8859_6_INIT,
2572 &ISO_8859_8_INIT,
2573 &UTF_8_INIT,
2574 &ISO_8859_5_INIT,
2575 &EUC_JP_INIT,
2576];
2577
2578// END GENERATED CODE
2579
2580/// An encoding as defined in the [Encoding Standard][1].
2581///
2582/// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2583/// and, in most cases, vice versa. Each encoding has a name, an output
2584/// encoding, and one or more labels.
2585///
2586/// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2587/// encoding in formats and protocols. The _name_ of the encoding is the
2588/// preferred label in the case appropriate for returning from the
2589/// [`characterSet`][2] property of the `Document` DOM interface.
2590///
2591/// The _output encoding_ is the encoding used for form submission and URL
2592/// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2593/// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2594/// encodings.
2595///
2596/// [1]: https://encoding.spec.whatwg.org/
2597/// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2598///
2599/// # Streaming vs. Non-Streaming
2600///
2601/// When you have the entire input in a single buffer, you can use the
2602/// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2603/// [`decode_without_bom_handling()`][5],
2604/// [`decode_without_bom_handling_and_without_replacement()`][6] and
2605/// [`encode()`][7]. (These methods are available to Rust callers only and are
2606/// not available in the C API.) Unlike the rest of the API available to Rust,
2607/// these methods perform heap allocations. You should the `Decoder` and
2608/// `Encoder` objects when your input is split into multiple buffers or when
2609/// you want to control the allocation of the output buffers.
2610///
2611/// [3]: #method.decode
2612/// [4]: #method.decode_with_bom_removal
2613/// [5]: #method.decode_without_bom_handling
2614/// [6]: #method.decode_without_bom_handling_and_without_replacement
2615/// [7]: #method.encode
2616///
2617/// # Instances
2618///
2619/// All instances of `Encoding` are statically allocated and have the `'static`
2620/// lifetime. There is precisely one unique `Encoding` instance for each
2621/// encoding defined in the Encoding Standard.
2622///
2623/// To obtain a reference to a particular encoding whose identity you know at
2624/// compile time, use a `static` that refers to encoding. There is a `static`
2625/// for each encoding. The `static`s are named in all caps with hyphens
2626/// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2627/// name). For example, if you know at compile time that you will want to
2628/// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2629/// in C/C++).
2630///
2631/// Additionally, there are non-reference-typed forms ending with `_INIT` to
2632/// work around the problem that `static`s of the type `&'static Encoding`
2633/// cannot be used to initialize items of an array whose type is
2634/// `[&'static Encoding; N]`.
2635///
2636/// If you don't know what encoding you need at compile time and need to
2637/// dynamically get an encoding by label, use
2638/// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2639///
2640/// Instances of `Encoding` can be compared with `==` (in both Rust and in
2641/// C/C++).
2642pub struct Encoding {
2643 name: &'static str,
2644 variant: VariantEncoding,
2645}
2646
2647impl Encoding {
2648 /// Implements the
2649 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2650 /// algorithm.
2651 ///
2652 /// If, after ASCII-lowercasing and removing leading and trailing
2653 /// whitespace, the argument matches a label defined in the Encoding
2654 /// Standard, `Some(&'static Encoding)` representing the corresponding
2655 /// encoding is returned. If there is no match, `None` is returned.
2656 ///
2657 /// This is the right method to use if the action upon the method returning
2658 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2659 /// When the action upon the method returning `None` is not to proceed with
2660 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2661 /// appropriate.
2662 ///
2663 /// The argument is of type `&[u8]` instead of `&str` to save callers
2664 /// that are extracting the label from a non-UTF-8 protocol the trouble
2665 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2666 /// on it.)
2667 ///
2668 /// Available via the C wrapper.
2669 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2670 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2671 let mut trimmed_pos = 0usize;
2672 let mut iter = label.into_iter();
2673 // before
2674 loop {
2675 match iter.next() {
2676 None => {
2677 return None;
2678 }
2679 Some(byte) => {
2680 // The characters used in labels are:
2681 // a-z (except q, but excluding it below seems excessive)
2682 // 0-9
2683 // . _ - :
2684 match *byte {
2685 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2686 continue;
2687 }
2688 b'A'..=b'Z' => {
2689 trimmed[trimmed_pos] = *byte + 0x20u8;
2690 trimmed_pos = 1usize;
2691 break;
2692 }
2693 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2694 trimmed[trimmed_pos] = *byte;
2695 trimmed_pos = 1usize;
2696 break;
2697 }
2698 _ => {
2699 return None;
2700 }
2701 }
2702 }
2703 }
2704 }
2705 // inside
2706 loop {
2707 match iter.next() {
2708 None => {
2709 break;
2710 }
2711 Some(byte) => {
2712 match *byte {
2713 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2714 break;
2715 }
2716 b'A'..=b'Z' => {
2717 if trimmed_pos == LONGEST_LABEL_LENGTH {
2718 // There's no encoding with a label this long
2719 return None;
2720 }
2721 trimmed[trimmed_pos] = *byte + 0x20u8;
2722 trimmed_pos += 1usize;
2723 continue;
2724 }
2725 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2726 if trimmed_pos == LONGEST_LABEL_LENGTH {
2727 // There's no encoding with a label this long
2728 return None;
2729 }
2730 trimmed[trimmed_pos] = *byte;
2731 trimmed_pos += 1usize;
2732 continue;
2733 }
2734 _ => {
2735 return None;
2736 }
2737 }
2738 }
2739 }
2740 }
2741 // after
2742 loop {
2743 match iter.next() {
2744 None => {
2745 break;
2746 }
2747 Some(byte) => {
2748 match *byte {
2749 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2750 continue;
2751 }
2752 _ => {
2753 // There's no label with space in the middle
2754 return None;
2755 }
2756 }
2757 }
2758 }
2759 }
2760 let candidate = &trimmed[..trimmed_pos];
2761 match LABELS_SORTED.binary_search_by(|probe| {
2762 let bytes = probe.as_bytes();
2763 let c = bytes.len().cmp(&candidate.len());
2764 if c != Ordering::Equal {
2765 return c;
2766 }
2767 let probe_iter = bytes.iter().rev();
2768 let candidate_iter = candidate.iter().rev();
2769 probe_iter.cmp(candidate_iter)
2770 }) {
2771 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2772 Err(_) => None,
2773 }
2774 }
2775
2776 /// This method behaves the same as `for_label()`, except when `for_label()`
2777 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2778 ///
2779 /// This method is useful in scenarios where a fatal error is required
2780 /// upon invalid label, because in those cases the caller typically wishes
2781 /// to treat the labels that map to the replacement encoding as fatal
2782 /// errors, too.
2783 ///
2784 /// It is not OK to use this method when the action upon the method returning
2785 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2786 /// case, the `for_label()` method should be used instead in order to avoid
2787 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2788 ///
2789 /// Available via the C wrapper.
2790 #[inline]
2791 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2792 match Encoding::for_label(label) {
2793 None => None,
2794 Some(encoding) => {
2795 if encoding == REPLACEMENT {
2796 None
2797 } else {
2798 Some(encoding)
2799 }
2800 }
2801 }
2802 }
2803
2804 /// Performs non-incremental BOM sniffing.
2805 ///
2806 /// The argument must either be a buffer representing the entire input
2807 /// stream (non-streaming case) or a buffer representing at least the first
2808 /// three bytes of the input stream (streaming case).
2809 ///
2810 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2811 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2812 /// or UTF-16BE BOM or `None` otherwise.
2813 ///
2814 /// Available via the C wrapper.
2815 #[inline]
2816 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2817 if buffer.starts_with(b"\xEF\xBB\xBF") {
2818 Some((UTF_8, 3))
2819 } else if buffer.starts_with(b"\xFF\xFE") {
2820 Some((UTF_16LE, 2))
2821 } else if buffer.starts_with(b"\xFE\xFF") {
2822 Some((UTF_16BE, 2))
2823 } else {
2824 None
2825 }
2826 }
2827
2828 /// Returns the name of this encoding.
2829 ///
2830 /// This name is appropriate to return as-is from the DOM
2831 /// `document.characterSet` property.
2832 ///
2833 /// Available via the C wrapper.
2834 #[inline]
2835 pub fn name(&'static self) -> &'static str {
2836 self.name
2837 }
2838
2839 /// Checks whether the _output encoding_ of this encoding can encode every
2840 /// `char`. (Only true if the output encoding is UTF-8.)
2841 ///
2842 /// Available via the C wrapper.
2843 #[inline]
2844 pub fn can_encode_everything(&'static self) -> bool {
2845 self.output_encoding() == UTF_8
2846 }
2847
2848 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2849 /// U+0000...U+007F and vice versa.
2850 ///
2851 /// Available via the C wrapper.
2852 #[inline]
2853 pub fn is_ascii_compatible(&'static self) -> bool {
2854 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2855 }
2856
2857 /// Checks whether this encoding maps one byte to one Basic Multilingual
2858 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2859 /// vice versa (for mappable characters).
2860 ///
2861 /// `true` iff this encoding is on the list of [Legacy single-byte
2862 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2863 /// in the spec or x-user-defined.
2864 ///
2865 /// Available via the C wrapper.
2866 #[inline]
2867 pub fn is_single_byte(&'static self) -> bool {
2868 self.variant.is_single_byte()
2869 }
2870
2871 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2872 /// U+0000...U+007F and vice versa.
2873 #[inline]
2874 fn is_potentially_borrowable(&'static self) -> bool {
2875 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2876 }
2877
2878 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2879 /// UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
2880 ///
2881 /// Available via the C wrapper.
2882 #[inline]
2883 pub fn output_encoding(&'static self) -> &'static Encoding {
2884 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2885 UTF_8
2886 } else {
2887 self
2888 }
2889 }
2890
2891 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2892 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2893 /// entire input is available as a single buffer (i.e. the end of the
2894 /// buffer marks the end of the stream).
2895 ///
2896 /// This method implements the (non-streaming version of) the
2897 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2898 ///
2899 /// The second item in the returned tuple is the encoding that was actually
2900 /// used (which may differ from this encoding thanks to BOM sniffing).
2901 ///
2902 /// The third item in the returned tuple indicates whether there were
2903 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2904 ///
2905 /// _Note:_ It is wrong to use this when the input buffer represents only
2906 /// a segment of the input instead of the whole input. Use `new_decoder()`
2907 /// when decoding segmented input.
2908 ///
2909 /// This method performs a one or two heap allocations for the backing
2910 /// buffer of the `String` when unable to borrow. (One allocation if not
2911 /// errors and potentially another one in the presence of errors.) The
2912 /// first allocation assumes jemalloc and may not be optimal with
2913 /// allocators that do not use power-of-two buckets. A borrow is performed
2914 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2915 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2916 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2917 /// transitions.
2918 ///
2919 /// # Panics
2920 ///
2921 /// If the size calculation for a heap-allocated backing buffer overflows
2922 /// `usize`.
2923 ///
2924 /// Available to Rust only.
2925 #[inline]
2926 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2927 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2928 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2929 None => (self, bytes),
2930 };
2931 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2932 (cow, encoding, had_errors)
2933 }
2934
2935 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2936 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2937 /// entire input is available as a single buffer (i.e. the end of the
2938 /// buffer marks the end of the stream).
2939 ///
2940 /// When invoked on `UTF_8`, this method implements the (non-streaming
2941 /// version of) the
2942 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
2943 /// concept.
2944 ///
2945 /// The second item in the returned pair indicates whether there were
2946 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2947 ///
2948 /// _Note:_ It is wrong to use this when the input buffer represents only
2949 /// a segment of the input instead of the whole input. Use
2950 /// `new_decoder_with_bom_removal()` when decoding segmented input.
2951 ///
2952 /// This method performs a one or two heap allocations for the backing
2953 /// buffer of the `String` when unable to borrow. (One allocation if not
2954 /// errors and potentially another one in the presence of errors.) The
2955 /// first allocation assumes jemalloc and may not be optimal with
2956 /// allocators that do not use power-of-two buckets. A borrow is performed
2957 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2958 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2959 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2960 /// transitions.
2961 ///
2962 /// # Panics
2963 ///
2964 /// If the size calculation for a heap-allocated backing buffer overflows
2965 /// `usize`.
2966 ///
2967 /// Available to Rust only.
2968 #[inline]
2969 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
2970 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
2971 &bytes[3..]
2972 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
2973 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
2974 {
2975 &bytes[2..]
2976 } else {
2977 bytes
2978 };
2979 self.decode_without_bom_handling(without_bom)
2980 }
2981
2982 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
2983 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
2984 /// the entire input is available as a single buffer (i.e. the end of the
2985 /// buffer marks the end of the stream).
2986 ///
2987 /// When invoked on `UTF_8`, this method implements the (non-streaming
2988 /// version of) the
2989 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
2990 /// spec concept.
2991 ///
2992 /// The second item in the returned pair indicates whether there were
2993 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2994 ///
2995 /// _Note:_ It is wrong to use this when the input buffer represents only
2996 /// a segment of the input instead of the whole input. Use
2997 /// `new_decoder_without_bom_handling()` when decoding segmented input.
2998 ///
2999 /// This method performs a one or two heap allocations for the backing
3000 /// buffer of the `String` when unable to borrow. (One allocation if not
3001 /// errors and potentially another one in the presence of errors.) The
3002 /// first allocation assumes jemalloc and may not be optimal with
3003 /// allocators that do not use power-of-two buckets. A borrow is performed
3004 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3005 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3006 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3007 /// transitions.
3008 ///
3009 /// # Panics
3010 ///
3011 /// If the size calculation for a heap-allocated backing buffer overflows
3012 /// `usize`.
3013 ///
3014 /// Available to Rust only.
3015 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3016 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3017 let valid_up_to = if self == UTF_8 {
3018 utf8_valid_up_to(bytes)
3019 } else if self == ISO_2022_JP {
3020 iso_2022_jp_ascii_valid_up_to(bytes)
3021 } else {
3022 ascii_valid_up_to(bytes)
3023 };
3024 if valid_up_to == bytes.len() {
3025 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3026 return (Cow::Borrowed(str), false);
3027 }
3028 let decoder = self.new_decoder_without_bom_handling();
3029
3030 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3031 valid_up_to,
3032 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3033 ));
3034 let with_replacement = checked_add(
3035 valid_up_to,
3036 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3037 );
3038 let mut string = String::with_capacity(
3039 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3040 );
3041 unsafe {
3042 let vec = string.as_mut_vec();
3043 vec.set_len(valid_up_to);
3044 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3045 }
3046 (decoder, string, valid_up_to)
3047 } else {
3048 let decoder = self.new_decoder_without_bom_handling();
3049 let rounded_without_replacement = checked_next_power_of_two(
3050 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3051 );
3052 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3053 let string = String::with_capacity(
3054 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3055 );
3056 (decoder, string, 0)
3057 };
3058
3059 let mut total_had_errors = false;
3060 loop {
3061 let (result, read, had_errors) =
3062 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3063 total_read += read;
3064 total_had_errors |= had_errors;
3065 match result {
3066 CoderResult::InputEmpty => {
3067 debug_assert_eq!(total_read, bytes.len());
3068 return (Cow::Owned(string), total_had_errors);
3069 }
3070 CoderResult::OutputFull => {
3071 // Allocate for the worst case. That is, we should come
3072 // here at most once per invocation of this method.
3073 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3074 string.reserve(needed.unwrap());
3075 }
3076 }
3077 }
3078 }
3079
3080 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3081 /// _with malformed sequences treated as fatal_ when the entire input is
3082 /// available as a single buffer (i.e. the end of the buffer marks the end
3083 /// of the stream).
3084 ///
3085 /// When invoked on `UTF_8`, this method implements the (non-streaming
3086 /// version of) the
3087 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3088 /// spec concept.
3089 ///
3090 /// Returns `None` if a malformed sequence was encountered and the result
3091 /// of the decode as `Some(String)` otherwise.
3092 ///
3093 /// _Note:_ It is wrong to use this when the input buffer represents only
3094 /// a segment of the input instead of the whole input. Use
3095 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3096 ///
3097 /// This method performs a single heap allocation for the backing
3098 /// buffer of the `String` when unable to borrow. A borrow is performed if
3099 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3100 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3101 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3102 /// transitions.
3103 ///
3104 /// # Panics
3105 ///
3106 /// If the size calculation for a heap-allocated backing buffer overflows
3107 /// `usize`.
3108 ///
3109 /// Available to Rust only.
3110 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3111 &'static self,
3112 bytes: &'a [u8],
3113 ) -> Option<Cow<'a, str>> {
3114 if self == UTF_8 {
3115 let valid_up_to = utf8_valid_up_to(bytes);
3116 if valid_up_to == bytes.len() {
3117 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3118 return Some(Cow::Borrowed(str));
3119 }
3120 return None;
3121 }
3122 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3123 let valid_up_to = if self == ISO_2022_JP {
3124 iso_2022_jp_ascii_valid_up_to(bytes)
3125 } else {
3126 ascii_valid_up_to(bytes)
3127 };
3128 if valid_up_to == bytes.len() {
3129 let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
3130 return Some(Cow::Borrowed(str));
3131 }
3132 let decoder = self.new_decoder_without_bom_handling();
3133 let mut string = String::with_capacity(
3134 checked_add(
3135 valid_up_to,
3136 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3137 )
3138 .unwrap(),
3139 );
3140 unsafe {
3141 let vec = string.as_mut_vec();
3142 vec.set_len(valid_up_to);
3143 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3144 }
3145 (decoder, string, &bytes[valid_up_to..])
3146 } else {
3147 let decoder = self.new_decoder_without_bom_handling();
3148 let string = String::with_capacity(
3149 decoder
3150 .max_utf8_buffer_length_without_replacement(bytes.len())
3151 .unwrap(),
3152 );
3153 (decoder, string, bytes)
3154 };
3155 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3156 match result {
3157 DecoderResult::InputEmpty => {
3158 debug_assert_eq!(read, input.len());
3159 Some(Cow::Owned(string))
3160 }
3161 DecoderResult::Malformed(_, _) => None,
3162 DecoderResult::OutputFull => unreachable!(),
3163 }
3164 }
3165
3166 /// Encode complete input to `Cow<'a, [u8]>` with unmappable characters
3167 /// replaced with decimal numeric character references when the entire input
3168 /// is available as a single buffer (i.e. the end of the buffer marks the
3169 /// end of the stream).
3170 ///
3171 /// This method implements the (non-streaming version of) the
3172 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3173 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3174 /// spec concept, it is slightly more efficient to use
3175 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3176 /// method on `UTF_8`.
3177 ///
3178 /// The second item in the returned tuple is the encoding that was actually
3179 /// used (which may differ from this encoding thanks to some encodings
3180 /// having UTF-8 as their output encoding).
3181 ///
3182 /// The third item in the returned tuple indicates whether there were
3183 /// unmappable characters (that were replaced with HTML numeric character
3184 /// references).
3185 ///
3186 /// _Note:_ It is wrong to use this when the input buffer represents only
3187 /// a segment of the input instead of the whole input. Use `new_encoder()`
3188 /// when encoding segmented output.
3189 ///
3190 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3191 /// ASCII-compatible encoding, this method returns a borrow of the input
3192 /// without a heap allocation. Otherwise, this method performs a single
3193 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3194 /// unmappable characters and potentially multiple heap allocations if
3195 /// there are. These allocations are tuned for jemalloc and may not be
3196 /// optimal when using a different allocator that doesn't use power-of-two
3197 /// buckets.
3198 ///
3199 /// # Panics
3200 ///
3201 /// If the size calculation for a heap-allocated backing buffer overflows
3202 /// `usize`.
3203 ///
3204 /// Available to Rust only.
3205 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3206 let output_encoding = self.output_encoding();
3207 if output_encoding == UTF_8 {
3208 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3209 }
3210 debug_assert!(output_encoding.is_potentially_borrowable());
3211 let bytes = string.as_bytes();
3212 let valid_up_to = if output_encoding == ISO_2022_JP {
3213 iso_2022_jp_ascii_valid_up_to(bytes)
3214 } else {
3215 ascii_valid_up_to(bytes)
3216 };
3217 if valid_up_to == bytes.len() {
3218 return (Cow::Borrowed(bytes), output_encoding, false);
3219 }
3220 let mut encoder = output_encoding.new_encoder();
3221 let mut vec: Vec<u8> = Vec::with_capacity(
3222 (checked_add(
3223 valid_up_to,
3224 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3225 ))
3226 .unwrap()
3227 .next_power_of_two(),
3228 );
3229 unsafe {
3230 vec.set_len(valid_up_to);
3231 std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3232 }
3233 let mut total_read = valid_up_to;
3234 let mut total_had_errors = false;
3235 loop {
3236 let (result, read, had_errors) =
3237 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3238 total_read += read;
3239 total_had_errors |= had_errors;
3240 match result {
3241 CoderResult::InputEmpty => {
3242 debug_assert_eq!(total_read, string.len());
3243 return (Cow::Owned(vec), output_encoding, total_had_errors);
3244 }
3245 CoderResult::OutputFull => {
3246 // reserve_exact wants to know how much more on top of current
3247 // length--not current capacity.
3248 let needed = encoder
3249 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3250 let rounded = (checked_add(vec.capacity(), needed))
3251 .unwrap()
3252 .next_power_of_two();
3253 let additional = rounded - vec.len();
3254 vec.reserve_exact(additional);
3255 }
3256 }
3257 }
3258 }
3259
3260 fn new_variant_decoder(&'static self) -> VariantDecoder {
3261 self.variant.new_variant_decoder()
3262 }
3263
3264 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3265 ///
3266 /// BOM sniffing may cause the returned decoder to morph into a decoder
3267 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
3268 ///
3269 /// Available via the C wrapper.
3270 #[inline]
3271 pub fn new_decoder(&'static self) -> Decoder {
3272 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3273 }
3274
3275 /// Instantiates a new decoder for this encoding with BOM removal.
3276 ///
3277 /// If the input starts with bytes that are the BOM for this encoding,
3278 /// those bytes are removed. However, the decoder never morphs into a
3279 /// decoder for another encoding: A BOM for another encoding is treated as
3280 /// (potentially malformed) input to the decoding algorithm for this
3281 /// encoding.
3282 ///
3283 /// Available via the C wrapper.
3284 #[inline]
3285 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3286 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3287 }
3288
3289 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3290 ///
3291 /// If the input starts with bytes that look like a BOM, those bytes are
3292 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3293 /// for another encoding.)
3294 ///
3295 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3296 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3297 /// instead of this method to cause the BOM to be removed.
3298 ///
3299 /// Available via the C wrapper.
3300 #[inline]
3301 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3302 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3303 }
3304
3305 /// Instantiates a new encoder for the output encoding of this encoding.
3306 ///
3307 /// Available via the C wrapper.
3308 #[inline]
3309 pub fn new_encoder(&'static self) -> Encoder {
3310 let enc = self.output_encoding();
3311 enc.variant.new_encoder(enc)
3312 }
3313
3314 /// Validates UTF-8.
3315 ///
3316 /// Returns the index of the first byte that makes the input malformed as
3317 /// UTF-8 or the length of the slice if the slice is entirely valid.
3318 ///
3319 /// This is currently faster than the corresponding standard library
3320 /// functionality. If this implementation gets upstreamed to the standard
3321 /// library, this method may be removed in the future.
3322 ///
3323 /// Available via the C wrapper.
3324 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3325 utf8_valid_up_to(bytes)
3326 }
3327
3328 /// Validates ASCII.
3329 ///
3330 /// Returns the index of the first byte that makes the input malformed as
3331 /// ASCII or the length of the slice if the slice is entirely valid.
3332 ///
3333 /// Available via the C wrapper.
3334 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3335 ascii_valid_up_to(bytes)
3336 }
3337
3338 /// Validates ISO-2022-JP ASCII-state data.
3339 ///
3340 /// Returns the index of the first byte that makes the input not
3341 /// representable in the ASCII state of ISO-2022-JP or the length of the
3342 /// slice if the slice is entirely representable in the ASCII state of
3343 /// ISO-2022-JP.
3344 ///
3345 /// Available via the C wrapper.
3346 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3347 iso_2022_jp_ascii_valid_up_to(bytes)
3348 }
3349}
3350
3351impl PartialEq for Encoding {
3352 #[inline]
3353 fn eq(&self, other: &Encoding) -> bool {
3354 (self as *const Encoding) == (other as *const Encoding)
3355 }
3356}
3357
3358impl Eq for Encoding {}
3359
3360impl Hash for Encoding {
3361 #[inline]
3362 fn hash<H: Hasher>(&self, state: &mut H) {
3363 (self as *const Encoding).hash(state);
3364 }
3365}
3366
3367impl std::fmt::Debug for Encoding {
3368 #[inline]
3369 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3370 write!(f, "Encoding {{ {} }}", self.name)
3371 }
3372}
3373
3374#[cfg(feature = "serde")]
3375impl Serialize for Encoding {
3376 #[inline]
3377 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3378 where
3379 S: Serializer,
3380 {
3381 serializer.serialize_str(self.name)
3382 }
3383}
3384
3385#[cfg(feature = "serde")]
3386struct EncodingVisitor;
3387
3388#[cfg(feature = "serde")]
3389impl<'de> Visitor<'de> for EncodingVisitor {
3390 type Value = &'static Encoding;
3391
3392 fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
3393 formatter.write_str("a valid encoding label")
3394 }
3395
3396 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3397 where
3398 E: serde::de::Error,
3399 {
3400 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3401 Ok(enc)
3402 } else {
3403 Err(E::custom(format!("invalid encoding label: {}", value)))
3404 }
3405 }
3406}
3407
3408#[cfg(feature = "serde")]
3409impl<'de> Deserialize<'de> for &'static Encoding {
3410 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3411 where
3412 D: Deserializer<'de>,
3413 {
3414 deserializer.deserialize_str(EncodingVisitor)
3415 }
3416}
3417
3418/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3419#[derive(PartialEq, Debug, Copy, Clone)]
3420enum DecoderLifeCycle {
3421 /// The decoder has seen no input yet.
3422 AtStart,
3423 /// The decoder has seen no input yet but expects UTF-8.
3424 AtUtf8Start,
3425 /// The decoder has seen no input yet but expects UTF-16BE.
3426 AtUtf16BeStart,
3427 /// The decoder has seen no input yet but expects UTF-16LE.
3428 AtUtf16LeStart,
3429 /// The decoder has seen EF.
3430 SeenUtf8First,
3431 /// The decoder has seen EF, BB.
3432 SeenUtf8Second,
3433 /// The decoder has seen FE.
3434 SeenUtf16BeFirst,
3435 /// The decoder has seen FF.
3436 SeenUtf16LeFirst,
3437 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3438 /// underlying decoder reported EF as an error, so we need to remember to
3439 /// push BB before the next buffer.
3440 ConvertingWithPendingBB,
3441 /// No longer looking for a BOM and EOF not yet seen.
3442 Converting,
3443 /// EOF has been seen.
3444 Finished,
3445}
3446
3447/// Communicate the BOM handling mode.
3448#[derive(Debug, Copy, Clone)]
3449enum BomHandling {
3450 /// Don't handle the BOM
3451 Off,
3452 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3453 Sniff,
3454 /// Remove the BOM only if it's the BOM for this encoding
3455 Remove,
3456}
3457
3458/// Result of a (potentially partial) decode or encode operation with
3459/// replacement.
3460#[must_use]
3461#[derive(Debug, PartialEq, Eq)]
3462pub enum CoderResult {
3463 /// The input was exhausted.
3464 ///
3465 /// If this result was returned from a call where `last` was `true`, the
3466 /// conversion process has completed. Otherwise, the caller should call a
3467 /// decode or encode method again with more input.
3468 InputEmpty,
3469
3470 /// The converter cannot produce another unit of output, because the output
3471 /// buffer does not have enough space left.
3472 ///
3473 /// The caller must provide more output space upon the next call and re-push
3474 /// the remaining input to the converter.
3475 OutputFull,
3476}
3477
3478/// Result of a (potentially partial) decode operation without replacement.
3479#[must_use]
3480#[derive(Debug, PartialEq, Eq)]
3481pub enum DecoderResult {
3482 /// The input was exhausted.
3483 ///
3484 /// If this result was returned from a call where `last` was `true`, the
3485 /// decoding process has completed. Otherwise, the caller should call a
3486 /// decode method again with more input.
3487 InputEmpty,
3488
3489 /// The decoder cannot produce another unit of output, because the output
3490 /// buffer does not have enough space left.
3491 ///
3492 /// The caller must provide more output space upon the next call and re-push
3493 /// the remaining input to the decoder.
3494 OutputFull,
3495
3496 /// The decoder encountered a malformed byte sequence.
3497 ///
3498 /// The caller must either treat this as a fatal error or must append one
3499 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3500 /// the remaining input to the decoder.
3501 ///
3502 /// The first wrapped integer indicates the length of the malformed byte
3503 /// sequence. The second wrapped integer indicates the number of bytes
3504 /// that were consumed after the malformed sequence. If the second
3505 /// integer is zero, the last byte that was consumed is the last byte of
3506 /// the malformed sequence. Note that the malformed bytes may have been part
3507 /// of an earlier input buffer.
3508 ///
3509 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3510 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3511 /// of the two is 6, which happens with ISO-2022-JP.
3512 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3513}
3514
3515/// A converter that decodes a byte stream into Unicode according to a
3516/// character encoding in a streaming (incremental) manner.
3517///
3518/// The various `decode_*` methods take an input buffer (`src`) and an output
3519/// buffer `dst` both of which are caller-allocated. There are variants for
3520/// both UTF-8 and UTF-16 output buffers.
3521///
3522/// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3523/// into `dst` until one of the following three things happens:
3524///
3525/// 1. A malformed byte sequence is encountered (`*_without_replacement`
3526/// variants only).
3527///
3528/// 2. The output buffer has been filled so near capacity that the decoder
3529/// cannot be sure that processing an additional byte of input wouldn't
3530/// cause so much output that the output buffer would overflow.
3531///
3532/// 3. All the input bytes have been processed.
3533///
3534/// The `decode_*` method then returns tuple of a status indicating which one
3535/// of the three reasons to return happened, how many input bytes were read,
3536/// how many output code units (`u8` when decoding into UTF-8 and `u16`
3537/// when decoding to UTF-16) were written (except when decoding into `String`,
3538/// whose length change indicates this), and in the case of the
3539/// variants performing replacement, a boolean indicating whether an error was
3540/// replaced with the REPLACEMENT CHARACTER during the call.
3541///
3542/// The number of bytes "written" is what's logically written. Garbage may be
3543/// written in the output buffer beyond the point logically written to.
3544/// Therefore, if you wish to decode into an `&mut str`, you should use the
3545/// methods that take an `&mut str` argument instead of the ones that take an
3546/// `&mut [u8]` argument. The former take care of overwriting the trailing
3547/// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3548/// latter don't.
3549///
3550/// In the case of the `*_without_replacement` variants, the status is a
3551/// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3552/// `InputEmpty` corresponding to the three cases listed above).
3553///
3554/// In the case of methods whose name does not end with
3555/// `*_without_replacement`, malformed sequences are automatically replaced
3556/// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3557/// return early.
3558///
3559/// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3560/// space. When decoding to UTF-16, the output buffer must have at least two
3561/// UTF-16 code units (`u16`) of space.
3562///
3563/// When decoding to UTF-8 without replacement, the methods are guaranteed
3564/// not to return indicating that more output space is needed if the length
3565/// of the output buffer is at least the length returned by
3566/// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3567/// with replacement, the length of the output buffer that guarantees the
3568/// methods not to return indicating that more output space is needed is given
3569/// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3570/// or without replacement, the length of the output buffer that guarantees
3571/// the methods not to return indicating that more output space is needed is
3572/// given by [`max_utf16_buffer_length()`][4].
3573///
3574/// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3575/// and the output after each `decode_*` call is guaranteed to consist of
3576/// complete characters. (I.e. the code unit sequence for the last character is
3577/// guaranteed not to be split across output buffers.)
3578///
3579/// The boolean argument `last` indicates that the end of the stream is reached
3580/// when all the bytes in `src` have been consumed.
3581///
3582/// A `Decoder` object can be used to incrementally decode a byte stream.
3583///
3584/// During the processing of a single stream, the caller must call `decode_*`
3585/// zero or more times with `last` set to `false` and then call `decode_*` at
3586/// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3587/// the processing of the stream has ended. Otherwise, the caller must call
3588/// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3589/// a fatal error).
3590///
3591/// Once the stream has ended, the `Decoder` object must not be used anymore.
3592/// That is, you need to create another one to process another stream.
3593///
3594/// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3595/// the caller does not wish to treat it as a fatal error, the input buffer
3596/// `src` may not have been completely consumed. In that case, the caller must
3597/// pass the unconsumed contents of `src` to `decode_*` again upon the next
3598/// call.
3599///
3600/// [1]: enum.DecoderResult.html
3601/// [2]: #method.max_utf8_buffer_length_without_replacement
3602/// [3]: #method.max_utf8_buffer_length
3603/// [4]: #method.max_utf16_buffer_length
3604///
3605/// # Infinite loops
3606///
3607/// When converting with a fixed-size output buffer whose size is too small to
3608/// accommodate one character or (when applicable) one numeric character
3609/// reference of output, an infinite loop ensues. When converting with a
3610/// fixed-size output buffer, it generally makes sense to make the buffer
3611/// fairly large (e.g. couple of kilobytes).
3612pub struct Decoder {
3613 encoding: &'static Encoding,
3614 variant: VariantDecoder,
3615 life_cycle: DecoderLifeCycle,
3616}
3617
3618impl Decoder {
3619 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3620 Decoder {
3621 encoding: enc,
3622 variant: decoder,
3623 life_cycle: match sniffing {
3624 BomHandling::Off => DecoderLifeCycle::Converting,
3625 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3626 BomHandling::Remove => {
3627 if enc == UTF_8 {
3628 DecoderLifeCycle::AtUtf8Start
3629 } else if enc == UTF_16BE {
3630 DecoderLifeCycle::AtUtf16BeStart
3631 } else if enc == UTF_16LE {
3632 DecoderLifeCycle::AtUtf16LeStart
3633 } else {
3634 DecoderLifeCycle::Converting
3635 }
3636 }
3637 },
3638 }
3639 }
3640
3641 /// The `Encoding` this `Decoder` is for.
3642 ///
3643 /// BOM sniffing can change the return value of this method during the life
3644 /// of the decoder.
3645 ///
3646 /// Available via the C wrapper.
3647 #[inline]
3648 pub fn encoding(&self) -> &'static Encoding {
3649 self.encoding
3650 }
3651
3652 /// Query the worst-case UTF-8 output size _with replacement_.
3653 ///
3654 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3655 /// that will not overflow given the current state of the decoder and
3656 /// `byte_length` number of additional input bytes when decoding with
3657 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3658 /// sequence or `None` if `usize` would overflow.
3659 ///
3660 /// Available via the C wrapper.
3661 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3662 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3663 // BOM getting pushed to the underlying decoder.
3664 match self.life_cycle {
3665 DecoderLifeCycle::Converting
3666 | DecoderLifeCycle::AtUtf8Start
3667 | DecoderLifeCycle::AtUtf16LeStart
3668 | DecoderLifeCycle::AtUtf16BeStart => {
3669 return self.variant.max_utf8_buffer_length(byte_length);
3670 }
3671 DecoderLifeCycle::AtStart => {
3672 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3673 if let Some(utf16_bom) = checked_add(
3674 1,
3675 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3676 ) {
3677 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3678 let encoding = self.encoding();
3679 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3680 // No need to consider the internal state of the underlying decoder,
3681 // because it is at start, because no data has reached it yet.
3682 return Some(utf_bom);
3683 } else if let Some(non_bom) =
3684 self.variant.max_utf8_buffer_length(byte_length)
3685 {
3686 return Some(std::cmp::max(utf_bom, non_bom));
3687 }
3688 }
3689 }
3690 }
3691 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3692 // Add two bytes even when only one byte has been seen,
3693 // because the one byte can become a lead byte in multibyte
3694 // decoders, but only after the decoder has been queried
3695 // for max length, so the decoder's own logic for adding
3696 // one for a pending lead cannot work.
3697 if let Some(sum) = byte_length.checked_add(2) {
3698 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3699 if self.encoding() == UTF_8 {
3700 // No need to consider the internal state of the underlying decoder,
3701 // because it is at start, because no data has reached it yet.
3702 return Some(utf8_bom);
3703 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3704 return Some(std::cmp::max(utf8_bom, non_bom));
3705 }
3706 }
3707 }
3708 }
3709 DecoderLifeCycle::ConvertingWithPendingBB => {
3710 if let Some(sum) = byte_length.checked_add(2) {
3711 return self.variant.max_utf8_buffer_length(sum);
3712 }
3713 }
3714 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3715 // Add two bytes even when only one byte has been seen,
3716 // because the one byte can become a lead byte in multibyte
3717 // decoders, but only after the decoder has been queried
3718 // for max length, so the decoder's own logic for adding
3719 // one for a pending lead cannot work.
3720 if let Some(sum) = byte_length.checked_add(2) {
3721 if let Some(utf16_bom) =
3722 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3723 {
3724 let encoding = self.encoding();
3725 if encoding == UTF_16LE || encoding == UTF_16BE {
3726 // No need to consider the internal state of the underlying decoder,
3727 // because it is at start, because no data has reached it yet.
3728 return Some(utf16_bom);
3729 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3730 return Some(std::cmp::max(utf16_bom, non_bom));
3731 }
3732 }
3733 }
3734 }
3735 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3736 }
3737 None
3738 }
3739
3740 /// Query the worst-case UTF-8 output size _without replacement_.
3741 ///
3742 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3743 /// that will not overflow given the current state of the decoder and
3744 /// `byte_length` number of additional input bytes when decoding without
3745 /// replacement error handling or `None` if `usize` would overflow.
3746 ///
3747 /// Note that this value may be too small for the `_with_replacement` case.
3748 /// Use `max_utf8_buffer_length()` for that case.
3749 ///
3750 /// Available via the C wrapper.
3751 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3752 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3753 // BOM getting pushed to the underlying decoder.
3754 match self.life_cycle {
3755 DecoderLifeCycle::Converting
3756 | DecoderLifeCycle::AtUtf8Start
3757 | DecoderLifeCycle::AtUtf16LeStart
3758 | DecoderLifeCycle::AtUtf16BeStart => {
3759 return self
3760 .variant
3761 .max_utf8_buffer_length_without_replacement(byte_length);
3762 }
3763 DecoderLifeCycle::AtStart => {
3764 if let Some(utf8_bom) = byte_length.checked_add(3) {
3765 if let Some(utf16_bom) = checked_add(
3766 1,
3767 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3768 ) {
3769 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
3770 let encoding = self.encoding();
3771 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3772 // No need to consider the internal state of the underlying decoder,
3773 // because it is at start, because no data has reached it yet.
3774 return Some(utf_bom);
3775 } else if let Some(non_bom) = self
3776 .variant
3777 .max_utf8_buffer_length_without_replacement(byte_length)
3778 {
3779 return Some(std::cmp::max(utf_bom, non_bom));
3780 }
3781 }
3782 }
3783 }
3784 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3785 // Add two bytes even when only one byte has been seen,
3786 // because the one byte can become a lead byte in multibyte
3787 // decoders, but only after the decoder has been queried
3788 // for max length, so the decoder's own logic for adding
3789 // one for a pending lead cannot work.
3790 if let Some(sum) = byte_length.checked_add(2) {
3791 if let Some(utf8_bom) = sum.checked_add(3) {
3792 if self.encoding() == UTF_8 {
3793 // No need to consider the internal state of the underlying decoder,
3794 // because it is at start, because no data has reached it yet.
3795 return Some(utf8_bom);
3796 } else if let Some(non_bom) =
3797 self.variant.max_utf8_buffer_length_without_replacement(sum)
3798 {
3799 return Some(std::cmp::max(utf8_bom, non_bom));
3800 }
3801 }
3802 }
3803 }
3804 DecoderLifeCycle::ConvertingWithPendingBB => {
3805 if let Some(sum) = byte_length.checked_add(2) {
3806 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3807 }
3808 }
3809 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3810 // Add two bytes even when only one byte has been seen,
3811 // because the one byte can become a lead byte in multibyte
3812 // decoders, but only after the decoder has been queried
3813 // for max length, so the decoder's own logic for adding
3814 // one for a pending lead cannot work.
3815 if let Some(sum) = byte_length.checked_add(2) {
3816 if let Some(utf16_bom) =
3817 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3818 {
3819 let encoding = self.encoding();
3820 if encoding == UTF_16LE || encoding == UTF_16BE {
3821 // No need to consider the internal state of the underlying decoder,
3822 // because it is at start, because no data has reached it yet.
3823 return Some(utf16_bom);
3824 } else if let Some(non_bom) =
3825 self.variant.max_utf8_buffer_length_without_replacement(sum)
3826 {
3827 return Some(std::cmp::max(utf16_bom, non_bom));
3828 }
3829 }
3830 }
3831 }
3832 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3833 }
3834 None
3835 }
3836
3837 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3838 /// replaced with the REPLACEMENT CHARACTER.
3839 ///
3840 /// See the documentation of the struct for documentation for `decode_*`
3841 /// methods collectively.
3842 ///
3843 /// Available via the C wrapper.
3844 pub fn decode_to_utf8(
3845 &mut self,
3846 src: &[u8],
3847 dst: &mut [u8],
3848 last: bool,
3849 ) -> (CoderResult, usize, usize, bool) {
3850 let mut had_errors = false;
3851 let mut total_read = 0usize;
3852 let mut total_written = 0usize;
3853 loop {
3854 let (result, read, written) = self.decode_to_utf8_without_replacement(
3855 &src[total_read..],
3856 &mut dst[total_written..],
3857 last,
3858 );
3859 total_read += read;
3860 total_written += written;
3861 match result {
3862 DecoderResult::InputEmpty => {
3863 return (
3864 CoderResult::InputEmpty,
3865 total_read,
3866 total_written,
3867 had_errors,
3868 );
3869 }
3870 DecoderResult::OutputFull => {
3871 return (
3872 CoderResult::OutputFull,
3873 total_read,
3874 total_written,
3875 had_errors,
3876 );
3877 }
3878 DecoderResult::Malformed(_, _) => {
3879 had_errors = true;
3880 // There should always be space for the U+FFFD, because
3881 // otherwise we'd have gotten OutputFull already.
3882 // XXX: is the above comment actually true for UTF-8 itself?
3883 // TODO: Consider having fewer bound checks here.
3884 dst[total_written] = 0xEFu8;
3885 total_written += 1;
3886 dst[total_written] = 0xBFu8;
3887 total_written += 1;
3888 dst[total_written] = 0xBDu8;
3889 total_written += 1;
3890 }
3891 }
3892 }
3893 }
3894
3895 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3896 /// replaced with the REPLACEMENT CHARACTER with type system signaling
3897 /// of UTF-8 validity.
3898 ///
3899 /// This methods calls `decode_to_utf8` and then zeroes
3900 /// out up to three bytes that aren't logically part of the write in order
3901 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3902 ///
3903 /// See the documentation of the struct for documentation for `decode_*`
3904 /// methods collectively.
3905 ///
3906 /// Available to Rust only.
3907 pub fn decode_to_str(
3908 &mut self,
3909 src: &[u8],
3910 dst: &mut str,
3911 last: bool,
3912 ) -> (CoderResult, usize, usize, bool) {
3913 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
3914 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
3915 let len = bytes.len();
3916 let mut trail = written;
3917 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
3918 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
3919 // encodings to avoid overwriting here.
3920 if self.encoding != UTF_8 {
3921 let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
3922 while trail < max {
3923 bytes[trail] = 0;
3924 trail += 1;
3925 }
3926 }
3927 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
3928 bytes[trail] = 0;
3929 trail += 1;
3930 }
3931 (result, read, written, replaced)
3932 }
3933
3934 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3935 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
3936 ///
3937 /// Like the others, this method follows the logic that the output buffer is
3938 /// caller-allocated. This method treats the capacity of the `String` as
3939 /// the output limit. That is, this method guarantees not to cause a
3940 /// reallocation of the backing buffer of `String`.
3941 ///
3942 /// The return value is a tuple that contains the `DecoderResult`, the
3943 /// number of bytes read and a boolean indicating whether replacements
3944 /// were done. The number of bytes written is signaled via the length of
3945 /// the `String` changing.
3946 ///
3947 /// See the documentation of the struct for documentation for `decode_*`
3948 /// methods collectively.
3949 ///
3950 /// Available to Rust only.
3951 pub fn decode_to_string(
3952 &mut self,
3953 src: &[u8],
3954 dst: &mut String,
3955 last: bool,
3956 ) -> (CoderResult, usize, bool) {
3957 unsafe {
3958 let vec = dst.as_mut_vec();
3959 let old_len = vec.len();
3960 let capacity = vec.capacity();
3961 vec.set_len(capacity);
3962 let (result, read, written, replaced) =
3963 self.decode_to_utf8(src, &mut vec[old_len..], last);
3964 vec.set_len(old_len + written);
3965 (result, read, replaced)
3966 }
3967 }
3968
3969 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
3970 /// _without replacement_.
3971 ///
3972 /// See the documentation of the struct for
3973 /// documentation for `decode_*` methods
3974 /// collectively.
3975 ///
3976 /// Available via the C wrapper.
3977 ,
3978 decode_to_utf8_without_replacement,
3979 decode_to_utf8_raw,
3980 decode_to_utf8_checking_end,
3981 decode_to_utf8_after_one_potential_bom_byte,
3982 decode_to_utf8_after_two_potential_bom_bytes,
3983 decode_to_utf8_checking_end_with_offset,
3984 u8);
3985
3986 /// Incrementally decode a byte stream into UTF-8 with type system signaling
3987 /// of UTF-8 validity.
3988 ///
3989 /// This methods calls `decode_to_utf8` and then zeroes out up to three
3990 /// bytes that aren't logically part of the write in order to retain the
3991 /// UTF-8 validity even for the unwritten part of the buffer.
3992 ///
3993 /// See the documentation of the struct for documentation for `decode_*`
3994 /// methods collectively.
3995 ///
3996 /// Available to Rust only.
3997 pub fn decode_to_str_without_replacement(
3998 &mut self,
3999 src: &[u8],
4000 dst: &mut str,
4001 last: bool,
4002 ) -> (DecoderResult, usize, usize) {
4003 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4004 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4005 let len = bytes.len();
4006 let mut trail = written;
4007 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4008 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4009 // encodings to avoid overwriting here.
4010 if self.encoding != UTF_8 {
4011 let max = std::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4012 while trail < max {
4013 bytes[trail] = 0;
4014 trail += 1;
4015 }
4016 }
4017 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4018 bytes[trail] = 0;
4019 trail += 1;
4020 }
4021 (result, read, written)
4022 }
4023
4024 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4025 ///
4026 /// Like the others, this method follows the logic that the output buffer is
4027 /// caller-allocated. This method treats the capacity of the `String` as
4028 /// the output limit. That is, this method guarantees not to cause a
4029 /// reallocation of the backing buffer of `String`.
4030 ///
4031 /// The return value is a pair that contains the `DecoderResult` and the
4032 /// number of bytes read. The number of bytes written is signaled via
4033 /// the length of the `String` changing.
4034 ///
4035 /// See the documentation of the struct for documentation for `decode_*`
4036 /// methods collectively.
4037 ///
4038 /// Available to Rust only.
4039 pub fn decode_to_string_without_replacement(
4040 &mut self,
4041 src: &[u8],
4042 dst: &mut String,
4043 last: bool,
4044 ) -> (DecoderResult, usize) {
4045 unsafe {
4046 let vec = dst.as_mut_vec();
4047 let old_len = vec.len();
4048 let capacity = vec.capacity();
4049 vec.set_len(capacity);
4050 let (result, read, written) =
4051 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4052 vec.set_len(old_len + written);
4053 (result, read)
4054 }
4055 }
4056
4057 /// Query the worst-case UTF-16 output size (with or without replacement).
4058 ///
4059 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4060 /// that will not overflow given the current state of the decoder and
4061 /// `byte_length` number of additional input bytes or `None` if `usize`
4062 /// would overflow.
4063 ///
4064 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4065 /// return value of this method applies also in the
4066 /// `_without_replacement` case.
4067 ///
4068 /// Available via the C wrapper.
4069 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4070 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4071 // BOM getting pushed to the underlying decoder.
4072 match self.life_cycle {
4073 DecoderLifeCycle::Converting
4074 | DecoderLifeCycle::AtUtf8Start
4075 | DecoderLifeCycle::AtUtf16LeStart
4076 | DecoderLifeCycle::AtUtf16BeStart => {
4077 return self.variant.max_utf16_buffer_length(byte_length);
4078 }
4079 DecoderLifeCycle::AtStart => {
4080 if let Some(utf8_bom) = byte_length.checked_add(1) {
4081 if let Some(utf16_bom) =
4082 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4083 {
4084 let utf_bom = std::cmp::max(utf8_bom, utf16_bom);
4085 let encoding = self.encoding();
4086 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4087 // No need to consider the internal state of the underlying decoder,
4088 // because it is at start, because no data has reached it yet.
4089 return Some(utf_bom);
4090 } else if let Some(non_bom) =
4091 self.variant.max_utf16_buffer_length(byte_length)
4092 {
4093 return Some(std::cmp::max(utf_bom, non_bom));
4094 }
4095 }
4096 }
4097 }
4098 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4099 // Add two bytes even when only one byte has been seen,
4100 // because the one byte can become a lead byte in multibyte
4101 // decoders, but only after the decoder has been queried
4102 // for max length, so the decoder's own logic for adding
4103 // one for a pending lead cannot work.
4104 if let Some(sum) = byte_length.checked_add(2) {
4105 if let Some(utf8_bom) = sum.checked_add(1) {
4106 if self.encoding() == UTF_8 {
4107 // No need to consider the internal state of the underlying decoder,
4108 // because it is at start, because no data has reached it yet.
4109 return Some(utf8_bom);
4110 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4111 return Some(std::cmp::max(utf8_bom, non_bom));
4112 }
4113 }
4114 }
4115 }
4116 DecoderLifeCycle::ConvertingWithPendingBB => {
4117 if let Some(sum) = byte_length.checked_add(2) {
4118 return self.variant.max_utf16_buffer_length(sum);
4119 }
4120 }
4121 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4122 // Add two bytes even when only one byte has been seen,
4123 // because the one byte can become a lead byte in multibyte
4124 // decoders, but only after the decoder has been queried
4125 // for max length, so the decoder's own logic for adding
4126 // one for a pending lead cannot work.
4127 if let Some(sum) = byte_length.checked_add(2) {
4128 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4129 let encoding = self.encoding();
4130 if encoding == UTF_16LE || encoding == UTF_16BE {
4131 // No need to consider the internal state of the underlying decoder,
4132 // because it is at start, because no data has reached it yet.
4133 return Some(utf16_bom);
4134 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4135 return Some(std::cmp::max(utf16_bom, non_bom));
4136 }
4137 }
4138 }
4139 }
4140 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4141 }
4142 None
4143 }
4144
4145 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4146 /// replaced with the REPLACEMENT CHARACTER.
4147 ///
4148 /// See the documentation of the struct for documentation for `decode_*`
4149 /// methods collectively.
4150 ///
4151 /// Available via the C wrapper.
4152 pub fn decode_to_utf16(
4153 &mut self,
4154 src: &[u8],
4155 dst: &mut [u16],
4156 last: bool,
4157 ) -> (CoderResult, usize, usize, bool) {
4158 let mut had_errors = false;
4159 let mut total_read = 0usize;
4160 let mut total_written = 0usize;
4161 loop {
4162 let (result, read, written) = self.decode_to_utf16_without_replacement(
4163 &src[total_read..],
4164 &mut dst[total_written..],
4165 last,
4166 );
4167 total_read += read;
4168 total_written += written;
4169 match result {
4170 DecoderResult::InputEmpty => {
4171 return (
4172 CoderResult::InputEmpty,
4173 total_read,
4174 total_written,
4175 had_errors,
4176 );
4177 }
4178 DecoderResult::OutputFull => {
4179 return (
4180 CoderResult::OutputFull,
4181 total_read,
4182 total_written,
4183 had_errors,
4184 );
4185 }
4186 DecoderResult::Malformed(_, _) => {
4187 had_errors = true;
4188 // There should always be space for the U+FFFD, because
4189 // otherwise we'd have gotten OutputFull already.
4190 dst[total_written] = 0xFFFD;
4191 total_written += 1;
4192 }
4193 }
4194 }
4195 }
4196
4197 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4198 /// _without replacement_.
4199 ///
4200 /// See the documentation of the struct for
4201 /// documentation for `decode_*` methods
4202 /// collectively.
4203 ///
4204 /// Available via the C wrapper.
4205 ,
4206 decode_to_utf16_without_replacement,
4207 decode_to_utf16_raw,
4208 decode_to_utf16_checking_end,
4209 decode_to_utf16_after_one_potential_bom_byte,
4210 decode_to_utf16_after_two_potential_bom_bytes,
4211 decode_to_utf16_checking_end_with_offset,
4212 u16);
4213
4214 /// Checks for compatibility with storing Unicode scalar values as unsigned
4215 /// bytes taking into account the state of the decoder.
4216 ///
4217 /// Returns `None` if the decoder is not in a neutral state, including waiting
4218 /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4219 ///
4220 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4221 /// directly correspond to the decoded Unicode scalar value, or the length
4222 /// of the input if all bytes in the input decode directly to scalar values
4223 /// corresponding to the unsigned byte values.
4224 ///
4225 /// Does not change the state of the decoder.
4226 ///
4227 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4228 /// storage optimizations.
4229 ///
4230 /// Available via the C wrapper.
4231 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4232 match self.life_cycle {
4233 DecoderLifeCycle::Converting => {
4234 return self.variant.latin1_byte_compatible_up_to(bytes);
4235 }
4236 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4237 _ => None,
4238 }
4239 }
4240}
4241
4242/// Result of a (potentially partial) encode operation without replacement.
4243#[must_use]
4244#[derive(Debug, PartialEq, Eq)]
4245pub enum EncoderResult {
4246 /// The input was exhausted.
4247 ///
4248 /// If this result was returned from a call where `last` was `true`, the
4249 /// decoding process has completed. Otherwise, the caller should call a
4250 /// decode method again with more input.
4251 InputEmpty,
4252
4253 /// The encoder cannot produce another unit of output, because the output
4254 /// buffer does not have enough space left.
4255 ///
4256 /// The caller must provide more output space upon the next call and re-push
4257 /// the remaining input to the decoder.
4258 OutputFull,
4259
4260 /// The encoder encountered an unmappable character.
4261 ///
4262 /// The caller must either treat this as a fatal error or must append
4263 /// a placeholder to the output and then re-push the remaining input to the
4264 /// encoder.
4265 Unmappable(char),
4266}
4267
4268impl EncoderResult {
4269 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4270 EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
4271 }
4272}
4273
4274/// A converter that encodes a Unicode stream into bytes according to a
4275/// character encoding in a streaming (incremental) manner.
4276///
4277/// The various `encode_*` methods take an input buffer (`src`) and an output
4278/// buffer `dst` both of which are caller-allocated. There are variants for
4279/// both UTF-8 and UTF-16 input buffers.
4280///
4281/// An `encode_*` method encode characters from `src` into bytes characters
4282/// stored into `dst` until one of the following three things happens:
4283///
4284/// 1. An unmappable character is encountered (`*_without_replacement` variants
4285/// only).
4286///
4287/// 2. The output buffer has been filled so near capacity that the decoder
4288/// cannot be sure that processing an additional character of input wouldn't
4289/// cause so much output that the output buffer would overflow.
4290///
4291/// 3. All the input characters have been processed.
4292///
4293/// The `encode_*` method then returns tuple of a status indicating which one
4294/// of the three reasons to return happened, how many input code units (`u8`
4295/// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4296/// how many output bytes were written (except when encoding into `Vec<u8>`,
4297/// whose length change indicates this), and in the case of the variants that
4298/// perform replacement, a boolean indicating whether an unmappable
4299/// character was replaced with a numeric character reference during the call.
4300///
4301/// The number of bytes "written" is what's logically written. Garbage may be
4302/// written in the output buffer beyond the point logically written to.
4303///
4304/// In the case of the methods whose name ends with
4305/// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4306/// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4307/// the three cases listed above).
4308///
4309/// In the case of methods whose name does not end with
4310/// `*_without_replacement`, unmappable characters are automatically replaced
4311/// with the corresponding numeric character references and unmappable
4312/// characters do not cause the methods to return early.
4313///
4314/// When encoding from UTF-8 without replacement, the methods are guaranteed
4315/// not to return indicating that more output space is needed if the length
4316/// of the output buffer is at least the length returned by
4317/// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4318/// UTF-8 with replacement, the length of the output buffer that guarantees the
4319/// methods not to return indicating that more output space is needed in the
4320/// absence of unmappable characters is given by
4321/// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4322/// UTF-16 without replacement, the methods are guaranteed not to return
4323/// indicating that more output space is needed if the length of the output
4324/// buffer is at least the length returned by
4325/// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4326/// from UTF-16 with replacement, the the length of the output buffer that
4327/// guarantees the methods not to return indicating that more output space is
4328/// needed in the absence of unmappable characters is given by
4329/// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4330/// When encoding with replacement, applications are not expected to size the
4331/// buffer for the worst case ahead of time but to resize the buffer if there
4332/// are unmappable characters. This is why max length queries are only available
4333/// for the case where there are no unmappable characters.
4334///
4335/// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4336/// calling from Rust, the type system takes care of this.) When encoding from
4337/// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4338/// CHARACTERS. Therefore, in order for astral characters not to turn into a
4339/// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4340/// are not split across input buffer boundaries.
4341///
4342/// After an `encode_*` call returns, the output produced so far, taken as a
4343/// whole from the start of the stream, is guaranteed to consist of a valid
4344/// byte sequence in the target encoding. (I.e. the code unit sequence for a
4345/// character is guaranteed not to be split across output buffers. However, due
4346/// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4347/// from the start for it to be valid. For other encodings, the validity holds
4348/// on a per-output buffer basis.)
4349///
4350/// The boolean argument `last` indicates that the end of the stream is reached
4351/// when all the characters in `src` have been consumed. This argument is needed
4352/// for ISO-2022-JP and is ignored for other encodings.
4353///
4354/// An `Encoder` object can be used to incrementally encode a byte stream.
4355///
4356/// During the processing of a single stream, the caller must call `encode_*`
4357/// zero or more times with `last` set to `false` and then call `encode_*` at
4358/// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4359/// the processing of the stream has ended. Otherwise, the caller must call
4360/// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4361/// as a fatal error).
4362///
4363/// Once the stream has ended, the `Encoder` object must not be used anymore.
4364/// That is, you need to create another one to process another stream.
4365///
4366/// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4367/// and the caller does not wish to treat it as a fatal error, the input buffer
4368/// `src` may not have been completely consumed. In that case, the caller must
4369/// pass the unconsumed contents of `src` to `encode_*` again upon the next
4370/// call.
4371///
4372/// [1]: enum.EncoderResult.html
4373/// [2]: #method.max_buffer_length_from_utf8_without_replacement
4374/// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4375/// [4]: #method.max_buffer_length_from_utf16_without_replacement
4376/// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4377///
4378/// # Infinite loops
4379///
4380/// When converting with a fixed-size output buffer whose size is too small to
4381/// accommodate one character of output, an infinite loop ensues. When
4382/// converting with a fixed-size output buffer, it generally makes sense to
4383/// make the buffer fairly large (e.g. couple of kilobytes).
4384pub struct Encoder {
4385 encoding: &'static Encoding,
4386 variant: VariantEncoder,
4387}
4388
4389impl Encoder {
4390 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4391 Encoder {
4392 encoding: enc,
4393 variant: encoder,
4394 }
4395 }
4396
4397 /// The `Encoding` this `Encoder` is for.
4398 #[inline]
4399 pub fn encoding(&self) -> &'static Encoding {
4400 self.encoding
4401 }
4402
4403 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4404 /// ASCII state and `false` otherwise.
4405 #[inline]
4406 pub fn has_pending_state(&self) -> bool {
4407 self.variant.has_pending_state()
4408 }
4409
4410 /// Query the worst-case output size when encoding from UTF-8 with
4411 /// replacement.
4412 ///
4413 /// Returns the size of the output buffer in bytes that will not overflow
4414 /// given the current state of the encoder and `byte_length` number of
4415 /// additional input code units if there are no unmappable characters in
4416 /// the input or `None` if `usize` would overflow.
4417 ///
4418 /// Available via the C wrapper.
4419 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4420 &self,
4421 byte_length: usize,
4422 ) -> Option<usize> {
4423 checked_add(
4424 if self.encoding().can_encode_everything() {
4425 0
4426 } else {
4427 NCR_EXTRA
4428 },
4429 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4430 )
4431 }
4432
4433 /// Query the worst-case output size when encoding from UTF-8 without
4434 /// replacement.
4435 ///
4436 /// Returns the size of the output buffer in bytes that will not overflow
4437 /// given the current state of the encoder and `byte_length` number of
4438 /// additional input code units or `None` if `usize` would overflow.
4439 ///
4440 /// Available via the C wrapper.
4441 pub fn max_buffer_length_from_utf8_without_replacement(
4442 &self,
4443 byte_length: usize,
4444 ) -> Option<usize> {
4445 self.variant
4446 .max_buffer_length_from_utf8_without_replacement(byte_length)
4447 }
4448
4449 /// Incrementally encode into byte stream from UTF-8 with unmappable
4450 /// characters replaced with HTML (decimal) numeric character references.
4451 ///
4452 /// See the documentation of the struct for documentation for `encode_*`
4453 /// methods collectively.
4454 ///
4455 /// Available via the C wrapper.
4456 pub fn encode_from_utf8(
4457 &mut self,
4458 src: &str,
4459 dst: &mut [u8],
4460 last: bool,
4461 ) -> (CoderResult, usize, usize, bool) {
4462 let dst_len = dst.len();
4463 let effective_dst_len = if self.encoding().can_encode_everything() {
4464 dst_len
4465 } else {
4466 if dst_len < NCR_EXTRA {
4467 if src.is_empty() && !(last && self.has_pending_state()) {
4468 return (CoderResult::InputEmpty, 0, 0, false);
4469 }
4470 return (CoderResult::OutputFull, 0, 0, false);
4471 }
4472 dst_len - NCR_EXTRA
4473 };
4474 let mut had_unmappables = false;
4475 let mut total_read = 0usize;
4476 let mut total_written = 0usize;
4477 loop {
4478 let (result, read, written) = self.encode_from_utf8_without_replacement(
4479 &src[total_read..],
4480 &mut dst[total_written..effective_dst_len],
4481 last,
4482 );
4483 total_read += read;
4484 total_written += written;
4485 match result {
4486 EncoderResult::InputEmpty => {
4487 return (
4488 CoderResult::InputEmpty,
4489 total_read,
4490 total_written,
4491 had_unmappables,
4492 );
4493 }
4494 EncoderResult::OutputFull => {
4495 return (
4496 CoderResult::OutputFull,
4497 total_read,
4498 total_written,
4499 had_unmappables,
4500 );
4501 }
4502 EncoderResult::Unmappable(unmappable) => {
4503 had_unmappables = true;
4504 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4505 debug_assert_ne!(self.encoding(), UTF_16BE);
4506 debug_assert_ne!(self.encoding(), UTF_16LE);
4507 // Additionally, Iso2022JpEncoder is responsible for
4508 // transitioning to ASCII when returning with Unmappable.
4509 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4510 if total_written >= effective_dst_len {
4511 if total_read == src.len() && !(last && self.has_pending_state()) {
4512 return (
4513 CoderResult::InputEmpty,
4514 total_read,
4515 total_written,
4516 had_unmappables,
4517 );
4518 }
4519 return (
4520 CoderResult::OutputFull,
4521 total_read,
4522 total_written,
4523 had_unmappables,
4524 );
4525 }
4526 }
4527 }
4528 }
4529 }
4530
4531 /// Incrementally encode into byte stream from UTF-8 with unmappable
4532 /// characters replaced with HTML (decimal) numeric character references.
4533 ///
4534 /// See the documentation of the struct for documentation for `encode_*`
4535 /// methods collectively.
4536 ///
4537 /// Available to Rust only.
4538 pub fn encode_from_utf8_to_vec(
4539 &mut self,
4540 src: &str,
4541 dst: &mut Vec<u8>,
4542 last: bool,
4543 ) -> (CoderResult, usize, bool) {
4544 unsafe {
4545 let old_len = dst.len();
4546 let capacity = dst.capacity();
4547 dst.set_len(capacity);
4548 let (result, read, written, replaced) =
4549 self.encode_from_utf8(src, &mut dst[old_len..], last);
4550 dst.set_len(old_len + written);
4551 (result, read, replaced)
4552 }
4553 }
4554
4555 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4556 ///
4557 /// See the documentation of the struct for documentation for `encode_*`
4558 /// methods collectively.
4559 ///
4560 /// Available via the C wrapper.
4561 pub fn encode_from_utf8_without_replacement(
4562 &mut self,
4563 src: &str,
4564 dst: &mut [u8],
4565 last: bool,
4566 ) -> (EncoderResult, usize, usize) {
4567 self.variant.encode_from_utf8_raw(src, dst, last)
4568 }
4569
4570 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4571 ///
4572 /// See the documentation of the struct for documentation for `encode_*`
4573 /// methods collectively.
4574 ///
4575 /// Available to Rust only.
4576 pub fn encode_from_utf8_to_vec_without_replacement(
4577 &mut self,
4578 src: &str,
4579 dst: &mut Vec<u8>,
4580 last: bool,
4581 ) -> (EncoderResult, usize) {
4582 unsafe {
4583 let old_len = dst.len();
4584 let capacity = dst.capacity();
4585 dst.set_len(capacity);
4586 let (result, read, written) =
4587 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4588 dst.set_len(old_len + written);
4589 (result, read)
4590 }
4591 }
4592
4593 /// Query the worst-case output size when encoding from UTF-16 with
4594 /// replacement.
4595 ///
4596 /// Returns the size of the output buffer in bytes that will not overflow
4597 /// given the current state of the encoder and `u16_length` number of
4598 /// additional input code units if there are no unmappable characters in
4599 /// the input or `None` if `usize` would overflow.
4600 ///
4601 /// Available via the C wrapper.
4602 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4603 &self,
4604 u16_length: usize,
4605 ) -> Option<usize> {
4606 checked_add(
4607 if self.encoding().can_encode_everything() {
4608 0
4609 } else {
4610 NCR_EXTRA
4611 },
4612 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4613 )
4614 }
4615
4616 /// Query the worst-case output size when encoding from UTF-16 without
4617 /// replacement.
4618 ///
4619 /// Returns the size of the output buffer in bytes that will not overflow
4620 /// given the current state of the encoder and `u16_length` number of
4621 /// additional input code units or `None` if `usize` would overflow.
4622 ///
4623 /// Available via the C wrapper.
4624 pub fn max_buffer_length_from_utf16_without_replacement(
4625 &self,
4626 u16_length: usize,
4627 ) -> Option<usize> {
4628 self.variant
4629 .max_buffer_length_from_utf16_without_replacement(u16_length)
4630 }
4631
4632 /// Incrementally encode into byte stream from UTF-16 with unmappable
4633 /// characters replaced with HTML (decimal) numeric character references.
4634 ///
4635 /// See the documentation of the struct for documentation for `encode_*`
4636 /// methods collectively.
4637 ///
4638 /// Available via the C wrapper.
4639 pub fn encode_from_utf16(
4640 &mut self,
4641 src: &[u16],
4642 dst: &mut [u8],
4643 last: bool,
4644 ) -> (CoderResult, usize, usize, bool) {
4645 let dst_len = dst.len();
4646 let effective_dst_len = if self.encoding().can_encode_everything() {
4647 dst_len
4648 } else {
4649 if dst_len < NCR_EXTRA {
4650 if src.is_empty() && !(last && self.has_pending_state()) {
4651 return (CoderResult::InputEmpty, 0, 0, false);
4652 }
4653 return (CoderResult::OutputFull, 0, 0, false);
4654 }
4655 dst_len - NCR_EXTRA
4656 };
4657 let mut had_unmappables = false;
4658 let mut total_read = 0usize;
4659 let mut total_written = 0usize;
4660 loop {
4661 let (result, read, written) = self.encode_from_utf16_without_replacement(
4662 &src[total_read..],
4663 &mut dst[total_written..effective_dst_len],
4664 last,
4665 );
4666 total_read += read;
4667 total_written += written;
4668 match result {
4669 EncoderResult::InputEmpty => {
4670 return (
4671 CoderResult::InputEmpty,
4672 total_read,
4673 total_written,
4674 had_unmappables,
4675 );
4676 }
4677 EncoderResult::OutputFull => {
4678 return (
4679 CoderResult::OutputFull,
4680 total_read,
4681 total_written,
4682 had_unmappables,
4683 );
4684 }
4685 EncoderResult::Unmappable(unmappable) => {
4686 had_unmappables = true;
4687 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4688 // There are no UTF-16 encoders and even if there were,
4689 // they'd never have unmappables.
4690 debug_assert_ne!(self.encoding(), UTF_16BE);
4691 debug_assert_ne!(self.encoding(), UTF_16LE);
4692 // Additionally, Iso2022JpEncoder is responsible for
4693 // transitioning to ASCII when returning with Unmappable
4694 // from the jis0208 state. That is, when we encode
4695 // ISO-2022-JP and come here, the encoder is in either the
4696 // ASCII or the Roman state. We are allowed to generate any
4697 // printable ASCII excluding \ and ~.
4698 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4699 if total_written >= effective_dst_len {
4700 if total_read == src.len() && !(last && self.has_pending_state()) {
4701 return (
4702 CoderResult::InputEmpty,
4703 total_read,
4704 total_written,
4705 had_unmappables,
4706 );
4707 }
4708 return (
4709 CoderResult::OutputFull,
4710 total_read,
4711 total_written,
4712 had_unmappables,
4713 );
4714 }
4715 }
4716 }
4717 }
4718 }
4719
4720 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4721 ///
4722 /// See the documentation of the struct for documentation for `encode_*`
4723 /// methods collectively.
4724 ///
4725 /// Available via the C wrapper.
4726 pub fn encode_from_utf16_without_replacement(
4727 &mut self,
4728 src: &[u16],
4729 dst: &mut [u8],
4730 last: bool,
4731 ) -> (EncoderResult, usize, usize) {
4732 self.variant.encode_from_utf16_raw(src, dst, last)
4733 }
4734}
4735
4736/// Format an unmappable as NCR without heap allocation.
4737fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4738 // len is the number of decimal digits needed to represent unmappable plus
4739 // 3 (the length of "&#" and ";").
4740 let mut number = unmappable as u32;
4741 let len = if number >= 1_000_000u32 {
4742 10usize
4743 } else if number >= 100_000u32 {
4744 9usize
4745 } else if number >= 10_000u32 {
4746 8usize
4747 } else if number >= 1_000u32 {
4748 7usize
4749 } else if number >= 100u32 {
4750 6usize
4751 } else {
4752 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4753 // to see if this case is possible
4754 5usize
4755 };
4756 debug_assert!(number >= 10u32);
4757 debug_assert!(len <= dst.len());
4758 let mut pos = len - 1;
4759 dst[pos] = b';';
4760 pos -= 1;
4761 loop {
4762 let rightmost = number % 10;
4763 dst[pos] = rightmost as u8 + b'0';
4764 pos -= 1;
4765 if number < 10 {
4766 break;
4767 }
4768 number /= 10;
4769 }
4770 dst[1] = b'#';
4771 dst[0] = b'&';
4772 len
4773}
4774
4775#[inline(always)]
4776fn in_range16(i: u16, start: u16, end: u16) -> bool {
4777 i.wrapping_sub(start) < (end - start)
4778}
4779
4780#[inline(always)]
4781fn in_range32(i: u32, start: u32, end: u32) -> bool {
4782 i.wrapping_sub(start) < (end - start)
4783}
4784
4785#[inline(always)]
4786fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4787 i.wrapping_sub(start) <= (end - start)
4788}
4789
4790#[inline(always)]
4791fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4792 i.wrapping_sub(start) <= (end - start)
4793}
4794
4795#[inline(always)]
4796fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4797 i.wrapping_sub(start) <= (end - start)
4798}
4799
4800#[inline(always)]
4801fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4802 i.wrapping_sub(start) <= (end - start)
4803}
4804
4805#[inline(always)]
4806fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4807 if let Some(n) = opt {
4808 n.checked_add(num)
4809 } else {
4810 None
4811 }
4812}
4813
4814#[inline(always)]
4815fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4816 if let Some(n) = one {
4817 checked_add(n, other)
4818 } else {
4819 None
4820 }
4821}
4822
4823#[inline(always)]
4824fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4825 if let Some(n) = opt {
4826 n.checked_mul(num)
4827 } else {
4828 None
4829 }
4830}
4831
4832#[inline(always)]
4833fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4834 if let Some(n) = opt {
4835 n.checked_div(num)
4836 } else {
4837 None
4838 }
4839}
4840
4841#[inline(always)]
4842fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4843 opt.map(|n| n.next_power_of_two())
4844}
4845
4846#[inline(always)]
4847fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4848 if let Some(a) = one {
4849 if let Some(b) = other {
4850 Some(::std::cmp::min(a, b))
4851 } else {
4852 Some(a)
4853 }
4854 } else {
4855 other
4856 }
4857}
4858
4859// ############## TESTS ###############
4860
4861#[cfg(all(test, feature = "serde"))]
4862#[derive(Serialize, Deserialize, Debug, PartialEq)]
4863struct Demo {
4864 num: u32,
4865 name: String,
4866 enc: &'static Encoding,
4867}
4868
4869#[cfg(test)]
4870mod test_labels_names;
4871
4872#[cfg(test)]
4873mod tests {
4874 use super::*;
4875 use std::borrow::Cow;
4876
4877 fn sniff_to_utf16(
4878 initial_encoding: &'static Encoding,
4879 expected_encoding: &'static Encoding,
4880 bytes: &[u8],
4881 expect: &[u16],
4882 breaks: &[usize],
4883 ) {
4884 let mut decoder = initial_encoding.new_decoder();
4885
4886 let mut dest: Vec<u16> =
4887 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4888 let capacity = dest.capacity();
4889 dest.resize(capacity, 0u16);
4890
4891 let mut total_written = 0usize;
4892 let mut start = 0usize;
4893 for br in breaks {
4894 let (result, read, written, _) =
4895 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4896 total_written += written;
4897 assert_eq!(read, *br - start);
4898 match result {
4899 CoderResult::InputEmpty => {}
4900 CoderResult::OutputFull => {
4901 unreachable!();
4902 }
4903 }
4904 start = *br;
4905 }
4906 let (result, read, written, _) =
4907 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
4908 total_written += written;
4909 match result {
4910 CoderResult::InputEmpty => {}
4911 CoderResult::OutputFull => {
4912 unreachable!();
4913 }
4914 }
4915 assert_eq!(read, bytes.len() - start);
4916 assert_eq!(total_written, expect.len());
4917 assert_eq!(&dest[..total_written], expect);
4918 assert_eq!(decoder.encoding(), expected_encoding);
4919 }
4920
4921 // Any copyright to the test code below this comment is dedicated to the
4922 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
4923
4924 #[test]
4925 fn test_bom_sniffing() {
4926 // ASCII
4927 sniff_to_utf16(
4928 WINDOWS_1252,
4929 WINDOWS_1252,
4930 b"\x61\x62",
4931 &[0x0061u16, 0x0062u16],
4932 &[],
4933 );
4934 // UTF-8
4935 sniff_to_utf16(
4936 WINDOWS_1252,
4937 UTF_8,
4938 b"\xEF\xBB\xBF\x61\x62",
4939 &[0x0061u16, 0x0062u16],
4940 &[],
4941 );
4942 sniff_to_utf16(
4943 WINDOWS_1252,
4944 UTF_8,
4945 b"\xEF\xBB\xBF\x61\x62",
4946 &[0x0061u16, 0x0062u16],
4947 &[1],
4948 );
4949 sniff_to_utf16(
4950 WINDOWS_1252,
4951 UTF_8,
4952 b"\xEF\xBB\xBF\x61\x62",
4953 &[0x0061u16, 0x0062u16],
4954 &[2],
4955 );
4956 sniff_to_utf16(
4957 WINDOWS_1252,
4958 UTF_8,
4959 b"\xEF\xBB\xBF\x61\x62",
4960 &[0x0061u16, 0x0062u16],
4961 &[3],
4962 );
4963 sniff_to_utf16(
4964 WINDOWS_1252,
4965 UTF_8,
4966 b"\xEF\xBB\xBF\x61\x62",
4967 &[0x0061u16, 0x0062u16],
4968 &[4],
4969 );
4970 sniff_to_utf16(
4971 WINDOWS_1252,
4972 UTF_8,
4973 b"\xEF\xBB\xBF\x61\x62",
4974 &[0x0061u16, 0x0062u16],
4975 &[2, 3],
4976 );
4977 sniff_to_utf16(
4978 WINDOWS_1252,
4979 UTF_8,
4980 b"\xEF\xBB\xBF\x61\x62",
4981 &[0x0061u16, 0x0062u16],
4982 &[1, 2],
4983 );
4984 sniff_to_utf16(
4985 WINDOWS_1252,
4986 UTF_8,
4987 b"\xEF\xBB\xBF\x61\x62",
4988 &[0x0061u16, 0x0062u16],
4989 &[1, 3],
4990 );
4991 sniff_to_utf16(
4992 WINDOWS_1252,
4993 UTF_8,
4994 b"\xEF\xBB\xBF\x61\x62",
4995 &[0x0061u16, 0x0062u16],
4996 &[1, 2, 3, 4],
4997 );
4998 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
4999 // Not UTF-8
5000 sniff_to_utf16(
5001 WINDOWS_1252,
5002 WINDOWS_1252,
5003 b"\xEF\xBB\x61\x62",
5004 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5005 &[],
5006 );
5007 sniff_to_utf16(
5008 WINDOWS_1252,
5009 WINDOWS_1252,
5010 b"\xEF\xBB\x61\x62",
5011 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5012 &[1],
5013 );
5014 sniff_to_utf16(
5015 WINDOWS_1252,
5016 WINDOWS_1252,
5017 b"\xEF\x61\x62",
5018 &[0x00EFu16, 0x0061u16, 0x0062u16],
5019 &[],
5020 );
5021 sniff_to_utf16(
5022 WINDOWS_1252,
5023 WINDOWS_1252,
5024 b"\xEF\x61\x62",
5025 &[0x00EFu16, 0x0061u16, 0x0062u16],
5026 &[1],
5027 );
5028 sniff_to_utf16(
5029 WINDOWS_1252,
5030 WINDOWS_1252,
5031 b"\xEF\xBB",
5032 &[0x00EFu16, 0x00BBu16],
5033 &[],
5034 );
5035 sniff_to_utf16(
5036 WINDOWS_1252,
5037 WINDOWS_1252,
5038 b"\xEF\xBB",
5039 &[0x00EFu16, 0x00BBu16],
5040 &[1],
5041 );
5042 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5043 // Not UTF-16
5044 sniff_to_utf16(
5045 WINDOWS_1252,
5046 WINDOWS_1252,
5047 b"\xFE\x61\x62",
5048 &[0x00FEu16, 0x0061u16, 0x0062u16],
5049 &[],
5050 );
5051 sniff_to_utf16(
5052 WINDOWS_1252,
5053 WINDOWS_1252,
5054 b"\xFE\x61\x62",
5055 &[0x00FEu16, 0x0061u16, 0x0062u16],
5056 &[1],
5057 );
5058 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5059 sniff_to_utf16(
5060 WINDOWS_1252,
5061 WINDOWS_1252,
5062 b"\xFF\x61\x62",
5063 &[0x00FFu16, 0x0061u16, 0x0062u16],
5064 &[],
5065 );
5066 sniff_to_utf16(
5067 WINDOWS_1252,
5068 WINDOWS_1252,
5069 b"\xFF\x61\x62",
5070 &[0x00FFu16, 0x0061u16, 0x0062u16],
5071 &[1],
5072 );
5073 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5074 // UTF-16
5075 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5076 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5077 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5078 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5079 }
5080
5081 #[test]
5082 fn test_output_encoding() {
5083 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5084 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5085 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5086 assert_eq!(UTF_8.output_encoding(), UTF_8);
5087 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5088 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5089 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5090 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5091 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5092 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5093 }
5094
5095 #[test]
5096 fn test_label_resolution() {
5097 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5098 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5099 assert_eq!(
5100 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5101 Some(UTF_8)
5102 );
5103 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5104 assert_eq!(Encoding::for_label(b"bogus"), None);
5105 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5106 }
5107
5108 #[test]
5109 fn test_decode_valid_windows_1257_to_cow() {
5110 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5111 match cow {
5112 Cow::Borrowed(_) => unreachable!(),
5113 Cow::Owned(s) => {
5114 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5115 }
5116 }
5117 assert_eq!(encoding, WINDOWS_1257);
5118 assert!(!had_errors);
5119 }
5120
5121 #[test]
5122 fn test_decode_invalid_windows_1257_to_cow() {
5123 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5124 match cow {
5125 Cow::Borrowed(_) => unreachable!(),
5126 Cow::Owned(s) => {
5127 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5128 }
5129 }
5130 assert_eq!(encoding, WINDOWS_1257);
5131 assert!(had_errors);
5132 }
5133
5134 #[test]
5135 fn test_decode_ascii_only_windows_1257_to_cow() {
5136 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5137 match cow {
5138 Cow::Borrowed(s) => {
5139 assert_eq!(s, "abc");
5140 }
5141 Cow::Owned(_) => unreachable!(),
5142 }
5143 assert_eq!(encoding, WINDOWS_1257);
5144 assert!(!had_errors);
5145 }
5146
5147 #[test]
5148 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5149 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5150 match cow {
5151 Cow::Borrowed(s) => {
5152 assert_eq!(s, "\u{20AC}\u{00E4}");
5153 }
5154 Cow::Owned(_) => unreachable!(),
5155 }
5156 assert_eq!(encoding, UTF_8);
5157 assert!(!had_errors);
5158 }
5159
5160 #[test]
5161 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5162 let (cow, encoding, had_errors) =
5163 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5164 match cow {
5165 Cow::Borrowed(_) => unreachable!(),
5166 Cow::Owned(s) => {
5167 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5168 }
5169 }
5170 assert_eq!(encoding, UTF_8);
5171 assert!(had_errors);
5172 }
5173
5174 #[test]
5175 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5176 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5177 match cow {
5178 Cow::Borrowed(s) => {
5179 assert_eq!(s, "\u{20AC}\u{00E4}");
5180 }
5181 Cow::Owned(_) => unreachable!(),
5182 }
5183 assert_eq!(encoding, UTF_8);
5184 assert!(!had_errors);
5185 }
5186
5187 #[test]
5188 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5189 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5190 match cow {
5191 Cow::Borrowed(_) => unreachable!(),
5192 Cow::Owned(s) => {
5193 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5194 }
5195 }
5196 assert_eq!(encoding, UTF_8);
5197 assert!(had_errors);
5198 }
5199
5200 #[test]
5201 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5202 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5203 match cow {
5204 Cow::Borrowed(s) => {
5205 assert_eq!(s, "\u{20AC}\u{00E4}");
5206 }
5207 Cow::Owned(_) => unreachable!(),
5208 }
5209 assert!(!had_errors);
5210 }
5211
5212 #[test]
5213 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5214 let (cow, had_errors) =
5215 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5216 match cow {
5217 Cow::Borrowed(_) => unreachable!(),
5218 Cow::Owned(s) => {
5219 assert_eq!(
5220 s,
5221 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5222 );
5223 }
5224 }
5225 assert!(!had_errors);
5226 }
5227
5228 #[test]
5229 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5230 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5231 match cow {
5232 Cow::Borrowed(_) => unreachable!(),
5233 Cow::Owned(s) => {
5234 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5235 }
5236 }
5237 assert!(!had_errors);
5238 }
5239
5240 #[test]
5241 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5242 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5243 match cow {
5244 Cow::Borrowed(_) => unreachable!(),
5245 Cow::Owned(s) => {
5246 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5247 }
5248 }
5249 assert!(had_errors);
5250 }
5251
5252 #[test]
5253 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5254 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5255 match cow {
5256 Cow::Borrowed(s) => {
5257 assert_eq!(s, "abc");
5258 }
5259 Cow::Owned(_) => unreachable!(),
5260 }
5261 assert!(!had_errors);
5262 }
5263
5264 #[test]
5265 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5266 let (cow, had_errors) =
5267 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5268 match cow {
5269 Cow::Borrowed(s) => {
5270 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5271 }
5272 Cow::Owned(_) => unreachable!(),
5273 }
5274 assert!(!had_errors);
5275 }
5276
5277 #[test]
5278 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5279 let (cow, had_errors) =
5280 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5281 match cow {
5282 Cow::Borrowed(_) => unreachable!(),
5283 Cow::Owned(s) => {
5284 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5285 }
5286 }
5287 assert!(had_errors);
5288 }
5289
5290 #[test]
5291 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5292 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5293 match cow {
5294 Cow::Borrowed(_) => unreachable!(),
5295 Cow::Owned(s) => {
5296 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5297 }
5298 }
5299 assert!(!had_errors);
5300 }
5301
5302 #[test]
5303 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5304 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5305 match cow {
5306 Cow::Borrowed(_) => unreachable!(),
5307 Cow::Owned(s) => {
5308 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5309 }
5310 }
5311 assert!(had_errors);
5312 }
5313
5314 #[test]
5315 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5316 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5317 match cow {
5318 Cow::Borrowed(s) => {
5319 assert_eq!(s, "abc");
5320 }
5321 Cow::Owned(_) => unreachable!(),
5322 }
5323 assert!(!had_errors);
5324 }
5325
5326 #[test]
5327 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5328 match UTF_8.decode_without_bom_handling_and_without_replacement(
5329 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5330 ) {
5331 Some(cow) => match cow {
5332 Cow::Borrowed(s) => {
5333 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5334 }
5335 Cow::Owned(_) => unreachable!(),
5336 },
5337 None => unreachable!(),
5338 }
5339 }
5340
5341 #[test]
5342 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5343 assert!(UTF_8
5344 .decode_without_bom_handling_and_without_replacement(
5345 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5346 )
5347 .is_none());
5348 }
5349
5350 #[test]
5351 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5352 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5353 Some(cow) => match cow {
5354 Cow::Borrowed(_) => unreachable!(),
5355 Cow::Owned(s) => {
5356 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5357 }
5358 },
5359 None => unreachable!(),
5360 }
5361 }
5362
5363 #[test]
5364 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5365 assert!(WINDOWS_1257
5366 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5367 .is_none());
5368 }
5369
5370 #[test]
5371 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5372 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5373 Some(cow) => match cow {
5374 Cow::Borrowed(s) => {
5375 assert_eq!(s, "abc");
5376 }
5377 Cow::Owned(_) => unreachable!(),
5378 },
5379 None => unreachable!(),
5380 }
5381 }
5382
5383 #[test]
5384 fn test_encode_ascii_only_windows_1257_to_cow() {
5385 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5386 match cow {
5387 Cow::Borrowed(s) => {
5388 assert_eq!(s, b"abc");
5389 }
5390 Cow::Owned(_) => unreachable!(),
5391 }
5392 assert_eq!(encoding, WINDOWS_1257);
5393 assert!(!had_errors);
5394 }
5395
5396 #[test]
5397 fn test_encode_valid_windows_1257_to_cow() {
5398 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5399 match cow {
5400 Cow::Borrowed(_) => unreachable!(),
5401 Cow::Owned(s) => {
5402 assert_eq!(s, b"abc\x80\xE4");
5403 }
5404 }
5405 assert_eq!(encoding, WINDOWS_1257);
5406 assert!(!had_errors);
5407 }
5408
5409 #[test]
5410 fn test_utf16_space_with_one_bom_byte() {
5411 let mut decoder = UTF_16LE.new_decoder();
5412 let mut dst = [0u16; 12];
5413 {
5414 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5415 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5416 assert_eq!(result, CoderResult::InputEmpty);
5417 }
5418 {
5419 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5420 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5421 assert_eq!(result, CoderResult::InputEmpty);
5422 }
5423 }
5424
5425 #[test]
5426 fn test_utf8_space_with_one_bom_byte() {
5427 let mut decoder = UTF_8.new_decoder();
5428 let mut dst = [0u16; 12];
5429 {
5430 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5431 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5432 assert_eq!(result, CoderResult::InputEmpty);
5433 }
5434 {
5435 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5436 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5437 assert_eq!(result, CoderResult::InputEmpty);
5438 }
5439 }
5440
5441 #[test]
5442 fn test_utf16_space_with_two_bom_bytes() {
5443 let mut decoder = UTF_16LE.new_decoder();
5444 let mut dst = [0u16; 12];
5445 {
5446 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5447 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5448 assert_eq!(result, CoderResult::InputEmpty);
5449 }
5450 {
5451 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5452 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5453 assert_eq!(result, CoderResult::InputEmpty);
5454 }
5455 {
5456 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5457 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5458 assert_eq!(result, CoderResult::InputEmpty);
5459 }
5460 }
5461
5462 #[test]
5463 fn test_utf8_space_with_two_bom_bytes() {
5464 let mut decoder = UTF_8.new_decoder();
5465 let mut dst = [0u16; 12];
5466 {
5467 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5468 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5469 assert_eq!(result, CoderResult::InputEmpty);
5470 }
5471 {
5472 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5473 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5474 assert_eq!(result, CoderResult::InputEmpty);
5475 }
5476 {
5477 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5478 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5479 assert_eq!(result, CoderResult::InputEmpty);
5480 }
5481 }
5482
5483 #[test]
5484 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5485 let mut decoder = UTF_16LE.new_decoder();
5486 let mut dst = [0u16; 12];
5487 {
5488 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5489 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5490 assert_eq!(result, CoderResult::InputEmpty);
5491 }
5492 }
5493
5494 #[test]
5495 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5496 let mut dst = [0u8; 8];
5497 let mut encoder = ISO_2022_JP.new_encoder();
5498 {
5499 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5500 assert_eq!(result, CoderResult::InputEmpty);
5501 }
5502 {
5503 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5504 assert_eq!(result, CoderResult::InputEmpty);
5505 }
5506 }
5507
5508 #[test]
5509 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5510 let mut dst = [0u8; 16];
5511 let mut encoder = ISO_2022_JP.new_encoder();
5512 {
5513 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5514 assert_eq!(result, CoderResult::InputEmpty);
5515 }
5516 {
5517 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5518 assert_eq!(result, CoderResult::InputEmpty);
5519 }
5520 {
5521 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5522 assert_eq!(result, CoderResult::OutputFull);
5523 }
5524 }
5525
5526 #[test]
5527 fn test_buffer_end_iso_2022_jp_from_utf8() {
5528 let mut dst = [0u8; 18];
5529 {
5530 let mut encoder = ISO_2022_JP.new_encoder();
5531 let (result, _, _, _) =
5532 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5533 assert_eq!(result, CoderResult::InputEmpty);
5534 }
5535 {
5536 let mut encoder = ISO_2022_JP.new_encoder();
5537 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5538 assert_eq!(result, CoderResult::OutputFull);
5539 }
5540 {
5541 let mut encoder = ISO_2022_JP.new_encoder();
5542 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5543 assert_eq!(result, CoderResult::InputEmpty);
5544 }
5545 {
5546 let mut encoder = ISO_2022_JP.new_encoder();
5547 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5548 assert_eq!(result, CoderResult::InputEmpty);
5549 }
5550 }
5551
5552 #[test]
5553 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5554 let mut dst = [0u8; 8];
5555 let mut encoder = ISO_2022_JP.new_encoder();
5556 {
5557 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5558 assert_eq!(result, CoderResult::InputEmpty);
5559 }
5560 {
5561 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5562 assert_eq!(result, CoderResult::InputEmpty);
5563 }
5564 }
5565
5566 #[test]
5567 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5568 let mut dst = [0u8; 16];
5569 let mut encoder = ISO_2022_JP.new_encoder();
5570 {
5571 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5572 assert_eq!(result, CoderResult::InputEmpty);
5573 }
5574 {
5575 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5576 assert_eq!(result, CoderResult::InputEmpty);
5577 }
5578 {
5579 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5580 assert_eq!(result, CoderResult::OutputFull);
5581 }
5582 }
5583
5584 #[test]
5585 fn test_buffer_end_iso_2022_jp_from_utf16() {
5586 let mut dst = [0u8; 18];
5587 {
5588 let mut encoder = ISO_2022_JP.new_encoder();
5589 let (result, _, _, _) =
5590 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5591 assert_eq!(result, CoderResult::InputEmpty);
5592 }
5593 {
5594 let mut encoder = ISO_2022_JP.new_encoder();
5595 let (result, _, _, _) =
5596 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5597 assert_eq!(result, CoderResult::OutputFull);
5598 }
5599 {
5600 let mut encoder = ISO_2022_JP.new_encoder();
5601 let (result, _, _, _) =
5602 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5603 assert_eq!(result, CoderResult::InputEmpty);
5604 }
5605 {
5606 let mut encoder = ISO_2022_JP.new_encoder();
5607 let (result, _, _, _) =
5608 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5609 assert_eq!(result, CoderResult::InputEmpty);
5610 }
5611 }
5612
5613 #[test]
5614 fn test_buffer_end_utf16be() {
5615 let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5616 let mut dest = [0u8; 4];
5617
5618 assert_eq!(
5619 decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5620 (CoderResult::InputEmpty, 2, 0, false)
5621 );
5622
5623 let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5624 }
5625
5626 #[test]
5627 fn test_hash() {
5628 let mut encodings = ::std::collections::HashSet::new();
5629 encodings.insert(UTF_8);
5630 encodings.insert(ISO_2022_JP);
5631 assert!(encodings.contains(UTF_8));
5632 assert!(encodings.contains(ISO_2022_JP));
5633 assert!(!encodings.contains(WINDOWS_1252));
5634 encodings.remove(ISO_2022_JP);
5635 assert!(!encodings.contains(ISO_2022_JP));
5636 }
5637
5638 #[test]
5639 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5640 let mut dst = [0u8; 17];
5641 {
5642 let mut encoder = ISO_2022_JP.new_encoder();
5643 let (result, _, _, _) =
5644 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5645 assert_eq!(result, CoderResult::OutputFull);
5646 }
5647 }
5648
5649 #[test]
5650 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5651 let mut dst = [0u8; 17];
5652 {
5653 let mut encoder = ISO_2022_JP.new_encoder();
5654 let (result, _, _, _) =
5655 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5656 assert_eq!(result, CoderResult::OutputFull);
5657 }
5658 }
5659
5660 #[test]
5661 fn test_max_length_with_bom_to_utf8() {
5662 let mut output = [0u8; 20];
5663 let mut decoder = REPLACEMENT.new_decoder();
5664 let input = b"\xEF\xBB\xBFA";
5665 {
5666 let needed = decoder
5667 .max_utf8_buffer_length_without_replacement(input.len())
5668 .unwrap();
5669 let (result, read, written) =
5670 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5671 assert_eq!(result, DecoderResult::InputEmpty);
5672 assert_eq!(read, input.len());
5673 assert_eq!(written, 1);
5674 assert_eq!(output[0], 0x41);
5675 }
5676 }
5677
5678 #[cfg(feature = "serde")]
5679 #[test]
5680 fn test_serde() {
5681 let demo = Demo {
5682 num: 42,
5683 name: "foo".into(),
5684 enc: UTF_8,
5685 };
5686
5687 let serialized = serde_json::to_string(&demo).unwrap();
5688
5689 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5690 assert_eq!(deserialized, demo);
5691
5692 let bincoded = bincode::serialize(&demo).unwrap();
5693 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5694 assert_eq!(debincoded, demo);
5695 }
5696
5697 #[test]
5698 fn test_is_single_byte() {
5699 assert!(!BIG5.is_single_byte());
5700 assert!(!EUC_JP.is_single_byte());
5701 assert!(!EUC_KR.is_single_byte());
5702 assert!(!GB18030.is_single_byte());
5703 assert!(!GBK.is_single_byte());
5704 assert!(!REPLACEMENT.is_single_byte());
5705 assert!(!SHIFT_JIS.is_single_byte());
5706 assert!(!UTF_8.is_single_byte());
5707 assert!(!UTF_16BE.is_single_byte());
5708 assert!(!UTF_16LE.is_single_byte());
5709 assert!(!ISO_2022_JP.is_single_byte());
5710
5711 assert!(IBM866.is_single_byte());
5712 assert!(ISO_8859_2.is_single_byte());
5713 assert!(ISO_8859_3.is_single_byte());
5714 assert!(ISO_8859_4.is_single_byte());
5715 assert!(ISO_8859_5.is_single_byte());
5716 assert!(ISO_8859_6.is_single_byte());
5717 assert!(ISO_8859_7.is_single_byte());
5718 assert!(ISO_8859_8.is_single_byte());
5719 assert!(ISO_8859_10.is_single_byte());
5720 assert!(ISO_8859_13.is_single_byte());
5721 assert!(ISO_8859_14.is_single_byte());
5722 assert!(ISO_8859_15.is_single_byte());
5723 assert!(ISO_8859_16.is_single_byte());
5724 assert!(ISO_8859_8_I.is_single_byte());
5725 assert!(KOI8_R.is_single_byte());
5726 assert!(KOI8_U.is_single_byte());
5727 assert!(MACINTOSH.is_single_byte());
5728 assert!(WINDOWS_874.is_single_byte());
5729 assert!(WINDOWS_1250.is_single_byte());
5730 assert!(WINDOWS_1251.is_single_byte());
5731 assert!(WINDOWS_1252.is_single_byte());
5732 assert!(WINDOWS_1253.is_single_byte());
5733 assert!(WINDOWS_1254.is_single_byte());
5734 assert!(WINDOWS_1255.is_single_byte());
5735 assert!(WINDOWS_1256.is_single_byte());
5736 assert!(WINDOWS_1257.is_single_byte());
5737 assert!(WINDOWS_1258.is_single_byte());
5738 assert!(X_MAC_CYRILLIC.is_single_byte());
5739 assert!(X_USER_DEFINED.is_single_byte());
5740 }
5741
5742 #[test]
5743 fn test_latin1_byte_compatible_up_to() {
5744 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5745 assert_eq!(
5746 BIG5.new_decoder_without_bom_handling()
5747 .latin1_byte_compatible_up_to(buffer)
5748 .unwrap(),
5749 1
5750 );
5751 assert_eq!(
5752 EUC_JP
5753 .new_decoder_without_bom_handling()
5754 .latin1_byte_compatible_up_to(buffer)
5755 .unwrap(),
5756 1
5757 );
5758 assert_eq!(
5759 EUC_KR
5760 .new_decoder_without_bom_handling()
5761 .latin1_byte_compatible_up_to(buffer)
5762 .unwrap(),
5763 1
5764 );
5765 assert_eq!(
5766 GB18030
5767 .new_decoder_without_bom_handling()
5768 .latin1_byte_compatible_up_to(buffer)
5769 .unwrap(),
5770 1
5771 );
5772 assert_eq!(
5773 GBK.new_decoder_without_bom_handling()
5774 .latin1_byte_compatible_up_to(buffer)
5775 .unwrap(),
5776 1
5777 );
5778 assert!(REPLACEMENT
5779 .new_decoder_without_bom_handling()
5780 .latin1_byte_compatible_up_to(buffer)
5781 .is_none());
5782 assert_eq!(
5783 SHIFT_JIS
5784 .new_decoder_without_bom_handling()
5785 .latin1_byte_compatible_up_to(buffer)
5786 .unwrap(),
5787 1
5788 );
5789 assert_eq!(
5790 UTF_8
5791 .new_decoder_without_bom_handling()
5792 .latin1_byte_compatible_up_to(buffer)
5793 .unwrap(),
5794 1
5795 );
5796 assert!(UTF_16BE
5797 .new_decoder_without_bom_handling()
5798 .latin1_byte_compatible_up_to(buffer)
5799 .is_none());
5800 assert!(UTF_16LE
5801 .new_decoder_without_bom_handling()
5802 .latin1_byte_compatible_up_to(buffer)
5803 .is_none());
5804 assert_eq!(
5805 ISO_2022_JP
5806 .new_decoder_without_bom_handling()
5807 .latin1_byte_compatible_up_to(buffer)
5808 .unwrap(),
5809 1
5810 );
5811
5812 assert_eq!(
5813 IBM866
5814 .new_decoder_without_bom_handling()
5815 .latin1_byte_compatible_up_to(buffer)
5816 .unwrap(),
5817 1
5818 );
5819 assert_eq!(
5820 ISO_8859_2
5821 .new_decoder_without_bom_handling()
5822 .latin1_byte_compatible_up_to(buffer)
5823 .unwrap(),
5824 2
5825 );
5826 assert_eq!(
5827 ISO_8859_3
5828 .new_decoder_without_bom_handling()
5829 .latin1_byte_compatible_up_to(buffer)
5830 .unwrap(),
5831 2
5832 );
5833 assert_eq!(
5834 ISO_8859_4
5835 .new_decoder_without_bom_handling()
5836 .latin1_byte_compatible_up_to(buffer)
5837 .unwrap(),
5838 2
5839 );
5840 assert_eq!(
5841 ISO_8859_5
5842 .new_decoder_without_bom_handling()
5843 .latin1_byte_compatible_up_to(buffer)
5844 .unwrap(),
5845 2
5846 );
5847 assert_eq!(
5848 ISO_8859_6
5849 .new_decoder_without_bom_handling()
5850 .latin1_byte_compatible_up_to(buffer)
5851 .unwrap(),
5852 2
5853 );
5854 assert_eq!(
5855 ISO_8859_7
5856 .new_decoder_without_bom_handling()
5857 .latin1_byte_compatible_up_to(buffer)
5858 .unwrap(),
5859 2
5860 );
5861 assert_eq!(
5862 ISO_8859_8
5863 .new_decoder_without_bom_handling()
5864 .latin1_byte_compatible_up_to(buffer)
5865 .unwrap(),
5866 3
5867 );
5868 assert_eq!(
5869 ISO_8859_10
5870 .new_decoder_without_bom_handling()
5871 .latin1_byte_compatible_up_to(buffer)
5872 .unwrap(),
5873 2
5874 );
5875 assert_eq!(
5876 ISO_8859_13
5877 .new_decoder_without_bom_handling()
5878 .latin1_byte_compatible_up_to(buffer)
5879 .unwrap(),
5880 4
5881 );
5882 assert_eq!(
5883 ISO_8859_14
5884 .new_decoder_without_bom_handling()
5885 .latin1_byte_compatible_up_to(buffer)
5886 .unwrap(),
5887 4
5888 );
5889 assert_eq!(
5890 ISO_8859_15
5891 .new_decoder_without_bom_handling()
5892 .latin1_byte_compatible_up_to(buffer)
5893 .unwrap(),
5894 6
5895 );
5896 assert_eq!(
5897 ISO_8859_16
5898 .new_decoder_without_bom_handling()
5899 .latin1_byte_compatible_up_to(buffer)
5900 .unwrap(),
5901 4
5902 );
5903 assert_eq!(
5904 ISO_8859_8_I
5905 .new_decoder_without_bom_handling()
5906 .latin1_byte_compatible_up_to(buffer)
5907 .unwrap(),
5908 3
5909 );
5910 assert_eq!(
5911 KOI8_R
5912 .new_decoder_without_bom_handling()
5913 .latin1_byte_compatible_up_to(buffer)
5914 .unwrap(),
5915 1
5916 );
5917 assert_eq!(
5918 KOI8_U
5919 .new_decoder_without_bom_handling()
5920 .latin1_byte_compatible_up_to(buffer)
5921 .unwrap(),
5922 1
5923 );
5924 assert_eq!(
5925 MACINTOSH
5926 .new_decoder_without_bom_handling()
5927 .latin1_byte_compatible_up_to(buffer)
5928 .unwrap(),
5929 1
5930 );
5931 assert_eq!(
5932 WINDOWS_874
5933 .new_decoder_without_bom_handling()
5934 .latin1_byte_compatible_up_to(buffer)
5935 .unwrap(),
5936 2
5937 );
5938 assert_eq!(
5939 WINDOWS_1250
5940 .new_decoder_without_bom_handling()
5941 .latin1_byte_compatible_up_to(buffer)
5942 .unwrap(),
5943 4
5944 );
5945 assert_eq!(
5946 WINDOWS_1251
5947 .new_decoder_without_bom_handling()
5948 .latin1_byte_compatible_up_to(buffer)
5949 .unwrap(),
5950 1
5951 );
5952 assert_eq!(
5953 WINDOWS_1252
5954 .new_decoder_without_bom_handling()
5955 .latin1_byte_compatible_up_to(buffer)
5956 .unwrap(),
5957 5
5958 );
5959 assert_eq!(
5960 WINDOWS_1253
5961 .new_decoder_without_bom_handling()
5962 .latin1_byte_compatible_up_to(buffer)
5963 .unwrap(),
5964 3
5965 );
5966 assert_eq!(
5967 WINDOWS_1254
5968 .new_decoder_without_bom_handling()
5969 .latin1_byte_compatible_up_to(buffer)
5970 .unwrap(),
5971 4
5972 );
5973 assert_eq!(
5974 WINDOWS_1255
5975 .new_decoder_without_bom_handling()
5976 .latin1_byte_compatible_up_to(buffer)
5977 .unwrap(),
5978 3
5979 );
5980 assert_eq!(
5981 WINDOWS_1256
5982 .new_decoder_without_bom_handling()
5983 .latin1_byte_compatible_up_to(buffer)
5984 .unwrap(),
5985 1
5986 );
5987 assert_eq!(
5988 WINDOWS_1257
5989 .new_decoder_without_bom_handling()
5990 .latin1_byte_compatible_up_to(buffer)
5991 .unwrap(),
5992 4
5993 );
5994 assert_eq!(
5995 WINDOWS_1258
5996 .new_decoder_without_bom_handling()
5997 .latin1_byte_compatible_up_to(buffer)
5998 .unwrap(),
5999 4
6000 );
6001 assert_eq!(
6002 X_MAC_CYRILLIC
6003 .new_decoder_without_bom_handling()
6004 .latin1_byte_compatible_up_to(buffer)
6005 .unwrap(),
6006 1
6007 );
6008 assert_eq!(
6009 X_USER_DEFINED
6010 .new_decoder_without_bom_handling()
6011 .latin1_byte_compatible_up_to(buffer)
6012 .unwrap(),
6013 1
6014 );
6015
6016 assert!(UTF_8
6017 .new_decoder()
6018 .latin1_byte_compatible_up_to(buffer)
6019 .is_none());
6020
6021 let mut decoder = UTF_8.new_decoder();
6022 let mut output = [0u16; 4];
6023 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6024 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6025 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6026 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6027 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6028 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6029 }
6030}