encoding/
types.rs

1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5/*!
6 * Interface to the character encoding.
7 *
8 * # Raw incremental interface
9 *
10 * Methods which name starts with `raw_` constitute the raw incremental interface,
11 * the lowest-available API for encoders and decoders.
12 * This interface divides the entire input to four parts:
13 *
14 * - **Processed** bytes do not affect the future result.
15 * - **Unprocessed** bytes may affect the future result
16 *   and can be a part of problematic sequence according to the future input.
17 * - **Problematic** byte is the first byte that causes an error condition.
18 * - **Remaining** bytes are not yet processed nor read,
19 *   so the caller should feed any remaining bytes again.
20 *
21 * The following figure illustrates an example of successive `raw_feed` calls:
22 *
23 * ````notrust
24 * 1st raw_feed   :2nd raw_feed   :3rd raw_feed
25 * ----------+----:---------------:--+--+---------
26 *           |    :               :  |  |
27 * ----------+----:---------------:--+--+---------
28 * processed  unprocessed             |  remaining
29 *                               problematic
30 * ````
31 *
32 * Since these parts can span the multiple input sequences to `raw_feed`,
33 * `raw_feed` returns two offsets (one optional)
34 * with that the caller can track the problematic sequence.
35 * The first offset (the first `usize` in the tuple) points to the first unprocessed bytes,
36 * or is zero when unprocessed bytes have started before the current call.
37 * (The first unprocessed byte can also be at offset 0,
38 * which doesn't make a difference for the caller.)
39 * The second offset (`upto` field in the `CodecError` struct), if any,
40 * points to the first remaining bytes.
41 *
42 * If the caller needs to recover the error via the problematic sequence,
43 * then the caller starts to save the unprocessed bytes when the first offset < the input length,
44 * appends any new unprocessed bytes while the first offset is zero,
45 * and discards unprocessed bytes when first offset becomes non-zero
46 * while saving new unprocessed bytes when the first offset < the input length.
47 * Then the caller checks for the error condition
48 * and can use the saved unprocessed bytes for error recovery.
49 * Alternatively, if the caller only wants to replace the problematic sequence
50 * with a fixed string (like U+FFFD),
51 * then it can just discard the first sequence and can emit the fixed string on an error.
52 * It still has to feed the input bytes starting at the second offset again.
53 */
54use std::borrow::Cow;
55
56/// Error information from either encoder or decoder.
57pub struct CodecError {
58    /// The byte position of the first remaining byte, with respect to the *current* input.
59    /// For the `finish` call, this should be no more than zero (since there is no input).
60    /// It can be negative if the remaining byte is in the prior inputs,
61    /// as long as the remaining byte is not yet processed.
62    /// The caller should feed the bytes starting from this point again
63    /// in order to continue encoding or decoding after an error.
64    pub upto: isize,
65    /// A human-readable cause of the error.
66    pub cause: Cow<'static, str>,
67}
68
69/// Byte writer used by encoders. In most cases this will be an owned vector of `u8`.
70pub trait ByteWriter {
71    /// Hints an expected lower bound on the length (in bytes) of the output
72    /// until the next call to `writer_hint`,
73    /// so that the writer can reserve the memory for writing.
74    /// `RawEncoder`s are recommended but not required to call this method
75    /// with an appropriate estimate.
76    /// By default this method does nothing.
77    fn writer_hint(&mut self, _expectedlen: usize) {}
78
79    /// Writes a single byte.
80    fn write_byte(&mut self, b: u8);
81
82    /// Writes a number of bytes.
83    fn write_bytes(&mut self, v: &[u8]);
84}
85
86impl ByteWriter for Vec<u8> {
87    fn writer_hint(&mut self, expectedlen: usize) {
88        self.reserve(expectedlen);
89    }
90
91    fn write_byte(&mut self, b: u8) {
92        self.push(b);
93    }
94
95    fn write_bytes(&mut self, v: &[u8]) {
96        self.extend(v.iter().cloned());
97    }
98}
99
100/// String writer used by decoders. In most cases this will be an owned string.
101pub trait StringWriter {
102    /// Hints an expected lower bound on the length (in bytes) of the output
103    /// until the next call to `writer_hint`,
104    /// so that the writer can reserve the memory for writing.
105    /// `RawDecoder`s are recommended but not required to call this method
106    /// with an appropriate estimate.
107    /// By default this method does nothing.
108    fn writer_hint(&mut self, _expectedlen: usize) {}
109
110    /// Writes a single character.
111    fn write_char(&mut self, c: char);
112
113    /// Writes a string.
114    fn write_str(&mut self, s: &str);
115}
116
117impl StringWriter for String {
118    fn writer_hint(&mut self, expectedlen: usize) {
119        let newlen = self.len() + expectedlen;
120        self.reserve(newlen);
121    }
122
123    fn write_char(&mut self, c: char) {
124        self.push(c);
125    }
126
127    fn write_str(&mut self, s: &str) {
128        self.push_str(s);
129    }
130}
131
132/// Encoder converting a Unicode string into a byte sequence.
133/// This is a lower level interface, and normally `Encoding::encode` should be used instead.
134pub trait RawEncoder: 'static {
135    /// Creates a fresh `RawEncoder` instance which parameters are same as `self`.
136    fn from_self(&self) -> Box<RawEncoder>;
137
138    /// Returns true if this encoding is compatible to ASCII,
139    /// i.e. U+0000 through U+007F always map to bytes 00 through 7F and nothing else.
140    fn is_ascii_compatible(&self) -> bool { false }
141
142    /// Feeds given portion of string to the encoder,
143    /// pushes the an encoded byte sequence at the end of the given output,
144    /// and returns a byte offset to the first unprocessed character
145    /// (that can be zero when the first such character appeared in the prior calls to `raw_feed`)
146    /// and optional error information (None means success).
147    fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>);
148
149    /// Finishes the encoder,
150    /// pushes the an encoded byte sequence at the end of the given output,
151    /// and returns optional error information (None means success).
152    /// `remaining` value of the error information, if any, is always an empty string.
153    fn raw_finish(&mut self, output: &mut ByteWriter) -> Option<CodecError>;
154}
155
156/// Decoder converting a byte sequence into a Unicode string.
157/// This is a lower level interface, and normally `Encoding::decode` should be used instead.
158pub trait RawDecoder: 'static {
159    /// Creates a fresh `RawDecoder` instance which parameters are same as `self`.
160    fn from_self(&self) -> Box<RawDecoder>;
161
162    /// Returns true if this encoding is compatible to ASCII,
163    /// i.e. bytes 00 through 7F always map to U+0000 through U+007F and nothing else.
164    fn is_ascii_compatible(&self) -> bool { false }
165
166    /// Feeds given portion of byte sequence to the encoder,
167    /// pushes the a decoded string at the end of the given output,
168    /// and returns an offset to the first unprocessed byte
169    /// (that can be zero when the first such byte appeared in the prior calls to `raw_feed`)
170    /// and optional error information (None means success).
171    fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>);
172
173    /// Finishes the decoder,
174    /// pushes the a decoded string at the end of the given output,
175    /// and returns optional error information (None means success).
176    fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>;
177}
178
179/// A trait object using dynamic dispatch which is a sendable reference to the encoding,
180/// for code where the encoding is not known at compile-time.
181pub type EncodingRef = &'static (Encoding + Send + Sync);
182
183/// Character encoding.
184pub trait Encoding {
185    /// Returns the canonical name of given encoding.
186    /// This name is guaranteed to be unique across built-in encodings,
187    /// but it is not normative and would be at most arbitrary.
188    fn name(&self) -> &'static str;
189
190    /// Returns a name of given encoding defined in the WHATWG Encoding standard, if any.
191    /// This name often differs from `name` due to the compatibility reason.
192    fn whatwg_name(&self) -> Option<&'static str> { None }
193
194    /// Creates a new encoder.
195    fn raw_encoder(&self) -> Box<RawEncoder>;
196
197    /// Creates a new decoder.
198    fn raw_decoder(&self) -> Box<RawDecoder>;
199
200    /// An easy-to-use interface to `RawEncoder`.
201    /// On the encoder error `trap` is called,
202    /// which may return a replacement sequence to continue processing,
203    /// or a failure to return the error.
204    fn encode(&self, input: &str, trap: EncoderTrap) -> Result<Vec<u8>, Cow<'static, str>> {
205        let mut ret = Vec::new();
206        self.encode_to(input, trap, &mut ret).map(|_| ret)
207    }
208
209    /// Encode into a `ByteWriter`.
210    fn encode_to(&self, input: &str, trap: EncoderTrap, ret: &mut ByteWriter)
211        -> Result<(), Cow<'static, str>>
212    {
213        // we don't need to keep `unprocessed` here;
214        // `raw_feed` should process as much input as possible.
215        let mut encoder = self.raw_encoder();
216        let mut remaining = 0;
217
218        loop {
219            let (offset, err) = encoder.raw_feed(&input[remaining..], ret);
220            let unprocessed = remaining + offset;
221            match err {
222                Some(err) => {
223                    remaining = (remaining as isize + err.upto) as usize;
224                    if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
225                        return Err(err.cause);
226                    }
227                }
228                None => {
229                    remaining = input.len();
230                    match encoder.raw_finish(ret) {
231                        Some(err) => {
232                            remaining = (remaining as isize + err.upto) as usize;
233                            if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
234                                return Err(err.cause);
235                            }
236                        }
237                        None => {}
238                    }
239                    if remaining >= input.len() { return Ok(()); }
240                }
241            }
242        }
243    }
244
245    /// An easy-to-use interface to `RawDecoder`.
246    /// On the decoder error `trap` is called,
247    /// which may return a replacement string to continue processing,
248    /// or a failure to return the error.
249    fn decode(&self, input: &[u8], trap: DecoderTrap) -> Result<String, Cow<'static, str>> {
250        let mut ret = String::new();
251        self.decode_to(input, trap, &mut ret).map(|_| ret)
252    }
253
254    /// Decode into a `StringWriter`.
255    ///
256    /// This does *not* handle partial characters at the beginning or end of `input`!
257    /// Use `RawDecoder` for incremental decoding.
258    fn decode_to(&self, input: &[u8], trap: DecoderTrap, ret: &mut StringWriter)
259        -> Result<(), Cow<'static, str>>
260    {
261        // we don't need to keep `unprocessed` here;
262        // `raw_feed` should process as much input as possible.
263        let mut decoder = self.raw_decoder();
264        let mut remaining = 0;
265
266        loop {
267            let (offset, err) = decoder.raw_feed(&input[remaining..], ret);
268            let unprocessed = remaining + offset;
269            match err {
270                Some(err) => {
271                    remaining = (remaining as isize + err.upto) as usize;
272                    if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
273                        return Err(err.cause);
274                    }
275                }
276                None => {
277                    remaining = input.len();
278                    match decoder.raw_finish(ret) {
279                        Some(err) => {
280                            remaining = (remaining as isize + err.upto) as usize;
281                            if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
282                                return Err(err.cause);
283                            }
284                        }
285                        None => {}
286                    }
287                    if remaining >= input.len() { return Ok(()); }
288                }
289            }
290        }
291    }
292}
293
294/// A type of the bare function in `EncoderTrap` values.
295pub type EncoderTrapFunc =
296    extern "Rust" fn(encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool;
297
298/// A type of the bare function in `DecoderTrap` values.
299pub type DecoderTrapFunc =
300    extern "Rust" fn(decoder: &mut RawDecoder, input: &[u8], output: &mut StringWriter) -> bool;
301
302/// Trap, which handles decoder errors.
303#[derive(Copy)]
304pub enum DecoderTrap {
305    /// Immediately fails on errors.
306    /// Corresponds to WHATWG "fatal" error algorithm.
307    Strict,
308    /// Replaces an error with a U+FFFD (decoder).
309    /// Corresponds to WHATWG "replacement" error algorithm.
310    Replace,
311    /// Silently ignores an error, effectively replacing it with an empty sequence.
312    Ignore,
313    /// Calls given function to handle decoder errors.
314    /// The function is given the current decoder, input and output writer,
315    /// and should return true only when it is fine to keep going.
316    Call(DecoderTrapFunc),
317}
318
319impl DecoderTrap {
320    /// Handles a decoder error. May write to the output writer.
321    /// Returns true only when it is fine to keep going.
322    pub fn trap(&self, decoder: &mut RawDecoder, input: &[u8], output: &mut StringWriter) -> bool {
323        match *self {
324            DecoderTrap::Strict     => false,
325            DecoderTrap::Replace    => { output.write_char('\u{fffd}'); true },
326            DecoderTrap::Ignore     => true,
327            DecoderTrap::Call(func) => func(decoder, input, output),
328        }
329    }
330}
331
332impl Clone for DecoderTrap {
333    fn clone(&self) -> DecoderTrap {
334        match *self {
335            DecoderTrap::Strict => DecoderTrap::Strict,
336            DecoderTrap::Replace => DecoderTrap::Replace,
337            DecoderTrap::Ignore => DecoderTrap::Ignore,
338            DecoderTrap::Call(f) => DecoderTrap::Call(f),
339        }
340    }
341}
342
343#[derive(Copy)]
344pub enum EncoderTrap {
345    /// Immediately fails on errors.
346    /// Corresponds to WHATWG "fatal" error algorithm.
347    Strict,
348    /// Replaces an error with `?` in given encoding.
349    /// Note that this fails when `?` cannot be represented in given encoding.
350    /// Corresponds to WHATWG "URL" error algorithms.
351    Replace,
352    /// Silently ignores an error, effectively replacing it with an empty sequence.
353    Ignore,
354    /// Replaces an error with XML numeric character references (e.g. `&#1234;`).
355    /// The encoder trap fails when NCRs cannot be represented in given encoding.
356    /// Corresponds to WHATWG "<form>" error algorithms.
357    NcrEscape,
358    /// Calls given function to handle encoder errors.
359    /// The function is given the current encoder, input and output writer,
360    /// and should return true only when it is fine to keep going.
361    Call(EncoderTrapFunc),
362}
363
364impl EncoderTrap {
365    /// Handles an encoder error. May write to the output writer.
366    /// Returns true only when it is fine to keep going.
367    pub fn trap(&self, encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool {
368        fn reencode(encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter,
369                    trapname: &str) -> bool {
370            if encoder.is_ascii_compatible() { // optimization!
371                output.write_bytes(input.as_bytes());
372            } else {
373                let (_, err) = encoder.raw_feed(input, output);
374                if err.is_some() {
375                    panic!("{} cannot reencode a replacement string", trapname);
376                }
377            }
378            true
379        }
380
381        match *self {
382            EncoderTrap::Strict     => false,
383            EncoderTrap::Replace    => reencode(encoder, "?", output, "Replace"),
384            EncoderTrap::Ignore     => true,
385            EncoderTrap::NcrEscape  => {
386                let mut escapes = String::new();
387                for ch in input.chars() {
388                    escapes.push_str(&format!("&#{};", ch as isize));
389                }
390                reencode(encoder, &escapes, output, "NcrEscape")
391            },
392            EncoderTrap::Call(func) => func(encoder, input, output),
393        }
394    }
395}
396
397impl Clone for EncoderTrap {
398    fn clone(&self) -> EncoderTrap {
399        match *self {
400            EncoderTrap::Strict => EncoderTrap::Strict,
401            EncoderTrap::Replace => EncoderTrap::Replace,
402            EncoderTrap::Ignore => EncoderTrap::Ignore,
403            EncoderTrap::NcrEscape => EncoderTrap::NcrEscape,
404            EncoderTrap::Call(f) => EncoderTrap::Call(f),
405        }
406    }
407}
408
409/// Determine the encoding by looking for a Byte Order Mark (BOM)
410/// and decoded a single string in memory.
411/// Return the result and the used encoding.
412pub fn decode(input: &[u8], trap: DecoderTrap, fallback_encoding: EncodingRef)
413           -> (Result<String, Cow<'static, str>>, EncodingRef) {
414    use all::{UTF_8, UTF_16LE, UTF_16BE};
415    if input.starts_with(&[0xEF, 0xBB, 0xBF]) {
416        (UTF_8.decode(&input[3..], trap), UTF_8 as EncodingRef)
417    } else if input.starts_with(&[0xFE, 0xFF]) {
418        (UTF_16BE.decode(&input[2..], trap), UTF_16BE as EncodingRef)
419    } else if input.starts_with(&[0xFF, 0xFE]) {
420        (UTF_16LE.decode(&input[2..], trap), UTF_16LE as EncodingRef)
421    } else {
422        (fallback_encoding.decode(input, trap), fallback_encoding)
423    }
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429    use super::EncoderTrap::NcrEscape;
430    use util::StrCharIndex;
431    use std::convert::Into;
432
433    // a contrived encoding example: same as ASCII, but inserts `prepend` between each character
434    // within two "e"s (so that `widespread` becomes `wide*s*p*r*ead` and `eeeeasel` becomes
435    // `e*ee*ease*l` where `*` is substituted by `prepend`) and prohibits `prohibit` character.
436    struct MyEncoder { flag: bool, prohibit: char, prepend: &'static str, toggle: bool }
437    impl RawEncoder for MyEncoder {
438        fn from_self(&self) -> Box<RawEncoder> {
439            Box::new(MyEncoder { flag: self.flag,
440                                 prohibit: self.prohibit,
441                                 prepend: self.prepend,
442                                 toggle: false })
443        }
444        fn is_ascii_compatible(&self) -> bool { self.flag }
445        fn raw_feed(&mut self, input: &str,
446                    output: &mut ByteWriter) -> (usize, Option<CodecError>) {
447            for ((i,j), ch) in input.index_iter() {
448                if ch <= '\u{7f}' && ch != self.prohibit {
449                    if self.toggle && !self.prepend.is_empty() {
450                        output.write_bytes(self.prepend.as_bytes());
451                    }
452                    output.write_byte(ch as u8);
453                    if ch == 'e' {
454                        self.toggle = !self.toggle;
455                    }
456                } else {
457                    return (i, Some(CodecError { upto: j as isize,
458                                                 cause: "!!!".into() }));
459                }
460            }
461            (input.len(), None)
462        }
463        fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> { None }
464    }
465
466    struct MyEncoding { flag: bool, prohibit: char, prepend: &'static str }
467    impl Encoding for MyEncoding {
468        fn name(&self) -> &'static str { "my encoding" }
469        fn raw_encoder(&self) -> Box<RawEncoder> {
470            Box::new(MyEncoder { flag: self.flag,
471                                 prohibit: self.prohibit,
472                                 prepend: self.prepend,
473                                 toggle: false })
474        }
475        fn raw_decoder(&self) -> Box<RawDecoder> { panic!("not supported") }
476    }
477
478    #[test]
479    fn test_reencoding_trap_with_ascii_compatible_encoding() {
480        static COMPAT: &'static MyEncoding =
481            &MyEncoding { flag: true, prohibit: '\u{80}', prepend: "" };
482        static INCOMPAT: &'static MyEncoding =
483            &MyEncoding { flag: false, prohibit: '\u{80}', prepend: "" };
484
485        assert_eq!(COMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
486                   Ok(b"Hello&#8253; I'm fine.".to_vec()));
487        assert_eq!(INCOMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
488                   Ok(b"Hello&#8253; I'm fine.".to_vec()));
489    }
490
491    #[test]
492    fn test_reencoding_trap_with_ascii_incompatible_encoding() {
493        static COMPAT: &'static MyEncoding =
494            &MyEncoding { flag: true, prohibit: '\u{80}', prepend: "*" };
495        static INCOMPAT: &'static MyEncoding =
496            &MyEncoding { flag: false, prohibit: '\u{80}', prepend: "*" };
497
498        // this should behave incorrectly as the encoding broke the assumption.
499        assert_eq!(COMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
500                   Ok(b"He*l*l*o&#8253;* *I*'*m* *f*i*n*e.".to_vec()));
501        assert_eq!(INCOMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
502                   Ok(b"He*l*l*o*&*#*8*2*5*3*;* *I*'*m* *f*i*n*e.".to_vec()));
503    }
504
505    #[test]
506    #[should_panic]
507    fn test_reencoding_trap_can_fail() {
508        static FAIL: &'static MyEncoding = &MyEncoding { flag: false, prohibit: '&', prepend: "" };
509
510        // this should fail as this contrived encoding does not support `&` at all
511        let _ = FAIL.encode("Hello\u{203d} I'm fine.", NcrEscape);
512    }
513}