encoding/types.rs
1// This is a part of rust-encoding.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5/*!
6 * Interface to the character encoding.
7 *
8 * # Raw incremental interface
9 *
10 * Methods which name starts with `raw_` constitute the raw incremental interface,
11 * the lowest-available API for encoders and decoders.
12 * This interface divides the entire input to four parts:
13 *
14 * - **Processed** bytes do not affect the future result.
15 * - **Unprocessed** bytes may affect the future result
16 * and can be a part of problematic sequence according to the future input.
17 * - **Problematic** byte is the first byte that causes an error condition.
18 * - **Remaining** bytes are not yet processed nor read,
19 * so the caller should feed any remaining bytes again.
20 *
21 * The following figure illustrates an example of successive `raw_feed` calls:
22 *
23 * ````notrust
24 * 1st raw_feed :2nd raw_feed :3rd raw_feed
25 * ----------+----:---------------:--+--+---------
26 * | : : | |
27 * ----------+----:---------------:--+--+---------
28 * processed unprocessed | remaining
29 * problematic
30 * ````
31 *
32 * Since these parts can span the multiple input sequences to `raw_feed`,
33 * `raw_feed` returns two offsets (one optional)
34 * with that the caller can track the problematic sequence.
35 * The first offset (the first `usize` in the tuple) points to the first unprocessed bytes,
36 * or is zero when unprocessed bytes have started before the current call.
37 * (The first unprocessed byte can also be at offset 0,
38 * which doesn't make a difference for the caller.)
39 * The second offset (`upto` field in the `CodecError` struct), if any,
40 * points to the first remaining bytes.
41 *
42 * If the caller needs to recover the error via the problematic sequence,
43 * then the caller starts to save the unprocessed bytes when the first offset < the input length,
44 * appends any new unprocessed bytes while the first offset is zero,
45 * and discards unprocessed bytes when first offset becomes non-zero
46 * while saving new unprocessed bytes when the first offset < the input length.
47 * Then the caller checks for the error condition
48 * and can use the saved unprocessed bytes for error recovery.
49 * Alternatively, if the caller only wants to replace the problematic sequence
50 * with a fixed string (like U+FFFD),
51 * then it can just discard the first sequence and can emit the fixed string on an error.
52 * It still has to feed the input bytes starting at the second offset again.
53 */
54use std::borrow::Cow;
55
56/// Error information from either encoder or decoder.
57pub struct CodecError {
58 /// The byte position of the first remaining byte, with respect to the *current* input.
59 /// For the `finish` call, this should be no more than zero (since there is no input).
60 /// It can be negative if the remaining byte is in the prior inputs,
61 /// as long as the remaining byte is not yet processed.
62 /// The caller should feed the bytes starting from this point again
63 /// in order to continue encoding or decoding after an error.
64 pub upto: isize,
65 /// A human-readable cause of the error.
66 pub cause: Cow<'static, str>,
67}
68
69/// Byte writer used by encoders. In most cases this will be an owned vector of `u8`.
70pub trait ByteWriter {
71 /// Hints an expected lower bound on the length (in bytes) of the output
72 /// until the next call to `writer_hint`,
73 /// so that the writer can reserve the memory for writing.
74 /// `RawEncoder`s are recommended but not required to call this method
75 /// with an appropriate estimate.
76 /// By default this method does nothing.
77 fn writer_hint(&mut self, _expectedlen: usize) {}
78
79 /// Writes a single byte.
80 fn write_byte(&mut self, b: u8);
81
82 /// Writes a number of bytes.
83 fn write_bytes(&mut self, v: &[u8]);
84}
85
86impl ByteWriter for Vec<u8> {
87 fn writer_hint(&mut self, expectedlen: usize) {
88 self.reserve(expectedlen);
89 }
90
91 fn write_byte(&mut self, b: u8) {
92 self.push(b);
93 }
94
95 fn write_bytes(&mut self, v: &[u8]) {
96 self.extend(v.iter().cloned());
97 }
98}
99
100/// String writer used by decoders. In most cases this will be an owned string.
101pub trait StringWriter {
102 /// Hints an expected lower bound on the length (in bytes) of the output
103 /// until the next call to `writer_hint`,
104 /// so that the writer can reserve the memory for writing.
105 /// `RawDecoder`s are recommended but not required to call this method
106 /// with an appropriate estimate.
107 /// By default this method does nothing.
108 fn writer_hint(&mut self, _expectedlen: usize) {}
109
110 /// Writes a single character.
111 fn write_char(&mut self, c: char);
112
113 /// Writes a string.
114 fn write_str(&mut self, s: &str);
115}
116
117impl StringWriter for String {
118 fn writer_hint(&mut self, expectedlen: usize) {
119 let newlen = self.len() + expectedlen;
120 self.reserve(newlen);
121 }
122
123 fn write_char(&mut self, c: char) {
124 self.push(c);
125 }
126
127 fn write_str(&mut self, s: &str) {
128 self.push_str(s);
129 }
130}
131
132/// Encoder converting a Unicode string into a byte sequence.
133/// This is a lower level interface, and normally `Encoding::encode` should be used instead.
134pub trait RawEncoder: 'static {
135 /// Creates a fresh `RawEncoder` instance which parameters are same as `self`.
136 fn from_self(&self) -> Box<RawEncoder>;
137
138 /// Returns true if this encoding is compatible to ASCII,
139 /// i.e. U+0000 through U+007F always map to bytes 00 through 7F and nothing else.
140 fn is_ascii_compatible(&self) -> bool { false }
141
142 /// Feeds given portion of string to the encoder,
143 /// pushes the an encoded byte sequence at the end of the given output,
144 /// and returns a byte offset to the first unprocessed character
145 /// (that can be zero when the first such character appeared in the prior calls to `raw_feed`)
146 /// and optional error information (None means success).
147 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>);
148
149 /// Finishes the encoder,
150 /// pushes the an encoded byte sequence at the end of the given output,
151 /// and returns optional error information (None means success).
152 /// `remaining` value of the error information, if any, is always an empty string.
153 fn raw_finish(&mut self, output: &mut ByteWriter) -> Option<CodecError>;
154}
155
156/// Decoder converting a byte sequence into a Unicode string.
157/// This is a lower level interface, and normally `Encoding::decode` should be used instead.
158pub trait RawDecoder: 'static {
159 /// Creates a fresh `RawDecoder` instance which parameters are same as `self`.
160 fn from_self(&self) -> Box<RawDecoder>;
161
162 /// Returns true if this encoding is compatible to ASCII,
163 /// i.e. bytes 00 through 7F always map to U+0000 through U+007F and nothing else.
164 fn is_ascii_compatible(&self) -> bool { false }
165
166 /// Feeds given portion of byte sequence to the encoder,
167 /// pushes the a decoded string at the end of the given output,
168 /// and returns an offset to the first unprocessed byte
169 /// (that can be zero when the first such byte appeared in the prior calls to `raw_feed`)
170 /// and optional error information (None means success).
171 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>);
172
173 /// Finishes the decoder,
174 /// pushes the a decoded string at the end of the given output,
175 /// and returns optional error information (None means success).
176 fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>;
177}
178
179/// A trait object using dynamic dispatch which is a sendable reference to the encoding,
180/// for code where the encoding is not known at compile-time.
181pub type EncodingRef = &'static (Encoding + Send + Sync);
182
183/// Character encoding.
184pub trait Encoding {
185 /// Returns the canonical name of given encoding.
186 /// This name is guaranteed to be unique across built-in encodings,
187 /// but it is not normative and would be at most arbitrary.
188 fn name(&self) -> &'static str;
189
190 /// Returns a name of given encoding defined in the WHATWG Encoding standard, if any.
191 /// This name often differs from `name` due to the compatibility reason.
192 fn whatwg_name(&self) -> Option<&'static str> { None }
193
194 /// Creates a new encoder.
195 fn raw_encoder(&self) -> Box<RawEncoder>;
196
197 /// Creates a new decoder.
198 fn raw_decoder(&self) -> Box<RawDecoder>;
199
200 /// An easy-to-use interface to `RawEncoder`.
201 /// On the encoder error `trap` is called,
202 /// which may return a replacement sequence to continue processing,
203 /// or a failure to return the error.
204 fn encode(&self, input: &str, trap: EncoderTrap) -> Result<Vec<u8>, Cow<'static, str>> {
205 let mut ret = Vec::new();
206 self.encode_to(input, trap, &mut ret).map(|_| ret)
207 }
208
209 /// Encode into a `ByteWriter`.
210 fn encode_to(&self, input: &str, trap: EncoderTrap, ret: &mut ByteWriter)
211 -> Result<(), Cow<'static, str>>
212 {
213 // we don't need to keep `unprocessed` here;
214 // `raw_feed` should process as much input as possible.
215 let mut encoder = self.raw_encoder();
216 let mut remaining = 0;
217
218 loop {
219 let (offset, err) = encoder.raw_feed(&input[remaining..], ret);
220 let unprocessed = remaining + offset;
221 match err {
222 Some(err) => {
223 remaining = (remaining as isize + err.upto) as usize;
224 if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
225 return Err(err.cause);
226 }
227 }
228 None => {
229 remaining = input.len();
230 match encoder.raw_finish(ret) {
231 Some(err) => {
232 remaining = (remaining as isize + err.upto) as usize;
233 if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
234 return Err(err.cause);
235 }
236 }
237 None => {}
238 }
239 if remaining >= input.len() { return Ok(()); }
240 }
241 }
242 }
243 }
244
245 /// An easy-to-use interface to `RawDecoder`.
246 /// On the decoder error `trap` is called,
247 /// which may return a replacement string to continue processing,
248 /// or a failure to return the error.
249 fn decode(&self, input: &[u8], trap: DecoderTrap) -> Result<String, Cow<'static, str>> {
250 let mut ret = String::new();
251 self.decode_to(input, trap, &mut ret).map(|_| ret)
252 }
253
254 /// Decode into a `StringWriter`.
255 ///
256 /// This does *not* handle partial characters at the beginning or end of `input`!
257 /// Use `RawDecoder` for incremental decoding.
258 fn decode_to(&self, input: &[u8], trap: DecoderTrap, ret: &mut StringWriter)
259 -> Result<(), Cow<'static, str>>
260 {
261 // we don't need to keep `unprocessed` here;
262 // `raw_feed` should process as much input as possible.
263 let mut decoder = self.raw_decoder();
264 let mut remaining = 0;
265
266 loop {
267 let (offset, err) = decoder.raw_feed(&input[remaining..], ret);
268 let unprocessed = remaining + offset;
269 match err {
270 Some(err) => {
271 remaining = (remaining as isize + err.upto) as usize;
272 if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
273 return Err(err.cause);
274 }
275 }
276 None => {
277 remaining = input.len();
278 match decoder.raw_finish(ret) {
279 Some(err) => {
280 remaining = (remaining as isize + err.upto) as usize;
281 if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
282 return Err(err.cause);
283 }
284 }
285 None => {}
286 }
287 if remaining >= input.len() { return Ok(()); }
288 }
289 }
290 }
291 }
292}
293
294/// A type of the bare function in `EncoderTrap` values.
295pub type EncoderTrapFunc =
296 extern "Rust" fn(encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool;
297
298/// A type of the bare function in `DecoderTrap` values.
299pub type DecoderTrapFunc =
300 extern "Rust" fn(decoder: &mut RawDecoder, input: &[u8], output: &mut StringWriter) -> bool;
301
302/// Trap, which handles decoder errors.
303#[derive(Copy)]
304pub enum DecoderTrap {
305 /// Immediately fails on errors.
306 /// Corresponds to WHATWG "fatal" error algorithm.
307 Strict,
308 /// Replaces an error with a U+FFFD (decoder).
309 /// Corresponds to WHATWG "replacement" error algorithm.
310 Replace,
311 /// Silently ignores an error, effectively replacing it with an empty sequence.
312 Ignore,
313 /// Calls given function to handle decoder errors.
314 /// The function is given the current decoder, input and output writer,
315 /// and should return true only when it is fine to keep going.
316 Call(DecoderTrapFunc),
317}
318
319impl DecoderTrap {
320 /// Handles a decoder error. May write to the output writer.
321 /// Returns true only when it is fine to keep going.
322 pub fn trap(&self, decoder: &mut RawDecoder, input: &[u8], output: &mut StringWriter) -> bool {
323 match *self {
324 DecoderTrap::Strict => false,
325 DecoderTrap::Replace => { output.write_char('\u{fffd}'); true },
326 DecoderTrap::Ignore => true,
327 DecoderTrap::Call(func) => func(decoder, input, output),
328 }
329 }
330}
331
332impl Clone for DecoderTrap {
333 fn clone(&self) -> DecoderTrap {
334 match *self {
335 DecoderTrap::Strict => DecoderTrap::Strict,
336 DecoderTrap::Replace => DecoderTrap::Replace,
337 DecoderTrap::Ignore => DecoderTrap::Ignore,
338 DecoderTrap::Call(f) => DecoderTrap::Call(f),
339 }
340 }
341}
342
343#[derive(Copy)]
344pub enum EncoderTrap {
345 /// Immediately fails on errors.
346 /// Corresponds to WHATWG "fatal" error algorithm.
347 Strict,
348 /// Replaces an error with `?` in given encoding.
349 /// Note that this fails when `?` cannot be represented in given encoding.
350 /// Corresponds to WHATWG "URL" error algorithms.
351 Replace,
352 /// Silently ignores an error, effectively replacing it with an empty sequence.
353 Ignore,
354 /// Replaces an error with XML numeric character references (e.g. `Ӓ`).
355 /// The encoder trap fails when NCRs cannot be represented in given encoding.
356 /// Corresponds to WHATWG "<form>" error algorithms.
357 NcrEscape,
358 /// Calls given function to handle encoder errors.
359 /// The function is given the current encoder, input and output writer,
360 /// and should return true only when it is fine to keep going.
361 Call(EncoderTrapFunc),
362}
363
364impl EncoderTrap {
365 /// Handles an encoder error. May write to the output writer.
366 /// Returns true only when it is fine to keep going.
367 pub fn trap(&self, encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter) -> bool {
368 fn reencode(encoder: &mut RawEncoder, input: &str, output: &mut ByteWriter,
369 trapname: &str) -> bool {
370 if encoder.is_ascii_compatible() { // optimization!
371 output.write_bytes(input.as_bytes());
372 } else {
373 let (_, err) = encoder.raw_feed(input, output);
374 if err.is_some() {
375 panic!("{} cannot reencode a replacement string", trapname);
376 }
377 }
378 true
379 }
380
381 match *self {
382 EncoderTrap::Strict => false,
383 EncoderTrap::Replace => reencode(encoder, "?", output, "Replace"),
384 EncoderTrap::Ignore => true,
385 EncoderTrap::NcrEscape => {
386 let mut escapes = String::new();
387 for ch in input.chars() {
388 escapes.push_str(&format!("&#{};", ch as isize));
389 }
390 reencode(encoder, &escapes, output, "NcrEscape")
391 },
392 EncoderTrap::Call(func) => func(encoder, input, output),
393 }
394 }
395}
396
397impl Clone for EncoderTrap {
398 fn clone(&self) -> EncoderTrap {
399 match *self {
400 EncoderTrap::Strict => EncoderTrap::Strict,
401 EncoderTrap::Replace => EncoderTrap::Replace,
402 EncoderTrap::Ignore => EncoderTrap::Ignore,
403 EncoderTrap::NcrEscape => EncoderTrap::NcrEscape,
404 EncoderTrap::Call(f) => EncoderTrap::Call(f),
405 }
406 }
407}
408
409/// Determine the encoding by looking for a Byte Order Mark (BOM)
410/// and decoded a single string in memory.
411/// Return the result and the used encoding.
412pub fn decode(input: &[u8], trap: DecoderTrap, fallback_encoding: EncodingRef)
413 -> (Result<String, Cow<'static, str>>, EncodingRef) {
414 use all::{UTF_8, UTF_16LE, UTF_16BE};
415 if input.starts_with(&[0xEF, 0xBB, 0xBF]) {
416 (UTF_8.decode(&input[3..], trap), UTF_8 as EncodingRef)
417 } else if input.starts_with(&[0xFE, 0xFF]) {
418 (UTF_16BE.decode(&input[2..], trap), UTF_16BE as EncodingRef)
419 } else if input.starts_with(&[0xFF, 0xFE]) {
420 (UTF_16LE.decode(&input[2..], trap), UTF_16LE as EncodingRef)
421 } else {
422 (fallback_encoding.decode(input, trap), fallback_encoding)
423 }
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429 use super::EncoderTrap::NcrEscape;
430 use util::StrCharIndex;
431 use std::convert::Into;
432
433 // a contrived encoding example: same as ASCII, but inserts `prepend` between each character
434 // within two "e"s (so that `widespread` becomes `wide*s*p*r*ead` and `eeeeasel` becomes
435 // `e*ee*ease*l` where `*` is substituted by `prepend`) and prohibits `prohibit` character.
436 struct MyEncoder { flag: bool, prohibit: char, prepend: &'static str, toggle: bool }
437 impl RawEncoder for MyEncoder {
438 fn from_self(&self) -> Box<RawEncoder> {
439 Box::new(MyEncoder { flag: self.flag,
440 prohibit: self.prohibit,
441 prepend: self.prepend,
442 toggle: false })
443 }
444 fn is_ascii_compatible(&self) -> bool { self.flag }
445 fn raw_feed(&mut self, input: &str,
446 output: &mut ByteWriter) -> (usize, Option<CodecError>) {
447 for ((i,j), ch) in input.index_iter() {
448 if ch <= '\u{7f}' && ch != self.prohibit {
449 if self.toggle && !self.prepend.is_empty() {
450 output.write_bytes(self.prepend.as_bytes());
451 }
452 output.write_byte(ch as u8);
453 if ch == 'e' {
454 self.toggle = !self.toggle;
455 }
456 } else {
457 return (i, Some(CodecError { upto: j as isize,
458 cause: "!!!".into() }));
459 }
460 }
461 (input.len(), None)
462 }
463 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> { None }
464 }
465
466 struct MyEncoding { flag: bool, prohibit: char, prepend: &'static str }
467 impl Encoding for MyEncoding {
468 fn name(&self) -> &'static str { "my encoding" }
469 fn raw_encoder(&self) -> Box<RawEncoder> {
470 Box::new(MyEncoder { flag: self.flag,
471 prohibit: self.prohibit,
472 prepend: self.prepend,
473 toggle: false })
474 }
475 fn raw_decoder(&self) -> Box<RawDecoder> { panic!("not supported") }
476 }
477
478 #[test]
479 fn test_reencoding_trap_with_ascii_compatible_encoding() {
480 static COMPAT: &'static MyEncoding =
481 &MyEncoding { flag: true, prohibit: '\u{80}', prepend: "" };
482 static INCOMPAT: &'static MyEncoding =
483 &MyEncoding { flag: false, prohibit: '\u{80}', prepend: "" };
484
485 assert_eq!(COMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
486 Ok(b"Hello‽ I'm fine.".to_vec()));
487 assert_eq!(INCOMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
488 Ok(b"Hello‽ I'm fine.".to_vec()));
489 }
490
491 #[test]
492 fn test_reencoding_trap_with_ascii_incompatible_encoding() {
493 static COMPAT: &'static MyEncoding =
494 &MyEncoding { flag: true, prohibit: '\u{80}', prepend: "*" };
495 static INCOMPAT: &'static MyEncoding =
496 &MyEncoding { flag: false, prohibit: '\u{80}', prepend: "*" };
497
498 // this should behave incorrectly as the encoding broke the assumption.
499 assert_eq!(COMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
500 Ok(b"He*l*l*o‽* *I*'*m* *f*i*n*e.".to_vec()));
501 assert_eq!(INCOMPAT.encode("Hello\u{203d} I'm fine.", NcrEscape),
502 Ok(b"He*l*l*o*&*#*8*2*5*3*;* *I*'*m* *f*i*n*e.".to_vec()));
503 }
504
505 #[test]
506 #[should_panic]
507 fn test_reencoding_trap_can_fail() {
508 static FAIL: &'static MyEncoding = &MyEncoding { flag: false, prohibit: '&', prepend: "" };
509
510 // this should fail as this contrived encoding does not support `&` at all
511 let _ = FAIL.encode("Hello\u{203d} I'm fine.", NcrEscape);
512 }
513}