1use super::*;
11use crate::ascii::*;
12use crate::data::position;
13use crate::handles::*;
14use crate::variant::*;
15
16pub struct SingleByteDecoder {
17 table: &'static [u16; 128],
18}
19
20impl SingleByteDecoder {
21 pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
22 VariantDecoder::SingleByte(SingleByteDecoder { table: data })
23 }
24
25 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
26 Some(byte_length)
27 }
28
29 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
30 byte_length.checked_mul(3)
31 }
32
33 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
34 byte_length.checked_mul(3)
35 }
36
37 pub fn decode_to_utf8_raw(
38 &mut self,
39 src: &[u8],
40 dst: &mut [u8],
41 _last: bool,
42 ) -> (DecoderResult, usize, usize) {
43 let mut source = ByteSource::new(src);
44 let mut dest = Utf8Destination::new(dst);
45 'outermost: loop {
46 match dest.copy_ascii_from_check_space_bmp(&mut source) {
47 CopyAsciiResult::Stop(ret) => return ret,
48 CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
49 let mapped =
57 unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
58 if mapped == 0u16 {
60 return (
61 DecoderResult::Malformed(1, 0),
62 source.consumed(),
63 handle.written(),
64 );
65 }
66 let dest_again = handle.write_bmp_excl_ascii(mapped);
67 match source.check_available() {
69 Space::Full(src_consumed) => {
70 return (
71 DecoderResult::InputEmpty,
72 src_consumed,
73 dest_again.written(),
74 );
75 }
76 Space::Available(source_handle) => {
77 match dest_again.check_space_bmp() {
78 Space::Full(dst_written) => {
79 return (
80 DecoderResult::OutputFull,
81 source_handle.consumed(),
82 dst_written,
83 );
84 }
85 Space::Available(mut destination_handle) => {
86 let (mut b, unread_handle) = source_handle.read();
87 let source_again = unread_handle.commit();
88 'innermost: loop {
89 if b > 127 {
90 non_ascii = b;
91 handle = destination_handle;
92 continue 'middle;
93 }
94 let dest_again_again = destination_handle.write_ascii(b);
98 if b < 60 {
99 match source_again.check_available() {
101 Space::Full(src_consumed_again) => {
102 return (
103 DecoderResult::InputEmpty,
104 src_consumed_again,
105 dest_again_again.written(),
106 );
107 }
108 Space::Available(source_handle_again) => {
109 match dest_again_again.check_space_bmp() {
110 Space::Full(dst_written_again) => {
111 return (
112 DecoderResult::OutputFull,
113 source_handle_again.consumed(),
114 dst_written_again,
115 );
116 }
117 Space::Available(
118 destination_handle_again,
119 ) => {
120 let (b_again, _unread_handle_again) =
121 source_handle_again.read();
122 b = b_again;
123 destination_handle =
124 destination_handle_again;
125 continue 'innermost;
126 }
127 }
128 }
129 }
130 }
131 continue 'outermost;
133 }
134 }
135 }
136 }
137 }
138 },
139 }
140 }
141 }
142
143 pub fn decode_to_utf16_raw(
144 &mut self,
145 src: &[u8],
146 dst: &mut [u16],
147 _last: bool,
148 ) -> (DecoderResult, usize, usize) {
149 let (pending, length) = if dst.len() < src.len() {
150 (DecoderResult::OutputFull, dst.len())
151 } else {
152 (DecoderResult::InputEmpty, src.len())
153 };
154 let mut converted = 0usize;
155 'outermost: loop {
156 match unsafe {
157 ascii_to_basic_latin(
158 src.as_ptr().add(converted),
159 dst.as_mut_ptr().add(converted),
160 length - converted,
161 )
162 } {
163 None => {
164 return (pending, length, length);
165 }
166 Some((mut non_ascii, consumed)) => {
167 converted += consumed;
168 'middle: loop {
169 let mapped =
176 unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
177 if mapped == 0u16 {
179 return (
180 DecoderResult::Malformed(1, 0),
181 converted + 1, converted,
183 );
184 }
185 unsafe {
186 *(dst.get_unchecked_mut(converted)) = mapped;
188 }
189 converted += 1;
190 if converted == length {
199 return (pending, length, length);
200 }
201 let mut b = unsafe { *(src.get_unchecked(converted)) };
202 'innermost: loop {
203 if b > 127 {
204 non_ascii = b;
205 continue 'middle;
206 }
207 unsafe {
211 *(dst.get_unchecked_mut(converted)) = u16::from(b);
212 }
213 converted += 1;
214 if b < 60 {
215 if converted == length {
217 return (pending, length, length);
218 }
219 b = unsafe { *(src.get_unchecked(converted)) };
220 continue 'innermost;
221 }
222 continue 'outermost;
224 }
225 }
226 }
227 }
228 }
229 }
230
231 pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
232 let mut bytes = buffer;
233 let mut total = 0;
234 loop {
235 if let Some((non_ascii, offset)) = validate_ascii(bytes) {
236 total += offset;
237 let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
238 if mapped != u16::from(non_ascii) {
239 return total;
240 }
241 total += 1;
242 bytes = &bytes[offset + 1..];
243 } else {
244 return total;
245 }
246 }
247 }
248}
249
250pub struct SingleByteEncoder {
251 table: &'static [u16; 128],
252 run_bmp_offset: usize,
253 run_byte_offset: usize,
254 run_length: usize,
255}
256
257impl SingleByteEncoder {
258 pub fn new(
259 encoding: &'static Encoding,
260 data: &'static [u16; 128],
261 run_bmp_offset: u16,
262 run_byte_offset: u8,
263 run_length: u8,
264 ) -> Encoder {
265 Encoder::new(
266 encoding,
267 VariantEncoder::SingleByte(SingleByteEncoder {
268 table: data,
269 run_bmp_offset: run_bmp_offset as usize,
270 run_byte_offset: run_byte_offset as usize,
271 run_length: run_length as usize,
272 }),
273 )
274 }
275
276 pub fn max_buffer_length_from_utf16_without_replacement(
277 &self,
278 u16_length: usize,
279 ) -> Option<usize> {
280 Some(u16_length)
281 }
282
283 pub fn max_buffer_length_from_utf8_without_replacement(
284 &self,
285 byte_length: usize,
286 ) -> Option<usize> {
287 Some(byte_length)
288 }
289
290 #[inline(always)]
291 fn encode_u16(&self, code_unit: u16) -> Option<u8> {
292 let unit_as_usize = code_unit as usize;
316 let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
317 if offset < self.run_length {
318 return Some((128 + self.run_byte_offset + offset) as u8);
319 }
320
321 let tail_start = self.run_byte_offset + self.run_length;
323 if let Some(pos) = position(&self.table[tail_start..], code_unit) {
324 return Some((128 + tail_start + pos) as u8);
325 }
326
327 if self.run_byte_offset >= 64 {
328 if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
330 return Some(((128 + 64) + pos) as u8);
331 }
332
333 if let Some(pos) = position(&self.table[32..64], code_unit) {
335 return Some(((128 + 32) + pos) as u8);
336 }
337 } else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
338 return Some(((128 + 32) + pos) as u8);
341 }
342
343 if let Some(pos) = position(&self.table[..32], code_unit) {
345 return Some((128 + pos) as u8);
346 }
347
348 None
349 }
350
351 ascii_compatible_bmp_encoder_function!(
352 {
353 match self.encode_u16(bmp) {
354 Some(byte) => handle.write_one(byte),
355 None => {
356 return (
357 EncoderResult::unmappable_from_bmp(bmp),
358 source.consumed(),
359 handle.written(),
360 );
361 }
362 }
363 },
364 bmp,
365 self,
366 source,
367 handle,
368 copy_ascii_to_check_space_one,
369 check_space_one,
370 encode_from_utf8_raw,
371 str,
372 Utf8Source,
373 true
374 );
375
376 pub fn encode_from_utf16_raw(
377 &mut self,
378 src: &[u16],
379 dst: &mut [u8],
380 _last: bool,
381 ) -> (EncoderResult, usize, usize) {
382 let (pending, length) = if dst.len() < src.len() {
383 (EncoderResult::OutputFull, dst.len())
384 } else {
385 (EncoderResult::InputEmpty, src.len())
386 };
387 let mut converted = 0usize;
388 'outermost: loop {
389 match unsafe {
390 basic_latin_to_ascii(
391 src.as_ptr().add(converted),
392 dst.as_mut_ptr().add(converted),
393 length - converted,
394 )
395 } {
396 None => {
397 return (pending, length, length);
398 }
399 Some((mut non_ascii, consumed)) => {
400 converted += consumed;
401 'middle: loop {
402 match self.encode_u16(non_ascii) {
404 Some(byte) => {
405 unsafe {
406 *(dst.get_unchecked_mut(converted)) = byte;
407 }
408 converted += 1;
409 }
410 None => {
411 let high_bits = non_ascii & 0xFC00u16;
414 if high_bits == 0xD800u16 {
415 if converted + 1 == length {
417 return (
419 EncoderResult::Unmappable('\u{FFFD}'),
420 converted + 1, converted,
422 );
423 }
424 let second =
425 u32::from(unsafe { *src.get_unchecked(converted + 1) });
426 if second & 0xFC00u32 != 0xDC00u32 {
427 return (
428 EncoderResult::Unmappable('\u{FFFD}'),
429 converted + 1, converted,
431 );
432 }
433 let astral: char = unsafe {
435 ::std::char::from_u32_unchecked(
436 (u32::from(non_ascii) << 10) + second
437 - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
438 )
439 };
440 return (
441 EncoderResult::Unmappable(astral),
442 converted + 2, converted,
444 );
445 }
446 if high_bits == 0xDC00u16 {
447 return (
449 EncoderResult::Unmappable('\u{FFFD}'),
450 converted + 1, converted,
452 );
453 }
454 return (
455 EncoderResult::unmappable_from_bmp(non_ascii),
456 converted + 1, converted,
458 );
459 }
460 }
461 if converted == length {
470 return (pending, length, length);
471 }
472 let mut unit = unsafe { *(src.get_unchecked(converted)) };
473 'innermost: loop {
474 if unit > 127 {
475 non_ascii = unit;
476 continue 'middle;
477 }
478 unsafe {
482 *(dst.get_unchecked_mut(converted)) = unit as u8;
483 }
484 converted += 1;
485 if unit < 60 {
486 if converted == length {
488 return (pending, length, length);
489 }
490 unit = unsafe { *(src.get_unchecked(converted)) };
491 continue 'innermost;
492 }
493 continue 'outermost;
495 }
496 }
497 }
498 }
499 }
500 }
501}
502
503#[cfg(test)]
507mod tests {
508 use super::super::testing::*;
509 use super::super::*;
510
511 #[test]
512 fn test_windows_1255_ca() {
513 decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
514 encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
515 }
516
517 #[test]
518 fn test_ascii_punctuation() {
519 let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
520 let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
521 \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
522 \u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
523 \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
524 \u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
525 decode(WINDOWS_1253, bytes, characters);
526 encode(WINDOWS_1253, characters, bytes);
527 }
528
529 #[test]
530 fn test_decode_malformed() {
531 decode(
532 WINDOWS_1253,
533 b"\xC1\xF5\xD2\xF4\xFC",
534 "\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
535 );
536 }
537
538 #[test]
539 fn test_encode_unmappables() {
540 encode(
541 WINDOWS_1253,
542 "\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
543 b"\xC1\xF5☃\xF4\xFC",
544 );
545 encode(
546 WINDOWS_1253,
547 "\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
548 b"\xC1\xF5💩\xF4\xFC",
549 );
550 }
551
552 #[test]
553 fn test_encode_unpaired_surrogates() {
554 encode_from_utf16(
555 WINDOWS_1253,
556 &[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
557 b"\xC1\xF5�\xF4\xFC",
558 );
559 encode_from_utf16(
560 WINDOWS_1253,
561 &[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
562 b"\xC1\xF5�\xF4\xFC",
563 );
564 encode_from_utf16(
565 WINDOWS_1253,
566 &[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
567 b"\xC1\xF5\xF4\xFC�",
568 );
569 }
570
571 pub const HIGH_BYTES: &'static [u8; 128] = &[
572 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
573 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
574 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
575 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
576 0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
577 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
578 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
579 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
580 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
581 ];
582
583 fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
584 let mut with_replacement = [0u16; 128];
585 let mut it = data.iter().enumerate();
586 loop {
587 match it.next() {
588 Some((i, code_point)) => {
589 if *code_point == 0 {
590 with_replacement[i] = 0xFFFD;
591 } else {
592 with_replacement[i] = *code_point;
593 }
594 }
595 None => {
596 break;
597 }
598 }
599 }
600
601 decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
602 }
603
604 fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
605 let mut with_zeros = [0u8; 128];
606 let mut it = data.iter().enumerate();
607 loop {
608 match it.next() {
609 Some((i, code_point)) => {
610 if *code_point == 0 {
611 with_zeros[i] = 0;
612 } else {
613 with_zeros[i] = HIGH_BYTES[i];
614 }
615 }
616 None => {
617 break;
618 }
619 }
620 }
621
622 encode_from_utf16(encoding, data, &with_zeros[..]);
623 }
624
625 #[test]
626 fn test_single_byte_from_two_low_surrogates() {
627 let expectation = b"��";
628 let mut output = [0u8; 40];
629 let mut encoder = WINDOWS_1253.new_encoder();
630 let (result, read, written, had_errors) =
631 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
632 assert_eq!(result, CoderResult::InputEmpty);
633 assert_eq!(read, 2);
634 assert_eq!(written, expectation.len());
635 assert!(had_errors);
636 assert_eq!(&output[..written], expectation);
637 }
638
639 #[test]
645 fn test_single_byte_decode() {
646 decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
647 decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
648 if cfg!(miri) {
649 return;
651 }
652 decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
653 decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
654 decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
655 decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
656 decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
657 decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
658 decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
659 decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
660 decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
661 decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
662 decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
663 decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
664 decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
665 decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
666 decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
667 decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
668 decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
669 decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
670 decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
671 decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
672 decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
673 decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
674 decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
675 decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
676 decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
677 }
678
679 #[test]
680 fn test_single_byte_encode() {
681 encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
682 encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
683 if cfg!(miri) {
684 return;
686 }
687 encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
688 encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
689 encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
690 encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
691 encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
692 encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
693 encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
694 encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
695 encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
696 encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
697 encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
698 encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
699 encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
700 encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
701 encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
702 encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
703 encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
704 encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
705 encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
706 encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
707 encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
708 encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
709 encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
710 encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
711 encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
712 }
713 }