idna_mapping/
lib.rs

1// Copyright 2013-2014 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! This crate is not meant to be used directly. It part of the unicode-rs back end
10//! for the `idna` crate providing the UTS 46 mapping data and an abstraction over
11//! JoiningType data (delegated to `unicode-joining-type`).
12//!
13//! See the [README of the latest version of the `idna_adapter` crate][1] for
14//! how to use.
15//!
16//! [1]: https://docs.rs/crate/idna_adapter/latest
17
18// The code in this file has been moved from the `rust-url` repo.
19// See https://github.com/servo/rust-url/blob/c04aca3f74eb567ec4853362ef28b7ce2f19c5d3/idna/src/uts46.rs
20// for older history.
21
22#![no_std]
23
24use self::Mapping::*;
25
26include!("uts46_mapping_table.rs");
27
28#[derive(Debug)]
29struct StringTableSlice {
30    // Store these as separate fields so the structure will have an
31    // alignment of 1 and thus pack better into the Mapping enum, below.
32    byte_start_lo: u8,
33    byte_start_hi: u8,
34    byte_len: u8,
35}
36
37fn decode_slice(slice: &StringTableSlice) -> &'static str {
38    let lo = slice.byte_start_lo as usize;
39    let hi = slice.byte_start_hi as usize;
40    let start = (hi << 8) | lo;
41    let len = slice.byte_len as usize;
42    &STRING_TABLE[start..(start + len)]
43}
44
45#[repr(u8)]
46#[derive(Debug)]
47enum Mapping {
48    Valid,
49    Ignored,
50    Mapped(StringTableSlice),
51    Disallowed,
52}
53
54fn find_char(codepoint: char) -> &'static Mapping {
55    let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
56        Ok(idx) => idx,
57        Err(idx) => idx - 1,
58    };
59
60    const SINGLE_MARKER: u16 = 1 << 15;
61
62    let (base, x) = TABLE[idx];
63    let single = (x & SINGLE_MARKER) != 0;
64    let offset = !SINGLE_MARKER & x;
65
66    if single {
67        &MAPPING_TABLE[offset as usize]
68    } else {
69        &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
70    }
71}
72
73pub struct Mapper<I>
74where
75    I: Iterator<Item = char>,
76{
77    chars: I,
78    slice: Option<core::str::Chars<'static>>,
79    ignored_as_errors: bool,
80}
81
82impl<I> Mapper<I>
83where
84    I: Iterator<Item = char>,
85{
86    pub fn new(delegate: I, ignored_as_errors: bool) -> Self {
87        Mapper {
88            chars: delegate,
89            slice: None,
90            ignored_as_errors,
91        }
92    }
93}
94
95impl<I> Iterator for Mapper<I>
96where
97    I: Iterator<Item = char>,
98{
99    type Item = char;
100
101    fn next(&mut self) -> Option<Self::Item> {
102        loop {
103            if let Some(s) = &mut self.slice {
104                match s.next() {
105                    Some(c) => return Some(c),
106                    None => {
107                        self.slice = None;
108                    }
109                }
110            }
111
112            let codepoint = self.chars.next()?;
113            if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
114                return Some(codepoint);
115            }
116
117            return Some(match *find_char(codepoint) {
118                Mapping::Valid => codepoint,
119                Mapping::Ignored => {
120                    if self.ignored_as_errors {
121                        '\u{FFFD}'
122                    } else {
123                        continue;
124                    }
125                }
126                Mapping::Mapped(ref slice) => {
127                    self.slice = Some(decode_slice(slice).chars());
128                    continue;
129                }
130                Mapping::Disallowed => '\u{FFFD}',
131            });
132        }
133    }
134}
135
136// Pushing the JoiningType functionality from `idna_adapter` to this crate
137// insulates `idna_adapter` from future semver breaks of `unicode_joining_type`.
138
139/// Turns a joining type into a mask for comparing with multiple type at once.
140const fn joining_type_to_mask(jt: unicode_joining_type::JoiningType) -> u32 {
141    1u32 << (jt as u32)
142}
143
144/// Mask for checking for both left and dual joining.
145pub const LEFT_OR_DUAL_JOINING_MASK: JoiningTypeMask = JoiningTypeMask(
146    joining_type_to_mask(unicode_joining_type::JoiningType::LeftJoining)
147        | joining_type_to_mask(unicode_joining_type::JoiningType::DualJoining),
148);
149
150/// Mask for checking for both left and dual joining.
151pub const RIGHT_OR_DUAL_JOINING_MASK: JoiningTypeMask = JoiningTypeMask(
152    joining_type_to_mask(unicode_joining_type::JoiningType::RightJoining)
153        | joining_type_to_mask(unicode_joining_type::JoiningType::DualJoining),
154);
155
156/// Value for the Joining_Type Unicode property.
157#[repr(transparent)]
158#[derive(Clone, Copy)]
159pub struct JoiningType(unicode_joining_type::JoiningType);
160
161impl JoiningType {
162    /// Returns the corresponding `JoiningTypeMask`.
163    #[inline(always)]
164    pub fn to_mask(self) -> JoiningTypeMask {
165        JoiningTypeMask(joining_type_to_mask(self.0))
166    }
167
168    // `true` iff this value is the Transparent value.
169    #[inline(always)]
170    pub fn is_transparent(self) -> bool {
171        self.0 == unicode_joining_type::JoiningType::Transparent
172    }
173}
174
175/// A mask representing potentially multiple `JoiningType`
176/// values.
177#[repr(transparent)]
178#[derive(Clone, Copy)]
179pub struct JoiningTypeMask(u32);
180
181impl JoiningTypeMask {
182    /// `true` iff both masks have at `JoiningType` in common.
183    #[inline(always)]
184    pub fn intersects(self, other: JoiningTypeMask) -> bool {
185        self.0 & other.0 != 0
186    }
187}
188
189/// Returns the Joining_Type of `c`.
190#[inline(always)]
191pub fn joining_type(c: char) -> JoiningType {
192    JoiningType(unicode_joining_type::get_joining_type(c))
193}
194
195#[cfg(test)]
196mod tests {
197    use super::{find_char, Mapping};
198    use assert_matches::assert_matches;
199
200    #[test]
201    fn mapping_fast_path() {
202        assert_matches!(find_char('-'), &Mapping::Valid);
203        assert_matches!(find_char('.'), &Mapping::Valid);
204        for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
205            assert_matches!(find_char(*c), &Mapping::Valid);
206        }
207        for c in &[
208            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
209            'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
210        ] {
211            assert_matches!(find_char(*c), &Mapping::Valid);
212        }
213    }
214}