1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
1011//! Unicode character composition and decomposition utilities
12//! as described in
13//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14//!
15//! ```rust
16//! extern crate unicode_normalization;
17//!
18//! use unicode_normalization::char::compose;
19//! use unicode_normalization::UnicodeNormalization;
20//!
21//! fn main() {
22//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
23//!
24//! let s = "ÅΩ";
25//! let c = s.nfc().collect::<String>();
26//! assert_eq!(c, "ÅΩ");
27//! }
28//! ```
29//!
30//! # crates.io
31//!
32//! You can use this package in your project by adding the following
33//! to your `Cargo.toml`:
34//!
35//! ```toml
36//! [dependencies]
37//! unicode-normalization = "0.1.20"
38//! ```
3940#![deny(missing_docs, unsafe_code)]
41#![doc(
42 html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43 html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44)]
45#![cfg_attr(not(feature = "std"), no_std)]
4647#[cfg(not(feature = "std"))]
48extern crate alloc;
4950#[cfg(feature = "std")]
51extern crate core;
5253extern crate tinyvec;
5455pub use crate::decompose::Decompositions;
56pub use crate::quick_check::{
57 is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58 is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59 IsNormalized,
60};
61pub use crate::recompose::Recompositions;
62pub use crate::replace::Replacements;
63pub use crate::stream_safe::StreamSafe;
64pub use crate::tables::UNICODE_VERSION;
65use core::{option, str::Chars};
6667mod decompose;
68mod lookups;
69mod normalize;
70mod perfect_hash;
71mod quick_check;
72mod recompose;
73mod replace;
74mod stream_safe;
75mod tables;
7677#[doc(hidden)]
78pub mod __test_api;
79#[cfg(test)]
80mod test;
8182/// Methods for composing and decomposing characters.
83pub mod char {
84pub use crate::normalize::{
85 compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
86 };
8788pub use crate::lookups::{canonical_combining_class, is_combining_mark};
8990/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
91 /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
92 /// of Unicode.
93pub use crate::tables::is_public_assigned;
94}
9596/// Methods for iterating over strings while applying Unicode normalizations
97/// as described in
98/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
99pub trait UnicodeNormalization<I: Iterator<Item = char>> {
100/// Returns an iterator over the string in Unicode Normalization Form D
101 /// (canonical decomposition).
102fn nfd(self) -> Decompositions<I>;
103104/// Returns an iterator over the string in Unicode Normalization Form KD
105 /// (compatibility decomposition).
106fn nfkd(self) -> Decompositions<I>;
107108/// An Iterator over the string in Unicode Normalization Form C
109 /// (canonical decomposition followed by canonical composition).
110fn nfc(self) -> Recompositions<I>;
111112/// An Iterator over the string in Unicode Normalization Form KC
113 /// (compatibility decomposition followed by canonical composition).
114fn nfkc(self) -> Recompositions<I>;
115116/// A transformation which replaces CJK Compatibility Ideograph codepoints
117 /// with normal forms using Standardized Variation Sequences. This is not
118 /// part of the canonical or compatibility decomposition algorithms, but
119 /// performing it before those algorithms produces normalized output which
120 /// better preserves the intent of the original text.
121 ///
122 /// Note that many systems today ignore variation selectors, so these
123 /// may not immediately help text display as intended, but they at
124 /// least preserve the information in a standardized form, giving
125 /// implementations the option to recognize them.
126fn cjk_compat_variants(self) -> Replacements<I>;
127128/// An Iterator over the string with Conjoining Grapheme Joiner characters
129 /// inserted according to the Stream-Safe Text Process (UAX15-D4)
130fn stream_safe(self) -> StreamSafe<I>;
131}
132133impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
134#[inline]
135fn nfd(self) -> Decompositions<Chars<'a>> {
136 Decompositions::new_canonical(self.chars())
137 }
138139#[inline]
140fn nfkd(self) -> Decompositions<Chars<'a>> {
141 Decompositions::new_compatible(self.chars())
142 }
143144#[inline]
145fn nfc(self) -> Recompositions<Chars<'a>> {
146 Recompositions::new_canonical(self.chars())
147 }
148149#[inline]
150fn nfkc(self) -> Recompositions<Chars<'a>> {
151 Recompositions::new_compatible(self.chars())
152 }
153154#[inline]
155fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
156 replace::new_cjk_compat_variants(self.chars())
157 }
158159#[inline]
160fn stream_safe(self) -> StreamSafe<Chars<'a>> {
161 StreamSafe::new(self.chars())
162 }
163}
164165impl UnicodeNormalization<option::IntoIter<char>> for char {
166#[inline]
167fn nfd(self) -> Decompositions<option::IntoIter<char>> {
168 Decompositions::new_canonical(Some(self).into_iter())
169 }
170171#[inline]
172fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
173 Decompositions::new_compatible(Some(self).into_iter())
174 }
175176#[inline]
177fn nfc(self) -> Recompositions<option::IntoIter<char>> {
178 Recompositions::new_canonical(Some(self).into_iter())
179 }
180181#[inline]
182fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
183 Recompositions::new_compatible(Some(self).into_iter())
184 }
185186#[inline]
187fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
188 replace::new_cjk_compat_variants(Some(self).into_iter())
189 }
190191#[inline]
192fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
193 StreamSafe::new(Some(self).into_iter())
194 }
195}
196197impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
198#[inline]
199fn nfd(self) -> Decompositions<I> {
200 Decompositions::new_canonical(self)
201 }
202203#[inline]
204fn nfkd(self) -> Decompositions<I> {
205 Decompositions::new_compatible(self)
206 }
207208#[inline]
209fn nfc(self) -> Recompositions<I> {
210 Recompositions::new_canonical(self)
211 }
212213#[inline]
214fn nfkc(self) -> Recompositions<I> {
215 Recompositions::new_compatible(self)
216 }
217218#[inline]
219fn cjk_compat_variants(self) -> Replacements<I> {
220 replace::new_cjk_compat_variants(self)
221 }
222223#[inline]
224fn stream_safe(self) -> StreamSafe<I> {
225 StreamSafe::new(self)
226 }
227}