unicode_bidi/utf16.rs
1// Copyright 2023 The Mozilla Foundation. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::TextSource;
11
12use alloc::borrow::Cow;
13use alloc::vec::Vec;
14use core::char;
15use core::ops::Range;
16
17use crate::{
18 compute_bidi_info_for_para, compute_initial_info, level, para_direction, reorder_levels,
19 reorder_visual, visual_runs_for_line,
20};
21use crate::{
22 BidiClass, BidiDataSource, Direction, Level, LevelRun, ParagraphInfo, ParagraphInfoFlags,
23};
24
25#[cfg(feature = "hardcoded-data")]
26use crate::HardcodedBidiData;
27
28/// Initial bidi information of the text (UTF-16 version).
29///
30/// Contains the text paragraphs and `BidiClass` of its characters.
31#[derive(PartialEq, Debug)]
32pub struct InitialInfo<'text> {
33 /// The text
34 pub text: &'text [u16],
35
36 /// The BidiClass of the character at each code unit in the text.
37 /// If a character is multiple code units, its class will appear multiple times in the vector.
38 pub original_classes: Vec<BidiClass>,
39
40 /// The boundaries and level of each paragraph within the text.
41 pub paragraphs: Vec<ParagraphInfo>,
42}
43
44impl<'text> InitialInfo<'text> {
45 /// Find the paragraphs and BidiClasses in a string of text.
46 ///
47 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
48 ///
49 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
50 /// character is found before the matching PDI. If no strong character is found, the class will
51 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
52 ///
53 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
54 #[cfg_attr(feature = "flame_it", flamer::flame)]
55 #[cfg(feature = "hardcoded-data")]
56 pub fn new(text: &[u16], default_para_level: Option<Level>) -> InitialInfo<'_> {
57 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
58 }
59
60 /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
61 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
62 /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
63 ///
64 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
65 ///
66 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
67 /// character is found before the matching PDI. If no strong character is found, the class will
68 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
69 #[cfg_attr(feature = "flame_it", flamer::flame)]
70 pub fn new_with_data_source<'a, D: BidiDataSource>(
71 data_source: &D,
72 text: &'a [u16],
73 default_para_level: Option<Level>,
74 ) -> InitialInfo<'a> {
75 InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base
76 }
77}
78
79/// Extended version of InitialInfo (not public API).
80#[derive(PartialEq, Debug)]
81struct InitialInfoExt<'text> {
82 /// The base InitialInfo for the text, recording its paragraphs and bidi classes.
83 base: InitialInfo<'text>,
84
85 /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
86 /// requires no further bidi processing (i.e. there are no RTL characters or bidi
87 /// control codes present).
88 flags: Vec<ParagraphInfoFlags>,
89}
90
91impl<'text> InitialInfoExt<'text> {
92 /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
93 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
94 /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
95 ///
96 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
97 ///
98 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
99 /// character is found before the matching PDI. If no strong character is found, the class will
100 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
101 #[cfg_attr(feature = "flame_it", flamer::flame)]
102 pub fn new_with_data_source<'a, D: BidiDataSource>(
103 data_source: &D,
104 text: &'a [u16],
105 default_para_level: Option<Level>,
106 ) -> InitialInfoExt<'a> {
107 let mut paragraphs = Vec::<ParagraphInfo>::new();
108 let mut flags = Vec::<ParagraphInfoFlags>::new();
109 let (original_classes, _, _, _) = compute_initial_info(
110 data_source,
111 text,
112 default_para_level,
113 Some((&mut paragraphs, &mut flags)),
114 );
115
116 InitialInfoExt {
117 base: InitialInfo {
118 text,
119 original_classes,
120 paragraphs,
121 },
122 flags,
123 }
124 }
125}
126
127/// Bidi information of the text (UTF-16 version).
128///
129/// The `original_classes` and `levels` vectors are indexed by code unit offsets into the text. If a
130/// character is multiple code units wide, then its class and level will appear multiple times in these
131/// vectors.
132// TODO: Impl `struct StringProperty<T> { values: Vec<T> }` and use instead of Vec<T>
133#[derive(Debug, PartialEq)]
134pub struct BidiInfo<'text> {
135 /// The text
136 pub text: &'text [u16],
137
138 /// The BidiClass of the character at each byte in the text.
139 pub original_classes: Vec<BidiClass>,
140
141 /// The directional embedding level of each byte in the text.
142 pub levels: Vec<Level>,
143
144 /// The boundaries and paragraph embedding level of each paragraph within the text.
145 ///
146 /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs?
147 /// Or just don't include the first paragraph, which always starts at 0?
148 pub paragraphs: Vec<ParagraphInfo>,
149}
150
151impl<'text> BidiInfo<'text> {
152 /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph.
153 ///
154 ///
155 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
156 ///
157 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
158 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
159 ///
160 /// TODO: Support auto-RTL base direction
161 #[cfg_attr(feature = "flame_it", flamer::flame)]
162 #[cfg(feature = "hardcoded-data")]
163 #[inline]
164 pub fn new(text: &[u16], default_para_level: Option<Level>) -> BidiInfo<'_> {
165 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
166 }
167
168 /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`]
169 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
170 /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
171 ///
172 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
173 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
174 ///
175 /// TODO: Support auto-RTL base direction
176 #[cfg_attr(feature = "flame_it", flamer::flame)]
177 pub fn new_with_data_source<'a, D: BidiDataSource>(
178 data_source: &D,
179 text: &'a [u16],
180 default_para_level: Option<Level>,
181 ) -> BidiInfo<'a> {
182 let InitialInfoExt { base, flags, .. } =
183 InitialInfoExt::new_with_data_source(data_source, text, default_para_level);
184
185 let mut levels = Vec::<Level>::with_capacity(text.len());
186 let mut processing_classes = base.original_classes.clone();
187
188 for (para, flags) in base.paragraphs.iter().zip(flags.iter()) {
189 let text = &text[para.range.clone()];
190 let original_classes = &base.original_classes[para.range.clone()];
191
192 compute_bidi_info_for_para(
193 data_source,
194 para,
195 flags.is_pure_ltr,
196 flags.has_isolate_controls,
197 text,
198 original_classes,
199 &mut processing_classes,
200 &mut levels,
201 );
202 }
203
204 BidiInfo {
205 text,
206 original_classes: base.original_classes,
207 paragraphs: base.paragraphs,
208 levels,
209 }
210 }
211
212 /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
213 /// in the paragraph. The returned vector includes bytes that are not included
214 /// in the `line`, but will not adjust them.
215 ///
216 /// This runs [Rule L1], you can run
217 /// [Rule L2] by calling [`Self::reorder_visual()`].
218 /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
219 /// to avoid non-byte indices.
220 ///
221 /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
222 ///
223 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
224 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
225 #[cfg_attr(feature = "flame_it", flamer::flame)]
226 pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
227 assert!(line.start <= self.levels.len());
228 assert!(line.end <= self.levels.len());
229
230 let mut levels = self.levels.clone();
231 let line_classes = &self.original_classes[line.clone()];
232 let line_levels = &mut levels[line.clone()];
233 let line_str: &[u16] = &self.text[line.clone()];
234
235 reorder_levels(line_classes, line_levels, line_str, para.level);
236
237 levels
238 }
239
240 /// Produce the levels for this paragraph as needed for reordering, one level per *character*
241 /// in the paragraph. The returned vector includes characters that are not included
242 /// in the `line`, but will not adjust them.
243 ///
244 /// This runs [Rule L1], you can run
245 /// [Rule L2] by calling [`Self::reorder_visual()`].
246 /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
247 /// to avoid non-byte indices.
248 ///
249 /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
250 ///
251 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
252 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
253 #[cfg_attr(feature = "flame_it", flamer::flame)]
254 pub fn reordered_levels_per_char(
255 &self,
256 para: &ParagraphInfo,
257 line: Range<usize>,
258 ) -> Vec<Level> {
259 let levels = self.reordered_levels(para, line);
260 self.text.char_indices().map(|(i, _)| levels[i]).collect()
261 }
262
263 /// Re-order a line based on resolved levels and return the line in display order.
264 ///
265 /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
266 ///
267 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
268 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
269 #[cfg_attr(feature = "flame_it", flamer::flame)]
270 pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, [u16]> {
271 if !level::has_rtl(&self.levels[line.clone()]) {
272 return self.text[line].into();
273 }
274 let (levels, runs) = self.visual_runs(para, line.clone());
275 reorder_line(self.text, line, levels, runs)
276 }
277
278 /// Reorders pre-calculated levels of a sequence of characters.
279 ///
280 /// NOTE: This is a convenience method that does not use a `Paragraph` object. It is
281 /// intended to be used when an application has determined the levels of the objects (character sequences)
282 /// and just needs to have them reordered.
283 ///
284 /// the index map will result in `indexMap[visualIndex]==logicalIndex`.
285 ///
286 /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
287 /// information about the actual text.
288 ///
289 /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
290 /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
291 /// is for a single code point.
292 ///
293 ///
294 /// # # Example
295 /// ```
296 /// use unicode_bidi::BidiInfo;
297 /// use unicode_bidi::Level;
298 ///
299 /// let l0 = Level::from(0);
300 /// let l1 = Level::from(1);
301 /// let l2 = Level::from(2);
302 ///
303 /// let levels = vec![l0, l0, l0, l0];
304 /// let index_map = BidiInfo::reorder_visual(&levels);
305 /// assert_eq!(levels.len(), index_map.len());
306 /// assert_eq!(index_map, [0, 1, 2, 3]);
307 ///
308 /// let levels: Vec<Level> = vec![l0, l0, l0, l1, l1, l1, l2, l2];
309 /// let index_map = BidiInfo::reorder_visual(&levels);
310 /// assert_eq!(levels.len(), index_map.len());
311 /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]);
312 /// ```
313 #[cfg_attr(feature = "flame_it", flamer::flame)]
314 #[inline]
315 pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
316 reorder_visual(levels)
317 }
318
319 /// Find the level runs within a line and return them in visual order.
320 ///
321 /// `line` is a range of bytes indices within `levels`.
322 ///
323 /// The first return value is a vector of levels used by the reordering algorithm,
324 /// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
325 /// the result of [Rule L2], showing the visual order that each level run (a run of text with the
326 /// same level) should be displayed. Within each run, the display order can be checked
327 /// against the Level vector.
328 ///
329 /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
330 /// as that should be handled by the engine using this API.
331 ///
332 /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
333 /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
334 /// of producing a level map, since one may wish to deal with the fact that this is operating on
335 /// byte rather than character indices.
336 ///
337 /// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
338 ///
339 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
340 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
341 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
342 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
343 #[cfg_attr(feature = "flame_it", flamer::flame)]
344 #[inline]
345 pub fn visual_runs(
346 &self,
347 para: &ParagraphInfo,
348 line: Range<usize>,
349 ) -> (Vec<Level>, Vec<LevelRun>) {
350 let levels = self.reordered_levels(para, line.clone());
351 visual_runs_for_line(levels, &line)
352 }
353
354 /// If processed text has any computed RTL levels
355 ///
356 /// This information is usually used to skip re-ordering of text when no RTL level is present
357 #[inline]
358 pub fn has_rtl(&self) -> bool {
359 level::has_rtl(&self.levels)
360 }
361}
362
363/// Bidi information of text treated as a single paragraph.
364///
365/// The `original_classes` and `levels` vectors are indexed by code unit offsets into the text. If a
366/// character is multiple code units wide, then its class and level will appear multiple times in these
367/// vectors.
368#[derive(Debug, PartialEq)]
369pub struct ParagraphBidiInfo<'text> {
370 /// The text
371 pub text: &'text [u16],
372
373 /// The BidiClass of the character at each byte in the text.
374 pub original_classes: Vec<BidiClass>,
375
376 /// The directional embedding level of each byte in the text.
377 pub levels: Vec<Level>,
378
379 /// The paragraph embedding level.
380 pub paragraph_level: Level,
381
382 /// Whether the paragraph is purely LTR.
383 pub is_pure_ltr: bool,
384}
385
386impl<'text> ParagraphBidiInfo<'text> {
387 /// Determine the bidi embedding level.
388 ///
389 ///
390 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
391 ///
392 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
393 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
394 ///
395 /// TODO: Support auto-RTL base direction
396 #[cfg_attr(feature = "flame_it", flamer::flame)]
397 #[cfg(feature = "hardcoded-data")]
398 #[inline]
399 pub fn new(text: &[u16], default_para_level: Option<Level>) -> ParagraphBidiInfo<'_> {
400 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
401 }
402
403 /// Determine the bidi embedding level, with a custom [`BidiDataSource`]
404 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
405 /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
406 ///
407 /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source,
408 /// and should be kept in sync with it.
409 #[cfg_attr(feature = "flame_it", flamer::flame)]
410 pub fn new_with_data_source<'a, D: BidiDataSource>(
411 data_source: &D,
412 text: &'a [u16],
413 default_para_level: Option<Level>,
414 ) -> ParagraphBidiInfo<'a> {
415 // Here we could create a ParagraphInitialInfo struct to parallel the one
416 // used by BidiInfo, but there doesn't seem any compelling reason for it.
417 let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) =
418 compute_initial_info(data_source, text, default_para_level, None);
419
420 let mut levels = Vec::<Level>::with_capacity(text.len());
421 let mut processing_classes = original_classes.clone();
422
423 let para_info = ParagraphInfo {
424 range: Range {
425 start: 0,
426 end: text.len(),
427 },
428 level: paragraph_level,
429 };
430
431 compute_bidi_info_for_para(
432 data_source,
433 ¶_info,
434 is_pure_ltr,
435 has_isolate_controls,
436 text,
437 &original_classes,
438 &mut processing_classes,
439 &mut levels,
440 );
441
442 ParagraphBidiInfo {
443 text,
444 original_classes,
445 levels,
446 paragraph_level,
447 is_pure_ltr,
448 }
449 }
450
451 /// Produce the levels for this paragraph as needed for reordering, one level per *code unit*
452 /// in the paragraph. The returned vector includes code units that are not included
453 /// in the `line`, but will not adjust them.
454 ///
455 /// See BidiInfo::reordered_levels for details.
456 ///
457 /// (This should be kept in sync with BidiInfo::reordered_levels.)
458 #[cfg_attr(feature = "flame_it", flamer::flame)]
459 pub fn reordered_levels(&self, line: Range<usize>) -> Vec<Level> {
460 assert!(line.start <= self.levels.len());
461 assert!(line.end <= self.levels.len());
462
463 let mut levels = self.levels.clone();
464 let line_classes = &self.original_classes[line.clone()];
465 let line_levels = &mut levels[line.clone()];
466
467 reorder_levels(
468 line_classes,
469 line_levels,
470 self.text.subrange(line),
471 self.paragraph_level,
472 );
473
474 levels
475 }
476
477 /// Produce the levels for this paragraph as needed for reordering, one level per *character*
478 /// in the paragraph. The returned vector includes characters that are not included
479 /// in the `line`, but will not adjust them.
480 ///
481 /// See BidiInfo::reordered_levels_per_char for details.
482 ///
483 /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.)
484 #[cfg_attr(feature = "flame_it", flamer::flame)]
485 pub fn reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level> {
486 let levels = self.reordered_levels(line);
487 self.text.char_indices().map(|(i, _)| levels[i]).collect()
488 }
489
490 /// Re-order a line based on resolved levels and return the line in display order.
491 ///
492 /// See BidiInfo::reorder_line for details.
493 ///
494 /// (This should be kept in sync with BidiInfo::reorder_line.)
495 #[cfg_attr(feature = "flame_it", flamer::flame)]
496 pub fn reorder_line(&self, line: Range<usize>) -> Cow<'text, [u16]> {
497 if !level::has_rtl(&self.levels[line.clone()]) {
498 return self.text[line].into();
499 }
500 let (levels, runs) = self.visual_runs(line.clone());
501 reorder_line(self.text, line, levels, runs)
502 }
503
504 /// Reorders pre-calculated levels of a sequence of characters.
505 ///
506 /// See BidiInfo::reorder_visual for details.
507 #[cfg_attr(feature = "flame_it", flamer::flame)]
508 #[inline]
509 pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
510 reorder_visual(levels)
511 }
512
513 /// Find the level runs within a line and return them in visual order.
514 ///
515 /// `line` is a range of code-unit indices within `levels`.
516 ///
517 /// See `BidiInfo::visual_runs` for details.
518 ///
519 /// (This should be kept in sync with BidiInfo::visual_runs.)
520 #[cfg_attr(feature = "flame_it", flamer::flame)]
521 #[inline]
522 pub fn visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
523 let levels = self.reordered_levels(line.clone());
524 visual_runs_for_line(levels, &line)
525 }
526
527 /// If processed text has any computed RTL levels
528 ///
529 /// This information is usually used to skip re-ordering of text when no RTL level is present
530 #[inline]
531 pub fn has_rtl(&self) -> bool {
532 !self.is_pure_ltr
533 }
534
535 /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels.
536 #[inline]
537 pub fn direction(&self) -> Direction {
538 para_direction(&self.levels)
539 }
540}
541
542/// Return a line of the text in display order based on resolved levels.
543///
544/// `text` the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis
545/// `line` a range of byte indices within `text` corresponding to one line
546/// `levels` array of `Level` values, with `line`'s levels reordered into visual order
547/// `runs` array of `LevelRun`s in visual order
548///
549/// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or
550/// `ParagraphBidiInfo::visual_runs()` for the line of interest.)
551///
552/// Returns: the reordered text of the line.
553///
554/// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
555///
556/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
557/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
558fn reorder_line(
559 text: &[u16],
560 line: Range<usize>,
561 levels: Vec<Level>,
562 runs: Vec<LevelRun>,
563) -> Cow<'_, [u16]> {
564 // If all isolating run sequences are LTR, no reordering is needed
565 if runs.iter().all(|run| levels[run.start].is_ltr()) {
566 return text[line].into();
567 }
568
569 let mut result = Vec::<u16>::with_capacity(line.len());
570 for run in runs {
571 if levels[run.start].is_rtl() {
572 let mut buf = [0; 2];
573 for c in text[run].chars().rev() {
574 result.extend(c.encode_utf16(&mut buf).iter());
575 }
576 } else {
577 result.extend(text[run].iter());
578 }
579 }
580 result.into()
581}
582
583/// Contains a reference of `BidiInfo` and one of its `paragraphs`.
584/// And it supports all operation in the `Paragraph` that needs also its
585/// `BidiInfo` such as `direction`.
586#[derive(Debug)]
587pub struct Paragraph<'a, 'text> {
588 pub info: &'a BidiInfo<'text>,
589 pub para: &'a ParagraphInfo,
590}
591
592impl<'a, 'text> Paragraph<'a, 'text> {
593 #[inline]
594 pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> {
595 Paragraph { info, para }
596 }
597
598 /// Returns if the paragraph is Left direction, right direction or mixed.
599 #[inline]
600 pub fn direction(&self) -> Direction {
601 para_direction(&self.info.levels[self.para.range.clone()])
602 }
603
604 /// Returns the `Level` of a certain character in the paragraph.
605 #[inline]
606 pub fn level_at(&self, pos: usize) -> Level {
607 let actual_position = self.para.range.start + pos;
608 self.info.levels[actual_position]
609 }
610}
611
612/// Implementation of TextSource for UTF-16 text in a [u16] array.
613/// Note that there could be unpaired surrogates present!
614
615// Convenience functions to check whether a UTF16 code unit is a surrogate.
616#[inline]
617fn is_high_surrogate(code: u16) -> bool {
618 (code & 0xFC00) == 0xD800
619}
620#[inline]
621fn is_low_surrogate(code: u16) -> bool {
622 (code & 0xFC00) == 0xDC00
623}
624
625impl<'text> TextSource<'text> for [u16] {
626 type CharIter = Utf16CharIter<'text>;
627 type CharIndexIter = Utf16CharIndexIter<'text>;
628 type IndexLenIter = Utf16IndexLenIter<'text>;
629
630 #[inline]
631 fn len(&self) -> usize {
632 (self as &[u16]).len()
633 }
634 fn char_at(&self, index: usize) -> Option<(char, usize)> {
635 if index >= self.len() {
636 return None;
637 }
638 // Get the indicated code unit and try simply converting it to a char;
639 // this will fail if it is half of a surrogate pair.
640 let c = self[index];
641 if let Some(ch) = char::from_u32(c.into()) {
642 return Some((ch, 1));
643 }
644 // If it's a low surrogate, and was immediately preceded by a high surrogate,
645 // then we're in the middle of a (valid) character, and should return None.
646 if is_low_surrogate(c) && index > 0 && is_high_surrogate(self[index - 1]) {
647 return None;
648 }
649 // Otherwise, try to decode, returning REPLACEMENT_CHARACTER for errors.
650 if let Some(ch) = char::decode_utf16(self[index..].iter().cloned()).next() {
651 if let Ok(ch) = ch {
652 // This must be a surrogate pair, otherwise char::from_u32() above should
653 // have succeeded!
654 debug_assert!(ch.len_utf16() == 2, "BMP should have already been handled");
655 return Some((ch, ch.len_utf16()));
656 }
657 } else {
658 debug_assert!(
659 false,
660 "Why did decode_utf16 return None when we're not at the end?"
661 );
662 return None;
663 }
664 // Failed to decode UTF-16: we must have encountered an unpaired surrogate.
665 // Return REPLACEMENT_CHARACTER (not None), to continue processing the following text
666 // and keep indexing correct.
667 Some((char::REPLACEMENT_CHARACTER, 1))
668 }
669 #[inline]
670 fn subrange(&self, range: Range<usize>) -> &Self {
671 &(self as &[u16])[range]
672 }
673 #[inline]
674 fn chars(&'text self) -> Self::CharIter {
675 Utf16CharIter::new(self)
676 }
677 #[inline]
678 fn char_indices(&'text self) -> Self::CharIndexIter {
679 Utf16CharIndexIter::new(self)
680 }
681 #[inline]
682 fn indices_lengths(&'text self) -> Self::IndexLenIter {
683 Utf16IndexLenIter::new(self)
684 }
685 #[inline]
686 fn char_len(ch: char) -> usize {
687 ch.len_utf16()
688 }
689}
690
691/// Iterator over UTF-16 text in a [u16] slice, returning (index, char_len) tuple.
692#[derive(Debug)]
693pub struct Utf16IndexLenIter<'text> {
694 text: &'text [u16],
695 cur_pos: usize,
696}
697
698impl<'text> Utf16IndexLenIter<'text> {
699 #[inline]
700 pub fn new(text: &'text [u16]) -> Self {
701 Utf16IndexLenIter { text, cur_pos: 0 }
702 }
703}
704
705impl Iterator for Utf16IndexLenIter<'_> {
706 type Item = (usize, usize);
707
708 #[inline]
709 fn next(&mut self) -> Option<Self::Item> {
710 if let Some((_, char_len)) = self.text.char_at(self.cur_pos) {
711 let result = (self.cur_pos, char_len);
712 self.cur_pos += char_len;
713 return Some(result);
714 }
715 None
716 }
717}
718
719/// Iterator over UTF-16 text in a [u16] slice, returning (index, char) tuple.
720#[derive(Debug)]
721pub struct Utf16CharIndexIter<'text> {
722 text: &'text [u16],
723 cur_pos: usize,
724}
725
726impl<'text> Utf16CharIndexIter<'text> {
727 pub fn new(text: &'text [u16]) -> Self {
728 Utf16CharIndexIter { text, cur_pos: 0 }
729 }
730}
731
732impl Iterator for Utf16CharIndexIter<'_> {
733 type Item = (usize, char);
734
735 fn next(&mut self) -> Option<Self::Item> {
736 if let Some((ch, char_len)) = self.text.char_at(self.cur_pos) {
737 let result = (self.cur_pos, ch);
738 self.cur_pos += char_len;
739 return Some(result);
740 }
741 None
742 }
743}
744
745/// Iterator over UTF-16 text in a [u16] slice, returning Unicode chars.
746/// (Unlike the other iterators above, this also supports reverse iteration.)
747#[derive(Debug)]
748pub struct Utf16CharIter<'text> {
749 text: &'text [u16],
750 cur_pos: usize,
751 end_pos: usize,
752}
753
754impl<'text> Utf16CharIter<'text> {
755 pub fn new(text: &'text [u16]) -> Self {
756 Utf16CharIter {
757 text,
758 cur_pos: 0,
759 end_pos: text.len(),
760 }
761 }
762}
763
764impl Iterator for Utf16CharIter<'_> {
765 type Item = char;
766
767 fn next(&mut self) -> Option<Self::Item> {
768 if let Some((ch, char_len)) = self.text.char_at(self.cur_pos) {
769 self.cur_pos += char_len;
770 return Some(ch);
771 }
772 None
773 }
774}
775
776impl DoubleEndedIterator for Utf16CharIter<'_> {
777 fn next_back(&mut self) -> Option<Self::Item> {
778 if self.end_pos <= self.cur_pos {
779 return None;
780 }
781 self.end_pos -= 1;
782 if let Some(ch) = char::from_u32(self.text[self.end_pos] as u32) {
783 return Some(ch);
784 }
785 if self.end_pos > self.cur_pos {
786 if let Some((ch, char_len)) = self.text.char_at(self.end_pos - 1) {
787 if char_len == 2 {
788 self.end_pos -= 1;
789 return Some(ch);
790 }
791 }
792 }
793 Some(char::REPLACEMENT_CHARACTER)
794 }
795}