unicode_bidi/
implicit.rs

1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! 3.3.4 - 3.3.6. Resolve implicit levels and types.
11
12#[cfg(not(feature = "smallvec"))]
13use alloc::vec::Vec;
14use core::cmp::max;
15#[cfg(feature = "smallvec")]
16use smallvec::SmallVec;
17
18use super::char_data::BidiClass::{self, *};
19use super::level::Level;
20use super::prepare::{not_removed_by_x9, IsolatingRunSequence};
21use super::{BidiDataSource, TextSource};
22
23/// 3.3.4 Resolving Weak Types
24///
25/// <http://www.unicode.org/reports/tr9/#Resolving_Weak_Types>
26#[cfg_attr(feature = "flame_it", flamer::flame)]
27pub fn resolve_weak<'a, T: TextSource<'a> + ?Sized>(
28    text: &'a T,
29    sequence: &IsolatingRunSequence,
30    processing_classes: &mut [BidiClass],
31) {
32    // Note: The spec treats these steps as individual passes that are applied one after the other
33    // on the entire IsolatingRunSequence at once. We instead collapse it into a single iteration,
34    // which is straightforward for rules that are based on the state of the current character, but not
35    // for rules that care about surrounding characters. To deal with them, we retain additional state
36    // about previous character classes that may have since been changed by later rules.
37
38    // The previous class for the purposes of rule W4/W6, not tracking changes made after or during W4.
39    let mut prev_class_before_w4 = sequence.sos;
40    // The previous class for the purposes of rule W5.
41    let mut prev_class_before_w5 = sequence.sos;
42    // The previous class for the purposes of rule W1, not tracking changes from any other rules.
43    let mut prev_class_before_w1 = sequence.sos;
44    let mut last_strong_is_al = false;
45    #[cfg(feature = "smallvec")]
46    let mut et_run_indices = SmallVec::<[usize; 8]>::new(); // for W5
47    #[cfg(not(feature = "smallvec"))]
48    let mut et_run_indices = Vec::new(); // for W5
49    #[cfg(feature = "smallvec")]
50    let mut bn_run_indices = SmallVec::<[usize; 8]>::new(); // for W5 +  <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
51    #[cfg(not(feature = "smallvec"))]
52    let mut bn_run_indices = Vec::new(); // for W5 +  <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
53
54    for (run_index, level_run) in sequence.runs.iter().enumerate() {
55        for i in &mut level_run.clone() {
56            if processing_classes[i] == BN {
57                // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
58                // Keeps track of bn runs for W5 in case we see an ET.
59                bn_run_indices.push(i);
60                // BNs aren't real, skip over them.
61                continue;
62            }
63
64            // Store the processing class of all rules before W2/W1.
65            // Used to keep track of the last strong character for W2. W3 is able to insert new strong
66            // characters, so we don't want to be misled by it.
67            let mut w2_processing_class = processing_classes[i];
68
69            // <http://www.unicode.org/reports/tr9/#W1>
70            //
71
72            if processing_classes[i] == NSM {
73                processing_classes[i] = match prev_class_before_w1 {
74                    RLI | LRI | FSI | PDI => ON,
75                    _ => prev_class_before_w1,
76                };
77                // W1 occurs before W2, update this.
78                w2_processing_class = processing_classes[i];
79            }
80
81            prev_class_before_w1 = processing_classes[i];
82
83            // <http://www.unicode.org/reports/tr9/#W2>
84            // <http://www.unicode.org/reports/tr9/#W3>
85            //
86            match processing_classes[i] {
87                EN => {
88                    if last_strong_is_al {
89                        // W2. If previous strong char was AL, change EN to AN.
90                        processing_classes[i] = AN;
91                    }
92                }
93                // W3.
94                AL => processing_classes[i] = R,
95                _ => {}
96            }
97
98            // update last_strong_is_al.
99            match w2_processing_class {
100                L | R => {
101                    last_strong_is_al = false;
102                }
103                AL => {
104                    last_strong_is_al = true;
105                }
106                _ => {}
107            }
108
109            let class_before_w456 = processing_classes[i];
110
111            // <http://www.unicode.org/reports/tr9/#W4>
112            // <http://www.unicode.org/reports/tr9/#W5>
113            // <http://www.unicode.org/reports/tr9/#W6> (separators only)
114            // (see below for W6 terminator code)
115            //
116            match processing_classes[i] {
117                // <http://www.unicode.org/reports/tr9/#W6>
118                EN => {
119                    // W5. If a run of ETs is adjacent to an EN, change the ETs to EN.
120                    for j in &et_run_indices {
121                        processing_classes[*j] = EN;
122                    }
123                    et_run_indices.clear();
124                }
125
126                // <http://www.unicode.org/reports/tr9/#W4>
127                // <http://www.unicode.org/reports/tr9/#W6>
128                ES | CS => {
129                    // See https://github.com/servo/unicode-bidi/issues/86 for improving this.
130                    // We want to make sure we check the correct next character by skipping past the rest
131                    // of this one.
132                    if let Some((_, char_len)) = text.char_at(i) {
133                        let mut next_class = sequence
134                            .iter_forwards_from(i + char_len, run_index)
135                            .map(|j| processing_classes[j])
136                            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
137                            .find(not_removed_by_x9)
138                            .unwrap_or(sequence.eos);
139                        if next_class == EN && last_strong_is_al {
140                            // Apply W2 to next_class. We know that last_strong_is_al
141                            // has no chance of changing on this character so we can still assume its value
142                            // will be the same by the time we get to it.
143                            next_class = AN;
144                        }
145                        processing_classes[i] =
146                            match (prev_class_before_w4, processing_classes[i], next_class) {
147                                // W4
148                                (EN, ES, EN) | (EN, CS, EN) => EN,
149                                // W4
150                                (AN, CS, AN) => AN,
151                                // W6 (separators only)
152                                (_, _, _) => ON,
153                            };
154
155                        // W6 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
156                        // We have to do this before W5 gets its grubby hands on these characters and thinks
157                        // they're part of an ET run.
158                        // We check for ON to ensure that we had hit the W6 branch above, since this `ES | CS` match
159                        // arm handles both W4 and W6.
160                        if processing_classes[i] == ON {
161                            for idx in sequence.iter_backwards_from(i, run_index) {
162                                let class = &mut processing_classes[idx];
163                                if *class != BN {
164                                    break;
165                                }
166                                *class = ON;
167                            }
168                            for idx in sequence.iter_forwards_from(i + char_len, run_index) {
169                                let class = &mut processing_classes[idx];
170                                if *class != BN {
171                                    break;
172                                }
173                                *class = ON;
174                            }
175                        }
176                    } else {
177                        // We're in the middle of a character, copy over work done for previous bytes
178                        // since it's going to be the same answer.
179                        processing_classes[i] = processing_classes[i - 1];
180                    }
181                }
182                // <http://www.unicode.org/reports/tr9/#W5>
183                ET => {
184                    match prev_class_before_w5 {
185                        EN => processing_classes[i] = EN,
186                        _ => {
187                            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
188                            // If there was a BN run before this, that's now a part of this ET run.
189                            et_run_indices.extend(bn_run_indices.clone());
190
191                            // In case this is followed by an EN.
192                            et_run_indices.push(i);
193                        }
194                    }
195                }
196                _ => {}
197            }
198
199            // Common loop iteration code
200            //
201
202            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
203            // BN runs would have already continued the loop, clear them before we get to the next one.
204            bn_run_indices.clear();
205
206            // W6 above only deals with separators, so it doesn't change anything W5 cares about,
207            // so we still can update this after running that part of W6.
208            prev_class_before_w5 = processing_classes[i];
209
210            // <http://www.unicode.org/reports/tr9/#W6> (terminators only)
211            // (see above for W6 separator code)
212            //
213            if prev_class_before_w5 != ET {
214                // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
215                for j in &et_run_indices {
216                    processing_classes[*j] = ON;
217                }
218                et_run_indices.clear();
219            }
220
221            // We stashed this before W4/5/6 could get their grubby hands on it, and it's not
222            // used in the W6 terminator code below so we can update it now.
223            prev_class_before_w4 = class_before_w456;
224        }
225    }
226    // Rerun this check in case we ended with a sequence of BNs (i.e., we'd never
227    // hit the end of the for loop above).
228    // W6. If we didn't find an adjacent EN, turn any ETs into ON instead.
229    for j in &et_run_indices {
230        processing_classes[*j] = ON;
231    }
232    et_run_indices.clear();
233
234    // W7. If the previous strong char was L, change EN to L.
235    let mut last_strong_is_l = sequence.sos == L;
236    for i in sequence.runs.iter().cloned().flatten() {
237        match processing_classes[i] {
238            EN if last_strong_is_l => {
239                processing_classes[i] = L;
240            }
241            L => {
242                last_strong_is_l = true;
243            }
244            R | AL => {
245                last_strong_is_l = false;
246            }
247            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
248            // Already scanning past BN here.
249            _ => {}
250        }
251    }
252}
253
254#[cfg(feature = "smallvec")]
255type BracketPairVec = SmallVec<[BracketPair; 8]>;
256#[cfg(not(feature = "smallvec"))]
257type BracketPairVec = Vec<BracketPair>;
258
259/// 3.3.5 Resolving Neutral Types
260///
261/// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types>
262#[cfg_attr(feature = "flame_it", flamer::flame)]
263pub fn resolve_neutral<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
264    text: &'a T,
265    data_source: &D,
266    sequence: &IsolatingRunSequence,
267    levels: &[Level],
268    original_classes: &[BidiClass],
269    processing_classes: &mut [BidiClass],
270) {
271    // e = embedding direction
272    let e: BidiClass = levels[sequence.runs[0].start].bidi_class();
273    let not_e = if e == BidiClass::L {
274        BidiClass::R
275    } else {
276        BidiClass::L
277    };
278    // N0. Process bracket pairs.
279
280    // > Identify the bracket pairs in the current isolating run sequence according to BD16.
281    // We use processing_classes, not original_classes, due to BD14/BD15
282    let mut bracket_pairs = BracketPairVec::new();
283    identify_bracket_pairs(
284        text,
285        data_source,
286        sequence,
287        processing_classes,
288        &mut bracket_pairs,
289    );
290
291    // > For each bracket-pair element in the list of pairs of text positions
292    //
293    // Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly
294    // for indexing as it will include the opening bracket pair but not the closing one.
295    for pair in bracket_pairs {
296        #[cfg(feature = "std")]
297        debug_assert!(
298            pair.start < processing_classes.len(),
299            "identify_bracket_pairs returned a range that is out of bounds!"
300        );
301        #[cfg(feature = "std")]
302        debug_assert!(
303            pair.end < processing_classes.len(),
304            "identify_bracket_pairs returned a range that is out of bounds!"
305        );
306        let mut found_e = false;
307        let mut found_not_e = false;
308        let mut class_to_set = None;
309
310        let start_char_len =
311            T::char_len(text.subrange(pair.start..pair.end).chars().next().unwrap());
312        // > Inspect the bidirectional types of the characters enclosed within the bracket pair.
313        //
314        // `pair` is [start, end) so we will end up processing the opening character but not the closing one.
315        //
316        for enclosed_i in sequence.iter_forwards_from(pair.start + start_char_len, pair.start_run) {
317            if enclosed_i >= pair.end {
318                #[cfg(feature = "std")]
319                debug_assert!(
320                    enclosed_i == pair.end,
321                    "If we skipped past this, the iterator is broken"
322                );
323                break;
324            }
325            let class = processing_classes[enclosed_i];
326            if class == e {
327                found_e = true;
328            } else if class == not_e {
329                found_not_e = true;
330            } else if matches!(class, BidiClass::EN | BidiClass::AN) {
331                // > Within this scope, bidirectional types EN and AN are treated as R.
332                if e == BidiClass::L {
333                    found_not_e = true;
334                } else {
335                    found_e = true;
336                }
337            }
338
339            // If we have found a character with the class of the embedding direction
340            // we can bail early.
341            if found_e {
342                break;
343            }
344        }
345        // > If any strong type (either L or R) matching the embedding direction is found
346        if found_e {
347            // > .. set the type for both brackets in the pair to match the embedding direction
348            class_to_set = Some(e);
349        // > Otherwise, if there is a strong type it must be opposite the embedding direction
350        } else if found_not_e {
351            // > Therefore, test for an established context with a preceding strong type by
352            // > checking backwards before the opening paired bracket
353            // > until the first strong type (L, R, or sos) is found.
354            // (see note above about processing_classes and character boundaries)
355            let mut previous_strong = sequence
356                .iter_backwards_from(pair.start, pair.start_run)
357                .map(|i| processing_classes[i])
358                .find(|class| {
359                    matches!(
360                        class,
361                        BidiClass::L | BidiClass::R | BidiClass::EN | BidiClass::AN
362                    )
363                })
364                .unwrap_or(sequence.sos);
365
366            // > Within this scope, bidirectional types EN and AN are treated as R.
367            if matches!(previous_strong, BidiClass::EN | BidiClass::AN) {
368                previous_strong = BidiClass::R;
369            }
370
371            // > If the preceding strong type is also opposite the embedding direction,
372            // > context is established,
373            // > so set the type for both brackets in the pair to that direction.
374            // AND
375            // > Otherwise set the type for both brackets in the pair to the embedding direction.
376            // > Either way it gets set to previous_strong
377            //
378            // Both branches amount to setting the type to the strong type.
379            class_to_set = Some(previous_strong);
380        }
381
382        if let Some(class_to_set) = class_to_set {
383            // Update all processing classes corresponding to the start and end elements, as requested.
384            // We should include all bytes of the character, not the first one.
385            let end_char_len =
386                T::char_len(text.subrange(pair.end..text.len()).chars().next().unwrap());
387            for class in &mut processing_classes[pair.start..pair.start + start_char_len] {
388                *class = class_to_set;
389            }
390            for class in &mut processing_classes[pair.end..pair.end + end_char_len] {
391                *class = class_to_set;
392            }
393            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
394            for idx in sequence.iter_backwards_from(pair.start, pair.start_run) {
395                let class = &mut processing_classes[idx];
396                if *class != BN {
397                    break;
398                }
399                *class = class_to_set;
400            }
401            // > Any number of characters that had original bidirectional character type NSM prior to the application of
402            // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket.
403
404            // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry
405            // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters.
406            let nsm_start = pair.start + start_char_len;
407            for idx in sequence.iter_forwards_from(nsm_start, pair.start_run) {
408                let class = original_classes[idx];
409                if class == BidiClass::NSM || processing_classes[idx] == BN {
410                    processing_classes[idx] = class_to_set;
411                } else {
412                    break;
413                }
414            }
415            let nsm_end = pair.end + end_char_len;
416            for idx in sequence.iter_forwards_from(nsm_end, pair.end_run) {
417                let class = original_classes[idx];
418                if class == BidiClass::NSM || processing_classes[idx] == BN {
419                    processing_classes[idx] = class_to_set;
420                } else {
421                    break;
422                }
423            }
424        }
425        // > Otherwise, there are no strong types within the bracket pair
426        // > Therefore, do not set the type for that bracket pair
427    }
428
429    // N1 and N2.
430    // Indices of every byte in this isolating run sequence
431    let mut indices = sequence.runs.iter().flat_map(Clone::clone);
432    let mut prev_class = sequence.sos;
433    while let Some(mut i) = indices.next() {
434        // Process sequences of NI characters.
435        #[cfg(feature = "smallvec")]
436        let mut ni_run = SmallVec::<[usize; 8]>::new();
437        #[cfg(not(feature = "smallvec"))]
438        let mut ni_run = Vec::new();
439        // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
440        if is_NI(processing_classes[i]) || processing_classes[i] == BN {
441            // Consume a run of consecutive NI characters.
442            ni_run.push(i);
443            let mut next_class;
444            loop {
445                match indices.next() {
446                    Some(j) => {
447                        i = j;
448                        next_class = processing_classes[j];
449                        // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
450                        if is_NI(next_class) || next_class == BN {
451                            ni_run.push(i);
452                        } else {
453                            break;
454                        }
455                    }
456                    None => {
457                        next_class = sequence.eos;
458                        break;
459                    }
460                };
461            }
462            // N1-N2.
463            //
464            // <http://www.unicode.org/reports/tr9/#N1>
465            // <http://www.unicode.org/reports/tr9/#N2>
466            let new_class = match (prev_class, next_class) {
467                (L, L) => L,
468                (R, R)
469                | (R, AN)
470                | (R, EN)
471                | (AN, R)
472                | (AN, AN)
473                | (AN, EN)
474                | (EN, R)
475                | (EN, AN)
476                | (EN, EN) => R,
477                (_, _) => e,
478            };
479            for j in &ni_run {
480                processing_classes[*j] = new_class;
481            }
482            ni_run.clear();
483        }
484        prev_class = processing_classes[i];
485    }
486}
487
488struct BracketPair {
489    /// The text-relative index of the opening bracket.
490    start: usize,
491    /// The text-relative index of the closing bracket.
492    end: usize,
493    /// The index of the run (in the run sequence) that the opening bracket is in.
494    start_run: usize,
495    /// The index of the run (in the run sequence) that the closing bracket is in.
496    end_run: usize,
497}
498/// 3.1.3 Identifying Bracket Pairs
499///
500/// Returns all paired brackets in the source, as indices into the
501/// text source.
502///
503/// <https://www.unicode.org/reports/tr9/#BD16>
504fn identify_bracket_pairs<'a, T: TextSource<'a> + ?Sized, D: BidiDataSource>(
505    text: &'a T,
506    data_source: &D,
507    run_sequence: &IsolatingRunSequence,
508    original_classes: &[BidiClass],
509    bracket_pairs: &mut BracketPairVec,
510) {
511    #[cfg(feature = "smallvec")]
512    let mut stack = SmallVec::<[(char, usize, usize); 8]>::new();
513    #[cfg(not(feature = "smallvec"))]
514    let mut stack = Vec::new();
515
516    for (run_index, level_run) in run_sequence.runs.iter().enumerate() {
517        for (i, ch) in text.subrange(level_run.clone()).char_indices() {
518            let actual_index = level_run.start + i;
519
520            // All paren characters are ON.
521            // From BidiBrackets.txt:
522            // > The Unicode property value stability policy guarantees that characters
523            // > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y
524            if original_classes[actual_index] != BidiClass::ON {
525                continue;
526            }
527
528            if let Some(matched) = data_source.bidi_matched_opening_bracket(ch) {
529                if matched.is_open {
530                    // > If an opening paired bracket is found ...
531
532                    // > ... and there is no room in the stack,
533                    // > stop processing BD16 for the remainder of the isolating run sequence.
534                    if stack.len() >= 63 {
535                        break;
536                    }
537                    // > ... push its Bidi_Paired_Bracket property value and its text position onto the stack
538                    stack.push((matched.opening, actual_index, run_index))
539                } else {
540                    // > If a closing paired bracket is found, do the following
541
542                    // > Declare a variable that holds a reference to the current stack element
543                    // > and initialize it with the top element of the stack.
544                    // AND
545                    // > Else, if the current stack element is not at the bottom of the stack
546                    for (stack_index, element) in stack.iter().enumerate().rev() {
547                        // > Compare the closing paired bracket being inspected or its canonical
548                        // > equivalent to the bracket in the current stack element.
549                        if element.0 == matched.opening {
550                            // > If the values match, meaning the two characters form a bracket pair, then
551
552                            // > Append the text position in the current stack element together with the
553                            // > text position of the closing paired bracket to the list.
554                            let pair = BracketPair {
555                                start: element.1,
556                                end: actual_index,
557                                start_run: element.2,
558                                end_run: run_index,
559                            };
560                            bracket_pairs.push(pair);
561
562                            // > Pop the stack through the current stack element inclusively.
563                            stack.truncate(stack_index);
564                            break;
565                        }
566                    }
567                }
568            }
569        }
570    }
571    // > Sort the list of pairs of text positions in ascending order based on
572    // > the text position of the opening paired bracket.
573    bracket_pairs.sort_by_key(|r| r.start);
574}
575
576/// 3.3.6 Resolving Implicit Levels
577///
578/// Returns the maximum embedding level in the paragraph.
579///
580/// <http://www.unicode.org/reports/tr9/#Resolving_Implicit_Levels>
581#[cfg_attr(feature = "flame_it", flamer::flame)]
582pub fn resolve_levels(processing_classes: &[BidiClass], levels: &mut [Level]) -> Level {
583    let mut max_level = Level::ltr();
584    assert_eq!(processing_classes.len(), levels.len());
585    for i in 0..levels.len() {
586        match (levels[i].is_rtl(), processing_classes[i]) {
587            (false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"),
588            (false, R) | (true, L) | (true, EN) | (true, AN) => {
589                levels[i].raise(1).expect("Level number error")
590            }
591            // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> handled here
592            (_, _) => {}
593        }
594        max_level = max(max_level, levels[i]);
595    }
596
597    max_level
598}
599
600/// Neutral or Isolate formatting character (B, S, WS, ON, FSI, LRI, RLI, PDI)
601///
602/// <http://www.unicode.org/reports/tr9/#NI>
603#[allow(non_snake_case)]
604fn is_NI(class: BidiClass) -> bool {
605    matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI)
606}