lexical_util/iterator.rs
1//! Specialized iterator traits.
2//!
3//! The traits are for iterables containing bytes, and provide optimizations
4//! which then can be used for contiguous or non-contiguous iterables,
5//! including containers or iterators of any kind.
6
7#![cfg(feature = "parse")]
8
9use core::mem;
10
11// Re-export our digit iterators.
12#[cfg(not(feature = "format"))]
13pub use crate::noskip::{AsBytes, Bytes};
14#[cfg(feature = "format")]
15pub use crate::skip::{AsBytes, Bytes};
16
17/// A trait for working with iterables of bytes.
18///
19/// These iterators can either be contiguous or not contiguous and provide
20/// methods for reading data and accessing underlying data. The readers
21/// can either be contiguous or non-contiguous, although performance and
22/// some API methods may not be available for both.
23///
24/// # Safety
25///
26/// Safe if [`set_cursor`] is set to an index <= [`buffer_length`], so no
27/// out-of-bounds reads can occur. Also, [`get_buffer`] must return a slice of
28/// initialized bytes. The caller must also ensure that any calls that increment
29/// the cursor, such as [`step_by_unchecked`], [`step_unchecked`], and
30/// [`peek_many_unchecked`] never exceed [`buffer_length`] as well.
31///
32/// [`set_cursor`]: `Iter::set_cursor`
33/// [`buffer_length`]: `Iter::buffer_length`
34/// [`get_buffer`]: `Iter::get_buffer`
35/// [`step_by_unchecked`]: `Iter::step_by_unchecked`
36/// [`step_unchecked`]: `Iter::step_unchecked`
37/// [`peek_many_unchecked`]: `Iter::peek_many_unchecked`
38#[cfg(feature = "parse")]
39pub unsafe trait Iter<'a> {
40    /// Determine if the buffer is contiguous in memory.
41    const IS_CONTIGUOUS: bool;
42
43    // CURSORS
44    // -------
45
46    /// Get a ptr to the current start of the buffer.
47    #[inline(always)]
48    fn as_ptr(&self) -> *const u8 {
49        self.as_slice().as_ptr()
50    }
51
52    /// Get a slice to the current start of the buffer.
53    #[inline(always)]
54    fn as_slice(&self) -> &'a [u8] {
55        debug_assert!(self.cursor() <= self.buffer_length());
56        // SAFETY: safe since index must be in range.
57        unsafe { self.get_buffer().get_unchecked(self.cursor()..) }
58    }
59
60    /// Get a slice to the full underlying contiguous buffer,
61    fn get_buffer(&self) -> &'a [u8];
62
63    /// Get the total number of elements in the underlying buffer.
64    #[inline(always)]
65    fn buffer_length(&self) -> usize {
66        self.get_buffer().len()
67    }
68
69    /// Get if no bytes are available in the buffer.
70    ///
71    /// This operators on the underlying buffer: that is,
72    /// it returns if [`as_slice`] would return an empty slice.
73    ///
74    /// [as_slice]: Iter::as_slice
75    #[inline(always)]
76    fn is_buffer_empty(&self) -> bool {
77        self.cursor() >= self.get_buffer().len()
78    }
79
80    /// Get the current index of the iterator in the slice.
81    fn cursor(&self) -> usize;
82
83    /// Set the current index of the iterator in the slice.
84    ///
85    /// This is **NOT** the current position of the iterator,
86    /// since iterators may skip digits: this is the cursor
87    /// in the underlying buffer. For example, if `slc[2]` is
88    /// skipped, `set_cursor(3)` would be the 3rd element in
89    /// the iterator, not the 4th.
90    ///
91    /// # Safety
92    ///
93    /// Safe if `index <= self.buffer_length()`. Although this
94    /// won't affect safety, the caller also should be careful it
95    /// does not set the cursor within skipped characters
96    /// since this could affect correctness: an iterator that
97    /// only accepts non-consecutive digit separators would
98    /// pass if the cursor was set between the two.
99    unsafe fn set_cursor(&mut self, index: usize);
100
101    /// Get the current number of digits returned by the iterator.
102    ///
103    /// For contiguous iterators, this can include the sign character, decimal
104    /// point, and the exponent sign (that is, it is always the cursor). For
105    /// non-contiguous iterators, this must always be the only the number of
106    /// digits returned.
107    ///
108    /// This is never used for indexing but will be used for API detection.
109    fn current_count(&self) -> usize;
110
111    // PROPERTIES
112
113    /// Determine if the buffer is contiguous.
114    #[inline(always)]
115    fn is_contiguous(&self) -> bool {
116        Self::IS_CONTIGUOUS
117    }
118
119    /// Get the next value available without consuming it.
120    ///
121    /// This does **NOT** skip digits, and directly fetches the item
122    /// from the underlying buffer.
123    #[inline(always)]
124    fn first(&self) -> Option<&'a u8> {
125        self.get_buffer().get(self.cursor())
126    }
127
128    /// Check if the next element is a given value.
129    #[inline(always)]
130    fn first_is_cased(&self, value: u8) -> bool {
131        Some(&value) == self.first()
132    }
133
134    /// Check if the next element is a given value without case sensitivity.
135    #[inline(always)]
136    fn first_is_uncased(&self, value: u8) -> bool {
137        if let Some(&c) = self.first() {
138            c.eq_ignore_ascii_case(&value)
139        } else {
140            false
141        }
142    }
143
144    /// Check if the next item in buffer is a given value with optional case
145    /// sensitivity.
146    #[inline(always)]
147    fn first_is(&self, value: u8, is_cased: bool) -> bool {
148        if is_cased {
149            self.first_is_cased(value)
150        } else {
151            self.first_is_uncased(value)
152        }
153    }
154
155    // STEP BY
156    // -------
157
158    /// Advance the internal slice by `N` elements.
159    ///
160    /// This does not advance the iterator by `N` elements for
161    /// non-contiguous iterators: this just advances the internal,
162    /// underlying buffer. This is useful for multi-digit optimizations
163    /// for contiguous iterators.
164    ///
165    /// This does not increment the count of items: returns: this only
166    /// increments the index, not the total digits returned. You must use
167    /// this carefully: if stepping over a digit, you must then call
168    /// [`increment_count`] afterwards or else the internal count will
169    /// be incorrect.
170    ///
171    /// [`increment_count`]: DigitsIter::increment_count
172    ///
173    /// # Panics
174    ///
175    /// This will panic if the buffer advances for non-contiguous
176    /// iterators if the current byte is a digit separator, or if the
177    /// count is more than 1.
178    ///
179    /// # Safety
180    ///
181    /// As long as the iterator is at least `N` elements, this
182    /// is safe.
183    unsafe fn step_by_unchecked(&mut self, count: usize);
184
185    /// Advance the internal slice by 1 element.
186    ///
187    ///
188    /// This does not increment the count of items: returns: this only
189    /// increments the index, not the total digits returned. You must
190    /// use this carefully: if stepping over a digit, you must then call
191    /// [`increment_count`] afterwards or else the internal count will
192    /// be incorrect.
193    ///
194    /// [`increment_count`]: DigitsIter::increment_count
195    ///
196    /// # Panics
197    ///
198    /// This will panic if the buffer advances for non-contiguous
199    /// iterators if the current byte is a digit separator.
200    ///
201    /// # Safety
202    ///
203    /// Safe as long as the iterator is not empty.
204    #[inline(always)]
205    unsafe fn step_unchecked(&mut self) {
206        debug_assert!(!self.as_slice().is_empty());
207        // SAFETY: safe if `self.index < self.buffer_length()`.
208        unsafe { self.step_by_unchecked(1) };
209    }
210
211    // READ
212    // ----
213
214    /// Read a value of a difference type from the iterator.
215    ///
216    /// This does **not** advance the internal state of the iterator.
217    /// This can only be implemented for contiguous iterators: non-
218    /// contiguous iterators **MUST** panic.
219    ///
220    /// # Panics
221    ///
222    /// If the iterator is a non-contiguous iterator.
223    ///
224    /// # Safety
225    ///
226    /// Safe as long as the number of the buffer is contains as least as
227    /// many bytes as the size of V. This must be unimplemented for
228    /// non-contiguous iterators.
229    #[inline(always)]
230    unsafe fn peek_many_unchecked<V>(&self) -> V {
231        unimplemented!();
232    }
233
234    /// Try to read a the next four bytes as a u32.
235    ///
236    /// This does not advance the internal state of the iterator.
237    #[inline(always)]
238    fn peek_u32(&self) -> Option<u32> {
239        if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u32>() {
240            // SAFETY: safe since we've guaranteed the buffer is greater than
241            // the number of elements read. u32 is valid for all bit patterns
242            unsafe { Some(self.peek_many_unchecked()) }
243        } else {
244            None
245        }
246    }
247
248    /// Try to read the next eight bytes as a u64.
249    ///
250    /// This does not advance the internal state of the iterator.
251    #[inline(always)]
252    fn peek_u64(&self) -> Option<u64> {
253        if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u64>() {
254            // SAFETY: safe since we've guaranteed the buffer is greater than
255            // the number of elements read. u64 is valid for all bit patterns
256            unsafe { Some(self.peek_many_unchecked()) }
257        } else {
258            None
259        }
260    }
261}
262
263/// Iterator over a contiguous block of bytes.
264///
265/// This allows us to convert to-and-from-slices, raw pointers, and
266/// peek/query the data from either end cheaply.
267///
268/// A default implementation is provided for slice iterators.
269/// This trait **should never** return `null` from `as_ptr`, or be
270/// implemented for non-contiguous data.
271pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
272    /// Get if the iterator cannot return any more elements.
273    ///
274    /// This may advance the internal iterator state, but not
275    /// modify the next returned value.
276    ///
277    /// If this is an iterator, this is based on the number of items
278    /// left to be returned. We do not necessarly know the length of
279    /// the buffer. If this is a non-contiguous iterator, this **MUST**
280    /// advance the state until it knows a value can be returned.
281    ///
282    /// Any incorrect implementations of this affect all safety invariants
283    /// for the rest of the trait. For contiguous iterators, this can be
284    /// as simple as checking if `self.cursor >= self.slc.len()`, but for
285    /// non-contiguous iterators you **MUST** advance to the next element
286    /// to be returned, then check to see if a value exists. The safest
287    /// implementation is always to check if `self.peek().is_none()` and
288    /// ensure [`peek`] is always safe.
289    ///
290    /// If you would like to see if the cursor is at the end of the buffer,
291    /// see [`is_buffer_empty`] instead.
292    ///
293    /// [is_buffer_empty]: Iter::is_buffer_empty
294    /// [peek]: DigitsIter::peek
295    #[inline(always)]
296    #[allow(clippy::wrong_self_convention)] // reason="required for peeking next item"
297    fn is_consumed(&mut self) -> bool {
298        self.peek().is_none()
299    }
300
301    /// Increment the number of digits that have been returned by the iterator.
302    ///
303    /// For contiguous iterators, this is a no-op. For non-contiguous iterators,
304    /// this increments the count by 1.
305    fn increment_count(&mut self);
306
307    /// Peek the next value of the iterator, without consuming it.
308    ///
309    /// Note that this can modify the internal state, by skipping digits
310    /// for iterators that find the first non-zero value, etc. We optimize
311    /// this for the case where we have contiguous iterators, since
312    /// non-contiguous iterators already have a major performance penalty.
313    fn peek(&mut self) -> Option<Self::Item>;
314
315    /// Peek the next value of the iterator, and step only if it exists.
316    #[inline(always)]
317    fn try_read(&mut self) -> Option<Self::Item> {
318        if let Some(value) = self.peek() {
319            // SAFETY: the slice cannot be empty because we peeked a value.
320            unsafe { self.step_unchecked() };
321            Some(value)
322        } else {
323            None
324        }
325    }
326
327    /// Check if the next element is a given value.
328    #[inline(always)]
329    fn peek_is_cased(&mut self, value: u8) -> bool {
330        Some(&value) == self.peek()
331    }
332
333    /// Check if the next element is a given value without case sensitivity.
334    #[inline(always)]
335    fn peek_is_uncased(&mut self, value: u8) -> bool {
336        if let Some(&c) = self.peek() {
337            c.eq_ignore_ascii_case(&value)
338        } else {
339            false
340        }
341    }
342
343    /// Check if the next element is a given value with optional case
344    /// sensitivity.
345    #[inline(always)]
346    fn peek_is(&mut self, value: u8, is_cased: bool) -> bool {
347        if is_cased {
348            self.peek_is_cased(value)
349        } else {
350            self.peek_is_uncased(value)
351        }
352    }
353
354    /// Peek the next value and consume it if the read value matches the
355    /// expected one.
356    #[inline(always)]
357    fn read_if<Pred: FnOnce(u8) -> bool>(&mut self, pred: Pred) -> Option<u8> {
358        // NOTE: This was implemented to remove usage of unsafe throughout to code
359        // base, however, performance was really not up to scratch. I'm not sure
360        // the cause of this.
361        if let Some(&peeked) = self.peek() {
362            if pred(peeked) {
363                // SAFETY: the slice cannot be empty because we peeked a value.
364                unsafe { self.step_unchecked() };
365                Some(peeked)
366            } else {
367                None
368            }
369        } else {
370            None
371        }
372    }
373
374    /// Read a value if the value matches the provided one.
375    #[inline(always)]
376    fn read_if_value_cased(&mut self, value: u8) -> Option<u8> {
377        if self.peek() == Some(&value) {
378            // SAFETY: the slice cannot be empty because we peeked a value.
379            unsafe { self.step_unchecked() };
380            Some(value)
381        } else {
382            None
383        }
384    }
385
386    /// Read a value if the value matches the provided one without case
387    /// sensitivity.
388    #[inline(always)]
389    fn read_if_value_uncased(&mut self, value: u8) -> Option<u8> {
390        self.read_if(|x| x.eq_ignore_ascii_case(&value))
391    }
392
393    /// Read a value if the value matches the provided one.
394    #[inline(always)]
395    fn read_if_value(&mut self, value: u8, is_cased: bool) -> Option<u8> {
396        if is_cased {
397            self.read_if_value_cased(value)
398        } else {
399            self.read_if_value_uncased(value)
400        }
401    }
402
403    /// Skip zeros from the start of the iterator
404    #[inline(always)]
405    fn skip_zeros(&mut self) -> usize {
406        let start = self.current_count();
407        while self.read_if_value_cased(b'0').is_some() {
408            self.increment_count();
409        }
410        self.current_count() - start
411    }
412
413    /// Determine if the character is a digit.
414    fn is_digit(&self, value: u8) -> bool;
415}