lexical_write_integer/
jeaiii.rs

1//! Optimized integer-to-string conversion routines for decimal values.
2
3//! This algorihm is described in [`Faster Integer Formatting`], which uses
4//! binary search trees for highly optimized digit writing. For large numbers,
5//! the increased branching can destroy performance, but for 32-bit or smaller
6//! integers it is always faster and can be optimized in 64-bit cases.
7//!
8//! This is based off of the work by James Anhalt (jeaiii) and Junekey Jeon
9//! (jk-jeon). This has a few advantages, one is that indexing can be done
10//! without bounds checking, without any major performance hits, which minimizes
11//! the unchecked indexing and therefore potential unsoundness.
12//!
13//! This has some additional changes for performance enhancements, most notably,
14//! it flattens out most of the comparisons and uses larger first, which
15//! paradoxically seems to improve performance, potentially due to less
16//! branching.
17//!
18//! See [Algorithm.md](/docs/Algorithm.md) for a more detailed description of
19//! the algorithm choice here. See [Benchmarks.md](/docs/Benchmarks.md) for
20//! recent benchmark data.
21//!
22//! [`Faster Integer Formatting`]: https://jk-jeon.github.io/posts/2022/02/jeaiii-algorithm/
23
24#![cfg(not(feature = "compact"))]
25#![doc(hidden)]
26
27use lexical_util::digit::digit_to_char_const;
28use lexical_util::div128::fast_u128_divrem;
29
30use crate::table::DIGIT_TO_BASE10_SQUARED;
31
32// Mask to extract the lower half.
33const LO32: u64 = u32::MAX as u64;
34
35/// Get the next 2 digits from the input.
36#[inline(always)]
37fn next2(prod: &mut u64) -> u32 {
38    *prod = (*prod & LO32) * 100;
39    (*prod >> 32) as u32
40}
41
42/// Quickly calculate `n / 1e10` and `n % 1e10`.
43#[inline(always)]
44fn u128_divrem_10_10pow10(n: u128) -> (u128, u64) {
45    fast_u128_divrem(
46        n,
47        10000000000,
48        18889465931478580854784,
49        10,
50        73075081866545145910184241635814150983,
51        31,
52    )
53}
54
55/// Quickly calculate `n / 1e10` and `n % 1e10`.
56///
57/// We use this for quickly breaking our integer into
58/// chunks of 10 digits for fast u128 formatting.
59#[inline(always)]
60fn div128_rem_1e10(n: u128) -> (u128, u64) {
61    u128_divrem_10_10pow10(n)
62}
63
64// Index a value from a buffer without bounds checking.
65macro_rules! i {
66    ($array:ident[$index:expr]) => {
67        // SAFETY: Safe if `array.len() > index`.
68        unsafe { *$array.get_unchecked($index) }
69    };
70}
71
72// Write N digits to our buffer.
73macro_rules! write_n {
74    (@1 $buffer:ident, $index:expr, $n:expr) => {{
75        let index = $index;
76        let digit = digit_to_char_const($n as u32, 10);
77        $buffer[index] = digit;
78        index + 1
79    }};
80
81    (@2 $buffer:ident, $index:expr, $r:expr) => {{
82        let index = $index;
83        let r = $r as usize;
84        // NOTE: This always should be true due to how we calculate our bounds.
85        // `r` is always a single digit, so `2 * r` must be smaller than our
86        // square table.
87        debug_assert!(r < DIGIT_TO_BASE10_SQUARED.len());
88        $buffer[index] = i!(DIGIT_TO_BASE10_SQUARED[r]);
89        $buffer[index + 1] = i!(DIGIT_TO_BASE10_SQUARED[r + 1]);
90        index + 2
91    }};
92
93    // Identical to `@2` except it's writing from the end, not front.
94    // This is for our Alexandrescu-popularized algorithm.
95    (@2sub $buffer:ident, $index:ident, $r:expr) => {{
96        $index -= 2;
97        _ = write_n!(@2 $buffer, $index, $r);
98    }};
99
100    // This writes 4 digits, using 2sub twice after getting the high and low.
101    (@4sub $buffer:ident, $index:ident, $value:ident) => {{
102        let r = $value % 10000;
103        $value /= 10000;
104        let r1 = 2 * (r / 100);
105        let r2 = 2 * (r % 100);
106        write_n!(@2sub $buffer, $index, r2);
107        write_n!(@2sub $buffer, $index, r1);
108    }};
109}
110
111// Print the next 2 digits, using `next2`.
112macro_rules! print_n {
113    (@2 $buffer:ident, $index:ident, $prod:ident) => {
114        $index = write_n!(@2 $buffer, $index, next2(&mut $prod) * 2);
115    };
116
117    (@n $buffer:ident, $index:ident, $n:ident, $magic:expr, $shift:expr, $remaining:expr) => {{
118        let mut prod = ($n as u64) * $magic;
119        prod >>= $shift;
120        let two = (prod >> 32) as u32;
121        if two < 10 {
122            $index = write_n!(@1 $buffer, $index, two);
123            for _ in 0..$remaining {
124                print_n!(@2 $buffer, $index, prod);
125            }
126        } else {
127            $index = write_n!(@2 $buffer, $index, two * 2);
128            for _ in 0..$remaining {
129                print_n!(@2 $buffer, $index, prod);
130            }
131        }
132        $index
133    }};
134}
135
136// Optimized digit writers for the number of digits for each.
137// This avoids code duplication while keeping our flat logic.
138macro_rules! write_digits {
139    (@1 $buffer:ident, $n:ident) => {
140        write_n!(@1 $buffer, 0, $n)
141    };
142
143    (@2 $buffer:ident, $n:ident) => {
144        write_n!(@2 $buffer, 0, $n * 2)
145    };
146
147    // NOTE: This is only used for u8
148    (@3 $buffer:ident, $n:ident) => {{
149        // `42949673 = ceil(2^32 / 10^2)`
150        let mut y = $n as u64 * 42949673u64;
151        _ = write_n!(@1 $buffer, 0, y >> 32);
152        write_n!(@2 $buffer, 1, next2(&mut y) * 2)
153    }};
154
155    (@3-4 $buffer:ident, $n:ident) => {{
156        // `42949673 = ceil(2^32 / 10^2)`
157        let mut index = 0;
158        print_n!(@n $buffer, index, $n, 42949673u64, 0, 1)
159    }};
160
161    (@5 $buffer:ident, $n:ident) => {{
162        // `429497 == ceil(2^32 / 10^4)`
163        let mut y = $n as u64 * 429497u64;
164        _ = write_n!(@1 $buffer, 0, y >> 32);
165        _ = write_n!(@2 $buffer, 1, next2(&mut y) * 2);
166        write_n!(@2 $buffer, 3, next2(&mut y) * 2)
167    }};
168
169    (@5-6 $buffer:ident, $n:ident) => {{
170        // `429497 == ceil(2^32 / 10^4)`
171        let mut index = 0;
172        print_n!(@n $buffer, index, $n, 429497u64, 0, 2)
173    }};
174
175    (@7-8 $buffer:ident, $n:ident) => {{
176        // `281474978 == ceil(2^48 / 10^6) + 1`
177        let mut index = 0;
178        print_n!(@n $buffer, index, $n, 281474978u64, 16, 3)
179    }};
180
181    (@9 $buffer:ident, $n:ident) => {{
182        // 1441151882 = ceil(2^57 / 10^8) + 1
183        let mut y = ($n as u64) * 1441151882u64;
184        y >>= 25;
185        _ = write_n!(@1 $buffer, 0, y >> 32);
186        _ = write_n!(@2 $buffer, 1, next2(&mut y) * 2);
187        _ = write_n!(@2 $buffer, 3, next2(&mut y) * 2);
188        _ = write_n!(@2 $buffer, 5, next2(&mut y) * 2);
189        write_n!(@2 $buffer, 7, next2(&mut y) * 2)
190    }};
191
192    (@10 $buffer:ident, $n:ident) => {{
193        // `1441151881 = ceil(2^57 / 10^8)`
194        let mut y = ($n as u64) * 1441151881u64;
195        y >>= 25;
196        _ = write_n!(@2 $buffer, 0, (y >> 32) * 2);
197        _ = write_n!(@2 $buffer, 2, next2(&mut y) * 2);
198        _ = write_n!(@2 $buffer, 4, next2(&mut y) * 2);
199        _ = write_n!(@2 $buffer, 6, next2(&mut y) * 2);
200        write_n!(@2 $buffer, 8, next2(&mut y) * 2)
201    }};
202
203    (@10u64 $buffer:ident, $n:ident) => {{
204        // Unfortunately, there is no good way without using 128 bits,
205        // since the smallest interval overflows a 64-bit integer at
206        // ~>= 5.5e9. This requires the value to be in `[1e9, 1e10)`,
207        // since there's no lower bound for the calculation and so it
208        // will not work with smaller values.
209        // D = 32, k = 8, L = 28
210        // `11529215047 = ceil(2^60 / 10^8)`
211        let prod = ($n as u128) * 11529215047u128;
212        let mut y = (prod >> 28) as u64;
213        _ = write_n!(@2 $buffer, 0, (y >> 32) * 2);
214        _ = write_n!(@2 $buffer, 2, next2(&mut y) * 2);
215        _ = write_n!(@2 $buffer, 4, next2(&mut y) * 2);
216        _ = write_n!(@2 $buffer, 6, next2(&mut y) * 2);
217        write_n!(@2 $buffer, 8, next2(&mut y) * 2)
218    }};
219
220    (@10alex $buffer:ident, $n:ident, $offset:ident) => {{
221        // This always writes 10 digits for any value `[0, 1e10)`,
222        // but it uses a slower algorithm to do so. Since we don't
223        // have to worry about
224        let mut value = $n;
225        let mut index = 10 + $offset;
226        write_n!(@4sub $buffer, index, value);
227        write_n!(@4sub $buffer, index, value);
228        write_n!(@2sub $buffer, index, value * 2);
229        10 + $offset
230    }};
231}
232
233/// Optimized jeaiii algorithm for u8.
234#[inline(always)]
235pub fn from_u8(n: u8, buffer: &mut [u8]) -> usize {
236    // NOTE: For some reason, doing the large comparisons **FIRST**
237    // seems to be faster than the inverse, for both large and small
238    // values, which seems to make little sense. But, the benchmarks
239    // tell us reality.
240    let buffer = &mut buffer[..3];
241    if n >= 100 {
242        write_digits!(@3 buffer, n)
243    } else if n >= 10 {
244        write_digits!(@2 buffer, n)
245    } else {
246        write_digits!(@1 buffer, n)
247    }
248}
249
250/// Optimized jeaiii algorithm for u16.
251#[inline(always)]
252pub fn from_u16(n: u16, buffer: &mut [u8]) -> usize {
253    // NOTE: Like before, this optimizes better for large and small
254    // values if there's a flat comparison with larger values first.
255    let buffer = &mut buffer[..5];
256    if n >= 1_0000 {
257        write_digits!(@5 buffer, n)
258    } else if n >= 100 {
259        write_digits!(@3-4 buffer, n)
260    } else if n >= 10 {
261        write_digits!(@2 buffer, n)
262    } else {
263        write_digits!(@1 buffer, n)
264    }
265}
266
267/// Optimized jeaiii algorithm for u32.
268#[inline(always)]
269#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
270pub fn from_u32(n: u32, buffer: &mut [u8]) -> usize {
271    // NOTE: Like before, this optimizes better for large and small
272    // values if there's a flat comparison with larger values first.
273    let buffer = &mut buffer[..10];
274    if n < 1_0000 {
275        if n >= 100 {
276            write_digits!(@3-4 buffer, n)
277        } else if n >= 10 {
278            write_digits!(@2 buffer, n)
279        } else {
280            write_digits!(@1 buffer, n)
281        }
282    } else if n < 1_0000_0000 {
283        if n >= 100_0000 {
284            write_digits!(@7-8 buffer, n)
285        } else {
286            write_digits!(@5-6 buffer, n)
287        }
288    } else {
289        if n >= 10_0000_0000 {
290            write_digits!(@10 buffer, n)
291        } else {
292            write_digits!(@9 buffer, n)
293        }
294    }
295}
296
297/// Optimized jeaiii algorithm for u64.
298#[inline(always)]
299#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
300fn from_u64_impl(n: u64, buffer: &mut [u8], is_signed: bool) -> usize {
301    // NOTE: Like before, this optimizes better for large and small
302    // values if there's a flat comparison with larger values first.
303    const FACTOR: u64 = 100_0000_0000;
304    // NOTE `i64` takes a max of 19 digits, while `u64` takes a max of 20.
305    let buffer = if is_signed {
306        &mut buffer[..19]
307    } else {
308        &mut buffer[..20]
309    };
310    if n < 1_0000 {
311        // 1 to 4 digits
312        if n >= 100 {
313            write_digits!(@3-4 buffer, n)
314        } else if n >= 10 {
315            write_digits!(@2 buffer, n)
316        } else {
317            write_digits!(@1 buffer, n)
318        }
319    } else if n < FACTOR {
320        // 5 to 10 digits
321        if n >= 10_0000_0000 {
322            // NOTE: We DO NOT know if this is >= u32::MAX,
323            // and the `write_digits!(@10)` is only accurate
324            // if `n <= 5.5e9`, which we cannot guarantee.
325            write_digits!(@10u64 buffer, n)
326        } else if n >= 1_0000_0000 {
327            write_digits!(@9 buffer, n)
328        } else if n >= 100_0000 {
329            write_digits!(@7-8 buffer, n)
330        } else {
331            write_digits!(@5-6 buffer, n)
332        }
333    } else {
334        // 11-20 digits, can do in 2 steps (11-19 if is signed).
335        // NOTE: `hi` has to be in `[0, 2^31)`, while `lo` is in `[0, 10^11)`
336        // So, we can use our `from_u64_small` for hi. For our `lo`, we always
337        // need to write 10 digits. However, the `jeaiii` algorithm is too
338        // slow, so we use a modified variant of our 2-digit unfolding for
339        // exactly 10 digits to read our values. We can optimize this in
340        // 2x 4 digits and 1x 2 digits.
341        let hi = (n / FACTOR) as u32;
342        let lo = n % FACTOR;
343        let offset = from_u32(hi, buffer);
344        write_digits!(@10alex buffer, lo, offset)
345    }
346}
347
348/// Optimized jeaiii algorithm for u64.
349#[inline(always)]
350pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
351    from_u64_impl(n, buffer, false)
352}
353
354/// Optimized jeaiii algorithm for i64, which must be positive.
355///
356/// This value **MUST** have originally been from an `i64`, since it
357/// uses `19` for the bounds checked, so this will panic if `>= 10^19`
358/// is passed to the function.
359#[inline(always)]
360pub fn from_i64(n: u64, buffer: &mut [u8]) -> usize {
361    debug_assert!(n <= 1000_0000_0000_0000_0000u64);
362    from_u64_impl(n, buffer, true)
363}
364
365/// Optimized jeaiii algorithm for u128.
366#[inline(always)]
367#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
368pub fn from_u128(n: u128, buffer: &mut [u8]) -> usize {
369    // NOTE: Like before, this optimizes better for large and small
370    // values if there's a flat comparison with larger values first.
371    let buffer = &mut buffer[..39];
372    if n < 1_0000 {
373        // 1 to 4 digits
374        if n >= 100 {
375            write_digits!(@3-4 buffer, n)
376        } else if n >= 10 {
377            write_digits!(@2 buffer, n)
378        } else {
379            write_digits!(@1 buffer, n)
380        }
381    } else if n < 100_0000_0000 {
382        // 5 to 10 digits
383        if n >= 10_0000_0000 {
384            // NOTE: We DO NOT know if this is >= u32::MAX,
385            // and the `write_digits!(@10)` is only accurate
386            // if `n <= 5.5e9`, which we cannot guarantee.
387            write_digits!(@10u64 buffer, n)
388        } else if n >= 1_0000_0000 {
389            write_digits!(@9 buffer, n)
390        } else if n >= 100_0000 {
391            write_digits!(@7-8 buffer, n)
392        } else {
393            write_digits!(@5-6 buffer, n)
394        }
395    } else {
396        // 11-39 digits, can do in 2-4 steps
397
398        // NOTE: We need to use fast division (`u128_divrem`) for this, which
399        // we can do in 2-4 steps (`2^128 - 1 == ~3.4e38`). So, we need to
400        // calculate the number of digits to avoid shifting into place, then
401        // once we do, we can write 1-3 `lo` digits and the `hi` digits (which
402        // must be in the range `[0, 2^29)`). Our `jeaiii` algorithm is too
403        // slow, so we use a modified variant of our 2-digit unfolding for
404        // exactly 10 digits to read our values. We can optimize this in
405        // 2x 4 digits and 1x 2 digits.
406        if n >= 100_0000_0000_0000_0000_0000_0000_0000 {
407            // 4 steps
408            let (mid, d) = div128_rem_1e10(n);
409            let (mid, c) = div128_rem_1e10(mid);
410            let (hi, b) = div128_rem_1e10(mid);
411            // NOTE: `2^128 == ~3.4e38`, so `a` must be in the
412            // range `[0, 2^29)`)
413            let a = hi as u32;
414            let mut offset = from_u32(a, buffer);
415            offset = write_digits!(@10alex buffer, b, offset);
416            offset = write_digits!(@10alex buffer, c, offset);
417            write_digits!(@10alex buffer, d, offset)
418        } else if n >= 1_0000_0000_0000_0000_0000 {
419            // 3 steps
420            let (mid, lo) = div128_rem_1e10(n);
421            let (hi, mid) = div128_rem_1e10(mid);
422            let hi = hi as u64;
423            let mut offset = from_u64(hi, buffer);
424            offset = write_digits!(@10alex buffer, mid, offset);
425            write_digits!(@10alex buffer, lo, offset)
426        } else {
427            // 2 steps
428            let (hi, lo) = div128_rem_1e10(n);
429            let hi = hi as u64;
430            let offset = from_u64(hi, buffer);
431            write_digits!(@10alex buffer, lo, offset)
432        }
433    }
434}