lexical_parse_float/
bigint.rs

1//! A simple big-integer type for slow path algorithms.
2//!
3//! This includes minimal stack vector for use in big-integer arithmetic.
4
5#![doc(hidden)]
6
7use core::{cmp, mem, ops, ptr, slice};
8
9#[cfg(feature = "radix")]
10use crate::float::ExtendedFloat80;
11use crate::float::RawFloat;
12use crate::limits::{u32_power_limit, u64_power_limit};
13#[cfg(not(feature = "compact"))]
14use crate::table::get_large_int_power;
15
16/// Index an array without bounds checking.
17///
18/// # Safety
19///
20/// Safe if `index < array.len()`.
21macro_rules! index_unchecked {
22    ($x:ident[$i:expr]) => {
23        // SAFETY: safe if `index < array.len()`.
24        *$x.get_unchecked($i)
25    };
26}
27
28// BIGINT
29// ------
30
31/// Number of bits in a Bigint.
32///
33/// This needs to be at least the number of bits required to store
34/// a Bigint, which is `log2(radix**digits)`.
35/// ≅ 5600 for base-36, rounded-up.
36#[cfg(feature = "radix")]
37const BIGINT_BITS: usize = 6000;
38
39/// ≅ 3600 for base-10, rounded-up.
40#[cfg(not(feature = "radix"))]
41const BIGINT_BITS: usize = 4000;
42
43/// The number of limbs for the bigint.
44const BIGINT_LIMBS: usize = BIGINT_BITS / Limb::BITS as usize;
45
46/// Storage for a big integer type.
47///
48/// This is used for algorithms when we have a finite number of digits.
49/// Specifically, it stores all the significant digits scaled to the
50/// proper exponent, as an integral type, and then directly compares
51/// these digits.
52///
53/// This requires us to store the number of significant bits, plus the
54/// number of exponent bits (required) since we scale everything
55/// to the same exponent.
56#[derive(Clone, PartialEq, Eq)]
57pub struct Bigint {
58    /// Significant digits for the float, stored in a big integer in LE order.
59    ///
60    /// This is pretty much the same number of digits for any radix, since the
61    ///  significant digits balances out the zeros from the exponent:
62    ///     1. Decimal is 1091 digits, 767 mantissa digits + 324 exponent zeros.
63    ///     2. Base 6 is 1097 digits, or 680 mantissa digits + 417 exponent
64    ///        zeros.
65    ///     3. Base 36 is 1086 digits, or 877 mantissa digits + 209 exponent
66    ///        zeros.
67    ///
68    /// However, the number of bytes required is larger for large radixes:
69    /// for decimal, we need `log2(10**1091) ≅ 3600`, while for base 36
70    /// we need `log2(36**1086) ≅ 5600`. Since we use uninitialized data,
71    /// we avoid a major performance hit from the large buffer size.
72    pub data: StackVec<BIGINT_LIMBS>,
73}
74
75impl Bigint {
76    /// Construct a bigfloat representing 0.
77    #[inline(always)]
78    pub const fn new() -> Self {
79        Self {
80            data: StackVec::new(),
81        }
82    }
83
84    /// Construct a bigfloat from an integer.
85    #[inline(always)]
86    pub fn from_u32(value: u32) -> Self {
87        Self {
88            data: StackVec::from_u32(value),
89        }
90    }
91
92    /// Construct a bigfloat from an integer.
93    #[inline(always)]
94    pub fn from_u64(value: u64) -> Self {
95        Self {
96            data: StackVec::from_u64(value),
97        }
98    }
99
100    #[inline(always)]
101    pub fn hi64(&self) -> (u64, bool) {
102        self.data.hi64()
103    }
104
105    /// Multiply and assign as if by exponentiation by a power.
106    #[inline(always)]
107    pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> {
108        let (odd, shift) = split_radix(base);
109        if odd != 0 {
110            pow::<BIGINT_LIMBS>(&mut self.data, odd, exp)?;
111        }
112        if shift != 0 {
113            shl(&mut self.data, (exp * shift) as usize)?;
114        }
115        Some(())
116    }
117
118    /// Calculate the bit-length of the big-integer.
119    #[inline(always)]
120    pub fn bit_length(&self) -> u32 {
121        bit_length(&self.data)
122    }
123}
124
125impl ops::MulAssign<&Bigint> for Bigint {
126    fn mul_assign(&mut self, rhs: &Bigint) {
127        self.data *= &rhs.data;
128    }
129}
130
131impl Default for Bigint {
132    fn default() -> Self {
133        Self::new()
134    }
135}
136
137/// Number of bits in a Bigfloat.
138///
139/// This needs to be at least the number of bits required to store
140/// a Bigint, which is `F::EXPONENT_BIAS + F::BITS`.
141/// Bias ≅ 1075, with 64 extra for the digits.
142#[cfg(feature = "radix")]
143const BIGFLOAT_BITS: usize = 1200;
144
145/// The number of limbs for the Bigfloat.
146#[cfg(feature = "radix")]
147const BIGFLOAT_LIMBS: usize = BIGFLOAT_BITS / Limb::BITS as usize;
148
149/// Storage for a big floating-point type.
150///
151/// This is used for the algorithm with a non-finite digit count, which creates
152/// a representation of `b+h` and the float scaled into the range `[1, radix)`.
153#[cfg(feature = "radix")]
154#[derive(Clone, PartialEq, Eq)]
155pub struct Bigfloat {
156    /// Significant digits for the float, stored in a big integer in LE order.
157    ///
158    /// This only needs ~1075 bits for the exponent, and ~64 more for the
159    /// significant digits, since it's based on a theoretical representation
160    /// of the halfway point. This means we can have a significantly smaller
161    /// representation. The largest 64-bit exponent in magnitude is 2^1074,
162    /// which will produce the same number of bits in any radix.
163    pub data: StackVec<BIGFLOAT_LIMBS>,
164    /// Binary exponent for the float type.
165    pub exp: i32,
166}
167
168#[cfg(feature = "radix")]
169impl Bigfloat {
170    /// Construct a bigfloat representing 0.
171    #[inline(always)]
172    pub const fn new() -> Self {
173        Self {
174            data: StackVec::new(),
175            exp: 0,
176        }
177    }
178
179    /// Construct a bigfloat from an extended-precision float.
180    #[inline(always)]
181    pub fn from_float(fp: ExtendedFloat80) -> Self {
182        Self {
183            data: StackVec::from_u64(fp.mant),
184            exp: fp.exp,
185        }
186    }
187
188    /// Construct a bigfloat from an integer.
189    #[inline(always)]
190    pub fn from_u32(value: u32) -> Self {
191        Self {
192            data: StackVec::from_u32(value),
193            exp: 0,
194        }
195    }
196
197    /// Construct a bigfloat from an integer.
198    #[inline(always)]
199    pub fn from_u64(value: u64) -> Self {
200        Self {
201            data: StackVec::from_u64(value),
202            exp: 0,
203        }
204    }
205
206    /// Multiply and assign as if by exponentiation by a power.
207    #[inline(always)]
208    pub fn pow(&mut self, base: u32, exp: u32) -> Option<()> {
209        let (odd, shift) = split_radix(base);
210        if odd != 0 {
211            pow::<BIGFLOAT_LIMBS>(&mut self.data, odd, exp)?;
212        }
213        if shift != 0 {
214            self.exp += (exp * shift) as i32;
215        }
216        Some(())
217    }
218
219    /// Shift-left the entire buffer n bits, where bits is less than the limb
220    /// size.
221    #[inline(always)]
222    pub fn shl_bits(&mut self, n: usize) -> Option<()> {
223        shl_bits(&mut self.data, n)
224    }
225
226    /// Shift-left the entire buffer n limbs.
227    #[inline(always)]
228    pub fn shl_limbs(&mut self, n: usize) -> Option<()> {
229        shl_limbs(&mut self.data, n)
230    }
231
232    /// Shift-left the entire buffer n bits.
233    #[inline(always)]
234    pub fn shl(&mut self, n: usize) -> Option<()> {
235        shl(&mut self.data, n)
236    }
237
238    /// Get number of leading zero bits in the storage.
239    /// Assumes the value is normalized.
240    #[inline(always)]
241    pub fn leading_zeros(&self) -> u32 {
242        leading_zeros(&self.data)
243    }
244}
245
246#[cfg(feature = "radix")]
247impl ops::MulAssign<&Bigfloat> for Bigfloat {
248    #[inline(always)]
249    #[allow(clippy::suspicious_op_assign_impl)] // reason="intended increment"
250    #[allow(clippy::unwrap_used)] // reason="exceeding the bounds is a developer error"
251    fn mul_assign(&mut self, rhs: &Bigfloat) {
252        large_mul(&mut self.data, &rhs.data).unwrap();
253        self.exp += rhs.exp;
254    }
255}
256
257#[cfg(feature = "radix")]
258impl Default for Bigfloat {
259    fn default() -> Self {
260        Self::new()
261    }
262}
263
264// VEC
265// ---
266
267/// Simple stack vector implementation.
268#[derive(Clone)]
269pub struct StackVec<const SIZE: usize> {
270    /// The raw buffer for the elements.
271    data: [mem::MaybeUninit<Limb>; SIZE],
272    /// The number of elements in the array (we never need more than
273    /// `u16::MAX`).
274    length: u16,
275}
276
277/// Extract the hi bits from the buffer.
278///
279/// NOTE: Modifying this to remove unsafety which we statically
280/// check directly in every caller leads to ~20% degradation in
281/// performance.
282/// - `rview`   - A reversed view over a slice.
283/// - `fn`      - The callback to extract the high bits.
284macro_rules! hi {
285    (@1 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
286        $fn(unsafe { index_unchecked!($rview[0]) as $t })
287    }};
288
289    // # Safety
290    //
291    // Safe as long as the `stackvec.len() >= 2`.
292    (@2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
293        let r0 = unsafe { index_unchecked!($rview[0]) as $t };
294        let r1 = unsafe { index_unchecked!($rview[1]) as $t };
295        $fn(r0, r1)
296    }};
297
298    // # Safety
299    //
300    // Safe as long as the `stackvec.len() >= 2`.
301    (@nonzero2 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
302        let (v, n) = hi!(@2 $self, $rview, $t, $fn);
303        (v, n || unsafe { nonzero($self, 2 ) })
304    }};
305
306    // # Safety
307    //
308    // Safe as long as the `stackvec.len() >= 3`.
309    (@3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
310        let r0 = unsafe { index_unchecked!($rview[0]) as $t };
311        let r1 = unsafe { index_unchecked!($rview[1]) as $t };
312        let r2 = unsafe { index_unchecked!($rview[2]) as $t };
313        $fn(r0, r1, r2)
314    }};
315
316    // # Safety
317    //
318    // Safe as long as the `stackvec.len() >= 3`.
319    (@nonzero3 $self:ident, $rview:ident, $t:ident, $fn:ident) => {{
320        let (v, n) = hi!(@3 $self, $rview, $t, $fn);
321        (v, n || unsafe { nonzero($self, 3 ) })
322    }};
323}
324
325impl<const SIZE: usize> StackVec<SIZE> {
326    /// Construct an empty vector.
327    #[must_use]
328    #[inline(always)]
329    pub const fn new() -> Self {
330        Self {
331            length: 0,
332            data: [mem::MaybeUninit::uninit(); SIZE],
333        }
334    }
335
336    /// Get a mutable ptr to the current start of the big integer.
337    #[must_use]
338    #[inline(always)]
339    pub fn as_mut_ptr(&mut self) -> *mut Limb {
340        self.data.as_mut_ptr().cast::<Limb>()
341    }
342
343    /// Get a ptr to the current start of the big integer.
344    #[must_use]
345    #[inline(always)]
346    pub fn as_ptr(&self) -> *const Limb {
347        self.data.as_ptr().cast::<Limb>()
348    }
349
350    /// Construct a vector from an existing slice.
351    #[must_use]
352    #[inline(always)]
353    pub fn try_from(x: &[Limb]) -> Option<Self> {
354        let mut vec = Self::new();
355        vec.try_extend(x)?;
356        Some(vec)
357    }
358
359    /// Sets the length of a vector.
360    ///
361    /// This will explicitly set the size of the vector, without actually
362    /// modifying its buffers, so it is up to the caller to ensure that the
363    /// vector is actually the specified size.
364    ///
365    /// # Safety
366    ///
367    /// Safe as long as `len` is less than `SIZE`.
368    #[inline(always)]
369    pub unsafe fn set_len(&mut self, len: usize) {
370        debug_assert!(len <= u16::MAX as usize, "indexing must fit in 16 bits");
371        debug_assert!(len <= SIZE, "cannot exceed our array bounds");
372        self.length = len as u16;
373    }
374
375    /// Get the number of elements stored in the vector.
376    #[must_use]
377    #[inline(always)]
378    pub const fn len(&self) -> usize {
379        self.length as usize
380    }
381
382    /// If the vector is empty.
383    #[must_use]
384    #[inline(always)]
385    pub const fn is_empty(&self) -> bool {
386        self.len() == 0
387    }
388
389    /// The number of items the vector can hold.
390    #[must_use]
391    #[inline(always)]
392    pub const fn capacity(&self) -> usize {
393        SIZE
394    }
395
396    /// Append an item to the vector, without bounds checking.
397    ///
398    /// # Safety
399    ///
400    /// Safe if `self.len() < self.capacity()`.
401    #[inline(always)]
402    unsafe fn push_unchecked(&mut self, value: Limb) {
403        debug_assert!(self.len() < self.capacity(), "cannot exceed our array bounds");
404        // SAFETY: safe, capacity is less than the current size.
405        unsafe {
406            let len = self.len();
407            let ptr = self.as_mut_ptr().add(len);
408            ptr.write(value);
409            self.length += 1;
410        }
411    }
412
413    /// Append an item to the vector.
414    #[inline(always)]
415    pub fn try_push(&mut self, value: Limb) -> Option<()> {
416        if self.len() < self.capacity() {
417            // SAFETY: safe, capacity is less than the current size.
418            unsafe { self.push_unchecked(value) };
419            Some(())
420        } else {
421            None
422        }
423    }
424
425    /// Remove an item from the end of a vector, without bounds checking.
426    ///
427    /// # Safety
428    ///
429    /// Safe if `self.len() > 0`.
430    #[inline(always)]
431    unsafe fn pop_unchecked(&mut self) -> Limb {
432        debug_assert!(!self.is_empty(), "cannot pop a value if none exists");
433        self.length -= 1;
434        // SAFETY: safe if `self.length > 0`.
435        // We have a trivial drop and copy, so this is safe.
436        unsafe { ptr::read(self.as_mut_ptr().add(self.len())) }
437    }
438
439    /// Remove an item from the end of the vector and return it, or None if
440    /// empty.
441    #[inline(always)]
442    pub fn pop(&mut self) -> Option<Limb> {
443        if self.is_empty() {
444            None
445        } else {
446            // SAFETY: safe, since `self.len() > 0`.
447            unsafe { Some(self.pop_unchecked()) }
448        }
449    }
450
451    /// Add items from a slice to the vector, without bounds checking.
452    ///
453    /// # Safety
454    ///
455    /// Safe if `self.len() + slc.len() <= self.capacity()`.
456    #[inline(always)]
457    unsafe fn extend_unchecked(&mut self, slc: &[Limb]) {
458        let index = self.len();
459        let new_len = index + slc.len();
460        debug_assert!(self.len() + slc.len() <= self.capacity(), "cannot exceed our array bounds");
461        let src = slc.as_ptr();
462        // SAFETY: safe if `self.len() + slc.len() <= self.capacity()`.
463        unsafe {
464            let dst = self.as_mut_ptr().add(index);
465            ptr::copy_nonoverlapping(src, dst, slc.len());
466            self.set_len(new_len);
467        }
468    }
469
470    /// Copy elements from a slice and append them to the vector.
471    #[inline(always)]
472    pub fn try_extend(&mut self, slc: &[Limb]) -> Option<()> {
473        if self.len() + slc.len() <= self.capacity() {
474            // SAFETY: safe, since `self.len() + slc.len() <= self.capacity()`.
475            unsafe { self.extend_unchecked(slc) };
476            Some(())
477        } else {
478            None
479        }
480    }
481
482    /// Truncate vector to new length, dropping any items after `len`.
483    ///
484    /// # Safety
485    ///
486    /// Safe as long as `len <= self.capacity()`.
487    unsafe fn truncate_unchecked(&mut self, len: usize) {
488        debug_assert!(len <= self.capacity(), "cannot exceed our array bounds");
489        self.length = len as u16;
490    }
491
492    /// Resize the buffer, without bounds checking.
493    ///
494    /// # Safety
495    ///
496    /// Safe as long as `len <= self.capacity()`.
497    #[inline(always)]
498    pub unsafe fn resize_unchecked(&mut self, len: usize, value: Limb) {
499        debug_assert!(len <= self.capacity(), "cannot exceed our array bounds");
500        let old_len = self.len();
501        if len > old_len {
502            // We have a trivial drop, so there's no worry here.
503            // Just, don't set the length until all values have been written,
504            // so we don't accidentally read uninitialized memory.
505
506            let count = len - old_len;
507            for index in 0..count {
508                // SAFETY: safe if `len < self.capacity()`.
509                unsafe {
510                    let dst = self.as_mut_ptr().add(old_len + index);
511                    ptr::write(dst, value);
512                }
513            }
514            self.length = len as u16;
515        } else {
516            // SAFETY: safe since `len < self.len()`.
517            unsafe { self.truncate_unchecked(len) };
518        }
519    }
520
521    /// Try to resize the buffer.
522    ///
523    /// If the new length is smaller than the current length, truncate
524    /// the input. If it's larger, then append elements to the buffer.
525    #[inline(always)]
526    pub fn try_resize(&mut self, len: usize, value: Limb) -> Option<()> {
527        if len > self.capacity() {
528            None
529        } else {
530            // SAFETY: safe, since `len <= self.capacity()`.
531            unsafe { self.resize_unchecked(len, value) };
532            Some(())
533        }
534    }
535
536    // HI
537
538    /// Get the high 16 bits from the vector.
539    #[inline(always)]
540    pub fn hi16(&self) -> (u16, bool) {
541        let rview = self.rview();
542        // SAFETY: the buffer must be at least length bytes long which we check on the
543        // match.
544        unsafe {
545            match rview.len() {
546                0 => (0, false),
547                1 if Limb::BITS == 32 => hi!(@1 self, rview, u32, u32_to_hi16_1),
548                1 => hi!(@1 self, rview, u64, u64_to_hi16_1),
549                _ if Limb::BITS == 32 => hi!(@nonzero2 self, rview, u32, u32_to_hi16_2),
550                _ => hi!(@nonzero2 self, rview, u64, u64_to_hi16_2),
551            }
552        }
553    }
554
555    /// Get the high 32 bits from the vector.
556    #[inline(always)]
557    pub fn hi32(&self) -> (u32, bool) {
558        let rview = self.rview();
559        // SAFETY: the buffer must be at least length bytes long which we check on the
560        // match.
561        unsafe {
562            match rview.len() {
563                0 => (0, false),
564                1 if Limb::BITS == 32 => hi!(@1 self, rview, u32, u32_to_hi32_1),
565                1 => hi!(@1 self, rview, u64, u64_to_hi32_1),
566                _ if Limb::BITS == 32 => hi!(@nonzero2 self, rview, u32, u32_to_hi32_2),
567                _ => hi!(@nonzero2 self, rview, u64, u64_to_hi32_2),
568            }
569        }
570    }
571
572    /// Get the high 64 bits from the vector.
573    #[inline(always)]
574    pub fn hi64(&self) -> (u64, bool) {
575        let rview = self.rview();
576        // SAFETY: the buffer must be at least length bytes long which we check on the
577        // match.
578        unsafe {
579            match rview.len() {
580                0 => (0, false),
581                1 if Limb::BITS == 32 => hi!(@1 self, rview, u32, u32_to_hi64_1),
582                1 => hi!(@1 self, rview, u64, u64_to_hi64_1),
583                2 if Limb::BITS == 32 => hi!(@2 self, rview, u32, u32_to_hi64_2),
584                2 => hi!(@2 self, rview, u64, u64_to_hi64_2),
585                _ if Limb::BITS == 32 => hi!(@nonzero3 self, rview, u32, u32_to_hi64_3),
586                _ => hi!(@nonzero2 self, rview, u64, u64_to_hi64_2),
587            }
588        }
589    }
590
591    // FROM
592
593    /// Create `StackVec` from u16 value.
594    #[must_use]
595    #[inline(always)]
596    pub fn from_u16(x: u16) -> Self {
597        let mut vec = Self::new();
598        assert!(1 <= vec.capacity(), "cannot exceed our array bounds");
599        _ = vec.try_push(x as Limb);
600        vec.normalize();
601        vec
602    }
603
604    /// Create `StackVec` from u32 value.
605    #[must_use]
606    #[inline(always)]
607    pub fn from_u32(x: u32) -> Self {
608        let mut vec = Self::new();
609        debug_assert!(1 <= vec.capacity(), "cannot exceed our array bounds");
610        assert!(1 <= SIZE, "cannot exceed our array bounds");
611        _ = vec.try_push(x as Limb);
612        vec.normalize();
613        vec
614    }
615
616    /// Create `StackVec` from u64 value.
617    #[must_use]
618    #[inline(always)]
619    pub fn from_u64(x: u64) -> Self {
620        let mut vec = Self::new();
621        debug_assert!(2 <= vec.capacity(), "cannot exceed our array bounds");
622        assert!(2 <= SIZE, "cannot exceed our array bounds");
623        if Limb::BITS == 32 {
624            _ = vec.try_push(x as Limb);
625            _ = vec.try_push((x >> 32) as Limb);
626        } else {
627            _ = vec.try_push(x as Limb);
628        }
629        vec.normalize();
630        vec
631    }
632
633    // INDEX
634
635    /// Create a reverse view of the vector for indexing.
636    #[must_use]
637    #[inline(always)]
638    pub fn rview(&self) -> ReverseView<Limb> {
639        ReverseView {
640            inner: self,
641        }
642    }
643
644    // MATH
645
646    /// Normalize the integer, so any leading zero values are removed.
647    #[inline(always)]
648    pub fn normalize(&mut self) {
649        // We don't care if this wraps: the index is bounds-checked.
650        while let Some(&value) = self.get(self.len().wrapping_sub(1)) {
651            if value == 0 {
652                self.length -= 1;
653            } else {
654                break;
655            }
656        }
657    }
658
659    /// Get if the big integer is normalized.
660    #[must_use]
661    #[inline(always)]
662    pub fn is_normalized(&self) -> bool {
663        // We don't care if this wraps: the index is bounds-checked.
664        self.get(self.len().wrapping_sub(1)) != Some(&0)
665    }
666
667    /// Calculate the fast quotient for a single limb-bit quotient.
668    ///
669    /// This requires a non-normalized divisor, where there at least
670    /// `integral_binary_factor` 0 bits set, to ensure at maximum a single
671    /// digit will be produced for a single base.
672    ///
673    /// Warning: This is not a general-purpose division algorithm,
674    /// it is highly specialized for peeling off singular digits.
675    #[inline(always)]
676    #[cfg(feature = "radix")]
677    pub fn quorem(&mut self, y: &Self) -> Limb {
678        large_quorem(self, y)
679    }
680
681    /// `AddAssign` small integer.
682    #[inline(always)]
683    pub fn add_small(&mut self, y: Limb) -> Option<()> {
684        small_add(self, y)
685    }
686
687    /// `MulAssign` small integer.
688    #[inline(always)]
689    pub fn mul_small(&mut self, y: Limb) -> Option<()> {
690        small_mul(self, y)
691    }
692}
693
694impl<const SIZE: usize> PartialEq for StackVec<SIZE> {
695    #[inline(always)]
696    #[allow(clippy::op_ref)] // reason="need to convert to slice for equality"
697    fn eq(&self, other: &Self) -> bool {
698        use core::ops::Deref;
699        self.len() == other.len() && self.deref() == other.deref()
700    }
701}
702
703impl<const SIZE: usize> Eq for StackVec<SIZE> {
704}
705
706impl<const SIZE: usize> cmp::PartialOrd for StackVec<SIZE> {
707    #[inline(always)]
708    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
709        Some(self.cmp(other))
710    }
711}
712
713impl<const SIZE: usize> cmp::Ord for StackVec<SIZE> {
714    #[inline(always)]
715    fn cmp(&self, other: &Self) -> cmp::Ordering {
716        compare(self, other)
717    }
718}
719
720impl<const SIZE: usize> ops::Deref for StackVec<SIZE> {
721    type Target = [Limb];
722    #[inline(always)]
723    fn deref(&self) -> &[Limb] {
724        debug_assert!(self.len() <= self.capacity(), "cannot exceed our array bounds");
725        // SAFETY: safe since `self.data[..self.len()]` must be initialized
726        // and `self.len() <= self.capacity()`.
727        unsafe {
728            let ptr = self.data.as_ptr() as *const Limb;
729            slice::from_raw_parts(ptr, self.len())
730        }
731    }
732}
733
734impl<const SIZE: usize> ops::DerefMut for StackVec<SIZE> {
735    #[inline(always)]
736    fn deref_mut(&mut self) -> &mut [Limb] {
737        debug_assert!(self.len() <= self.capacity(), "cannot exceed our array bounds");
738        // SAFETY: safe since `self.data[..self.len()]` must be initialized
739        // and `self.len() <= self.capacity()`.
740        unsafe {
741            let ptr = self.data.as_mut_ptr() as *mut Limb;
742            slice::from_raw_parts_mut(ptr, self.len())
743        }
744    }
745}
746
747impl<const SIZE: usize> ops::MulAssign<&[Limb]> for StackVec<SIZE> {
748    #[inline(always)]
749    #[allow(clippy::unwrap_used)] // reason="exceeding the bounds is a developer error"
750    fn mul_assign(&mut self, rhs: &[Limb]) {
751        large_mul(self, rhs).unwrap();
752    }
753}
754
755impl<const SIZE: usize> Default for StackVec<SIZE> {
756    fn default() -> Self {
757        Self::new()
758    }
759}
760
761// REVERSE VIEW
762
763/// Reverse, immutable view of a sequence.
764pub struct ReverseView<'a, T: 'a> {
765    inner: &'a [T],
766}
767
768impl<'a, T: 'a> ReverseView<'a, T> {
769    /// Get a reference to a value, without bounds checking.
770    ///
771    /// # Safety
772    ///
773    /// Safe if forward indexing would be safe for the type,
774    /// or `index < self.inner.len()`.
775    #[inline(always)]
776    pub unsafe fn get_unchecked(&self, index: usize) -> &T {
777        debug_assert!(index < self.inner.len(), "cannot exceed our array bounds");
778        let len = self.inner.len();
779        // SAFETY: Safe as long as the index < length, so len - index - 1 >= 0 and <=
780        // len.
781        unsafe { self.inner.get_unchecked(len - index - 1) }
782    }
783
784    /// Get a reference to a value.
785    #[inline(always)]
786    pub fn get(&self, index: usize) -> Option<&T> {
787        let len = self.inner.len();
788        // We don't care if this wraps: the index is bounds-checked.
789        self.inner.get(len.wrapping_sub(index + 1))
790    }
791
792    /// Get the length of the inner buffer.
793    #[inline(always)]
794    pub const fn len(&self) -> usize {
795        self.inner.len()
796    }
797
798    /// If the vector is empty.
799    #[inline(always)]
800    pub const fn is_empty(&self) -> bool {
801        self.inner.is_empty()
802    }
803}
804
805impl<T> ops::Index<usize> for ReverseView<'_, T> {
806    type Output = T;
807
808    #[inline(always)]
809    fn index(&self, index: usize) -> &T {
810        let len = self.inner.len();
811        &(*self.inner)[len - index - 1]
812    }
813}
814
815// HI
816// --
817
818/// Check if any of the remaining bits are non-zero.
819///
820/// # Safety
821///
822/// Safe as long as `rindex <= x.len()`. This is only called
823/// where the type size is directly from the caller, and removing
824/// it leads to a ~20% degradation in performance.
825#[must_use]
826#[inline(always)]
827pub unsafe fn nonzero(x: &[Limb], rindex: usize) -> bool {
828    debug_assert!(rindex <= x.len(), "cannot exceed our array bounds");
829    let len = x.len();
830    // SAFETY: safe if `rindex < x.len()`, since then `x.len() - rindex < x.len()`.
831    let slc = unsafe { &index_unchecked!(x[..len - rindex]) };
832    slc.iter().rev().any(|&x| x != 0)
833}
834
835// These return the high X bits and if the bits were truncated.
836
837/// Shift 32-bit integer to high 16-bits.
838#[must_use]
839#[inline(always)]
840pub const fn u32_to_hi16_1(r0: u32) -> (u16, bool) {
841    let r0 = u32_to_hi32_1(r0).0;
842    ((r0 >> 16) as u16, r0 as u16 != 0)
843}
844
845/// Shift 2 32-bit integers to high 16-bits.
846#[must_use]
847#[inline(always)]
848pub const fn u32_to_hi16_2(r0: u32, r1: u32) -> (u16, bool) {
849    let (r0, n) = u32_to_hi32_2(r0, r1);
850    ((r0 >> 16) as u16, n || r0 as u16 != 0)
851}
852
853/// Shift 32-bit integer to high 32-bits.
854#[must_use]
855#[inline(always)]
856pub const fn u32_to_hi32_1(r0: u32) -> (u32, bool) {
857    let ls = r0.leading_zeros();
858    (r0 << ls, false)
859}
860
861/// Shift 2 32-bit integers to high 32-bits.
862#[must_use]
863#[inline(always)]
864pub const fn u32_to_hi32_2(r0: u32, r1: u32) -> (u32, bool) {
865    let ls = r0.leading_zeros();
866    let rs = 32 - ls;
867    let v = match ls {
868        0 => r0,
869        _ => (r0 << ls) | (r1 >> rs),
870    };
871    let n = r1 << ls != 0;
872    (v, n)
873}
874
875/// Shift 32-bit integer to high 64-bits.
876#[must_use]
877#[inline(always)]
878pub const fn u32_to_hi64_1(r0: u32) -> (u64, bool) {
879    u64_to_hi64_1(r0 as u64)
880}
881
882/// Shift 2 32-bit integers to high 64-bits.
883#[must_use]
884#[inline(always)]
885pub const fn u32_to_hi64_2(r0: u32, r1: u32) -> (u64, bool) {
886    let r0 = (r0 as u64) << 32;
887    let r1 = r1 as u64;
888    u64_to_hi64_1(r0 | r1)
889}
890
891/// Shift 3 32-bit integers to high 64-bits.
892#[must_use]
893#[inline(always)]
894pub const fn u32_to_hi64_3(r0: u32, r1: u32, r2: u32) -> (u64, bool) {
895    let r0 = r0 as u64;
896    let r1 = (r1 as u64) << 32;
897    let r2 = r2 as u64;
898    u64_to_hi64_2(r0, r1 | r2)
899}
900
901/// Shift 64-bit integer to high 16-bits.
902#[must_use]
903#[inline(always)]
904pub const fn u64_to_hi16_1(r0: u64) -> (u16, bool) {
905    let r0 = u64_to_hi64_1(r0).0;
906    ((r0 >> 48) as u16, r0 as u16 != 0)
907}
908
909/// Shift 2 64-bit integers to high 16-bits.
910#[must_use]
911#[inline(always)]
912pub const fn u64_to_hi16_2(r0: u64, r1: u64) -> (u16, bool) {
913    let (r0, n) = u64_to_hi64_2(r0, r1);
914    ((r0 >> 48) as u16, n || r0 as u16 != 0)
915}
916
917/// Shift 64-bit integer to high 32-bits.
918#[must_use]
919#[inline(always)]
920pub const fn u64_to_hi32_1(r0: u64) -> (u32, bool) {
921    let r0 = u64_to_hi64_1(r0).0;
922    ((r0 >> 32) as u32, r0 as u32 != 0)
923}
924
925/// Shift 2 64-bit integers to high 32-bits.
926#[must_use]
927#[inline(always)]
928pub const fn u64_to_hi32_2(r0: u64, r1: u64) -> (u32, bool) {
929    let (r0, n) = u64_to_hi64_2(r0, r1);
930    ((r0 >> 32) as u32, n || r0 as u32 != 0)
931}
932
933/// Shift 64-bit integer to high 64-bits.
934#[must_use]
935#[inline(always)]
936pub const fn u64_to_hi64_1(r0: u64) -> (u64, bool) {
937    let ls = r0.leading_zeros();
938    (r0 << ls, false)
939}
940
941/// Shift 2 64-bit integers to high 64-bits.
942#[must_use]
943#[inline(always)]
944pub const fn u64_to_hi64_2(r0: u64, r1: u64) -> (u64, bool) {
945    let ls = r0.leading_zeros();
946    let rs = 64 - ls;
947    let v = match ls {
948        0 => r0,
949        _ => (r0 << ls) | (r1 >> rs),
950    };
951    let n = r1 << ls != 0;
952    (v, n)
953}
954
955// POWERS
956// ------
957
958/// MulAssign by a power.
959///
960/// Theoretically...
961///
962/// Use an exponentiation by squaring method, since it reduces the time
963/// complexity of the multiplication to ~`O(log(n))` for the squaring,
964/// and `O(n*m)` for the result. Since `m` is typically a lower-order
965/// factor, this significantly reduces the number of multiplications
966/// we need to do. Iteratively multiplying by small powers follows
967/// the nth triangular number series, which scales as `O(p^2)`, but
968/// where `p` is `n+m`. In short, it scales very poorly.
969///
970/// Practically....
971///
972/// Exponentiation by Squaring:
973///     running 2 tests
974///     test bigcomp_f32_lexical ... bench:       1,018 ns/iter (+/- 78)
975///     test bigcomp_f64_lexical ... bench:       3,639 ns/iter (+/- 1,007)
976///
977/// Exponentiation by Iterative Small Powers:
978///     running 2 tests
979///     test bigcomp_f32_lexical ... bench:         518 ns/iter (+/- 31)
980///     test bigcomp_f64_lexical ... bench:         583 ns/iter (+/- 47)
981///
982/// Exponentiation by Iterative Large Powers (of 2):
983///     running 2 tests
984///     test bigcomp_f32_lexical ... bench:         671 ns/iter (+/- 31)
985///     test bigcomp_f64_lexical ... bench:       1,394 ns/iter (+/- 47)
986///
987/// The following benchmarks were run on `1 * 5^300`, using native `pow`,
988/// a version with only small powers, and one with pre-computed powers
989/// of `5^(3 * max_exp)`, rather than `5^(5 * max_exp)`.
990///
991/// However, using large powers is crucial for good performance for higher
992/// powers.
993///     pow/default             time:   [426.20 ns 427.96 ns 429.89 ns]
994///     pow/small               time:   [2.9270 us 2.9411 us 2.9565 us]
995///     pow/large:3             time:   [838.51 ns 842.21 ns 846.27 ns]
996///
997/// Even using worst-case scenarios, exponentiation by squaring is
998/// significantly slower for our workloads. Just multiply by small powers,
999/// in simple cases, and use pre-calculated large powers in other cases.
1000///
1001/// Furthermore, using sufficiently big large powers is also crucial for
1002/// performance. This is a trade-off of binary size and performance, and
1003/// using a single value at ~`5^(5 * max_exp)` seems optimal.
1004#[allow(clippy::doc_markdown)] // reason="not attempted to be referencing items"
1005#[allow(clippy::missing_inline_in_public_items)] // reason="only public for testing"
1006pub fn pow<const SIZE: usize>(x: &mut StackVec<SIZE>, base: u32, mut exp: u32) -> Option<()> {
1007    // Minimize the number of iterations for large exponents: just
1008    // do a few steps with a large powers.
1009    #[cfg(not(feature = "compact"))]
1010    {
1011        let (large, step) = get_large_int_power(base);
1012        while exp >= step {
1013            large_mul(x, large)?;
1014            exp -= step;
1015        }
1016    }
1017
1018    // Now use our pre-computed small powers iteratively.
1019    let small_step = if Limb::BITS == 32 {
1020        u32_power_limit(base)
1021    } else {
1022        u64_power_limit(base)
1023    };
1024    let max_native = (base as Limb).pow(small_step);
1025    while exp >= small_step {
1026        small_mul(x, max_native)?;
1027        exp -= small_step;
1028    }
1029    if exp != 0 {
1030        let small_power = f64::int_pow_fast_path(exp as usize, base);
1031        small_mul(x, small_power as Limb)?;
1032    }
1033    Some(())
1034}
1035
1036// SCALAR
1037// ------
1038
1039/// Add two small integers and return the resulting value and if overflow
1040/// happens.
1041#[must_use]
1042#[inline(always)]
1043pub const fn scalar_add(x: Limb, y: Limb) -> (Limb, bool) {
1044    x.overflowing_add(y)
1045}
1046
1047/// Multiply two small integers (with carry) (and return the overflow
1048/// contribution).
1049///
1050/// Returns the (low, high) components.
1051#[must_use]
1052#[inline(always)]
1053pub const fn scalar_mul(x: Limb, y: Limb, carry: Limb) -> (Limb, Limb) {
1054    // Cannot overflow, as long as wide is 2x as wide. This is because
1055    // the following is always true:
1056    // `Wide::MAX - (Narrow::MAX * Narrow::MAX) >= Narrow::MAX`
1057    let z: Wide = (x as Wide) * (y as Wide) + (carry as Wide);
1058    (z as Limb, (z >> Limb::BITS) as Limb)
1059}
1060
1061// SMALL
1062// -----
1063
1064/// Add small integer to bigint starting from offset.
1065#[inline(always)]
1066pub fn small_add_from<const SIZE: usize>(
1067    x: &mut StackVec<SIZE>,
1068    y: Limb,
1069    start: usize,
1070) -> Option<()> {
1071    let mut index = start;
1072    let mut carry = y;
1073    while carry != 0 && index < x.len() {
1074        // NOTE: Don't need unsafety because the compiler will optimize it out.
1075        let result = scalar_add(x[index], carry);
1076        x[index] = result.0;
1077        carry = result.1 as Limb;
1078        index += 1;
1079    }
1080    // If we carried past all the elements, add to the end of the buffer.
1081    if carry != 0 {
1082        x.try_push(carry)?;
1083    }
1084    Some(())
1085}
1086
1087/// Add small integer to bigint.
1088#[inline(always)]
1089pub fn small_add<const SIZE: usize>(x: &mut StackVec<SIZE>, y: Limb) -> Option<()> {
1090    small_add_from(x, y, 0)
1091}
1092
1093/// Multiply bigint by small integer.
1094#[inline(always)]
1095pub fn small_mul<const SIZE: usize>(x: &mut StackVec<SIZE>, y: Limb) -> Option<()> {
1096    let mut carry = 0;
1097    for xi in x.iter_mut() {
1098        let result = scalar_mul(*xi, y, carry);
1099        *xi = result.0;
1100        carry = result.1;
1101    }
1102    // If we carried past all the elements, add to the end of the buffer.
1103    if carry != 0 {
1104        x.try_push(carry)?;
1105    }
1106    Some(())
1107}
1108
1109// LARGE
1110// -----
1111
1112/// Add bigint to bigint starting from offset.
1113#[allow(clippy::missing_inline_in_public_items)] // reason="only public for testing"
1114pub fn large_add_from<const SIZE: usize>(
1115    x: &mut StackVec<SIZE>,
1116    y: &[Limb],
1117    start: usize,
1118) -> Option<()> {
1119    // The effective `x` buffer is from `xstart..x.len()`, so we need to treat
1120    // that as the current range. If the effective `y` buffer is longer, need
1121    // to resize to that, + the start index.
1122    if y.len() > x.len().saturating_sub(start) {
1123        // Ensure we panic if we can't extend the buffer.
1124        // This avoids any unsafe behavior afterwards.
1125        x.try_resize(y.len() + start, 0)?;
1126    }
1127
1128    // Iteratively add elements from `y` to `x`.
1129    let mut carry = false;
1130    for index in 0..y.len() {
1131        let xi = &mut x[start + index];
1132        let yi = y[index];
1133
1134        // Only one op of the two ops can overflow, since we added at max
1135        // `Limb::max_value() + Limb::max_value()`. Add the previous carry,
1136        // and store the current carry for the next.
1137        let result = scalar_add(*xi, yi);
1138        *xi = result.0;
1139        let mut tmp = result.1;
1140        if carry {
1141            let result = scalar_add(*xi, 1);
1142            *xi = result.0;
1143            tmp |= result.1;
1144        }
1145        carry = tmp;
1146    }
1147
1148    // Handle overflow.
1149    if carry {
1150        small_add_from(x, 1, y.len() + start)?;
1151    }
1152    Some(())
1153}
1154
1155/// Add bigint to bigint.
1156#[inline(always)]
1157pub fn large_add<const SIZE: usize>(x: &mut StackVec<SIZE>, y: &[Limb]) -> Option<()> {
1158    large_add_from(x, y, 0)
1159}
1160
1161/// Grade-school multiplication algorithm.
1162///
1163/// Slow, naive algorithm, using limb-bit bases and just shifting left for
1164/// each iteration. This could be optimized with numerous other algorithms,
1165/// but it's extremely simple, and works in O(n*m) time, which is fine
1166/// by me. Each iteration, of which there are `m` iterations, requires
1167/// `n` multiplications, and `n` additions, or grade-school multiplication.
1168///
1169/// Don't use Karatsuba multiplication, since out implementation seems to
1170/// be slower asymptotically, which is likely just due to the small sizes
1171/// we deal with here. For example, running on the following data:
1172///
1173/// ```text
1174/// const SMALL_X: &[u32] = &[
1175///     766857581, 3588187092, 1583923090, 2204542082, 1564708913, 2695310100, 3676050286,
1176///     1022770393, 468044626, 446028186
1177/// ];
1178/// const SMALL_Y: &[u32] = &[
1179///     3945492125, 3250752032, 1282554898, 1708742809, 1131807209, 3171663979, 1353276095,
1180///     1678845844, 2373924447, 3640713171
1181/// ];
1182/// const LARGE_X: &[u32] = &[
1183///     3647536243, 2836434412, 2154401029, 1297917894, 137240595, 790694805, 2260404854,
1184///     3872698172, 690585094, 99641546, 3510774932, 1672049983, 2313458559, 2017623719,
1185///     638180197, 1140936565, 1787190494, 1797420655, 14113450, 2350476485, 3052941684,
1186///     1993594787, 2901001571, 4156930025, 1248016552, 848099908, 2660577483, 4030871206,
1187///     692169593, 2835966319, 1781364505, 4266390061, 1813581655, 4210899844, 2137005290,
1188///     2346701569, 3715571980, 3386325356, 1251725092, 2267270902, 474686922, 2712200426,
1189///     197581715, 3087636290, 1379224439, 1258285015, 3230794403, 2759309199, 1494932094,
1190///     326310242
1191/// ];
1192/// const LARGE_Y: &[u32] = &[
1193///     1574249566, 868970575, 76716509, 3198027972, 1541766986, 1095120699, 3891610505,
1194///     2322545818, 1677345138, 865101357, 2650232883, 2831881215, 3985005565, 2294283760,
1195///     3468161605, 393539559, 3665153349, 1494067812, 106699483, 2596454134, 797235106,
1196///     705031740, 1209732933, 2732145769, 4122429072, 141002534, 790195010, 4014829800,
1197///     1303930792, 3649568494, 308065964, 1233648836, 2807326116, 79326486, 1262500691,
1198///     621809229, 2258109428, 3819258501, 171115668, 1139491184, 2979680603, 1333372297,
1199///     1657496603, 2790845317, 4090236532, 4220374789, 601876604, 1828177209, 2372228171,
1200///     2247372529
1201/// ];
1202/// ```
1203///
1204/// We get the following results:
1205///
1206/// ```text
1207/// mul/small:long          time:   [220.23 ns 221.47 ns 222.81 ns]
1208/// Found 4 outliers among 100 measurements (4.00%)
1209///   2 (2.00%) high mild
1210///   2 (2.00%) high severe
1211/// mul/small:karatsuba     time:   [233.88 ns 234.63 ns 235.44 ns]
1212/// Found 11 outliers among 100 measurements (11.00%)
1213///   8 (8.00%) high mild
1214///   3 (3.00%) high severe
1215/// mul/large:long          time:   [1.9365 us 1.9455 us 1.9558 us]
1216/// Found 12 outliers among 100 measurements (12.00%)
1217///   7 (7.00%) high mild
1218///   5 (5.00%) high severe
1219/// mul/large:karatsuba     time:   [4.4250 us 4.4515 us 4.4812 us]
1220/// ```
1221///
1222/// In short, Karatsuba multiplication is never worthwhile for out use-case.
1223#[must_use]
1224#[allow(clippy::needless_range_loop)] // reason="required for performance, see benches"
1225#[allow(clippy::missing_inline_in_public_items)] // reason="only public for testing"
1226pub fn long_mul<const SIZE: usize>(x: &[Limb], y: &[Limb]) -> Option<StackVec<SIZE>> {
1227    // Using the immutable value, multiply by all the scalars in y, using
1228    // the algorithm defined above. Use a single buffer to avoid
1229    // frequent reallocations. Handle the first case to avoid a redundant
1230    // addition, since we know y.len() >= 1.
1231    let mut z = StackVec::<SIZE>::try_from(x)?;
1232    if let Some(&y0) = y.first() {
1233        small_mul(&mut z, y0)?;
1234
1235        // NOTE: Don't use enumerate/skip since it's slow.
1236        for index in 1..y.len() {
1237            let yi = y[index];
1238            if yi != 0 {
1239                let mut zi = StackVec::<SIZE>::try_from(x)?;
1240                small_mul(&mut zi, yi)?;
1241                large_add_from(&mut z, &zi, index)?;
1242            }
1243        }
1244    }
1245
1246    z.normalize();
1247    Some(z)
1248}
1249
1250/// Multiply bigint by bigint using grade-school multiplication algorithm.
1251#[inline(always)]
1252pub fn large_mul<const SIZE: usize>(x: &mut StackVec<SIZE>, y: &[Limb]) -> Option<()> {
1253    // Karatsuba multiplication never makes sense, so just use grade school
1254    // multiplication.
1255    if y.len() == 1 {
1256        // SAFETY: safe since `y.len() == 1`.
1257        // NOTE: The compiler does not seem to optimize this out correctly.
1258        small_mul(x, unsafe { index_unchecked!(y[0]) })?;
1259    } else {
1260        *x = long_mul(y, x)?;
1261    }
1262    Some(())
1263}
1264
1265/// Emit a single digit for the quotient and store the remainder in-place.
1266///
1267/// An extremely efficient division algorithm for small quotients, requiring
1268/// you to know the full range of the quotient prior to use. For example,
1269/// with a quotient that can range from [0, 10), you must have 4 leading
1270/// zeros in the divisor, so we can use a single-limb division to get
1271/// an accurate estimate of the quotient. Since we always underestimate
1272/// the quotient, we can add 1 and then emit the digit.
1273///
1274/// Requires a non-normalized denominator, with at least [1-6] leading
1275/// zeros, depending on the base (for example, 1 for base2, 6 for base36).
1276///
1277/// Adapted from David M. Gay's dtoa, and therefore under an MIT license:
1278///     www.netlib.org/fp/dtoa.c
1279#[cfg(feature = "radix")]
1280#[allow(clippy::many_single_char_names)] // reason = "mathematical names of variables"
1281pub fn large_quorem<const SIZE: usize>(x: &mut StackVec<SIZE>, y: &[Limb]) -> Limb {
1282    // If we have an empty divisor, error out early.
1283    assert!(!y.is_empty(), "large_quorem:: division by zero error.");
1284    assert!(x.len() <= y.len(), "large_quorem:: oversized numerator.");
1285    let mask = Limb::MAX as Wide;
1286
1287    // Numerator is smaller the denominator, quotient always 0.
1288    if x.len() < y.len() {
1289        return 0;
1290    }
1291
1292    // Calculate our initial estimate for q.
1293    let xm_1 = x[x.len() - 1];
1294    let yn_1 = y[y.len() - 1];
1295    let mut q = xm_1 / (yn_1 + 1);
1296
1297    // Need to calculate the remainder if we don't have a 0 quotient.
1298    if q != 0 {
1299        let mut borrow: Wide = 0;
1300        let mut carry: Wide = 0;
1301        for j in 0..x.len() {
1302            let yj = y[j] as Wide;
1303            let p = yj * q as Wide + carry;
1304            carry = p >> Limb::BITS;
1305            let xj = x[j] as Wide;
1306            let t = xj.wrapping_sub(p & mask).wrapping_sub(borrow);
1307            borrow = (t >> Limb::BITS) & 1;
1308            x[j] = t as Limb;
1309        }
1310        x.normalize();
1311    }
1312
1313    // Check if we under-estimated x.
1314    if compare(x, y) != cmp::Ordering::Less {
1315        q += 1;
1316        let mut borrow: Wide = 0;
1317        let mut carry: Wide = 0;
1318        for j in 0..x.len() {
1319            let yj = y[j] as Wide;
1320            let p = yj + carry;
1321            carry = p >> Limb::BITS;
1322            let xj = x[j] as Wide;
1323            let t = xj.wrapping_sub(p & mask).wrapping_sub(borrow);
1324            borrow = (t >> Limb::BITS) & 1;
1325            x[j] = t as Limb;
1326        }
1327        x.normalize();
1328    }
1329
1330    q
1331}
1332
1333// COMPARE
1334// -------
1335
1336/// Compare `x` to `y`, in little-endian order.
1337#[must_use]
1338#[inline(always)]
1339pub fn compare(x: &[Limb], y: &[Limb]) -> cmp::Ordering {
1340    match x.len().cmp(&y.len()) {
1341        cmp::Ordering::Equal => {
1342            let iter = x.iter().rev().zip(y.iter().rev());
1343            for (&xi, yi) in iter {
1344                match xi.cmp(yi) {
1345                    cmp::Ordering::Equal => (),
1346                    ord => return ord,
1347                }
1348            }
1349            // Equal case.
1350            cmp::Ordering::Equal
1351        },
1352        ord => ord,
1353    }
1354}
1355
1356// SHIFT
1357// -----
1358
1359/// Shift-left `n` bits inside a buffer.
1360#[inline(always)]
1361pub fn shl_bits<const SIZE: usize>(x: &mut StackVec<SIZE>, n: usize) -> Option<()> {
1362    debug_assert!(n != 0, "cannot shift left by 0 bits");
1363
1364    // Internally, for each item, we shift left by n, and add the previous
1365    // right shifted limb-bits.
1366    // For example, we transform (for u8) shifted left 2, to:
1367    //      b10100100 b01000010
1368    //      b10 b10010001 b00001000
1369    debug_assert!(n < Limb::BITS as usize, "cannot shift left more bits than in our limb");
1370    let rshift = Limb::BITS as usize - n;
1371    let lshift = n;
1372    let mut prev: Limb = 0;
1373    for xi in x.iter_mut() {
1374        let tmp = *xi;
1375        *xi <<= lshift;
1376        *xi |= prev >> rshift;
1377        prev = tmp;
1378    }
1379
1380    // Always push the carry, even if it creates a non-normal result.
1381    let carry = prev >> rshift;
1382    if carry != 0 {
1383        x.try_push(carry)?;
1384    }
1385
1386    Some(())
1387}
1388
1389/// Shift-left `n` limbs inside a buffer.
1390#[inline(always)]
1391pub fn shl_limbs<const SIZE: usize>(x: &mut StackVec<SIZE>, n: usize) -> Option<()> {
1392    debug_assert!(n != 0, "cannot shift left by 0 bits");
1393    if n + x.len() > x.capacity() {
1394        None
1395    } else if !x.is_empty() {
1396        let len = n + x.len();
1397        let x_len = x.len();
1398        let ptr = x.as_mut_ptr();
1399        let src = ptr;
1400        // SAFETY: since x is not empty, and `x.len() + n <= x.capacity()`.
1401        unsafe {
1402            // Move the elements.
1403            let dst = ptr.add(n);
1404            ptr::copy(src, dst, x_len);
1405            // Write our 0s.
1406            ptr::write_bytes(ptr, 0, n);
1407            x.set_len(len);
1408        }
1409        Some(())
1410    } else {
1411        Some(())
1412    }
1413}
1414
1415/// Shift-left buffer by n bits.
1416#[must_use]
1417#[inline(always)]
1418pub fn shl<const SIZE: usize>(x: &mut StackVec<SIZE>, n: usize) -> Option<()> {
1419    let rem = n % Limb::BITS as usize;
1420    let div = n / Limb::BITS as usize;
1421    if rem != 0 {
1422        shl_bits(x, rem)?;
1423    }
1424    if div != 0 {
1425        shl_limbs(x, div)?;
1426    }
1427    Some(())
1428}
1429
1430/// Get number of leading zero bits in the storage.
1431#[must_use]
1432#[inline(always)]
1433pub fn leading_zeros(x: &[Limb]) -> u32 {
1434    let length = x.len();
1435    // `wrapping_sub` is fine, since it'll just return None.
1436    if let Some(&value) = x.get(length.wrapping_sub(1)) {
1437        value.leading_zeros()
1438    } else {
1439        0
1440    }
1441}
1442
1443/// Calculate the bit-length of the big-integer.
1444#[must_use]
1445#[inline(always)]
1446pub fn bit_length(x: &[Limb]) -> u32 {
1447    let nlz = leading_zeros(x);
1448    Limb::BITS * x.len() as u32 - nlz
1449}
1450
1451// RADIX
1452// -----
1453
1454/// Get the base, odd radix, and the power-of-two for the type.
1455#[must_use]
1456#[inline(always)]
1457#[cfg(feature = "radix")]
1458pub const fn split_radix(radix: u32) -> (u32, u32) {
1459    match radix {
1460        2 => (0, 1),
1461        3 => (3, 0),
1462        4 => (0, 2),
1463        5 => (5, 0),
1464        6 => (3, 1),
1465        7 => (7, 0),
1466        8 => (0, 3),
1467        9 => (9, 0),
1468        10 => (5, 1),
1469        11 => (11, 0),
1470        12 => (6, 1),
1471        13 => (13, 0),
1472        14 => (7, 1),
1473        15 => (15, 0),
1474        16 => (0, 4),
1475        17 => (17, 0),
1476        18 => (9, 1),
1477        19 => (19, 0),
1478        20 => (5, 2),
1479        21 => (21, 0),
1480        22 => (11, 1),
1481        23 => (23, 0),
1482        24 => (3, 3),
1483        25 => (25, 0),
1484        26 => (13, 1),
1485        27 => (27, 0),
1486        28 => (7, 2),
1487        29 => (29, 0),
1488        30 => (15, 1),
1489        31 => (31, 0),
1490        32 => (0, 5),
1491        33 => (33, 0),
1492        34 => (17, 1),
1493        35 => (35, 0),
1494        36 => (9, 2),
1495        // Any other radix should be unreachable.
1496        _ => (0, 0),
1497    }
1498}
1499
1500/// Get the base, odd radix, and the power-of-two for the type.
1501#[must_use]
1502#[inline(always)]
1503#[cfg(all(feature = "power-of-two", not(feature = "radix")))]
1504pub const fn split_radix(radix: u32) -> (u32, u32) {
1505    match radix {
1506        // Is also needed for decimal floats, due to `negative_digit_comp`.
1507        2 => (0, 1),
1508        4 => (0, 2),
1509        // Is also needed for decimal floats, due to `negative_digit_comp`.
1510        5 => (5, 0),
1511        8 => (0, 3),
1512        10 => (5, 1),
1513        16 => (0, 4),
1514        32 => (0, 5),
1515        // Any other radix should be unreachable.
1516        _ => (0, 0),
1517    }
1518}
1519
1520/// Get the base, odd radix, and the power-of-two for the type.
1521#[must_use]
1522#[inline(always)]
1523#[cfg(not(feature = "power-of-two"))]
1524pub const fn split_radix(radix: u32) -> (u32, u32) {
1525    match radix {
1526        // Is also needed for decimal floats, due to `negative_digit_comp`.
1527        2 => (0, 1),
1528        // Is also needed for decimal floats, due to `negative_digit_comp`.
1529        5 => (5, 0),
1530        10 => (5, 1),
1531        // Any other radix should be unreachable.
1532        _ => (0, 0),
1533    }
1534}
1535
1536// LIMB
1537// ----
1538
1539//  Type for a single limb of the big integer.
1540//
1541//  A limb is analogous to a digit in base10, except, it stores 32-bit
1542//  or 64-bit numbers instead. We want types where 64-bit multiplication
1543//  is well-supported by the architecture, rather than emulated in 3
1544//  instructions. The quickest way to check this support is using a
1545//  cross-compiler for numerous architectures, along with the following
1546//  source file and command:
1547//
1548//  Compile with `gcc main.c -c -S -O3 -masm=intel`
1549//
1550//  And the source code is:
1551//  ```text
1552//  #include <stdint.h>
1553//
1554//  struct i128 {
1555//      uint64_t hi;
1556//      uint64_t lo;
1557//  };
1558//
1559//  // Type your code here, or load an example.
1560//  struct i128 square(uint64_t x, uint64_t y) {
1561//      __int128 prod = (__int128)x * (__int128)y;
1562//      struct i128 z;
1563//      z.hi = (uint64_t)(prod >> 64);
1564//      z.lo = (uint64_t)prod;
1565//      return z;
1566//  }
1567//  ```
1568//
1569//  If the result contains `call __multi3`, then the multiplication
1570//  is emulated by the compiler. Otherwise, it's natively supported.
1571//
1572//  This should be all-known 64-bit platforms supported by Rust.
1573//      https://forge.rust-lang.org/platform-support.html
1574//
1575//  # Supported
1576//
1577//  Platforms where native 128-bit multiplication is explicitly supported:
1578//      - x86_64 (Supported via `MUL`).
1579//      - mips64 (Supported via `DMULTU`, which `HI` and `LO` can be read-from).
1580//      - s390x (Supported via `MLGR`).
1581//
1582//  # Efficient
1583//
1584//  Platforms where native 64-bit multiplication is supported and
1585//  you can extract hi-lo for 64-bit multiplications.
1586//      - aarch64 (Requires `UMULH` and `MUL` to capture high and low bits).
1587//      - powerpc64 (Requires `MULHDU` and `MULLD` to capture high and low
1588//        bits).
1589//      - riscv64 (Requires `MUL` and `MULH` to capture high and low bits).
1590//
1591//  # Unsupported
1592//
1593//  Platforms where native 128-bit multiplication is not supported,
1594//  requiring software emulation.
1595//      sparc64 (`UMUL` only supports double-word arguments).
1596//      sparcv9 (Same as sparc64).
1597//
1598//  These tests are run via `xcross`, my own library for C cross-compiling,
1599//  which supports numerous targets (far in excess of Rust's tier 1 support,
1600//  or rust-embedded/cross's list). xcross may be found here:
1601//      https://github.com/Alexhuszagh/xcross
1602//
1603//  To compile for the given target, run:
1604//      `xcross gcc main.c -c -S -O3 --target $target`
1605//
1606//  All 32-bit architectures inherently do not have support. That means
1607//  we can essentially look for 64-bit architectures that are not SPARC.
1608
1609#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
1610pub type Limb = u64;
1611#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
1612pub type Wide = u128;
1613#[cfg(all(target_pointer_width = "64", not(target_arch = "sparc")))]
1614pub type SignedWide = i128;
1615
1616#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
1617pub type Limb = u32;
1618#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
1619pub type Wide = u64;
1620#[cfg(not(all(target_pointer_width = "64", not(target_arch = "sparc"))))]
1621pub type SignedWide = i64;