mz_row_spine/
lib.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Types and traits in support of containers for row-encoded byte slices.
11//!
12//! This includes the vanilla `bytes_container` that holds byte slices in contiguous
13//! allocations, as well as a `dictionary` encoding wrapper that is able to rewrite
14//! the byte slices to use spare tags in each column to reference common values.
15
16pub use self::dictionary::DatumContainer;
17pub use self::dictionary::DatumSeq;
18pub use self::offset_opt::OffsetOptimized;
19pub use self::spines::{
20    RowBatcher, RowBuilder, RowRowBatcher, RowRowBuilder, RowRowColPagedBuilder, RowRowSpine,
21    RowSpine, RowValBatcher, RowValBuilder, RowValSpine, ValRowBatcher, ValRowBuilder,
22    ValRowColPagedBuilder, ValRowSpine,
23};
24use differential_dataflow::trace::implementations::OffsetList;
25
26/// Enable per-column dictionary compression in row containers.
27pub static DICTIONARY_COMPRESSION: std::sync::atomic::AtomicBool =
28    std::sync::atomic::AtomicBool::new(false);
29
30/// Spines specialized to contain `Row` types in keys and values.
31mod spines {
32    use std::rc::Rc;
33
34    use columnation::Columnation;
35    use differential_dataflow::trace::implementations::Layout;
36    use differential_dataflow::trace::implementations::Update;
37    use differential_dataflow::trace::implementations::merge_batcher::MergeBatcher;
38    use differential_dataflow::trace::implementations::ord_neu::{OrdKeyBatch, OrdValBatch};
39    use differential_dataflow::trace::implementations::spine_fueled::Spine;
40    use differential_dataflow::trace::rc_blanket_impls::RcBuilder;
41    use mz_repr::Row;
42    use mz_timely_util::columnation::{ColInternalMerger, ColumnationStack};
43
44    use crate::{DatumContainer, OffsetOptimized};
45
46    /// Batcher matching `mz_compute::typedefs::KeyValBatcher`, redeclared
47    /// locally so this crate does not need to depend on `mz_compute`.
48    type KeyValBatcher<K, V, T, D> = MergeBatcher<ColInternalMerger<(K, V), T, D>>;
49    type KeyBatcher<K, T, D> = KeyValBatcher<K, (), T, D>;
50
51    pub type RowRowSpine<T, R> = Spine<Rc<OrdValBatch<RowRowLayout<((Row, Row), T, R)>>>>;
52    pub type RowRowBatcher<T, R> = KeyValBatcher<Row, Row, T, R>;
53    pub type RowRowBuilder<T, R> = RcBuilder<crate::dictionary::builders::RowRowBuilder<T, R>>;
54
55    /// `RowRowBuilder` variant that consumes [`Column`] chunks. Pairs with
56    /// [`Col2ValPagedBatcher`] for the spillable arrange path. Installs a
57    /// dictionary codec at seal time, gathering statistics from the sealed
58    /// `Column` chain, so paged arrangements compress on the same footing as the
59    /// columnation-fed [`RowRowBuilder`].
60    ///
61    /// [`Col2ValPagedBatcher`]: mz_timely_util::columnar::Col2ValPagedBatcher
62    /// [`Column`]: mz_timely_util::columnar::Column
63    pub type RowRowColPagedBuilder<T, R> =
64        RcBuilder<crate::dictionary::builders::RowRowColPagedBuilder<T, R>>;
65
66    pub type RowValSpine<V, T, R> = Spine<Rc<OrdValBatch<RowValLayout<((Row, V), T, R)>>>>;
67    pub type RowValBatcher<V, T, R> = KeyValBatcher<Row, V, T, R>;
68    pub type RowValBuilder<V, T, R> =
69        RcBuilder<crate::dictionary::builders::RowValBuilder<V, T, R>>;
70
71    pub type RowSpine<T, R> = Spine<Rc<OrdKeyBatch<RowLayout<((Row, ()), T, R)>>>>;
72    pub type RowBatcher<T, R> = KeyBatcher<Row, T, R>;
73    pub type RowBuilder<T, R> = RcBuilder<crate::dictionary::builders::RowBuilder<T, R>>;
74
75    pub type ValRowSpine<K, T, R> = Spine<Rc<OrdValBatch<ValRowLayout<((K, Row), T, R)>>>>;
76    pub type ValRowBatcher<K, T, R> = KeyValBatcher<K, Row, T, R>;
77    pub type ValRowBuilder<K, T, R> =
78        RcBuilder<crate::dictionary::builders::ValRowBuilder<K, T, R>>;
79
80    /// `ValRowBuilder` variant that consumes [`Column`] chunks. Pairs with
81    /// `Col2ValPagedBatcher<K, Row, T, R>` for the spillable arrange path where
82    /// keys are arbitrary `Columnar` values (e.g. `UpsertKey`) and values are
83    /// packed `Row` bytes. Installs a dictionary codec on the value container at
84    /// seal time, gathering statistics from the sealed `Column` chain; keys are
85    /// not `Row`-shaped and so are left uncompressed.
86    ///
87    /// [`Column`]: mz_timely_util::columnar::Column
88    pub type ValRowColPagedBuilder<K, T, R> =
89        RcBuilder<crate::dictionary::builders::ValRowColPagedBuilder<K, T, R>>;
90
91    /// A layout based on timely stacks
92    pub struct RowRowLayout<U: Update<Key = Row, Val = Row>> {
93        phantom: std::marker::PhantomData<U>,
94    }
95    pub struct RowValLayout<U: Update<Key = Row>> {
96        phantom: std::marker::PhantomData<U>,
97    }
98    pub struct RowLayout<U: Update<Key = Row, Val = ()>> {
99        phantom: std::marker::PhantomData<U>,
100    }
101    /// Mirror of [`RowValLayout`] with the roles swapped: arbitrary `Columnation`
102    /// keys with `Row` values stored as packed bytes in a [`DatumContainer`].
103    pub struct ValRowLayout<U: Update<Val = Row>> {
104        phantom: std::marker::PhantomData<U>,
105    }
106
107    impl<U: Update<Key = Row, Val = Row>> Layout for RowRowLayout<U>
108    where
109        U::Time: Columnation,
110        U::Diff: Columnation,
111    {
112        type KeyContainer = DatumContainer;
113        type ValContainer = DatumContainer;
114        type TimeContainer = ColumnationStack<U::Time>;
115        type DiffContainer = ColumnationStack<U::Diff>;
116        type OffsetContainer = OffsetOptimized;
117    }
118    impl<U: Update<Key = Row>> Layout for RowValLayout<U>
119    where
120        U::Val: Columnation,
121        U::Time: Columnation,
122        U::Diff: Columnation,
123    {
124        type KeyContainer = DatumContainer;
125        type ValContainer = ColumnationStack<U::Val>;
126        type TimeContainer = ColumnationStack<U::Time>;
127        type DiffContainer = ColumnationStack<U::Diff>;
128        type OffsetContainer = OffsetOptimized;
129    }
130    impl<U: Update<Key = Row, Val = ()>> Layout for RowLayout<U>
131    where
132        U::Time: Columnation,
133        U::Diff: Columnation,
134    {
135        type KeyContainer = DatumContainer;
136        type ValContainer = ColumnationStack<()>;
137        type TimeContainer = ColumnationStack<U::Time>;
138        type DiffContainer = ColumnationStack<U::Diff>;
139        type OffsetContainer = OffsetOptimized;
140    }
141    impl<U: Update<Val = Row>> Layout for ValRowLayout<U>
142    where
143        U::Key: Columnation,
144        U::Time: Columnation,
145        U::Diff: Columnation,
146    {
147        type KeyContainer = ColumnationStack<U::Key>;
148        type ValContainer = DatumContainer;
149        type TimeContainer = ColumnationStack<U::Time>;
150        type DiffContainer = ColumnationStack<U::Diff>;
151        type OffsetContainer = OffsetOptimized;
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use crate::DatumContainer;
158    use differential_dataflow::trace::implementations::BatchContainer;
159    use mz_repr::adt::date::Date;
160    use mz_repr::adt::interval::Interval;
161    use mz_repr::{Datum, Row, SqlScalarType};
162
163    #[mz_ore::test]
164    #[cfg_attr(miri, ignore)] // unsupported operation: integer-to-pointer casts and `ptr::with_exposed_provenance` are not supported
165    fn test_round_trip() {
166        fn round_trip(datums: Vec<Datum>) {
167            let row = Row::pack(datums.clone());
168
169            let mut container = DatumContainer::with_capacity(row.byte_len());
170            container.push_own(&row);
171
172            // When run under miri this catches undefined bytes written to data
173            // eg by calling push_copy! on a type which contains undefined padding values
174            println!("{:?}", container.index(0).iter.data);
175
176            let datums2 = container.index(0).collect::<Vec<_>>();
177            assert_eq!(datums, datums2);
178        }
179
180        round_trip(vec![]);
181        round_trip(
182            SqlScalarType::enumerate()
183                .iter()
184                .flat_map(|r#type| r#type.interesting_datums())
185                .collect(),
186        );
187        round_trip(vec![
188            Datum::Null,
189            Datum::Null,
190            Datum::False,
191            Datum::True,
192            Datum::Int16(-21),
193            Datum::Int32(-42),
194            Datum::Int64(-2_147_483_648 - 42),
195            Datum::UInt8(0),
196            Datum::UInt8(1),
197            Datum::UInt16(0),
198            Datum::UInt16(1),
199            Datum::UInt16(1 << 8),
200            Datum::UInt32(0),
201            Datum::UInt32(1),
202            Datum::UInt32(1 << 8),
203            Datum::UInt32(1 << 16),
204            Datum::UInt32(1 << 24),
205            Datum::UInt64(0),
206            Datum::UInt64(1),
207            Datum::UInt64(1 << 8),
208            Datum::UInt64(1 << 16),
209            Datum::UInt64(1 << 24),
210            Datum::UInt64(1 << 32),
211            Datum::UInt64(1 << 40),
212            Datum::UInt64(1 << 48),
213            Datum::UInt64(1 << 56),
214            Datum::Date(Date::from_pg_epoch(365 * 45 + 21).unwrap()),
215            Datum::Interval(Interval {
216                months: 312,
217                ..Default::default()
218            }),
219            Datum::Interval(Interval::new(0, 0, 1_012_312)),
220            Datum::Bytes(&[]),
221            Datum::Bytes(&[0, 2, 1, 255]),
222            Datum::String(""),
223            Datum::String("العَرَبِيَّة"),
224        ]);
225    }
226
227    /// Exercises the *compressed* encode→decode paths, which the dyncfg-gated
228    /// `test_round_trip` never reaches (it installs no codec). We drive the codec
229    /// directly: observe a sample, build a codec via both `new_from([c1, c2])`
230    /// (the merge path) and `new_safe` (the safe-tag path), then round-trip every
231    /// row through it. We additionally assert the dictionary actually engaged, so
232    /// the test keeps covering the compressed branch rather than silently
233    /// degrading to raw fall-through.
234    #[mz_ore::test]
235    #[cfg_attr(miri, ignore)] // integer-to-pointer casts in row decoding are unsupported under miri
236    fn test_codec_round_trip() {
237        use crate::row_codec::ColumnsCodec;
238
239        // Rows with a small set of repeated, multi-byte string values, so the
240        // dictionary installs entries (MisraGries keeps values with len > 1 and
241        // count > 1). Mixing in an integer column exercises the raw fall-through
242        // (and thus the new soundness `debug_assert`) alongside dictionary hits.
243        let values = ["apple", "banana", "cherry"];
244        let rows: Vec<Row> = (0..3_000)
245            .map(|i| {
246                Row::pack_slice(&[
247                    Datum::String(values[i % values.len()]),
248                    Datum::Int64(i64::try_from(i).unwrap()),
249                    Datum::String(values[(i / 7) % values.len()]),
250                ])
251            })
252            .collect();
253
254        // Accumulate statistics in two independent observers, so the merge in
255        // `new_from([&stats1, &stats2])` is actually exercised.
256        let mut stats1 = ColumnsCodec::default();
257        let mut stats2 = ColumnsCodec::default();
258        let mut scratch = Vec::new();
259        for (i, row) in rows.iter().enumerate() {
260            scratch.clear();
261            let stats = if i % 2 == 0 { &mut stats1 } else { &mut stats2 };
262            stats.encode(ColumnsCodec::borrow_row(row), &mut scratch);
263        }
264
265        let merged = ColumnsCodec::new_from([&stats1, &stats2]);
266        let safe = stats1.new_safe();
267        for mut codec in [merged, safe] {
268            let mut compressed_any = false;
269            for row in &rows {
270                let mut buf = Vec::new();
271                codec.encode(ColumnsCodec::borrow_row(row), &mut buf);
272
273                let decoded = codec.decode(&buf).collect::<Vec<_>>();
274                let expected = ColumnsCodec::borrow_row(row).collect::<Vec<_>>();
275                assert_eq!(decoded, expected, "round-trip mismatch for {row:?}");
276
277                compressed_any |= buf.len() < row.data().len();
278            }
279            assert!(
280                compressed_any,
281                "dictionary never engaged; test no longer covers the compressed path",
282            );
283        }
284    }
285
286    /// Regression test for a dictionary-codec soundness bug in the safe-install
287    /// path (`new_safe`), reachable with the paged batcher enabled.
288    ///
289    /// A from-scratch container stores its pre-install rows *raw* while gathering
290    /// statistics, then installs a *safe* codec via `new_safe`. `new_safe` used to
291    /// discard the first-byte bitmap gathered over those raw rows. That bitmap is
292    /// soundness-critical: a later `new_from` merge consults it to decide which
293    /// one-byte tags are free to hand out as dictionary keys. With the bitmap
294    /// dropped, the merge could assign a dictionary tag equal to a raw datum's
295    /// first byte, after which `decode` resolves that literal datum to the
296    /// dictionary entry — returning the wrong value.
297    ///
298    /// We drive the lifecycle directly: observe short strings (first byte
299    /// `StringTiny`) into the pre-install statistics, install a safe codec, then
300    /// feed it many distinct *long* strings (first byte `StringShort`)
301    /// post-install so the merge has heavy hitters to compress. Merging via
302    /// `new_from` and re-encoding the short strings then exercises the raw
303    /// fall-through whose first byte the merge must not have claimed as a tag.
304    /// Before the fix the `StringTiny` tag was handed out and the round-trip
305    /// produced a long string (and tripped `encode`'s soundness `debug_assert`).
306    #[mz_ore::test]
307    #[cfg_attr(miri, ignore)] // integer-to-pointer casts in row decoding are unsupported under miri
308    fn test_safe_codec_merge_bitmap_carryover() {
309        use crate::row_codec::ColumnsCodec;
310
311        // Short strings: length < 256, so they encode with the `StringTiny` tag.
312        // Unique, so MisraGries never makes them dictionary entries; they always
313        // fall through raw, exposing their first byte.
314        let short_rows: Vec<Row> = (0..256)
315            .map(|i| Row::pack_slice(&[Datum::String(&format!("s{i}"))]))
316            .collect();
317        // Long strings: length >= 256, so they encode with the `StringShort` tag —
318        // a *different* first byte than the short strings. Distinct values, each
319        // repeated, so the post-install codec accrues many heavy hitters and the
320        // merge assigns dictionary tags across the low byte range, reaching the
321        // short strings' `StringTiny` tag unless the bitmap reserves it.
322        let long_values: Vec<String> = (0..64).map(|i| format!("{i:0>300}")).collect();
323
324        // Pre-install statistics observe only the short strings' first bytes.
325        let mut stats = ColumnsCodec::default();
326        let mut scratch = Vec::new();
327        for row in &short_rows {
328            scratch.clear();
329            stats.encode(ColumnsCodec::borrow_row(row), &mut scratch);
330        }
331
332        // Install a safe codec, then feed it the long strings post-install so it
333        // accrues heavy hitters (and observes only the `StringShort` first byte).
334        let mut safe = stats.new_safe();
335        for _ in 0..8 {
336            for v in &long_values {
337                let row = Row::pack_slice(&[Datum::String(v)]);
338                scratch.clear();
339                safe.encode(ColumnsCodec::borrow_row(&row), &mut scratch);
340            }
341        }
342
343        // Merge, then round-trip the short strings. With the bitmap carried over,
344        // no dictionary tag collides with the short strings' first byte; without
345        // it, one does.
346        let mut merged = ColumnsCodec::new_from([&safe]);
347        for row in &short_rows {
348            let mut buf = Vec::new();
349            merged.encode(ColumnsCodec::borrow_row(row), &mut buf);
350            let decoded = merged.decode(&buf).collect::<Vec<_>>();
351            let expected = ColumnsCodec::borrow_row(row).collect::<Vec<_>>();
352            assert_eq!(decoded, expected, "round-trip mismatch for {row:?}");
353        }
354    }
355
356    /// Confirms the structural assumption underpinning `SAFE_TAG_BASE`: every
357    /// datum the row format produces encodes with a first byte strictly less
358    /// than `SAFE_TAG_BASE`. If `mz_repr` ever introduces a tag that crosses
359    /// the boundary, `DictionaryCodec::new_safe` would assign a dictionary tag
360    /// that collides with a literal datum first-byte, breaking decoding.
361    #[mz_ore::test]
362    fn test_safe_tag_base() {
363        use crate::row_codec::SAFE_TAG_BASE;
364        let check = |datum: Datum| {
365            let row = Row::pack_slice(&[datum]);
366            let data = row.data();
367            assert!(!data.is_empty(), "empty encoding for {datum:?}");
368            assert!(
369                data[0] < SAFE_TAG_BASE,
370                "datum {datum:?} encodes with first byte {} >= SAFE_TAG_BASE ({}); \
371                 a new row tag has crossed the safe boundary",
372                data[0],
373                SAFE_TAG_BASE,
374            );
375        };
376        for ty in SqlScalarType::enumerate().iter() {
377            for datum in ty.interesting_datums() {
378                check(datum);
379            }
380        }
381    }
382
383    /// A batch built via the builder's `push`/`done` path (as the `reduce` operator
384    /// does) that stays under `STATS_THRESHOLD` never installs a codec at build time.
385    /// `done` now promotes the gathered statistics into the codec slot, so the batch
386    /// carries a codec + heavy-hitter summary and does not poison a later merge.
387    ///
388    /// This drives that container lifecycle directly: gather raw (well under the
389    /// threshold), promote at "done", then merge two such containers the way a spine
390    /// compaction does. With promotion the merge takes the `new_from` path and
391    /// compresses; without it both inputs are codec-less and the merge stays raw.
392    /// Every merged row must still round-trip.
393    #[mz_ore::test]
394    #[cfg_attr(miri, ignore)] // integer-to-pointer casts in row decoding are unsupported under miri
395    fn push_done_promotion_avoids_merge_poison() {
396        use std::sync::atomic::Ordering;
397        use timely::container::PushInto;
398
399        // Gate the dictionary path on. Safe for other tests: the flag only controls
400        // whether `DatumContainer` gathers stats; it never changes decode results.
401        crate::DICTIONARY_COMPRESSION.store(true, Ordering::Relaxed);
402
403        // Low-cardinality rows, well under `STATS_THRESHOLD` (64Ki): a repeated
404        // multi-byte string the dictionary compresses, plus an integer column that
405        // exercises raw fall-through.
406        let rows: Vec<Row> = (0..2_000i64)
407            .map(|i| {
408                Row::pack_slice(&[
409                    Datum::Int64(i % 8),
410                    Datum::String("a repeated string value"),
411                ])
412            })
413            .collect();
414
415        // Build a container the way the push/done path does: gather raw without ever
416        // crossing `STATS_THRESHOLD`, optionally promoting at "done".
417        let build = |promote: bool| {
418            let mut c = DatumContainer::with_capacity(rows.len());
419            for row in &rows {
420                c.push_into(row);
421            }
422            if promote {
423                c.promote_stats_to_codec();
424            }
425            c
426        };
427
428        // Merge two containers as a spine compaction does: allocate via
429        // `merge_capacity`, then copy every row through.
430        let merge = |a: &DatumContainer, b: &DatumContainer| {
431            let mut m = DatumContainer::merge_capacity(a, b);
432            for i in 0..a.len() {
433                m.push_into(a.index(i));
434            }
435            for i in 0..b.len() {
436                m.push_into(b.index(i));
437            }
438            m
439        };
440
441        let heap = |c: &DatumContainer| {
442            let mut size = 0;
443            c.heap_size(|_, cap| size += cap);
444            size
445        };
446
447        // Codec-less inputs (no promotion): the merge cannot `new_from` and stays raw.
448        let poisoned = merge(&build(false), &build(false));
449        // Promoted inputs carry a codec + summary: the merge `new_from`s and compresses.
450        let compressed = merge(&build(true), &build(true));
451
452        // Round-trip: every merged row decodes back to the corresponding input row
453        // (the merge here concatenates a's rows then b's rows, no consolidation).
454        assert_eq!(compressed.len(), rows.len() * 2);
455        for i in 0..compressed.len() {
456            let got = compressed.index(i).collect::<Vec<_>>();
457            let want = rows[i % rows.len()].iter().collect::<Vec<_>>();
458            assert_eq!(got, want, "merged row {i} round-trips");
459        }
460
461        // The promoted merge must actually compress relative to the poisoned one,
462        // confirming promotion carried a usable summary into `new_from`.
463        assert!(
464            heap(&compressed) < heap(&poisoned),
465            "promotion should let the merge compress: compressed={} poisoned={}",
466            heap(&compressed),
467            heap(&poisoned),
468        );
469    }
470}
471
472/// A `[u8]`-specialized container.
473mod bytes_container {
474
475    use differential_dataflow::trace::implementations::BatchContainer;
476    use timely::container::PushInto;
477
478    use mz_ore::region::Region;
479
480    /// A slice container with four bytes overhead per slice.
481    pub struct BytesContainer {
482        /// Total length of `batches`, maintained because recomputation is expensive.
483        length: usize,
484        batches: Vec<BytesBatch>,
485    }
486
487    impl BytesContainer {
488        /// Visit contained allocations to determine their size and capacity.
489        #[inline]
490        pub fn heap_size(&self, mut callback: impl FnMut(usize, usize)) {
491            // Calculate heap size for local, stash, and stash entries
492            callback(
493                self.batches.len() * std::mem::size_of::<BytesBatch>(),
494                self.batches.capacity() * std::mem::size_of::<BytesBatch>(),
495            );
496            for batch in self.batches.iter() {
497                batch.offsets.heap_size(&mut callback);
498                callback(batch.storage.len(), batch.storage.capacity());
499            }
500        }
501    }
502
503    impl BatchContainer for BytesContainer {
504        type Owned = Vec<u8>;
505        type ReadItem<'a> = &'a [u8];
506
507        #[inline]
508        fn into_owned<'a>(item: Self::ReadItem<'a>) -> Self::Owned {
509            item.to_vec()
510        }
511
512        #[inline]
513        fn clone_onto<'a>(item: Self::ReadItem<'a>, other: &mut Self::Owned) {
514            other.clear();
515            other.extend_from_slice(item);
516        }
517
518        #[inline(always)]
519        fn push_ref(&mut self, item: Self::ReadItem<'_>) {
520            self.push_into(item);
521        }
522
523        #[inline(always)]
524        fn push_own(&mut self, item: &Self::Owned) {
525            self.push_into(item.as_slice())
526        }
527
528        fn clear(&mut self) {
529            self.batches.clear();
530            self.batches.push(BytesBatch::with_capacities(0, 0));
531            self.length = 0;
532        }
533
534        fn with_capacity(size: usize) -> Self {
535            Self {
536                length: 0,
537                batches: vec![BytesBatch::with_capacities(size, size)],
538            }
539        }
540
541        fn merge_capacity(cont1: &Self, cont2: &Self) -> Self {
542            let mut item_cap = 1;
543            let mut byte_cap = 0;
544            for batch in cont1.batches.iter() {
545                item_cap += batch.offsets.len() - 1;
546                byte_cap += batch.storage.len();
547            }
548            for batch in cont2.batches.iter() {
549                item_cap += batch.offsets.len() - 1;
550                byte_cap += batch.storage.len();
551            }
552            Self {
553                length: 0,
554                batches: vec![BytesBatch::with_capacities(item_cap, byte_cap)],
555            }
556        }
557
558        #[inline(always)]
559        fn reborrow<'b, 'a: 'b>(item: Self::ReadItem<'a>) -> Self::ReadItem<'b> {
560            item
561        }
562
563        #[inline]
564        fn index(&self, mut index: usize) -> Self::ReadItem<'_> {
565            for batch in self.batches.iter() {
566                if index < batch.len() {
567                    return batch.index(index);
568                }
569                index -= batch.len();
570            }
571            panic!("Index out of bounds");
572        }
573
574        #[inline(always)]
575        fn len(&self) -> usize {
576            self.length
577        }
578    }
579
580    impl PushInto<&[u8]> for BytesContainer {
581        #[inline]
582        fn push_into(&mut self, item: &[u8]) {
583            self.length += 1;
584            if let Some(batch) = self.batches.last_mut() {
585                let success = batch.try_push(item);
586                if !success {
587                    // double the lengths from `batch`.
588                    let item_cap = 2 * batch.offsets.len();
589                    let byte_cap = std::cmp::max(2 * batch.storage.capacity(), item.len());
590                    let mut new_batch = BytesBatch::with_capacities(item_cap, byte_cap);
591                    assert!(new_batch.try_push(item));
592                    self.batches.push(new_batch);
593                }
594            }
595        }
596    }
597
598    /// A batch of slice storage.
599    ///
600    /// The backing storage for this batch will not be resized.
601    pub struct BytesBatch {
602        offsets: crate::OffsetOptimized,
603        storage: Region<u8>,
604        len: usize,
605    }
606
607    impl BytesBatch {
608        /// Either accepts the slice and returns true,
609        /// or does not and returns false.
610        fn try_push(&mut self, slice: &[u8]) -> bool {
611            if self.storage.len() + slice.len() <= self.storage.capacity() {
612                self.storage.extend_from_slice(slice);
613                self.offsets.push_into(self.storage.len());
614                self.len += 1;
615                true
616            } else {
617                false
618            }
619        }
620        #[inline]
621        fn index(&self, index: usize) -> &[u8] {
622            let lower = self.offsets.index(index);
623            let upper = self.offsets.index(index + 1);
624            &self.storage[lower..upper]
625        }
626        #[inline(always)]
627        fn len(&self) -> usize {
628            debug_assert_eq!(self.len, self.offsets.len() - 1);
629            self.len
630        }
631
632        fn with_capacities(item_cap: usize, byte_cap: usize) -> Self {
633            // TODO: be wary of `byte_cap` greater than 2^32.
634            let mut offsets = crate::OffsetOptimized::with_capacity(item_cap + 1);
635            offsets.push_into(0);
636            Self {
637                offsets,
638                storage: Region::new_auto(byte_cap.next_power_of_two()),
639                len: 0,
640            }
641        }
642    }
643}
644
645mod offset_opt {
646    use differential_dataflow::trace::implementations::BatchContainer;
647    use differential_dataflow::trace::implementations::OffsetList;
648    use timely::container::PushInto;
649
650    enum OffsetStride {
651        Empty,
652        Zero,
653        Striding(usize, usize),
654        Saturated(usize, usize, usize),
655    }
656
657    impl OffsetStride {
658        /// Accepts or rejects a newly pushed element.
659        #[inline]
660        fn push(&mut self, item: usize) -> bool {
661            match self {
662                OffsetStride::Empty => {
663                    if item == 0 {
664                        *self = OffsetStride::Zero;
665                        true
666                    } else {
667                        false
668                    }
669                }
670                OffsetStride::Zero => {
671                    *self = OffsetStride::Striding(item, 2);
672                    true
673                }
674                OffsetStride::Striding(stride, count) => {
675                    if item == *stride * *count {
676                        *count += 1;
677                        true
678                    } else if item == *stride * (*count - 1) {
679                        *self = OffsetStride::Saturated(*stride, *count, 1);
680                        true
681                    } else {
682                        false
683                    }
684                }
685                OffsetStride::Saturated(stride, count, reps) => {
686                    if item == *stride * (*count - 1) {
687                        *reps += 1;
688                        true
689                    } else {
690                        false
691                    }
692                }
693            }
694        }
695
696        #[inline]
697        fn index(&self, index: usize) -> usize {
698            match self {
699                OffsetStride::Empty => {
700                    panic!("Empty OffsetStride")
701                }
702                OffsetStride::Zero => 0,
703                OffsetStride::Striding(stride, _steps) => *stride * index,
704                OffsetStride::Saturated(stride, steps, _reps) => {
705                    if index < *steps {
706                        *stride * index
707                    } else {
708                        *stride * (*steps - 1)
709                    }
710                }
711            }
712        }
713
714        #[inline]
715        fn len(&self) -> usize {
716            match self {
717                OffsetStride::Empty => 0,
718                OffsetStride::Zero => 1,
719                OffsetStride::Striding(_stride, steps) => *steps,
720                OffsetStride::Saturated(_stride, steps, reps) => *steps + *reps,
721            }
722        }
723    }
724
725    pub struct OffsetOptimized {
726        strided: OffsetStride,
727        spilled: OffsetList,
728    }
729
730    impl BatchContainer for OffsetOptimized {
731        type Owned = usize;
732        type ReadItem<'a> = usize;
733
734        #[inline]
735        fn into_owned<'a>(item: Self::ReadItem<'a>) -> Self::Owned {
736            item
737        }
738
739        #[inline]
740        fn push_ref(&mut self, item: Self::ReadItem<'_>) {
741            self.push_into(item)
742        }
743
744        #[inline]
745        fn push_own(&mut self, item: &Self::Owned) {
746            self.push_into(*item)
747        }
748
749        fn clear(&mut self) {
750            self.strided = OffsetStride::Empty;
751            self.spilled.clear();
752        }
753
754        fn with_capacity(_size: usize) -> Self {
755            Self {
756                strided: OffsetStride::Empty,
757                spilled: OffsetList::with_capacity(0),
758            }
759        }
760
761        fn merge_capacity(_cont1: &Self, _cont2: &Self) -> Self {
762            Self {
763                strided: OffsetStride::Empty,
764                spilled: OffsetList::with_capacity(0),
765            }
766        }
767
768        #[inline]
769        fn reborrow<'b, 'a: 'b>(item: Self::ReadItem<'a>) -> Self::ReadItem<'b> {
770            item
771        }
772
773        #[inline]
774        fn index(&self, index: usize) -> Self::ReadItem<'_> {
775            if index < self.strided.len() {
776                self.strided.index(index)
777            } else {
778                self.spilled.index(index - self.strided.len())
779            }
780        }
781
782        #[inline]
783        fn len(&self) -> usize {
784            self.strided.len() + self.spilled.len()
785        }
786    }
787
788    impl PushInto<usize> for OffsetOptimized {
789        #[inline]
790        fn push_into(&mut self, item: usize) {
791            if !self.spilled.is_empty() {
792                self.spilled.push(item);
793            } else {
794                let inserted = self.strided.push(item);
795                if !inserted {
796                    self.spilled.push(item);
797                }
798            }
799        }
800    }
801
802    impl OffsetOptimized {
803        pub fn heap_size(&self, callback: impl FnMut(usize, usize)) {
804            crate::offset_list_size(&self.spilled, callback);
805        }
806    }
807}
808
809/// Helper to compute the size of an [`OffsetList`] in memory.
810#[inline]
811pub(crate) fn offset_list_size(data: &OffsetList, mut callback: impl FnMut(usize, usize)) {
812    // Private `vec_size` because we should only use it where data isn't region-allocated.
813    // `T: Copy` makes sure the implementation is correct even if types change!
814    #[inline(always)]
815    fn vec_size<T: Copy>(data: &Vec<T>, mut callback: impl FnMut(usize, usize)) {
816        let size_of_t = std::mem::size_of::<T>();
817        callback(data.len() * size_of_t, data.capacity() * size_of_t);
818    }
819
820    vec_size(&data.smol, &mut callback);
821    vec_size(&data.chonk, callback);
822}
823
824/// A `Row`-specialized container using dictionary compression.
825///
826/// The approach is to establish for each column lists of common values, and to use "unoccupied"
827/// tags in the row encoding (e.g. where we would indicate types) to replace these common values.
828/// This substitution is opt-in, in that we don't need to do it, and in particular do not do it
829/// while we are collecting preliminary information about common values, and then start to use it
830/// once we believe we have enough information. Once we have started to use the substitutions we
831/// cannot change the meaning of a reserved byte pattern, for the container we are populating.
832///
833/// Each from-scratch container observes `STATS_THRESHOLD` records before establishing a mapping
834/// from spare tags to common values. Containers that are formed from merging other containers
835/// use those input containers' common values to populate a codec and use it immediately.
836///
837/// The dictionary behavior is controlled by the `DICTIONARY_COMPRESSION` flag, which if disabled
838/// prevents the construction of codecs, which when absent simply cause the wrapper to behave as
839/// a no-op that fails to use any spare tags for common values. The flag is set once, when a
840/// replica is created (from compute's `InstanceConfig::arrangement_dictionary_compression`, itself
841/// captured from the `enable_arrangement_dictionary_compression_alpha` dyncfg at that moment), and is
842/// not changed for the life of the process; flipping the dyncfg only affects replicas created
843/// afterwards. Even with the flag fixed, a single replica can hold a mix of compressed and
844/// uncompressed containers — e.g. containers that never observed enough records to install a
845/// codec, or that were merged from uncompressed inputs.
846mod dictionary {
847
848    use differential_dataflow::trace::implementations::BatchContainer;
849
850    use mz_repr::{Row, RowRef};
851
852    use super::row_codec::{ColumnsCodec, ColumnsIter};
853
854    /// Wrapper types that exist to support the creation of dictionary codecs.
855    ///
856    /// These types interpose at the seal() call, to traverse the data that is being sealed and
857    /// then construct codecs that are used to encode the row-shaped keys and values. There are
858    /// several variants, corresponding to the RowRow, RowVal, and Row-only spine types.
859    pub mod builders {
860
861        use columnar::{Columnar, Index};
862        use columnation::Columnation;
863        use differential_dataflow::difference::Semigroup;
864        use differential_dataflow::lattice::Lattice;
865        use differential_dataflow::trace::Builder;
866        use differential_dataflow::trace::Description;
867        use differential_dataflow::trace::implementations::ord_neu::{OrdKeyBatch, OrdKeyBuilder};
868        use differential_dataflow::trace::implementations::ord_neu::{OrdValBatch, OrdValBuilder};
869        use mz_timely_util::columnar::Column;
870        use mz_timely_util::columnation::ColumnationStack as TimelyStack;
871        use timely::progress::Timestamp;
872
873        use mz_repr::{Row, RowRef};
874
875        use super::super::row_codec::ColumnsCodec;
876        use super::{DatumContainer, DatumSeq};
877        use crate::DICTIONARY_COMPRESSION;
878        use crate::spines::{RowLayout, RowRowLayout, RowValLayout, ValRowLayout};
879
880        /// Gather encoding statistics across `rows` and produce a codec from them.
881        ///
882        /// Accepts anything that borrows as a [`RowRef`], so it serves both the
883        /// columnation-fed builders (which yield `&Row`) and the paged builders
884        /// (which yield `&RowRef` straight out of a [`Column`] chunk).
885        ///
886        /// Returns `None` when dictionary compression is disabled.
887        fn build_codec<'a, B>(rows: impl IntoIterator<Item = &'a B>) -> Option<ColumnsCodec>
888        where
889            B: std::borrow::Borrow<RowRef> + ?Sized + 'a,
890        {
891            if !DICTIONARY_COMPRESSION.load(std::sync::atomic::Ordering::Relaxed) {
892                return None;
893            }
894            let mut stats = ColumnsCodec::default();
895            for row in rows {
896                let row = row.borrow();
897                if !row.is_empty() {
898                    // Gather stats only; the encoded output would be thrown away here, so
899                    // `observe` skips the per-value lookup and the throwaway-buffer memcpy
900                    // that `encode` would do (see `ColumnsCodec::observe`).
901                    stats.observe(DatumSeq::borrow_as(row).bytes_iter());
902                }
903            }
904            Some(ColumnsCodec::new_from([&stats]))
905        }
906
907        pub struct RowRowBuilder<
908            T: Lattice + Timestamp + Columnation,
909            R: Ord + Semigroup + Columnation + 'static,
910        > {
911            inner: OrdValBuilder<RowRowLayout<((Row, Row), T, R)>, TimelyStack<((Row, Row), T, R)>>,
912        }
913
914        impl<T: Lattice + Timestamp + Columnation, R: Ord + Semigroup + Columnation + 'static>
915            Builder for RowRowBuilder<T, R>
916        {
917            type Input = TimelyStack<((Row, Row), T, R)>;
918            type Time = T;
919            type Output = OrdValBatch<RowRowLayout<((Row, Row), T, R)>>;
920
921            fn with_capacity(keys: usize, vals: usize, upds: usize) -> Self {
922                Self {
923                    inner: Builder::with_capacity(keys, vals, upds),
924                }
925            }
926            fn push(&mut self, chunk: &mut Self::Input) {
927                self.inner.push(chunk)
928            }
929            fn done(self, description: Description<Self::Time>) -> Self::Output {
930                // The push/done build path (e.g. the `reduce` operator, which builds
931                // batches with `Builder::new()` + `push` + `done` rather than `seal`)
932                // never runs `seal`'s codec install. Install a codec here from the
933                // statistics gathered during `push`, mirroring `seal` — but without
934                // building a dictionary or re-encoding the rows; see
935                // `DatumContainer::promote_stats_to_codec` for why a codec-less batch
936                // must be avoided even though its rows stay raw.
937                let mut inner = self.inner;
938                inner.result.keys.promote_stats_to_codec();
939                inner.result.vals.vals.promote_stats_to_codec();
940                inner.done(description)
941            }
942            fn seal(
943                chain: &mut Vec<Self::Input>,
944                description: Description<Self::Time>,
945            ) -> Self::Output {
946                let key_codec = build_codec(
947                    chain
948                        .iter()
949                        .flat_map(|link| link.iter().map(|((k, _), _, _)| k)),
950                );
951                let val_codec = build_codec(
952                    chain
953                        .iter()
954                        .flat_map(|link| link.iter().map(|((_, v), _, _)| v)),
955                );
956
957                use differential_dataflow::trace::implementations::BuilderInput;
958
959                let (keys, vals, upds) = <Self::Input as BuilderInput<
960                    DatumContainer,
961                    DatumContainer,
962                >>::key_val_upd_counts(&chain[..]);
963                let mut builder = Self::with_capacity(keys, vals, upds);
964                // The seal path installs a codec directly, so the per-container stats
965                // gatherer (which `with_capacity` may have allocated) is dead weight and
966                // would contradict the `stats: None once codec installed` invariant.
967                builder.inner.result.keys.codec = key_codec;
968                builder.inner.result.keys.stats = None;
969                builder.inner.result.vals.vals.codec = val_codec;
970                builder.inner.result.vals.vals.stats = None;
971
972                for mut chunk in chain.drain(..) {
973                    builder.push(&mut chunk);
974                }
975
976                builder.done(description)
977            }
978        }
979
980        pub struct RowValBuilder<
981            V: Ord + Clone + Columnation + 'static,
982            T: Lattice + Timestamp + Columnation,
983            R: Ord + Semigroup + Columnation + 'static,
984        > {
985            inner: OrdValBuilder<RowValLayout<((Row, V), T, R)>, TimelyStack<((Row, V), T, R)>>,
986        }
987
988        impl<
989            V: Ord + Clone + Columnation,
990            T: Lattice + Timestamp + Columnation,
991            R: Ord + Semigroup + Columnation + 'static,
992        > Builder for RowValBuilder<V, T, R>
993        {
994            type Input = TimelyStack<((Row, V), T, R)>;
995            type Time = T;
996            type Output = OrdValBatch<RowValLayout<((Row, V), T, R)>>;
997
998            fn with_capacity(keys: usize, vals: usize, upds: usize) -> Self {
999                Self {
1000                    inner: Builder::with_capacity(keys, vals, upds),
1001                }
1002            }
1003            fn push(&mut self, chunk: &mut Self::Input) {
1004                self.inner.push(chunk)
1005            }
1006            fn done(self, description: Description<Self::Time>) -> Self::Output {
1007                // See `RowRowBuilder::done`: install a codec on the `Row`-shaped key
1008                // container for the push/done (e.g. `reduce`) path that skips `seal`.
1009                let mut inner = self.inner;
1010                inner.result.keys.promote_stats_to_codec();
1011                inner.done(description)
1012            }
1013            fn seal(
1014                chain: &mut Vec<Self::Input>,
1015                description: Description<Self::Time>,
1016            ) -> Self::Output {
1017                let key_codec = build_codec(
1018                    chain
1019                        .iter()
1020                        .flat_map(|link| link.iter().map(|((k, _), _, _)| k)),
1021                );
1022
1023                use differential_dataflow::trace::implementations::BuilderInput;
1024
1025                let (keys, vals, upds) = <Self::Input as BuilderInput<
1026                    DatumContainer,
1027                    TimelyStack<V>,
1028                >>::key_val_upd_counts(&chain[..]);
1029                let mut builder = Self::with_capacity(keys, vals, upds);
1030                // See `RowRowBuilder::seal`: drop the now-redundant stats gatherer.
1031                builder.inner.result.keys.codec = key_codec;
1032                builder.inner.result.keys.stats = None;
1033
1034                for mut chunk in chain.drain(..) {
1035                    builder.push(&mut chunk);
1036                }
1037
1038                builder.done(description)
1039            }
1040        }
1041
1042        pub struct RowBuilder<
1043            T: Lattice + Timestamp + Columnation,
1044            R: Ord + Semigroup + Columnation + 'static,
1045        > {
1046            inner: OrdKeyBuilder<RowLayout<((Row, ()), T, R)>, TimelyStack<((Row, ()), T, R)>>,
1047        }
1048
1049        impl<T: Lattice + Timestamp + Columnation, R: Ord + Semigroup + Columnation + 'static>
1050            Builder for RowBuilder<T, R>
1051        {
1052            type Input = TimelyStack<((Row, ()), T, R)>;
1053            type Time = T;
1054            type Output = OrdKeyBatch<RowLayout<((Row, ()), T, R)>>;
1055
1056            fn with_capacity(keys: usize, vals: usize, upds: usize) -> Self {
1057                Self {
1058                    inner: Builder::with_capacity(keys, vals, upds),
1059                }
1060            }
1061            fn push(&mut self, chunk: &mut Self::Input) {
1062                self.inner.push(chunk)
1063            }
1064            fn done(self, description: Description<Self::Time>) -> Self::Output {
1065                // See `RowRowBuilder::done`: install a codec on the `Row`-shaped key
1066                // container for the push/done (e.g. `reduce`) path that skips `seal`.
1067                let mut inner = self.inner;
1068                inner.result.keys.promote_stats_to_codec();
1069                inner.done(description)
1070            }
1071            fn seal(
1072                chain: &mut Vec<Self::Input>,
1073                description: Description<Self::Time>,
1074            ) -> Self::Output {
1075                let key_codec = build_codec(
1076                    chain
1077                        .iter()
1078                        .flat_map(|link| link.iter().map(|((k, _), _, _)| k)),
1079                );
1080
1081                use differential_dataflow::trace::implementations::BuilderInput;
1082
1083                let (keys, vals, upds) = <Self::Input as BuilderInput<
1084                    DatumContainer,
1085                    TimelyStack<()>,
1086                >>::key_val_upd_counts(&chain[..]);
1087                let mut builder = Self::with_capacity(keys, vals, upds);
1088                // See `RowRowBuilder::seal`: drop the now-redundant stats gatherer.
1089                builder.inner.result.keys.codec = key_codec;
1090                builder.inner.result.keys.stats = None;
1091
1092                for mut chunk in chain.drain(..) {
1093                    builder.push(&mut chunk);
1094                }
1095
1096                builder.done(description)
1097            }
1098        }
1099
1100        /// Mirror of [`RowValBuilder`] with the roles swapped: arbitrary keys and
1101        /// `Row` *values*, so the dictionary codec is built for and installed on the
1102        /// value container.
1103        pub struct ValRowBuilder<
1104            K: Ord + Clone + Columnation + 'static,
1105            T: Lattice + Timestamp + Columnation,
1106            R: Ord + Semigroup + Columnation + 'static,
1107        > {
1108            inner: OrdValBuilder<ValRowLayout<((K, Row), T, R)>, TimelyStack<((K, Row), T, R)>>,
1109        }
1110
1111        impl<
1112            K: Ord + Clone + Columnation,
1113            T: Lattice + Timestamp + Columnation,
1114            R: Ord + Semigroup + Columnation + 'static,
1115        > Builder for ValRowBuilder<K, T, R>
1116        {
1117            type Input = TimelyStack<((K, Row), T, R)>;
1118            type Time = T;
1119            type Output = OrdValBatch<ValRowLayout<((K, Row), T, R)>>;
1120
1121            fn with_capacity(keys: usize, vals: usize, upds: usize) -> Self {
1122                Self {
1123                    inner: Builder::with_capacity(keys, vals, upds),
1124                }
1125            }
1126            fn push(&mut self, chunk: &mut Self::Input) {
1127                self.inner.push(chunk)
1128            }
1129            fn done(self, description: Description<Self::Time>) -> Self::Output {
1130                // See `RowRowBuilder::done`: install a codec on the `Row`-shaped value
1131                // container for the push/done (e.g. `reduce`) path that skips `seal`.
1132                let mut inner = self.inner;
1133                inner.result.vals.vals.promote_stats_to_codec();
1134                inner.done(description)
1135            }
1136            fn seal(
1137                chain: &mut Vec<Self::Input>,
1138                description: Description<Self::Time>,
1139            ) -> Self::Output {
1140                let val_codec = build_codec(
1141                    chain
1142                        .iter()
1143                        .flat_map(|link| link.iter().map(|((_, v), _, _)| v)),
1144                );
1145
1146                use differential_dataflow::trace::implementations::BuilderInput;
1147
1148                let (keys, vals, upds) = <Self::Input as BuilderInput<
1149                    TimelyStack<K>,
1150                    DatumContainer,
1151                >>::key_val_upd_counts(&chain[..]);
1152                let mut builder = Self::with_capacity(keys, vals, upds);
1153                // See `RowRowBuilder::seal`: drop the now-redundant stats gatherer.
1154                builder.inner.result.vals.vals.codec = val_codec;
1155                builder.inner.result.vals.vals.stats = None;
1156
1157                for mut chunk in chain.drain(..) {
1158                    builder.push(&mut chunk);
1159                }
1160
1161                builder.done(description)
1162            }
1163        }
1164
1165        /// Paged counterpart of [`RowRowBuilder`] that consumes [`Column`]
1166        /// chunks instead of columnation stacks. Mirrors `RowRowBuilder::seal`:
1167        /// it gathers key and value statistics from the sealed chain and
1168        /// installs codecs directly, then drops the per-container stats gatherer.
1169        pub struct RowRowColPagedBuilder<
1170            T: Lattice + Timestamp + Columnation + Columnar,
1171            R: Ord + Semigroup + Columnation + Columnar + Clone + 'static,
1172        > {
1173            inner: OrdValBuilder<RowRowLayout<((Row, Row), T, R)>, Column<((Row, Row), T, R)>>,
1174        }
1175
1176        impl<
1177            T: Lattice + Timestamp + Columnation + Columnar,
1178            R: Ord + Semigroup + Columnation + Columnar + Clone + 'static,
1179        > Builder for RowRowColPagedBuilder<T, R>
1180        {
1181            type Input = Column<((Row, Row), T, R)>;
1182            type Time = T;
1183            type Output = OrdValBatch<RowRowLayout<((Row, Row), T, R)>>;
1184
1185            fn with_capacity(keys: usize, vals: usize, upds: usize) -> Self {
1186                Self {
1187                    inner: Builder::with_capacity(keys, vals, upds),
1188                }
1189            }
1190            fn push(&mut self, chunk: &mut Self::Input) {
1191                self.inner.push(chunk)
1192            }
1193            fn done(self, description: Description<Self::Time>) -> Self::Output {
1194                self.inner.done(description)
1195            }
1196            fn seal(
1197                chain: &mut Vec<Self::Input>,
1198                description: Description<Self::Time>,
1199            ) -> Self::Output {
1200                // `into_index_iter` yields the value column's `Row`s as `&RowRef`,
1201                // which `build_codec` consumes directly.
1202                let key_codec = build_codec(
1203                    chain
1204                        .iter()
1205                        .flat_map(|c| c.borrow().into_index_iter().map(|((k, _), _, _)| k)),
1206                );
1207                let val_codec = build_codec(
1208                    chain
1209                        .iter()
1210                        .flat_map(|c| c.borrow().into_index_iter().map(|((_, v), _, _)| v)),
1211                );
1212
1213                use differential_dataflow::trace::implementations::BuilderInput;
1214
1215                let (keys, vals, upds) = <Self::Input as BuilderInput<
1216                    DatumContainer,
1217                    DatumContainer,
1218                >>::key_val_upd_counts(&chain[..]);
1219                let mut builder = Self::with_capacity(keys, vals, upds);
1220                // See `RowRowBuilder::seal`: install the codecs and drop the
1221                // now-redundant per-container stats gatherer.
1222                builder.inner.result.keys.codec = key_codec;
1223                builder.inner.result.keys.stats = None;
1224                builder.inner.result.vals.vals.codec = val_codec;
1225                builder.inner.result.vals.vals.stats = None;
1226
1227                for mut chunk in chain.drain(..) {
1228                    builder.push(&mut chunk);
1229                }
1230
1231                builder.done(description)
1232            }
1233        }
1234
1235        /// Paged counterpart of [`ValRowBuilder`] that consumes [`Column`]
1236        /// chunks. Keys are arbitrary `Columnar` values (not `Row`-shaped) and
1237        /// stay uncompressed; only the value container receives a codec.
1238        pub struct ValRowColPagedBuilder<
1239            K: Ord + Clone + Columnation + Columnar + 'static,
1240            T: Lattice + Timestamp + Columnation + Columnar,
1241            R: Ord + Semigroup + Columnation + Columnar + Clone + 'static,
1242        > {
1243            inner: OrdValBuilder<ValRowLayout<((K, Row), T, R)>, Column<((K, Row), T, R)>>,
1244        }
1245
1246        impl<
1247            K: Ord + Clone + Columnation + Columnar + 'static,
1248            T: Lattice + Timestamp + Columnation + Columnar,
1249            R: Ord + Semigroup + Columnation + Columnar + Clone + 'static,
1250        > Builder for ValRowColPagedBuilder<K, T, R>
1251        where
1252            for<'a> columnar::Ref<'a, K>: Copy + Ord,
1253            for<'a, 'b> &'a K: PartialEq<columnar::Ref<'b, K>>,
1254            for<'a> TimelyStack<K>: timely::container::PushInto<columnar::Ref<'a, K>>,
1255        {
1256            type Input = Column<((K, Row), T, R)>;
1257            type Time = T;
1258            type Output = OrdValBatch<ValRowLayout<((K, Row), T, R)>>;
1259
1260            fn with_capacity(keys: usize, vals: usize, upds: usize) -> Self {
1261                Self {
1262                    inner: Builder::with_capacity(keys, vals, upds),
1263                }
1264            }
1265            fn push(&mut self, chunk: &mut Self::Input) {
1266                self.inner.push(chunk)
1267            }
1268            fn done(self, description: Description<Self::Time>) -> Self::Output {
1269                self.inner.done(description)
1270            }
1271            fn seal(
1272                chain: &mut Vec<Self::Input>,
1273                description: Description<Self::Time>,
1274            ) -> Self::Output {
1275                let val_codec = build_codec(
1276                    chain
1277                        .iter()
1278                        .flat_map(|c| c.borrow().into_index_iter().map(|((_, v), _, _)| v)),
1279                );
1280
1281                use differential_dataflow::trace::implementations::BuilderInput;
1282
1283                let (keys, vals, upds) = <Self::Input as BuilderInput<
1284                    TimelyStack<K>,
1285                    DatumContainer,
1286                >>::key_val_upd_counts(&chain[..]);
1287                let mut builder = Self::with_capacity(keys, vals, upds);
1288                // See `RowRowBuilder::seal`: drop the now-redundant stats gatherer.
1289                builder.inner.result.vals.vals.codec = val_codec;
1290                builder.inner.result.vals.vals.stats = None;
1291
1292                for mut chunk in chain.drain(..) {
1293                    builder.push(&mut chunk);
1294                }
1295
1296                builder.done(description)
1297            }
1298        }
1299    }
1300
1301    pub struct DatumContainer {
1302        /// Encoder/decoder used to translate between row bytes and the stored bytes.
1303        /// `None` until enough pushes have been observed (or if compression is disabled).
1304        codec: Option<ColumnsCodec>,
1305        /// The stored, possibly-encoded, row bytes.
1306        inner: super::bytes_container::BytesContainer,
1307        /// Staging buffer for ingested `Row` types.
1308        staging: Vec<u8>,
1309        /// Statistics gatherer, used to build a safe codec after enough pushes.
1310        /// `None` once the codec has been installed or if compression is disabled.
1311        stats: Option<ColumnsCodec>,
1312    }
1313
1314    impl BatchContainer for DatumContainer {
1315        type Owned = Row;
1316        type ReadItem<'a> = DatumSeq<'a>;
1317
1318        fn with_capacity(size: usize) -> Self {
1319            let stats = if crate::DICTIONARY_COMPRESSION.load(std::sync::atomic::Ordering::Relaxed)
1320            {
1321                Some(Default::default())
1322            } else {
1323                None
1324            };
1325
1326            Self {
1327                codec: None,
1328                inner: BatchContainer::with_capacity(size),
1329                staging: Vec::new(),
1330                stats,
1331            }
1332        }
1333        fn merge_capacity(cont1: &Self, cont2: &Self) -> Self {
1334            // We only build a merged codec when *both* inputs carry one. A codec is
1335            // sound only for the data whose tag usage it observed, so we cannot reuse
1336            // one side's codec to decode the other side's rows. When exactly one side
1337            // is compressed we conservatively produce an uncompressed container rather
1338            // than risk a tag collision; the merged container re-gathers stats and may
1339            // install a fresh codec later via the `STATS_THRESHOLD` path.
1340            let codec = match (&cont1.codec, &cont2.codec) {
1341                (Some(c1), Some(c2)) => Some(ColumnsCodec::new_from([c1, c2])),
1342                _ => None,
1343            };
1344
1345            Self {
1346                codec,
1347                inner: BatchContainer::merge_capacity(&cont1.inner, &cont2.inner),
1348                staging: Vec::new(),
1349                stats: None,
1350            }
1351        }
1352        #[inline]
1353        fn index(&self, index: usize) -> Self::ReadItem<'_> {
1354            let data = self.inner.index(index);
1355            let iter = if let Some(codec) = &self.codec {
1356                codec.decode(data)
1357            } else {
1358                // Safety: without a codec we only push rows or datumseqs into `self.inner`.
1359                // Each retrieved byte slice should be row-encoded data, as long as we have
1360                // not unset the codec in the interim.
1361                unsafe { ColumnsIter::without_codec(data) }
1362            };
1363            DatumSeq { iter }
1364        }
1365        #[inline(always)]
1366        fn len(&self) -> usize {
1367            self.inner.len()
1368        }
1369
1370        #[inline(always)]
1371        fn reborrow<'b, 'a: 'b>(item: Self::ReadItem<'a>) -> Self::ReadItem<'b> {
1372            item
1373        }
1374
1375        #[inline(always)]
1376        fn into_owned<'a>(item: Self::ReadItem<'a>) -> Self::Owned {
1377            // Fast path: unencoded data is already row-formatted bytes.
1378            if item.iter.index.is_none() {
1379                // SAFETY: `iter.data` is raw row-encoded bytes when there is no codec.
1380                return unsafe { Row::from_bytes_unchecked(item.iter.data) };
1381            }
1382            Row::pack(item)
1383        }
1384
1385        #[inline(always)]
1386        fn clone_onto<'a>(item: Self::ReadItem<'a>, other: &mut Self::Owned) {
1387            // Fast path: unencoded data is already row-formatted bytes.
1388            if item.iter.index.is_none() {
1389                let mut packer = other.packer();
1390                // SAFETY: `iter.data` is raw row-encoded bytes when there is no codec.
1391                unsafe { packer.extend_by_slice_unchecked(item.iter.data) };
1392                return;
1393            }
1394            other.packer().extend(item);
1395        }
1396
1397        #[inline(always)]
1398        fn push_ref(&mut self, item: Self::ReadItem<'_>) {
1399            // Fast path: both sides unencoded — push raw bytes directly.
1400            if self.codec.is_none() && self.stats.is_none() && item.iter.index.is_none() {
1401                self.inner.push_ref(item.iter.data);
1402                return;
1403            }
1404            self.push_into(item);
1405        }
1406
1407        #[inline(always)]
1408        fn push_own(&mut self, item: &Self::Owned) {
1409            // Fast path: container is unencoded — push raw row bytes directly.
1410            if self.codec.is_none() && self.stats.is_none() {
1411                self.inner.push_ref(item.data());
1412                return;
1413            }
1414            self.push_into(item);
1415        }
1416
1417        #[inline(always)]
1418        fn clear(&mut self) {
1419            self.inner.clear();
1420            self.staging.clear();
1421            // Reset to the same state as a fresh `with_capacity`: drop any installed
1422            // codec and restore stats gathering (if compression is enabled). Keeping a
1423            // now-empty codec would leave `codec.is_some()`, which permanently routes
1424            // pushes down the encode path with an empty dictionary and prevents the
1425            // `STATS_THRESHOLD` install logic from ever re-engaging compression.
1426            self.codec = None;
1427            self.stats = if crate::DICTIONARY_COMPRESSION.load(std::sync::atomic::Ordering::Relaxed)
1428            {
1429                Some(Default::default())
1430            } else {
1431                None
1432            };
1433        }
1434    }
1435
1436    impl DatumContainer {
1437        /// Visit contained allocations to determine their size and capacity.
1438        #[inline]
1439        pub fn heap_size(&self, mut callback: impl FnMut(usize, usize)) {
1440            self.inner.heap_size(&mut callback);
1441            // The staging buffer and the (possibly absent) codec and stats gatherer all
1442            // hold heap allocations that the bare `inner` accounting misses.
1443            callback(self.staging.len(), self.staging.capacity());
1444            if let Some(codec) = &self.codec {
1445                codec.heap_size(&mut callback);
1446            }
1447            if let Some(stats) = &self.stats {
1448                stats.heap_size(&mut callback);
1449            }
1450        }
1451
1452        /// Promote a gathered-but-uninstalled statistics summary into the codec slot.
1453        ///
1454        /// A container filled via the builder's `push`/`done` path — as the `reduce`
1455        /// operator does, building batches with `Builder::new()` + `push` + `done`
1456        /// rather than `seal` — gathers statistics on every push but never reaches
1457        /// `seal`'s codec install, and only crosses the mid-formation
1458        /// `STATS_THRESHOLD` install if it grows past it. A smaller such container
1459        /// would otherwise be finalized with no codec at all, even with the flag on.
1460        ///
1461        /// That is a problem not because this batch needs compressing — its rows are
1462        /// already stored raw and we deliberately do *not* re-encode them here — but
1463        /// because a codec-less batch poisons future merges: [`Self::merge_capacity`]
1464        /// keys off the presence of a codec, so a codec-less input forces the merged
1465        /// container onto the uncompressed path. Moving the gathered statistics into
1466        /// the codec slot leaves the batch carrying a codec whose retained heavy-hitter
1467        /// summary a later merge can rebuild from via `ColumnsCodec::new_from`, while
1468        /// installing no dictionary: the empty `decode` map resolves every stored
1469        /// (raw) column through the literal-datum fall-through, so reads stay correct.
1470        ///
1471        /// We move the summary as-is rather than building a dictionary via `new_safe`
1472        /// / `new_from` (which reset the summary): unlike `seal` and the mid-formation
1473        /// install, `done` has no further rows to re-observe, so a reset summary would
1474        /// leave the eventual merge nothing to rebuild from.
1475        pub(crate) fn promote_stats_to_codec(&mut self) {
1476            if self.codec.is_none() {
1477                self.codec = self.stats.take();
1478            }
1479        }
1480    }
1481
1482    use timely::container::PushInto;
1483    impl PushInto<Row> for DatumContainer {
1484        #[inline(always)]
1485        fn push_into(&mut self, item: Row) {
1486            self.push_into(&item);
1487        }
1488    }
1489
1490    impl PushInto<&Row> for DatumContainer {
1491        #[inline(always)]
1492        fn push_into(&mut self, item: &Row) {
1493            self.push_into(DatumSeq::borrow_as(item));
1494        }
1495    }
1496
1497    impl PushInto<&RowRef> for DatumContainer {
1498        #[inline(always)]
1499        fn push_into(&mut self, item: &RowRef) {
1500            self.push_into(DatumSeq::borrow_as(item));
1501        }
1502    }
1503
1504    /// Number of pushes a from-scratch container observes before it turns its
1505    /// gathered stats into a safe codec.
1506    ///
1507    /// A safe codec has at most `256 - SAFE_TAG_BASE` (= 134) dictionary slots per
1508    /// column, so we only need to identify ~134 genuinely-popular values. The
1509    /// `MisraGries` summary retains up to `2 * k` (= 1024) distinct candidates
1510    /// between tidies and reduces to `k` (= 512), comfortably more than 134, so the
1511    /// threshold just needs to be large enough that heavy hitters accumulate counts
1512    /// well above 1 before we freeze the codec. 64Ki pushes gives that headroom while
1513    /// keeping the pre-codec (uncompressed) window short.
1514    const STATS_THRESHOLD: usize = 64 * 1024;
1515
1516    impl PushInto<DatumSeq<'_>> for DatumContainer {
1517        #[inline]
1518        fn push_into(&mut self, item: DatumSeq<'_>) {
1519            // Fast path: container and item are both unencoded.
1520            // This is the hot path when dictionary compression is disabled.
1521            if self.codec.is_none() && self.stats.is_none() && item.iter.index.is_none() {
1522                self.inner.push_ref(item.iter.data);
1523                return;
1524            }
1525
1526            // Check if we've gathered enough stats to install a safe codec.
1527            if self.codec.is_none() && self.stats.is_some() && self.inner.len() >= STATS_THRESHOLD {
1528                let stats = self.stats.take().unwrap();
1529                self.codec = Some(stats.new_safe());
1530            }
1531
1532            if let Some(codec) = &mut self.codec {
1533                // Encode using the installed codec.
1534                codec.encode(item.bytes_iter(), &mut self.staging);
1535            } else if let Some(stats) = &mut self.stats {
1536                // Stats-gathering phase: feed the statistics but store raw bytes.
1537                // `observe` updates the heavy-hitter/tag summaries without encoding, so
1538                // we copy each row exactly once (below) instead of also encoding it into
1539                // a buffer we would immediately discard.
1540                stats.observe(item.bytes_iter());
1541                for slice in item.bytes_iter() {
1542                    self.staging.extend_from_slice(slice);
1543                }
1544            } else {
1545                // No codec, no stats: raw copy.
1546                for slice in item.bytes_iter() {
1547                    self.staging.extend_from_slice(slice);
1548                }
1549            }
1550            self.inner.push_ref(&self.staging[..]);
1551            self.staging.clear();
1552        }
1553    }
1554
1555    use mz_repr::{Datum, read_datum};
1556
1557    /// A reference that can be resolved to a sequence of `Datum`s.
1558    ///
1559    /// This type must "compare" as if decoded to a `Row`, which means it needs to track
1560    /// various nuances of `Row::cmp`, which at the moment is first by length, and then by
1561    /// the raw binary slice backing the row. Neither of those are explicit in this struct.
1562    /// We will need to produce them in order to perform comparisons.
1563    #[derive(Debug)]
1564    pub struct DatumSeq<'a> {
1565        pub iter: ColumnsIter<'a>,
1566    }
1567
1568    impl<'a> DatumSeq<'a> {
1569        #[inline(always)]
1570        fn borrow_as(other: &'a RowRef) -> Self {
1571            Self {
1572                iter: ColumnsCodec::borrow_row(other),
1573            }
1574        }
1575
1576        /// Borrow a `Row` as a `DatumSeq` so that it can be used to seek into a
1577        /// trace whose key/value container is a [`DatumContainer`].
1578        #[inline]
1579        pub fn from_row(row: &'a Row) -> Self {
1580            Self::borrow_as(row)
1581        }
1582
1583        #[inline]
1584        pub fn to_row(&self) -> Row {
1585            // Fast path: unencoded data is already row-formatted bytes.
1586            if self.iter.index.is_none() {
1587                return unsafe { Row::from_bytes_unchecked(self.iter.data) };
1588            }
1589            Row::pack(*self)
1590        }
1591    }
1592
1593    impl<'a> Copy for DatumSeq<'a> {}
1594    impl<'a> Clone for DatumSeq<'a> {
1595        #[inline(always)]
1596        fn clone(&self) -> Self {
1597            *self
1598        }
1599    }
1600
1601    use std::cmp::Ordering;
1602    impl<'a, 'b> PartialEq<DatumSeq<'a>> for DatumSeq<'b> {
1603        #[inline(always)]
1604        fn eq(&self, other: &DatumSeq<'a>) -> bool {
1605            // Fast path: both sides are unencoded raw row bytes.
1606            if self.iter.index.is_none() && other.iter.index.is_none() {
1607                return self.iter.data == other.iter.data;
1608            }
1609            Iterator::eq(self.iter, other.iter)
1610        }
1611    }
1612    impl<'a> Eq for DatumSeq<'a> {}
1613    impl<'a, 'b> PartialOrd<DatumSeq<'a>> for DatumSeq<'b> {
1614        #[inline(always)]
1615        fn partial_cmp(&self, other: &DatumSeq<'a>) -> Option<Ordering> {
1616            // Fast path: both sides are unencoded raw row bytes.
1617            if self.iter.index.is_none() && other.iter.index.is_none() {
1618                let left = self.iter.data;
1619                let right = other.iter.data;
1620                return Some(match left.len().cmp(&right.len()) {
1621                    Ordering::Equal => left.cmp(right),
1622                    other => other,
1623                });
1624            }
1625            // Slow path: at least one side is dictionary-encoded.
1626            // Fused length + lexicographic comparison in a single pass per side.
1627            // Row ordering is: shorter < longer; equal lengths compared lexicographically.
1628            //
1629            // We compare byte-by-byte (via `flatten`) rather than slice-by-slice on
1630            // purpose: a dictionary tag expands to a multi-byte value on one side while
1631            // the other side may store those same bytes raw, so the per-column slice
1632            // boundaries do not line up between the two iterators. Decoding to a flat
1633            // byte stream is the only representation in which both sides are directly
1634            // comparable. This path is cold — it only runs when at least one operand is
1635            // dictionary-encoded; the common unencoded case is handled by the fast path
1636            // above with a single slice comparison.
1637            let mut left = self.iter.flatten();
1638            let mut right = other.iter.flatten();
1639            let mut first_diff = Ordering::Equal;
1640            loop {
1641                match (left.next(), right.next()) {
1642                    (Some(l), Some(r)) => {
1643                        if first_diff == Ordering::Equal {
1644                            first_diff = l.cmp(r);
1645                        }
1646                    }
1647                    // Left exhausted first: left is shorter, so Less.
1648                    (None, Some(_)) => return Some(Ordering::Less),
1649                    // Right exhausted first: right is shorter, so Greater.
1650                    (Some(_), None) => return Some(Ordering::Greater),
1651                    // Same length: use first lexicographic difference.
1652                    (None, None) => return Some(first_diff),
1653                }
1654            }
1655        }
1656    }
1657    impl<'a> Ord for DatumSeq<'a> {
1658        #[inline(always)]
1659        fn cmp(&self, other: &Self) -> Ordering {
1660            self.partial_cmp(other).unwrap()
1661        }
1662    }
1663
1664    impl<'a> PartialEq<&'a Row> for DatumSeq<'a> {
1665        #[inline(always)]
1666        fn eq(&self, other: &&'a Row) -> bool {
1667            self.eq(&Self::borrow_as(*other))
1668        }
1669    }
1670
1671    // Lifetimes decoupled (`'b` independent of `'a`): the arrange machinery
1672    // requires `for<'b> DatumSeq<'a>: PartialEq<&'b RowRef>`, i.e. a fixed
1673    // `DatumSeq` must compare against a `&RowRef` of any lifetime.
1674    impl<'a, 'b> PartialEq<&'b RowRef> for DatumSeq<'a> {
1675        #[inline(always)]
1676        fn eq(&self, other: &&'b RowRef) -> bool {
1677            self.eq(&DatumSeq::borrow_as(*other))
1678        }
1679    }
1680
1681    impl<'a> DatumSeq<'a> {
1682        #[inline(always)]
1683        pub fn bytes_iter(self) -> ColumnsIter<'a> {
1684            self.iter
1685        }
1686    }
1687
1688    impl<'a> Iterator for DatumSeq<'a> {
1689        type Item = Datum<'a>;
1690        #[inline(always)]
1691        fn next(&mut self) -> Option<Self::Item> {
1692            // Delegate to `ColumnsIter`, which handles both the codec and no-codec
1693            // cases. The no-codec scan hot path is served directly by `extend_datums`
1694            // (which decodes without going through this iterator), so the only callers
1695            // left here are the codec-encoded `extend_datums`/`to_row` paths and tests;
1696            // none warrant a dedicated no-codec fast path.
1697            self.iter
1698                .next()
1699                .map(|mut bytes| unsafe { read_datum(&mut bytes) })
1700        }
1701    }
1702
1703    use mz_repr::RowArena;
1704    use mz_repr::fixed_length::ExtendDatums;
1705    impl<'long> ExtendDatums for DatumSeq<'long> {
1706        #[inline]
1707        fn extend_datums<'a>(
1708            &'a self,
1709            _arena: &'a RowArena,
1710            target: &mut Vec<Datum<'a>>,
1711            max: Option<usize>,
1712        ) {
1713            // Branch on codec presence ONCE per row rather than once per datum.
1714            // With no codec (the common, feature-off case) push raw datums in a
1715            // tight loop, matching the pre-dictionary path; with a codec, fall
1716            // back to the per-column iterator. This keeps the codec check out of
1717            // the per-datum loop — the source of the feature-off scan overhead.
1718            if self.iter.index.is_none() {
1719                let mut data = self.iter.data;
1720                match max {
1721                    Some(max) => {
1722                        let mut n = 0;
1723                        while n < max && !data.is_empty() {
1724                            target.push(unsafe { read_datum(&mut data) });
1725                            n += 1;
1726                        }
1727                    }
1728                    None => {
1729                        while !data.is_empty() {
1730                            target.push(unsafe { read_datum(&mut data) });
1731                        }
1732                    }
1733                }
1734            } else {
1735                match max {
1736                    Some(max) => target.extend((*self).take(max)),
1737                    None => target.extend(*self),
1738                }
1739            }
1740        }
1741    }
1742}
1743
1744/// Traits abstracting the processes of encoding and decoding row-encoded byte sequences.
1745///
1746/// It is unsafe to use these types to encode byte sequences that are not row-encoded,
1747/// as they are parsed out of contiguous `[u8]` slices using `mz_repr::read_datum`.
1748mod row_codec {
1749
1750    pub use self::misra_gries::MisraGries;
1751    pub use columns::{ColumnsCodec, ColumnsIter};
1752    pub use dictionary::DictionaryCodec;
1753    #[cfg(test)]
1754    pub use dictionary::SAFE_TAG_BASE;
1755
1756    // Deterministic hasher state for the codecs' hash maps: a fixed-seed
1757    // `ahash::RandomState` shared with `mz_timely_util`'s consolidation hasher, so
1758    // the heavy-hitter summaries — and therefore which values each codec compresses
1759    // — are identical across runs and replicas, as the old `BTreeMap` backing was.
1760    use mz_timely_util::hash::fixed_state;
1761
1762    // The codecs encode and decode `[u8]` data specific to the `[Row]` encoding. They
1763    // soundly decode data they themselves encoded from valid `[Row]` data, but may be
1764    // unsound if asked to decode data that was not row-encoded, or was encoded with a
1765    // different codec. `ColumnsCodec` (a per-column wrapper around `DictionaryCodec`) is
1766    // the only codec the spine instantiates; the methods are inherent rather than behind
1767    // a `Codec` trait because nothing ever dispatches over codecs generically.
1768
1769    mod columns {
1770
1771        use mz_repr::{RowRef, read_datum};
1772
1773        use super::DictionaryCodec;
1774
1775        /// Independently encodes each column.
1776        #[derive(Default, Debug)]
1777        pub struct ColumnsCodec {
1778            columns: Vec<DictionaryCodec>,
1779        }
1780
1781        impl ColumnsCodec {
1782            /// Decode a row-encoded byte slice into per-column byte slices.
1783            pub(crate) fn decode<'a>(&'a self, bytes: &'a [u8]) -> ColumnsIter<'a> {
1784                ColumnsIter {
1785                    index: Some(self),
1786                    column: 0,
1787                    data: bytes,
1788                }
1789            }
1790            /// Encode a sequence of column byte slices, updating per-column statistics.
1791            pub(crate) fn encode<'a, I>(&mut self, iter: I, output: &mut Vec<u8>)
1792            where
1793                I: IntoIterator<Item = &'a [u8]>,
1794            {
1795                for (index, bytes) in iter.into_iter().enumerate() {
1796                    if self.columns.len() <= index {
1797                        self.columns.push(Default::default());
1798                    }
1799                    self.columns[index].encode(std::iter::once(bytes), output);
1800                }
1801            }
1802
1803            /// Construct a codec valid for the union of the supplied codecs' data.
1804            pub(crate) fn new_from<'a>(stats: impl IntoIterator<Item = &'a Self>) -> Self {
1805                // An empty `stats` iterator yields a zero-column codec, which encodes and
1806                // decodes nothing; callers merging no inputs get an inert (but sound) codec.
1807                let stats = stats.into_iter().collect::<Vec<_>>();
1808                let cols = stats.iter().map(|s| s.columns.len()).max().unwrap_or(0);
1809                let mut columns = Vec::with_capacity(cols);
1810                let default: DictionaryCodec = Default::default();
1811                for index in 0..cols {
1812                    columns.push(DictionaryCodec::new_from(
1813                        stats
1814                            .iter()
1815                            .map(|s| s.columns.get(index).unwrap_or(&default)),
1816                    ));
1817                }
1818                Self { columns }
1819            }
1820
1821            /// Reveal a row's bytes for fast-path comparison, with no codec to consult.
1822            #[inline(always)]
1823            pub(crate) fn borrow_row(row: &RowRef) -> ColumnsIter<'_> {
1824                ColumnsIter {
1825                    index: None,
1826                    column: 0,
1827                    data: row.data(),
1828                }
1829            }
1830        }
1831
1832        impl ColumnsCodec {
1833            /// Visit contained allocations to determine their size and capacity.
1834            pub(crate) fn heap_size(&self, callback: &mut impl FnMut(usize, usize)) {
1835                let elem = std::mem::size_of::<DictionaryCodec>();
1836                callback(self.columns.len() * elem, self.columns.capacity() * elem);
1837                for column in &self.columns {
1838                    column.heap_size(callback);
1839                }
1840            }
1841        }
1842
1843        impl ColumnsCodec {
1844            /// Record a row's column values in the statistics without encoding.
1845            ///
1846            /// Used during the stats-gathering phase, where we want the heavy-hitter
1847            /// and tag-usage information but store the row raw, so encoding into a
1848            /// throwaway buffer would be pure waste.
1849            #[inline]
1850            pub(crate) fn observe<'a, I>(&mut self, iter: I)
1851            where
1852                I: IntoIterator<Item = &'a [u8]>,
1853            {
1854                for (index, bytes) in iter.into_iter().enumerate() {
1855                    if self.columns.len() <= index {
1856                        self.columns.push(Default::default());
1857                    }
1858                    self.columns[index].observe(bytes);
1859                }
1860            }
1861        }
1862
1863        impl ColumnsCodec {
1864            /// Construct a codec using only structurally safe tags.
1865            ///
1866            /// Consumes `self`: this is only ever called on stats that have just been
1867            /// `take`n out of a container and are about to be discarded, so we move the
1868            /// per-column `MisraGries` summaries through rather than cloning them.
1869            pub(crate) fn new_safe(self) -> Self {
1870                let columns = self
1871                    .columns
1872                    .into_iter()
1873                    .map(DictionaryCodec::new_safe)
1874                    .collect();
1875                Self { columns }
1876            }
1877        }
1878
1879        #[derive(Debug, Copy, Clone)]
1880        pub struct ColumnsIter<'a> {
1881            // `None` when iterating an owned row directly, with no codec to consult.
1882            pub index: Option<&'a ColumnsCodec>,
1883            pub column: usize,
1884            pub data: &'a [u8],
1885        }
1886
1887        impl<'a> Iterator for ColumnsIter<'a> {
1888            type Item = &'a [u8];
1889            #[inline(always)]
1890            fn next(&mut self) -> Option<Self::Item> {
1891                if self.data.is_empty() {
1892                    None
1893                } else if let Some(bytes) = self
1894                    .index
1895                    .as_ref()
1896                    .and_then(|i| i.columns.get(self.column))
1897                    .and_then(|i| i.decode.get(self.data[0].into()))
1898                {
1899                    self.data = &self.data[1..];
1900                    self.column += 1;
1901                    Some(bytes)
1902                } else {
1903                    let mut data = self.data;
1904                    let data_len = data.len();
1905                    unsafe {
1906                        read_datum(&mut data);
1907                    }
1908                    let (prev, next) = self.data.split_at(data_len - data.len());
1909                    self.data = next;
1910                    self.column += 1;
1911                    Some(prev)
1912                }
1913            }
1914        }
1915
1916        impl<'a> ColumnsIter<'a> {
1917            /// Create a column iterator without a codec.
1918            ///
1919            /// This requires the data to be row-formatted, and it will be erroneous otherwise.
1920            #[inline(always)]
1921            pub unsafe fn without_codec(data: &'a [u8]) -> Self {
1922                Self {
1923                    index: None,
1924                    column: 0,
1925                    data,
1926                }
1927            }
1928        }
1929    }
1930
1931    /// A dictionary encoding codec for `[Row]` data.
1932    ///
1933    /// The dictionary harvests unused tags within each column and uses them to
1934    /// represent popular values within that column. There are two mechanisms it
1935    /// uses to accomplish this:
1936    ///
1937    /// 1. Statically free tags: `SAFE_TAG_BASE` is taken as an exclusive upper bound
1938    ///    on the tags that will be used by `[Row]`, and tags greater or equal to this
1939    ///    value are always safe to use.
1940    /// 2. Dynamically free tags: having seen an entire collection, we can use any
1941    ///    tag not otherwise used by the collection, as it would not be ambiguous.
1942    ///
1943    /// It goes without saying that if either of these approaches are incorrect,
1944    /// there are calamitous unsoundness implications.
1945    mod dictionary {
1946        // The `encode` map is a pure value->tag lookup table (never iterated for logic),
1947        // so `mz_ore::collections::HashMap`'s order-hiding would suffice — but it offers
1948        // no fixed-seed constructor, and we want the same deterministic hasher as the
1949        // summary above. `heap_size`'s `keys()` walk is an order-insensitive sum.
1950        #![allow(clippy::disallowed_types)]
1951
1952        use std::collections::HashMap;
1953
1954        use super::fixed_state;
1955        pub use super::{BytesMap, MisraGries};
1956
1957        /// First byte value that is structurally unused by the datum encoding.
1958        /// All byte values >= this are safe to use as dictionary tags without
1959        /// observing the data, since no datum's first byte can have this value.
1960        ///
1961        /// `mz_repr`'s `Row` `Tag` enum currently has 94 variants (discriminants
1962        /// 0..=93), so the truly tight bound is 94. We deliberately pick a larger,
1963        /// round-ish constant to leave headroom for new tags without having to also
1964        /// bump the safe set, and the `test_safe_tag_base` test pins the real
1965        /// invariant: every datum the row format produces must encode with a first
1966        /// byte strictly less than this value. If a future tag crosses the boundary
1967        /// that test fails loudly rather than silently corrupting decoding.
1968        pub const SAFE_TAG_BASE: u8 = 122;
1969
1970        /// Per-column dictionary codec. Encodes column byte slices, replacing popular
1971        /// values with spare tags; decoding is performed by `ColumnsIter` reading the
1972        /// `decode` map directly.
1973        #[derive(Default, Debug)]
1974        pub struct DictionaryCodec {
1975            // Looked up once per value on the encode path; mostly misses (only popular
1976            // values compress), so a hash map beats a `BTreeMap`'s byte-slice walk. The
1977            // map is only ever read via `get` — never iterated — so its hasher seed has
1978            // no observable effect; the populated maps are built with `fixed_state` in
1979            // `new_from`/`new_safe` for consistency, while the derived-`Default` (stats
1980            // accumulator) variant stays empty and is never consulted.
1981            encode: HashMap<Vec<u8>, u8, ahash::RandomState>,
1982            pub decode: BytesMap,
1983            stats: (MisraGries<Vec<u8>>, [u64; 4]),
1984        }
1985
1986        impl DictionaryCodec {
1987            /// Encode a sequence of byte slices.
1988            ///
1989            /// Encoding also records statistics about the structure of the input.
1990            ///
1991            /// Decoding has no symmetric method here: a column's bytes are decoded by
1992            /// `ColumnsIter`, which consults the `decode` map directly.
1993            pub(super) fn encode<'a, I>(&mut self, iter: I, output: &mut Vec<u8>)
1994            where
1995                I: IntoIterator<Item = &'a [u8]>,
1996            {
1997                for bytes in iter.into_iter() {
1998                    debug_assert!(
1999                        !bytes.is_empty(),
2000                        "row encoding never yields empty column slices",
2001                    );
2002                    // If we have an index referencing `bytes`, use the index key.
2003                    if let Some(b) = self.encode.get(bytes) {
2004                        output.push(*b);
2005                    } else {
2006                        // Raw fall-through. Soundness rests on `bytes[0]` never being a
2007                        // tag we hand out as a dictionary key: `new_from`/`new_safe` only
2008                        // assign dictionary tags from first-byte values that were never
2009                        // observed (or are `>= SAFE_TAG_BASE`, which no datum first-byte
2010                        // can equal). If a literal datum's first byte collided with a
2011                        // dictionary tag, `decode` would resolve it to the dictionary
2012                        // entry instead of reading the datum. This `debug_assert` makes
2013                        // the load-bearing "no later first-byte outside the observed
2014                        // union" invariant self-checking.
2015                        debug_assert!(
2016                            self.decode.get(bytes[0].into()).is_none(),
2017                            "raw datum first-byte {} collides with a dictionary tag; \
2018                             decode would be ambiguous",
2019                            bytes[0],
2020                        );
2021                        output.extend(bytes);
2022                    }
2023                    self.observe(bytes);
2024                }
2025            }
2026
2027            /// Construct a new encoder from supplied statistics.
2028            pub(super) fn new_from<'a>(stats: impl IntoIterator<Item = &'a Self>) -> Self {
2029                // Collect most popular bytes from combined containers.
2030                let mut mg = MisraGries::default();
2031                let mut tags: [u64; 4] = [0; 4];
2032                for stat in stats.into_iter() {
2033                    for (thing, count) in stat.stats.0.clone().done() {
2034                        mg.update(thing, count);
2035                    }
2036                    tags[0] |= stat.stats.1[0];
2037                    tags[1] |= stat.stats.1[1];
2038                    tags[2] |= stat.stats.1[2];
2039                    tags[3] |= stat.stats.1[3];
2040                }
2041                let mut mg = mg
2042                    .done()
2043                    .into_iter()
2044                    .filter(|(next_bytes, count)| next_bytes.len() > 1 && count > &1);
2045                // Establish encoding and decoding rules.
2046                let mut encode = HashMap::with_hasher(fixed_state());
2047                let mut decode = BytesMap::default();
2048                for tag in 0..=255 {
2049                    let tag_idx: usize = (tag % 4).into();
2050                    let shift = tag >> 2;
2051                    if (tags[tag_idx] >> shift) & 0x01 != 0 {
2052                        // Tag is used by a literal datum first-byte; reserve the slot.
2053                        decode.push(None);
2054                    } else if let Some((next_bytes, _count)) = mg.next() {
2055                        decode.push(Some(&next_bytes[..]));
2056                        encode.insert(next_bytes, tag);
2057                    } else {
2058                        // Unused tag, but the heavy-hitter supply is exhausted. We must
2059                        // still push a slot so that `decode`'s index stays aligned with
2060                        // the tag value: every iteration pushes exactly once, keeping the
2061                        // map length 256 and `decode.get(tag)` addressable by tag.
2062                        decode.push(None);
2063                    }
2064                }
2065
2066                Self {
2067                    encode,
2068                    decode,
2069                    stats: (MisraGries::default(), [0u64; 4]),
2070                }
2071            }
2072        }
2073
2074        impl DictionaryCodec {
2075            /// Visit contained allocations to determine their size and capacity.
2076            ///
2077            /// The `encode` table is approximated as one logical entry's worth of bytes
2078            /// per element for size and its reserved `capacity()` for capacity; the
2079            /// dominant terms (the owned key bytes and the `decode` map's byte arena)
2080            /// are accounted exactly.
2081            pub fn heap_size(&self, callback: &mut impl FnMut(usize, usize)) {
2082                let entry = std::mem::size_of::<(Vec<u8>, u8)>();
2083                callback(self.encode.len() * entry, self.encode.capacity() * entry);
2084                for key in self.encode.keys() {
2085                    callback(key.len(), key.capacity());
2086                }
2087                self.decode.heap_size(callback);
2088                self.stats.0.heap_size(callback);
2089            }
2090
2091            /// Record a single column value in this codec's statistics without
2092            /// producing any encoded output.
2093            ///
2094            /// Statistics come in two decoupled parts, with very different costs and
2095            /// purposes:
2096            ///
2097            /// 1. The tag bitmap (`stats.1`) records which first-byte values have been
2098            ///    observed. It is cheap (four `u64` ORs) and *soundness critical*:
2099            ///    `new_from`'s dynamic-tag path only hands out tags that this bitmap
2100            ///    reports as unused, so it must stay accurate for the entire life of the
2101            ///    codec, including on the hot encode path.
2102            /// 2. The MisraGries summary (`stats.0`) tracks heavy hitters and only
2103            ///    affects *which* values a future codec compresses, never correctness.
2104            ///    It is the expensive part (a `BTreeMap` insert per column per row). We
2105            ///    keep feeding it after install, on the hot encode path, on purpose: a
2106            ///    later merge rebuilds the merged codec from these summaries via
2107            ///    `new_from`. If we froze the summary at install time, then as the
2108            ///    collection evolves — records cancel under consolidation, the popular
2109            ///    set drifts — the codec could never reclaim slots for newly-popular
2110            ///    values and would eventually be left compressing values that no longer
2111            ///    occur, ceasing to compress the ones that do.
2112            #[inline]
2113            pub fn observe(&mut self, bytes: &[u8]) {
2114                debug_assert!(
2115                    !bytes.is_empty(),
2116                    "row encoding never yields empty column slices",
2117                );
2118                let tag = bytes[0];
2119                let tag_idx: usize = (tag % 4).into();
2120                self.stats.1[tag_idx] |= 1 << (tag >> 2);
2121                self.stats.0.insert_ref(bytes);
2122            }
2123
2124            /// Construct a codec using only structurally safe tags (>= SAFE_TAG_BASE).
2125            /// These tags never collide with datum first-bytes, so the codec can be
2126            /// installed without observing all data first.
2127            pub(super) fn new_safe(stats: Self) -> Self {
2128                // The container stores its pre-install rows raw, so the first-byte
2129                // bitmap (`stats.1`) gathered while observing them must carry over to
2130                // the installed codec. The bitmap is soundness-critical: a later
2131                // `new_from` merge consults it to decide which one-byte tags are free
2132                // to hand out as dictionary keys. If we dropped it here, the merge
2133                // could assign a dictionary tag equal to a pre-install datum's first
2134                // byte, after which `decode` would resolve that literal datum to the
2135                // dictionary entry. The MisraGries summary (`stats.0`), by contrast,
2136                // is consumed below to seed the dictionary and is reset, since the
2137                // installed codec re-accumulates it from rows it sees post-install.
2138                let (mg, observed_tags) = stats.stats;
2139                let mut mg = mg
2140                    .done()
2141                    .into_iter()
2142                    .filter(|(next_bytes, count)| next_bytes.len() > 1 && count > &1);
2143                let mut encode = HashMap::with_hasher(fixed_state());
2144                let mut decode = BytesMap::default();
2145                // Fill slots 0..SAFE_TAG_BASE with None (reserved for datum tags).
2146                for _ in 0..SAFE_TAG_BASE {
2147                    decode.push(None);
2148                }
2149                // Assign dictionary entries to safe tags.
2150                for tag in SAFE_TAG_BASE..=255 {
2151                    if let Some((next_bytes, _count)) = mg.next() {
2152                        decode.push(Some(&next_bytes[..]));
2153                        encode.insert(next_bytes, tag);
2154                    }
2155                }
2156                Self {
2157                    encode,
2158                    decode,
2159                    stats: (MisraGries::default(), observed_tags),
2160                }
2161            }
2162        }
2163    }
2164
2165    /// A map from `0 .. something` to `Option<&[u8]>`.
2166    ///
2167    /// Non-empty slices are pushed in order, and can be retrieved by index.
2168    /// Pushing an empty slice is equivalent to pushing `None`.
2169    #[derive(Debug)]
2170    pub struct BytesMap {
2171        offsets: Vec<usize>,
2172        bytes: Vec<u8>,
2173    }
2174    impl Default for BytesMap {
2175        #[inline(always)]
2176        fn default() -> Self {
2177            Self {
2178                offsets: vec![0],
2179                bytes: Vec::new(),
2180            }
2181        }
2182    }
2183    impl BytesMap {
2184        #[inline]
2185        fn push(&mut self, input: Option<&[u8]>) {
2186            if let Some(bytes) = input {
2187                self.bytes.extend(bytes);
2188            }
2189            self.offsets.push(self.bytes.len());
2190        }
2191        /// Visit contained allocations to determine their size and capacity.
2192        fn heap_size(&self, callback: &mut impl FnMut(usize, usize)) {
2193            let off = std::mem::size_of::<usize>();
2194            callback(self.offsets.len() * off, self.offsets.capacity() * off);
2195            callback(self.bytes.len(), self.bytes.capacity());
2196        }
2197        #[inline]
2198        fn get(&self, index: usize) -> Option<&[u8]> {
2199            if index < self.offsets.len() - 1 {
2200                let lower = self.offsets[index];
2201                let upper = self.offsets[index + 1];
2202                if lower < upper {
2203                    Some(&self.bytes[lower..upper])
2204                } else {
2205                    None
2206                }
2207            } else {
2208                None
2209            }
2210        }
2211    }
2212
2213    mod misra_gries {
2214        // The summary must iterate its entries (to extract heavy hitters in `done`, to
2215        // `tidy`, and to size itself), which `mz_ore::collections::HashMap` deliberately
2216        // forbids. We instead get determinism from the fixed-seed hasher (`fixed_state`)
2217        // plus the total-order sort in `done`; `tidy`/`heap_size` are order-insensitive.
2218        #![allow(clippy::disallowed_types)]
2219
2220        use std::collections::HashMap;
2221        use std::hash::Hash;
2222
2223        use super::fixed_state;
2224
2225        /// Maintains a summary of "heavy hitters" in a presented collection of items.
2226        ///
2227        /// Uses a hash map internally so that repeated observations of the same
2228        /// element only allocate once (on first sighting), and so the per-element
2229        /// `insert_ref` is an O(1) hash rather than an O(log n) walk of byte-slice
2230        /// comparisons. This is the hot path: one lookup per column per row, fed both
2231        /// while gathering stats and on the steady-state encode path. The hasher is
2232        /// fixed-seed (see [`fixed_state`]) so the summary — and thus which values a
2233        /// codec compresses — stays deterministic across runs and replicas.
2234        ///
2235        /// Tidy is performed when the number of *distinct* elements exceeds `2 * k`,
2236        /// reducing to at most `k` entries.
2237        #[derive(Clone, Debug)]
2238        pub struct MisraGries<T: Ord + Hash> {
2239            inner: HashMap<T, usize, ahash::RandomState>,
2240            k: usize,
2241        }
2242
2243        impl<T: Ord + Hash> Default for MisraGries<T> {
2244            #[inline(always)]
2245            fn default() -> Self {
2246                Self {
2247                    inner: HashMap::with_hasher(fixed_state()),
2248                    k: 512,
2249                }
2250            }
2251        }
2252
2253        impl<T: Ord + Hash> MisraGries<T> {
2254            /// Inserts an additional element to the summary.
2255            #[inline(always)]
2256            pub fn insert(&mut self, element: T) {
2257                self.update(element, 1);
2258            }
2259            /// Inserts multiple copies of an element to the summary.
2260            #[inline]
2261            pub fn update(&mut self, element: T, count: usize) {
2262                *self.inner.entry(element).or_insert(0) += count;
2263                if self.inner.len() > 2 * self.k {
2264                    self.tidy();
2265                }
2266            }
2267
2268            /// Completes the summary, and extracts the items and their counts.
2269            pub fn done(self) -> Vec<(T, usize)> {
2270                let mut result: Vec<_> = self.inner.into_iter().collect();
2271                // Descending count, ties broken by key, so the values a codec selects
2272                // are deterministic regardless of hash-map iteration order.
2273                result.sort_by(|x, y| y.1.cmp(&x.1).then_with(|| x.0.cmp(&y.0)));
2274                result
2275            }
2276
2277            /// Reduces the summary down to at most `k` distinct items by
2278            /// subtracting the (k+1)-th largest count from all entries and
2279            /// discarding those that drop to zero or below.
2280            fn tidy(&mut self) {
2281                let mut counts: Vec<usize> = self.inner.values().copied().collect();
2282                counts.sort_unstable_by(|a, b| b.cmp(a));
2283                // The (k+1)-th largest count, or 0 if fewer than k+1 entries.
2284                let sub_weight = counts.get(self.k).copied().unwrap_or(0);
2285                if sub_weight > 0 {
2286                    self.inner.retain(|_, count| {
2287                        *count = count.saturating_sub(sub_weight);
2288                        *count > 0
2289                    });
2290                }
2291            }
2292        }
2293
2294        impl MisraGries<Vec<u8>> {
2295            /// Visit contained allocations to determine their size and capacity.
2296            ///
2297            /// The hash table is approximated as one logical entry per element for
2298            /// size and its reserved `capacity()` for capacity; the owned key bytes
2299            /// are accounted exactly.
2300            pub fn heap_size(&self, callback: &mut impl FnMut(usize, usize)) {
2301                let entry = std::mem::size_of::<(Vec<u8>, usize)>();
2302                callback(self.inner.len() * entry, self.inner.capacity() * entry);
2303                for key in self.inner.keys() {
2304                    callback(key.len(), key.capacity());
2305                }
2306            }
2307
2308            /// Insert a borrowed byte slice, only allocating if the key is new.
2309            #[inline]
2310            pub fn insert_ref(&mut self, element: &[u8]) {
2311                if let Some(count) = self.inner.get_mut(element) {
2312                    *count += 1;
2313                } else {
2314                    self.insert(element.to_owned());
2315                }
2316            }
2317        }
2318
2319        impl<T: Ord + Hash> std::ops::AddAssign for MisraGries<T> {
2320            fn add_assign(&mut self, rhs: Self) {
2321                for (element, count) in rhs.done() {
2322                    self.update(element, count);
2323                }
2324            }
2325        }
2326    }
2327}
mz_row_spine/lib.rs

mz_row_spine/
lib.rs