mz_persist_types/stats/
structured.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeMap;
11
12use mz_proto::{ProtoType, RustType, TryFromProtoError};
13use proptest::prelude::*;
14use proptest::strategy::Strategy;
15use proptest_derive::Arbitrary;
16use serde::ser::{SerializeMap, SerializeStruct};
17
18use crate::stats::{
19    ColumnStatKinds, ColumnStats, ColumnarStats, DynStats, OptionStats, ProtoStructStats,
20    TrimStats, any_columnar_stats, proto_dyn_stats,
21};
22
23/// Statistics about a column of a struct type with a uniform schema (the same
24/// columns and associated `T: Data` types in each instance of the struct).
25#[derive(Arbitrary, Default, Clone)]
26pub struct StructStats {
27    /// The count of structs in the column.
28    pub len: usize,
29    /// Statistics about each of the columns in the struct.
30    ///
31    /// This will often be all of the columns, but it's not guaranteed. Persist
32    /// reserves the right to prune statistics about some or all of the columns.
33    #[proptest(strategy = "any_struct_stats_cols()")]
34    pub cols: BTreeMap<String, ColumnarStats>,
35}
36
37impl std::fmt::Debug for StructStats {
38    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39        std::fmt::Display::fmt(&self.debug_json(), f)
40    }
41}
42
43impl serde::Serialize for StructStats {
44    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
45        let StructStats { len, cols } = self;
46        let mut s = s.serialize_struct("StructStats", 2)?;
47        let () = s.serialize_field("len", len)?;
48        let () = s.serialize_field("cols", &DynStatsCols(cols))?;
49        s.end()
50    }
51}
52
53impl DynStats for StructStats {
54    fn debug_json(&self) -> serde_json::Value {
55        let mut cols = serde_json::Map::new();
56        cols.insert("len".to_owned(), self.len.into());
57        for (name, stats) in self.cols.iter() {
58            cols.insert(name.clone(), stats.debug_json());
59        }
60        cols.into()
61    }
62    fn into_columnar_stats(self) -> ColumnarStats {
63        ColumnarStats {
64            nulls: None,
65            values: ColumnStatKinds::Struct(self),
66        }
67    }
68}
69
70impl StructStats {
71    /// Returns the statistics for the specified column in the struct, if they exist.
72    ///
73    /// This will often be all of the columns, but it's not guaranteed. Persist
74    /// reserves the right to prune statistics about some or all of the columns.
75    pub fn col(&self, name: &str) -> Option<&ColumnarStats> {
76        self.cols.get(name)
77    }
78}
79
80impl ColumnStats for StructStats {
81    type Ref<'a> = ();
82
83    fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
84        // Not meaningful for structs
85        None
86    }
87    fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
88        // Not meaningful for structs
89        None
90    }
91    fn none_count(&self) -> usize {
92        0
93    }
94}
95
96impl ColumnStats for OptionStats<StructStats> {
97    type Ref<'a> = Option<()>;
98
99    fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
100        self.some.lower().map(Some)
101    }
102    fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
103        self.some.upper().map(Some)
104    }
105    fn none_count(&self) -> usize {
106        self.none
107    }
108}
109
110impl RustType<ProtoStructStats> for StructStats {
111    fn into_proto(&self) -> ProtoStructStats {
112        ProtoStructStats {
113            len: self.len.into_proto(),
114            cols: self
115                .cols
116                .iter()
117                .map(|(k, v)| (k.into_proto(), RustType::into_proto(v)))
118                .collect(),
119        }
120    }
121
122    fn from_proto(proto: ProtoStructStats) -> Result<Self, TryFromProtoError> {
123        let mut cols = BTreeMap::new();
124        for (k, v) in proto.cols {
125            cols.insert(k.into_rust()?, v.into_rust()?);
126        }
127        Ok(StructStats {
128            len: proto.len.into_rust()?,
129            cols,
130        })
131    }
132}
133
134impl TrimStats for ProtoStructStats {
135    fn trim(&mut self) {
136        use proto_dyn_stats::*;
137
138        for value in self.cols.values_mut() {
139            match &mut value.kind {
140                Some(Kind::Primitive(stats)) => stats.trim(),
141                Some(Kind::Bytes(stats)) => stats.trim(),
142                Some(Kind::Struct(stats)) => stats.trim(),
143                Some(Kind::None(())) => (),
144                None => {}
145            }
146        }
147    }
148}
149
150struct DynStatsCols<'a>(&'a BTreeMap<String, ColumnarStats>);
151
152impl serde::Serialize for DynStatsCols<'_> {
153    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
154        let mut s = s.serialize_map(Some(self.0.len()))?;
155        for (k, v) in self.0.iter() {
156            let v = v.debug_json();
157            let () = s.serialize_entry(k, &v)?;
158        }
159        s.end()
160    }
161}
162
163/// Returns a [`Strategy`] for generating arbitrary [`StructStats`].
164pub(crate) fn any_struct_stats_cols() -> impl Strategy<Value = BTreeMap<String, ColumnarStats>> {
165    proptest::collection::btree_map(any::<String>(), any_columnar_stats(), 1..5)
166}
167
168#[cfg(test)]
169mod tests {
170    use prost::Message;
171
172    use super::*;
173    use crate::stats::primitive::PrimitiveStats;
174    use crate::stats::{BytesStats, ColumnNullStats, ColumnStatKinds, trim_to_budget};
175
176    #[mz_ore::test]
177    fn struct_trim_to_budget() {
178        #[track_caller]
179        fn testcase(cols: &[(&str, usize)], required: Option<&str>) {
180            let cols = cols
181                .iter()
182                .map(|(key, cost)| {
183                    let stats = BytesStats::Primitive(PrimitiveStats {
184                        lower: vec![],
185                        upper: vec![0u8; *cost],
186                    });
187                    let stats = ColumnarStats {
188                        nulls: None,
189                        values: ColumnStatKinds::Bytes(stats),
190                    };
191                    ((*key).to_owned(), stats)
192                })
193                .collect();
194            let mut stats: ProtoStructStats = RustType::into_proto(&StructStats { len: 0, cols });
195            let mut budget = stats.encoded_len().next_power_of_two();
196            while budget > 0 {
197                let cost_before = stats.encoded_len();
198                let trimmed = trim_to_budget(&mut stats, budget, |col| Some(col) == required);
199                let cost_after = stats.encoded_len();
200                assert!(cost_before >= cost_after);
201                assert_eq!(trimmed, cost_before - cost_after);
202                if let Some(required) = required {
203                    assert!(stats.cols.contains_key(required));
204                } else {
205                    assert!(cost_after <= budget);
206                }
207                budget = budget / 2;
208            }
209        }
210
211        testcase(&[], None);
212        testcase(&[("a", 100)], None);
213        testcase(&[("a", 1), ("b", 2), ("c", 4)], None);
214        testcase(&[("a", 1), ("b", 2), ("c", 4)], Some("b"));
215    }
216
217    // Confirm that fields are being trimmed from largest to smallest.
218    #[mz_ore::test]
219    fn trim_order_regression() {
220        fn column_stats(lower: &'static str, upper: &'static str) -> ColumnarStats {
221            let stats = PrimitiveStats {
222                lower: lower.to_owned(),
223                upper: upper.to_owned(),
224            };
225            ColumnarStats {
226                nulls: None,
227                values: ColumnStatKinds::Primitive(stats.into()),
228            }
229        }
230        let stats = StructStats {
231            len: 2,
232            cols: BTreeMap::from([
233                ("foo".to_owned(), column_stats("a", "b")),
234                (
235                    "bar".to_owned(),
236                    column_stats("aaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaab"),
237                ),
238            ]),
239        };
240
241        // The threshold here is arbitrary... we just care that there's some budget where
242        // we'll discard the large field before the small one.
243        let mut proto_stats = RustType::into_proto(&stats);
244        trim_to_budget(&mut proto_stats, 30, |_| false);
245        assert!(proto_stats.cols.contains_key("foo"));
246        assert!(!proto_stats.cols.contains_key("bar"));
247    }
248
249    // Regression test for a bug found by a customer: trim_to_budget method only
250    // operates on the top level struct columns. This (sorta) worked before
251    // materialize#19309, but now there are always two columns at the top level, "ok" and
252    // "err", and the real columns are all nested under "ok".
253    #[mz_ore::test]
254    fn stats_trim_to_budget_regression_recursion() {
255        fn str_stats(n: usize, l: &str, u: &str) -> ColumnarStats {
256            ColumnarStats {
257                nulls: Some(ColumnNullStats { count: n }),
258                values: ColumnStatKinds::Primitive(
259                    PrimitiveStats {
260                        lower: l.to_owned(),
261                        upper: u.to_owned(),
262                    }
263                    .into(),
264                ),
265            }
266        }
267
268        const BIG: usize = 100;
269
270        // Model our ok/err structure for SourceData stats for a RelationDesc
271        // with wide columns.
272        let mut cols = BTreeMap::new();
273        for col in 'a'..='z' {
274            let col = col.to_string();
275            let stats = str_stats(2, "", &col.repeat(BIG));
276            cols.insert(col, stats);
277        }
278        cols.insert("foo_timestamp".to_string(), str_stats(2, "foo", "foo"));
279        let source_data_stats = StructStats {
280            len: 2,
281            cols: BTreeMap::from([
282                ("err".to_owned(), str_stats(2, "", "")),
283                (
284                    "ok".to_owned(),
285                    ColumnarStats {
286                        nulls: None,
287                        values: ColumnStatKinds::Struct(StructStats { len: 2, cols }),
288                    },
289                ),
290            ]),
291        };
292        let mut proto_stats = RustType::into_proto(&source_data_stats);
293        let trimmed = trim_to_budget(&mut proto_stats, BIG, |x| {
294            x.ends_with("timestamp") || x == "err"
295        });
296        // Sanity-check that the test is trimming something.
297        assert!(trimmed > 0);
298        // We don't want to trim either "ok" or "err".
299        assert!(proto_stats.cols.contains_key("ok"));
300        assert!(proto_stats.cols.contains_key("err"));
301        // Assert that we kept the timestamp column.
302        let ok = proto_stats.cols.get("ok").unwrap();
303        let proto_dyn_stats::Kind::Struct(ok_struct) = ok.kind.as_ref().unwrap() else {
304            panic!("ok was of unexpected type {:?}", ok);
305        };
306        assert!(ok_struct.cols.contains_key("foo_timestamp"));
307    }
308}