mz_persist_types/stats/
json.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeMap;
11use std::fmt::Debug;
12
13use mz_proto::{IntoRustIfSome, ProtoType, RustType, TryFromProtoError};
14use proptest::prelude::*;
15use proptest::strategy::{Strategy, Union};
16use serde_json::json;
17
18use crate::stats::primitive::{PrimitiveStats, any_primitive_stats};
19use crate::stats::{
20    DynStats, ProtoJsonMapElementStats, ProtoJsonMapStats, ProtoJsonStats, TrimStats,
21    proto_json_stats,
22};
23
24// Aggregate statistics about a column of Json elements.
25//
26// Each element could be any of a JsonNull, a bool, a string, a numeric, a list,
27// or a map/object. The column might be a single type but could also be a
28// mixture of any subset of these types.
29#[derive(Clone)]
30pub enum JsonStats {
31    /// A sentinel that indicates there were no elements.
32    None,
33    /// There were elements from more than one category of: bools, strings,
34    /// numerics, lists, maps.
35    Mixed,
36    /// A sentinel that indicates all elements were `Datum::JsonNull`s.
37    JsonNulls,
38    /// The min and max bools, or None if there were none.
39    Bools(PrimitiveStats<bool>),
40    /// The min and max strings, or None if there were none.
41    Strings(PrimitiveStats<String>),
42    /// The min and max numerics, or None if there were none.
43    /// Since we don't have a decimal type here yet, this is stored in serialized
44    /// form.
45    Numerics(PrimitiveStats<Vec<u8>>),
46    /// A sentinel that indicates all elements were `Datum::List`s.
47    ///
48    /// TODO: We could also do something for list indexes analogous to what we
49    /// do for map keys, but it initially seems much less likely that a user
50    /// would expect that to work with pushdown, so don't bother keeping the
51    /// stats until someone asks for it.
52    Lists,
53    /// Recursive statistics about the set of keys present in any maps/objects
54    /// in the column, or None if there were no maps/objects.
55    Maps(BTreeMap<String, JsonMapElementStats>),
56}
57
58#[derive(Default, Clone)]
59pub struct JsonMapElementStats {
60    pub len: usize,
61    pub stats: JsonStats,
62}
63
64impl Default for JsonStats {
65    fn default() -> Self {
66        JsonStats::None
67    }
68}
69
70impl Debug for JsonStats {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        Debug::fmt(&self.debug_json(), f)
73    }
74}
75
76impl JsonStats {
77    pub fn debug_json(&self) -> serde_json::Value {
78        match self {
79            JsonStats::None => json!({}),
80            JsonStats::Mixed => "json_mixed".into(),
81            JsonStats::JsonNulls => "json_nulls".into(),
82            JsonStats::Bools(x) => x.debug_json(),
83            JsonStats::Strings(x) => x.debug_json(),
84            JsonStats::Numerics(x) => x.debug_json(),
85            JsonStats::Lists => "json_lists".into(),
86            JsonStats::Maps(x) => x
87                .iter()
88                .map(|(k, v)| (k.clone(), v.debug_json()))
89                .collect::<serde_json::Map<_, _>>()
90                .into(),
91        }
92    }
93}
94
95impl JsonMapElementStats {
96    pub fn debug_json(&self) -> serde_json::Value {
97        json!({"len": self.len, "stats": self.stats.debug_json()})
98    }
99}
100
101impl RustType<ProtoJsonStats> for JsonStats {
102    fn into_proto(&self) -> ProtoJsonStats {
103        ProtoJsonStats {
104            kind: Some(match self {
105                JsonStats::None => proto_json_stats::Kind::None(()),
106                JsonStats::Mixed => proto_json_stats::Kind::Mixed(()),
107                JsonStats::JsonNulls => proto_json_stats::Kind::JsonNulls(()),
108                JsonStats::Bools(x) => proto_json_stats::Kind::Bools(RustType::into_proto(x)),
109                JsonStats::Strings(x) => proto_json_stats::Kind::Strings(RustType::into_proto(x)),
110                JsonStats::Numerics(x) => proto_json_stats::Kind::Numerics(RustType::into_proto(x)),
111                JsonStats::Lists => proto_json_stats::Kind::Lists(()),
112                JsonStats::Maps(x) => proto_json_stats::Kind::Maps(ProtoJsonMapStats {
113                    elements: x
114                        .iter()
115                        .map(|(k, v)| ProtoJsonMapElementStats {
116                            name: k.into_proto(),
117                            len: v.len.into_proto(),
118                            stats: Some(RustType::into_proto(&v.stats)),
119                        })
120                        .collect(),
121                }),
122            }),
123        }
124    }
125
126    fn from_proto(proto: ProtoJsonStats) -> Result<Self, TryFromProtoError> {
127        Ok(match proto.kind {
128            Some(proto_json_stats::Kind::None(())) => JsonStats::None,
129            Some(proto_json_stats::Kind::Mixed(())) => JsonStats::Mixed,
130            Some(proto_json_stats::Kind::JsonNulls(())) => JsonStats::JsonNulls,
131            Some(proto_json_stats::Kind::Bools(x)) => JsonStats::Bools(x.into_rust()?),
132            Some(proto_json_stats::Kind::Strings(x)) => JsonStats::Strings(x.into_rust()?),
133            Some(proto_json_stats::Kind::Numerics(x)) => JsonStats::Numerics(x.into_rust()?),
134            Some(proto_json_stats::Kind::Lists(())) => JsonStats::Lists,
135            Some(proto_json_stats::Kind::Maps(x)) => {
136                let mut elements = BTreeMap::new();
137                for x in x.elements {
138                    let stats = JsonMapElementStats {
139                        len: x.len.into_rust()?,
140                        stats: x.stats.into_rust_if_some("JsonMapElementStats::stats")?,
141                    };
142                    elements.insert(x.name.into_rust()?, stats);
143                }
144                JsonStats::Maps(elements)
145            }
146            // Unknown JSON stats type: assume this might have any value.
147            None => JsonStats::Mixed,
148        })
149    }
150}
151
152impl TrimStats for ProtoJsonStats {
153    fn trim(&mut self) {
154        use proto_json_stats::*;
155        match &mut self.kind {
156            Some(Kind::Strings(stats)) => {
157                stats.trim();
158            }
159            Some(Kind::Maps(stats)) => {
160                for value in &mut stats.elements {
161                    if let Some(stats) = &mut value.stats {
162                        stats.trim();
163                    }
164                }
165            }
166            Some(
167                Kind::None(_)
168                | Kind::Mixed(_)
169                | Kind::JsonNulls(_)
170                | Kind::Bools(_)
171                | Kind::Numerics(_)
172                | Kind::Lists(_),
173            ) => {}
174            None => {}
175        }
176    }
177}
178
179/// Returns a [`Strategy`] for generating abritrary [`JsonStats`].
180pub(crate) fn any_json_stats() -> impl Strategy<Value = JsonStats> {
181    let leaf = Union::new(vec![
182        any::<()>().prop_map(|_| JsonStats::None).boxed(),
183        any::<()>().prop_map(|_| JsonStats::Mixed).boxed(),
184        any::<()>().prop_map(|_| JsonStats::JsonNulls).boxed(),
185        any_primitive_stats::<bool>()
186            .prop_map(JsonStats::Bools)
187            .boxed(),
188        any_primitive_stats::<String>()
189            .prop_map(JsonStats::Strings)
190            .boxed(),
191        any::<()>().prop_map(|_| JsonStats::Lists).boxed(),
192    ]);
193    leaf.prop_recursive(2, 5, 3, |inner| {
194        (proptest::collection::btree_map(any::<String>(), inner, 0..3)).prop_map(|cols| {
195            let cols = cols
196                .into_iter()
197                .map(|(k, stats)| (k, JsonMapElementStats { len: 1, stats }))
198                .collect();
199            JsonStats::Maps(cols)
200        })
201    })
202}
203
204#[cfg(test)]
205mod tests {
206    use prost::Message;
207
208    use super::*;
209    use crate::stats::trim_to_budget_jsonb;
210
211    #[mz_ore::test]
212    fn jsonb_trim_to_budget() {
213        #[track_caller]
214        fn testcase(cols: &[(&str, usize)], required: Option<&str>) {
215            let cols = cols
216                .iter()
217                .map(|(key, cost)| {
218                    let stats = JsonStats::Numerics(PrimitiveStats {
219                        lower: vec![],
220                        upper: vec![0u8; *cost],
221                    });
222                    let len = stats.debug_json().to_string().len();
223                    ((*key).to_owned(), JsonMapElementStats { len, stats })
224                })
225                .collect();
226
227            // Serialize into proto and extract the necessary type.
228            let stats: ProtoJsonStats = RustType::into_proto(&JsonStats::Maps(cols));
229            let ProtoJsonStats {
230                kind: Some(proto_json_stats::Kind::Maps(mut stats)),
231            } = stats
232            else {
233                panic!("serialized produced wrong type!");
234            };
235
236            let mut budget = stats.encoded_len().next_power_of_two();
237            while budget > 0 {
238                let cost_before = stats.encoded_len();
239                trim_to_budget_jsonb(&mut stats, &mut budget, &|col| Some(col) == required);
240                let cost_after = stats.encoded_len();
241                assert!(cost_before >= cost_after);
242
243                // Assert force keep columns were kept.
244                if let Some(required) = required {
245                    assert!(
246                        stats
247                            .elements
248                            .iter()
249                            .any(|element| element.name == required)
250                    );
251                } else {
252                    assert!(cost_after <= budget);
253                }
254
255                budget = budget / 2;
256            }
257        }
258
259        testcase(&[], None);
260        testcase(&[("a", 100)], None);
261        testcase(&[("a", 1), ("b", 2), ("c", 4)], None);
262        testcase(&[("a", 1), ("b", 2), ("c", 4)], Some("b"));
263    }
264
265    #[mz_ore::test]
266    fn jsonb_trim_to_budget_smoke() {
267        let og_stats = JsonStats::Maps(
268            [
269                (
270                    "a".to_string(),
271                    JsonMapElementStats {
272                        len: 1,
273                        stats: JsonStats::Strings(PrimitiveStats {
274                            lower: "foobar".to_string(),
275                            upper: "foobaz".to_string(),
276                        }),
277                    },
278                ),
279                (
280                    "context".to_string(),
281                    JsonMapElementStats {
282                        len: 100,
283                        stats: JsonStats::Maps(
284                            [
285                                (
286                                    "b".to_string(),
287                                    JsonMapElementStats {
288                                        len: 99,
289                                        stats: JsonStats::Numerics(PrimitiveStats {
290                                            lower: vec![],
291                                            upper: vec![42u8; 99],
292                                        }),
293                                    },
294                                ),
295                                (
296                                    "c".to_string(),
297                                    JsonMapElementStats {
298                                        len: 1,
299                                        stats: JsonStats::Bools(PrimitiveStats {
300                                            lower: false,
301                                            upper: true,
302                                        }),
303                                    },
304                                ),
305                            ]
306                            .into(),
307                        ),
308                    },
309                ),
310            ]
311            .into(),
312        );
313
314        // Serialize into proto and extract the necessary type.
315        let stats: ProtoJsonStats = RustType::into_proto(&og_stats);
316        let ProtoJsonStats {
317            kind: Some(proto_json_stats::Kind::Maps(mut stats)),
318        } = stats
319        else {
320            panic!("serialized produced wrong type!");
321        };
322
323        let mut budget_shortfall = 50;
324        // We should recurse into the "context" message and only drop the "b" column.
325        trim_to_budget_jsonb(&mut stats, &mut budget_shortfall, &|_name| false);
326
327        let mut elements = stats
328            .elements
329            .into_iter()
330            .map(|element| (element.name.clone(), element))
331            .collect::<BTreeMap<String, _>>();
332        assert!(elements.remove("a").is_some());
333
334        let context = elements.remove("context").expect("trimmed too much");
335        let Some(ProtoJsonStats {
336            kind: Some(proto_json_stats::Kind::Maps(context)),
337        }) = context.stats
338        else {
339            panic!("serialized produced wrong type!")
340        };
341
342        // We should only have one element in "context" because we trimmed "b".
343        assert_eq!(context.elements.len(), 1);
344        assert_eq!(context.elements[0].name, "c");
345
346        // Redo the triming, force keeping the largest column.
347
348        // Serialize into proto and extract the necessary type.
349        let stats: ProtoJsonStats = RustType::into_proto(&og_stats);
350        let ProtoJsonStats {
351            kind: Some(proto_json_stats::Kind::Maps(mut stats)),
352        } = stats
353        else {
354            panic!("serialized produced wrong type!");
355        };
356
357        let mut budget_shortfall = 50;
358        // We're force keeping "b" which is larger than our budgets_shortfall, so we should drop
359        // everything else.
360        trim_to_budget_jsonb(&mut stats, &mut budget_shortfall, &|name| name == "b");
361
362        assert_eq!(stats.elements.len(), 1);
363        assert_eq!(stats.elements[0].name, "context");
364
365        let Some(ProtoJsonStats {
366            kind: Some(proto_json_stats::Kind::Maps(context)),
367        }) = &stats.elements[0].stats
368        else {
369            panic!("serialized produced wrong type!")
370        };
371
372        assert_eq!(context.elements.len(), 1);
373        assert_eq!(context.elements[0].name, "b");
374    }
375
376    // Regression test for a bug found during code review of initial stats
377    // trimming PR.
378    #[mz_ore::test]
379    fn stats_trim_regression_json() {
380        // Make sure we recursively trim json string and map stats by asserting
381        // that the goes down after trimming.
382        #[track_caller]
383        fn testcase(stats: JsonStats) {
384            let mut stats = stats.into_proto();
385            let before = stats.encoded_len();
386            stats.trim();
387            let after = stats.encoded_len();
388            assert!(after < before, "{} vs {}: {:?}", after, before, stats);
389        }
390
391        let col = JsonStats::Strings(PrimitiveStats {
392            lower: "foobar".into(),
393            upper: "foobaz".into(),
394        });
395        testcase(col.clone());
396        let mut cols = BTreeMap::new();
397        cols.insert("col".into(), JsonMapElementStats { len: 1, stats: col });
398        testcase(JsonStats::Maps(cols));
399    }
400}