mz_persist_types/stats/
json.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeMap;
11use std::fmt::Debug;
12
13use mz_ore::str::redact;
14use mz_proto::{IntoRustIfSome, ProtoType, RustType, TryFromProtoError};
15use proptest::prelude::*;
16use proptest::strategy::{Strategy, Union};
17use serde_json::json;
18
19use crate::stats::primitive::{PrimitiveStats, any_primitive_stats};
20use crate::stats::{
21    DynStats, ProtoJsonMapElementStats, ProtoJsonMapStats, ProtoJsonStats, TrimStats,
22    proto_json_stats,
23};
24
25// Aggregate statistics about a column of Json elements.
26//
27// Each element could be any of a JsonNull, a bool, a string, a numeric, a list,
28// or a map/object. The column might be a single type but could also be a
29// mixture of any subset of these types.
30#[derive(Clone)]
31pub enum JsonStats {
32    /// A sentinel that indicates there were no elements.
33    None,
34    /// There were elements from more than one category of: bools, strings,
35    /// numerics, lists, maps.
36    Mixed,
37    /// A sentinel that indicates all elements were `Datum::JsonNull`s.
38    JsonNulls,
39    /// The min and max bools, or None if there were none.
40    Bools(PrimitiveStats<bool>),
41    /// The min and max strings, or None if there were none.
42    Strings(PrimitiveStats<String>),
43    /// The min and max numerics, or None if there were none.
44    /// Since we don't have a decimal type here yet, this is stored in serialized
45    /// form.
46    Numerics(PrimitiveStats<Vec<u8>>),
47    /// A sentinel that indicates all elements were `Datum::List`s.
48    ///
49    /// TODO: We could also do something for list indexes analogous to what we
50    /// do for map keys, but it initially seems much less likely that a user
51    /// would expect that to work with pushdown, so don't bother keeping the
52    /// stats until someone asks for it.
53    Lists,
54    /// Recursive statistics about the set of keys present in any maps/objects
55    /// in the column, or None if there were no maps/objects.
56    Maps(BTreeMap<String, JsonMapElementStats>),
57}
58
59#[derive(Default, Clone)]
60pub struct JsonMapElementStats {
61    pub len: usize,
62    pub stats: JsonStats,
63}
64
65impl Default for JsonStats {
66    fn default() -> Self {
67        JsonStats::None
68    }
69}
70
71impl Debug for JsonStats {
72    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
73        match self {
74            JsonStats::None => f.write_str("None"),
75            JsonStats::Mixed => f.write_str("Mixed"),
76            JsonStats::JsonNulls => f.write_str("JsonNulls"),
77            JsonStats::Bools(stats) => f.debug_tuple("Bools").field(stats).finish(),
78            JsonStats::Strings(stats) => f.debug_tuple("Strings").field(stats).finish(),
79            JsonStats::Numerics(stats) => f.debug_tuple("Numerics").field(stats).finish(),
80            JsonStats::Lists => f.write_str("Lists"),
81            JsonStats::Maps(stats) => {
82                let mut f = f.debug_tuple("Maps");
83                for (k, v) in stats.iter() {
84                    f.field(&(redact(k), v.len, &v.stats));
85                }
86                f.finish()
87            }
88        }
89    }
90}
91
92impl JsonStats {
93    pub fn debug_json(&self) -> serde_json::Value {
94        match self {
95            JsonStats::None => json!({}),
96            JsonStats::Mixed => "json_mixed".into(),
97            JsonStats::JsonNulls => "json_nulls".into(),
98            JsonStats::Bools(x) => x.debug_json(),
99            JsonStats::Strings(x) => x.debug_json(),
100            JsonStats::Numerics(x) => x.debug_json(),
101            JsonStats::Lists => "json_lists".into(),
102            JsonStats::Maps(x) => x
103                .iter()
104                .map(|(k, v)| (k.clone(), v.debug_json()))
105                .collect::<serde_json::Map<_, _>>()
106                .into(),
107        }
108    }
109}
110
111impl JsonMapElementStats {
112    pub fn debug_json(&self) -> serde_json::Value {
113        json!({"len": self.len, "stats": self.stats.debug_json()})
114    }
115}
116
117impl RustType<ProtoJsonStats> for JsonStats {
118    fn into_proto(&self) -> ProtoJsonStats {
119        ProtoJsonStats {
120            kind: Some(match self {
121                JsonStats::None => proto_json_stats::Kind::None(()),
122                JsonStats::Mixed => proto_json_stats::Kind::Mixed(()),
123                JsonStats::JsonNulls => proto_json_stats::Kind::JsonNulls(()),
124                JsonStats::Bools(x) => proto_json_stats::Kind::Bools(RustType::into_proto(x)),
125                JsonStats::Strings(x) => proto_json_stats::Kind::Strings(RustType::into_proto(x)),
126                JsonStats::Numerics(x) => proto_json_stats::Kind::Numerics(RustType::into_proto(x)),
127                JsonStats::Lists => proto_json_stats::Kind::Lists(()),
128                JsonStats::Maps(x) => proto_json_stats::Kind::Maps(ProtoJsonMapStats {
129                    elements: x
130                        .iter()
131                        .map(|(k, v)| ProtoJsonMapElementStats {
132                            name: k.into_proto(),
133                            len: v.len.into_proto(),
134                            stats: Some(RustType::into_proto(&v.stats)),
135                        })
136                        .collect(),
137                }),
138            }),
139        }
140    }
141
142    fn from_proto(proto: ProtoJsonStats) -> Result<Self, TryFromProtoError> {
143        Ok(match proto.kind {
144            Some(proto_json_stats::Kind::None(())) => JsonStats::None,
145            Some(proto_json_stats::Kind::Mixed(())) => JsonStats::Mixed,
146            Some(proto_json_stats::Kind::JsonNulls(())) => JsonStats::JsonNulls,
147            Some(proto_json_stats::Kind::Bools(x)) => JsonStats::Bools(x.into_rust()?),
148            Some(proto_json_stats::Kind::Strings(x)) => JsonStats::Strings(x.into_rust()?),
149            Some(proto_json_stats::Kind::Numerics(x)) => JsonStats::Numerics(x.into_rust()?),
150            Some(proto_json_stats::Kind::Lists(())) => JsonStats::Lists,
151            Some(proto_json_stats::Kind::Maps(x)) => {
152                let mut elements = BTreeMap::new();
153                for x in x.elements {
154                    let stats = JsonMapElementStats {
155                        len: x.len.into_rust()?,
156                        stats: x.stats.into_rust_if_some("JsonMapElementStats::stats")?,
157                    };
158                    elements.insert(x.name.into_rust()?, stats);
159                }
160                JsonStats::Maps(elements)
161            }
162            // Unknown JSON stats type: assume this might have any value.
163            None => JsonStats::Mixed,
164        })
165    }
166}
167
168impl TrimStats for ProtoJsonStats {
169    fn trim(&mut self) {
170        use proto_json_stats::*;
171        match &mut self.kind {
172            Some(Kind::Strings(stats)) => {
173                stats.trim();
174            }
175            Some(Kind::Maps(stats)) => {
176                for value in &mut stats.elements {
177                    if let Some(stats) = &mut value.stats {
178                        stats.trim();
179                    }
180                }
181            }
182            Some(
183                Kind::None(_)
184                | Kind::Mixed(_)
185                | Kind::JsonNulls(_)
186                | Kind::Bools(_)
187                | Kind::Numerics(_)
188                | Kind::Lists(_),
189            ) => {}
190            None => {}
191        }
192    }
193}
194
195/// Returns a [`Strategy`] for generating abritrary [`JsonStats`].
196pub(crate) fn any_json_stats() -> impl Strategy<Value = JsonStats> {
197    let leaf = Union::new(vec![
198        any::<()>().prop_map(|_| JsonStats::None).boxed(),
199        any::<()>().prop_map(|_| JsonStats::Mixed).boxed(),
200        any::<()>().prop_map(|_| JsonStats::JsonNulls).boxed(),
201        any_primitive_stats::<bool>()
202            .prop_map(JsonStats::Bools)
203            .boxed(),
204        any_primitive_stats::<String>()
205            .prop_map(JsonStats::Strings)
206            .boxed(),
207        any::<()>().prop_map(|_| JsonStats::Lists).boxed(),
208    ]);
209    leaf.prop_recursive(2, 5, 3, |inner| {
210        (proptest::collection::btree_map(any::<String>(), inner, 0..3)).prop_map(|cols| {
211            let cols = cols
212                .into_iter()
213                .map(|(k, stats)| (k, JsonMapElementStats { len: 1, stats }))
214                .collect();
215            JsonStats::Maps(cols)
216        })
217    })
218}
219
220#[cfg(test)]
221mod tests {
222    use prost::Message;
223
224    use super::*;
225    use crate::stats::trim_to_budget_jsonb;
226
227    #[mz_ore::test]
228    fn jsonb_trim_to_budget() {
229        #[track_caller]
230        fn testcase(cols: &[(&str, usize)], required: Option<&str>) {
231            let cols = cols
232                .iter()
233                .map(|(key, cost)| {
234                    let stats = JsonStats::Numerics(PrimitiveStats {
235                        lower: vec![],
236                        upper: vec![0u8; *cost],
237                    });
238                    let len = stats.debug_json().to_string().len();
239                    ((*key).to_owned(), JsonMapElementStats { len, stats })
240                })
241                .collect();
242
243            // Serialize into proto and extract the necessary type.
244            let stats: ProtoJsonStats = RustType::into_proto(&JsonStats::Maps(cols));
245            let ProtoJsonStats {
246                kind: Some(proto_json_stats::Kind::Maps(mut stats)),
247            } = stats
248            else {
249                panic!("serialized produced wrong type!");
250            };
251
252            let mut budget = stats.encoded_len().next_power_of_two();
253            while budget > 0 {
254                let cost_before = stats.encoded_len();
255                trim_to_budget_jsonb(&mut stats, &mut budget, &|col| Some(col) == required);
256                let cost_after = stats.encoded_len();
257                assert!(cost_before >= cost_after);
258
259                // Assert force keep columns were kept.
260                if let Some(required) = required {
261                    assert!(
262                        stats
263                            .elements
264                            .iter()
265                            .any(|element| element.name == required)
266                    );
267                } else {
268                    assert!(cost_after <= budget);
269                }
270
271                budget = budget / 2;
272            }
273        }
274
275        testcase(&[], None);
276        testcase(&[("a", 100)], None);
277        testcase(&[("a", 1), ("b", 2), ("c", 4)], None);
278        testcase(&[("a", 1), ("b", 2), ("c", 4)], Some("b"));
279    }
280
281    #[mz_ore::test]
282    fn jsonb_trim_to_budget_smoke() {
283        let og_stats = JsonStats::Maps(
284            [
285                (
286                    "a".to_string(),
287                    JsonMapElementStats {
288                        len: 1,
289                        stats: JsonStats::Strings(PrimitiveStats {
290                            lower: "foobar".to_string(),
291                            upper: "foobaz".to_string(),
292                        }),
293                    },
294                ),
295                (
296                    "context".to_string(),
297                    JsonMapElementStats {
298                        len: 100,
299                        stats: JsonStats::Maps(
300                            [
301                                (
302                                    "b".to_string(),
303                                    JsonMapElementStats {
304                                        len: 99,
305                                        stats: JsonStats::Numerics(PrimitiveStats {
306                                            lower: vec![],
307                                            upper: vec![42u8; 99],
308                                        }),
309                                    },
310                                ),
311                                (
312                                    "c".to_string(),
313                                    JsonMapElementStats {
314                                        len: 1,
315                                        stats: JsonStats::Bools(PrimitiveStats {
316                                            lower: false,
317                                            upper: true,
318                                        }),
319                                    },
320                                ),
321                            ]
322                            .into(),
323                        ),
324                    },
325                ),
326            ]
327            .into(),
328        );
329
330        // Serialize into proto and extract the necessary type.
331        let stats: ProtoJsonStats = RustType::into_proto(&og_stats);
332        let ProtoJsonStats {
333            kind: Some(proto_json_stats::Kind::Maps(mut stats)),
334        } = stats
335        else {
336            panic!("serialized produced wrong type!");
337        };
338
339        let mut budget_shortfall = 50;
340        // We should recurse into the "context" message and only drop the "b" column.
341        trim_to_budget_jsonb(&mut stats, &mut budget_shortfall, &|_name| false);
342
343        let mut elements = stats
344            .elements
345            .into_iter()
346            .map(|element| (element.name.clone(), element))
347            .collect::<BTreeMap<String, _>>();
348        assert!(elements.remove("a").is_some());
349
350        let context = elements.remove("context").expect("trimmed too much");
351        let Some(ProtoJsonStats {
352            kind: Some(proto_json_stats::Kind::Maps(context)),
353        }) = context.stats
354        else {
355            panic!("serialized produced wrong type!")
356        };
357
358        // We should only have one element in "context" because we trimmed "b".
359        assert_eq!(context.elements.len(), 1);
360        assert_eq!(context.elements[0].name, "c");
361
362        // Redo the triming, force keeping the largest column.
363
364        // Serialize into proto and extract the necessary type.
365        let stats: ProtoJsonStats = RustType::into_proto(&og_stats);
366        let ProtoJsonStats {
367            kind: Some(proto_json_stats::Kind::Maps(mut stats)),
368        } = stats
369        else {
370            panic!("serialized produced wrong type!");
371        };
372
373        let mut budget_shortfall = 50;
374        // We're force keeping "b" which is larger than our budgets_shortfall, so we should drop
375        // everything else.
376        trim_to_budget_jsonb(&mut stats, &mut budget_shortfall, &|name| name == "b");
377
378        assert_eq!(stats.elements.len(), 1);
379        assert_eq!(stats.elements[0].name, "context");
380
381        let Some(ProtoJsonStats {
382            kind: Some(proto_json_stats::Kind::Maps(context)),
383        }) = &stats.elements[0].stats
384        else {
385            panic!("serialized produced wrong type!")
386        };
387
388        assert_eq!(context.elements.len(), 1);
389        assert_eq!(context.elements[0].name, "b");
390    }
391
392    // Regression test for a bug found during code review of initial stats
393    // trimming PR.
394    #[mz_ore::test]
395    fn stats_trim_regression_json() {
396        // Make sure we recursively trim json string and map stats by asserting
397        // that the goes down after trimming.
398        #[track_caller]
399        fn testcase(stats: JsonStats) {
400            let mut stats = stats.into_proto();
401            let before = stats.encoded_len();
402            stats.trim();
403            let after = stats.encoded_len();
404            assert!(after < before, "{} vs {}: {:?}", after, before, stats);
405        }
406
407        let col = JsonStats::Strings(PrimitiveStats {
408            lower: "foobar".into(),
409            upper: "foobaz".into(),
410        });
411        testcase(col.clone());
412        let mut cols = BTreeMap::new();
413        cols.insert("col".into(), JsonMapElementStats { len: 1, stats: col });
414        testcase(JsonStats::Maps(cols));
415    }
416}