mz_persist_types/stats/
primitive.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::fmt::{self, Debug};
11
12use mz_proto::{ProtoType, RustType, TryFromProtoError};
13use proptest::arbitrary::Arbitrary;
14use proptest::strategy::Strategy;
15use serde::Serialize;
16
17use crate::stats::{
18    BytesStats, ColumnStatKinds, ColumnStats, ColumnarStats, DynStats, OptionStats,
19    ProtoPrimitiveBytesStats, ProtoPrimitiveStats, TrimStats, proto_primitive_stats,
20};
21use crate::timestamp::try_parse_monotonic_iso8601_timestamp;
22
23/// The length to truncate `Vec<u8>` and `String` stats to.
24//
25// Ideally, this would be in LaunchDarkly, but the plumbing is tough.
26pub const TRUNCATE_LEN: usize = 100;
27
28/// Whether a truncated value should be a lower or upper bound on the original.
29pub enum TruncateBound {
30    /// Truncate such that the result is <= the original.
31    Lower,
32    /// Truncate such that the result is >= the original.
33    Upper,
34}
35
36/// Truncates a u8 slice to the given maximum byte length.
37///
38/// If `bound` is Lower, the returned value will sort <= the original value, and
39/// if `bound` is Upper, it will sort >= the original value.
40///
41/// Lower bounds will always return Some. Upper bounds might return None if the
42/// part that fits in `max_len` is entirely made of `u8::MAX`.
43pub fn truncate_bytes(x: &[u8], max_len: usize, bound: TruncateBound) -> Option<Vec<u8>> {
44    if x.len() <= max_len {
45        return Some(x.to_owned());
46    }
47    match bound {
48        // Any truncation is a lower bound.
49        TruncateBound::Lower => Some(x[..max_len].to_owned()),
50        TruncateBound::Upper => {
51            for idx in (0..max_len).rev() {
52                if x[idx] < u8::MAX {
53                    let mut ret = x[..=idx].to_owned();
54                    ret[idx] += 1;
55                    return Some(ret);
56                }
57            }
58            None
59        }
60    }
61}
62
63/// Truncates a string to the given maximum byte length.
64///
65/// The returned value is a valid utf-8 string. If `bound` is Lower, it will
66/// sort <= the original string, and if `bound` is Upper, it will sort >= the
67/// original string.
68///
69/// Lower bounds will always return Some. Upper bounds might return None if the
70/// part that fits in `max_len` is entirely made of `char::MAX` (so in practice,
71/// probably ~never).
72pub fn truncate_string(x: &str, max_len: usize, bound: TruncateBound) -> Option<String> {
73    if x.len() <= max_len {
74        return Some(x.to_owned());
75    }
76    // For the output to be valid utf-8, we have to truncate along a char
77    // boundary.
78    let truncation_idx = x
79        .char_indices()
80        .map(|(idx, c)| idx + c.len_utf8())
81        .take_while(|char_end| *char_end <= max_len)
82        .last()
83        .unwrap_or(0);
84    let truncated = &x[..truncation_idx];
85    match bound {
86        // Any truncation is a lower bound.
87        TruncateBound::Lower => Some(truncated.to_owned()),
88        TruncateBound::Upper => {
89            // See if we can find a char that's not already the max. If so, take
90            // the last of these and increment it.
91            for (idx, c) in truncated.char_indices().rev() {
92                if let Ok(new_last_char) = char::try_from(u32::from(c) + 1) {
93                    // NB: It's technically possible for `new_last_char` to be
94                    // more bytes than `c`, which means we could go over
95                    // max_len. It isn't a hard requirement for the initial
96                    // caller of this, so don't bother with the complexity yet.
97                    let mut ret = String::with_capacity(idx + new_last_char.len_utf8());
98                    ret.push_str(&truncated[..idx]);
99                    ret.push(new_last_char);
100                    return Some(ret);
101                }
102            }
103            None
104        }
105    }
106}
107
108/// Statistics about a primitive non-optional column.
109#[derive(Copy, Clone)]
110pub struct PrimitiveStats<T> {
111    /// An inclusive lower bound on the data contained in the column.
112    ///
113    /// This will often be a tight bound, but it's not guaranteed. Persist
114    /// reserves the right to (for example) invent smaller bounds for long byte
115    /// strings. SUBTLE: This means that this exact value may not be present in
116    /// the column.
117    ///
118    /// Similarly, if the column is empty, this will contain `T: Default`.
119    /// Emptiness will be indicated in statistics higher up (i.e.
120    /// [`crate::stats::StructStats`]).
121    pub lower: T,
122    /// Same as [Self::lower] but an (also inclusive) upper bound.
123    pub upper: T,
124}
125
126impl<T: Serialize> Debug for PrimitiveStats<T>
127where
128    // Note: We introduce this bound so we can have a different impl for PrimitiveStats<Vec<u8>>.
129    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>,
130{
131    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
132        let l = serde_json::to_value(&self.lower).expect("valid json");
133        let u = serde_json::to_value(&self.upper).expect("valid json");
134        Debug::fmt(&serde_json::json!({"lower": l, "upper": u}), f)
135    }
136}
137
138impl Debug for PrimitiveStats<Vec<u8>> {
139    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
140        Debug::fmt(&self.debug_json(), f)
141    }
142}
143
144impl<T: Serialize> DynStats for PrimitiveStats<T>
145where
146    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>
147        + Into<PrimitiveStatsVariants>
148        + Clone
149        + Send
150        + Sync
151        + 'static,
152{
153    fn debug_json(&self) -> serde_json::Value {
154        let l = serde_json::to_value(&self.lower).expect("valid json");
155        let u = serde_json::to_value(&self.upper).expect("valid json");
156        serde_json::json!({"lower": l, "upper": u})
157    }
158
159    fn into_columnar_stats(self) -> ColumnarStats {
160        ColumnarStats {
161            nulls: None,
162            values: ColumnStatKinds::Primitive(self.into()),
163        }
164    }
165}
166
167impl DynStats for PrimitiveStats<Vec<u8>> {
168    fn debug_json(&self) -> serde_json::Value {
169        serde_json::json!({
170            "lower": hex::encode(&self.lower),
171            "upper": hex::encode(&self.upper),
172        })
173    }
174
175    fn into_columnar_stats(self) -> ColumnarStats {
176        ColumnarStats {
177            nulls: None,
178            values: ColumnStatKinds::Bytes(BytesStats::Primitive(self)),
179        }
180    }
181}
182
183/// This macro implements the [`ColumnStats`] trait for all variants of [`PrimitiveStats`].
184macro_rules! stats_primitive {
185    ($data:ty, $ref_ty:ty, $ref:ident, $variant:ident) => {
186        impl ColumnStats for PrimitiveStats<$data> {
187            type Ref<'a> = $ref_ty;
188
189            fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
190                Some(self.lower.$ref())
191            }
192            fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
193                Some(self.upper.$ref())
194            }
195            fn none_count(&self) -> usize {
196                0
197            }
198        }
199
200        impl ColumnStats for OptionStats<PrimitiveStats<$data>> {
201            type Ref<'a> = Option<$ref_ty>;
202
203            fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
204                Some(self.some.lower())
205            }
206            fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
207                Some(self.some.upper())
208            }
209            fn none_count(&self) -> usize {
210                self.none
211            }
212        }
213
214        impl From<PrimitiveStats<$data>> for PrimitiveStatsVariants {
215            fn from(value: PrimitiveStats<$data>) -> Self {
216                PrimitiveStatsVariants::$variant(value)
217            }
218        }
219    };
220}
221
222stats_primitive!(bool, bool, clone, Bool);
223stats_primitive!(u8, u8, clone, U8);
224stats_primitive!(u16, u16, clone, U16);
225stats_primitive!(u32, u32, clone, U32);
226stats_primitive!(u64, u64, clone, U64);
227stats_primitive!(i8, i8, clone, I8);
228stats_primitive!(i16, i16, clone, I16);
229stats_primitive!(i32, i32, clone, I32);
230stats_primitive!(i64, i64, clone, I64);
231stats_primitive!(f32, f32, clone, F32);
232stats_primitive!(f64, f64, clone, F64);
233stats_primitive!(String, &'a str, as_str, String);
234
235/// This macro implements the [`RustType`] trait for all variants of [`PrimitiveStats`].
236macro_rules! primitive_stats_rust_type {
237    ($typ:ty, $lower:ident, $upper:ident) => {
238        impl RustType<ProtoPrimitiveStats> for PrimitiveStats<$typ> {
239            fn into_proto(&self) -> ProtoPrimitiveStats {
240                ProtoPrimitiveStats {
241                    lower: Some(proto_primitive_stats::Lower::$lower(
242                        self.lower.into_proto(),
243                    )),
244                    upper: Some(proto_primitive_stats::Upper::$upper(
245                        self.upper.into_proto(),
246                    )),
247                }
248            }
249
250            fn from_proto(proto: ProtoPrimitiveStats) -> Result<Self, TryFromProtoError> {
251                let lower = proto.lower.ok_or_else(|| {
252                    TryFromProtoError::missing_field("ProtoPrimitiveStats::lower")
253                })?;
254                let lower = match lower {
255                    proto_primitive_stats::Lower::$lower(x) => x.into_rust()?,
256                    _ => {
257                        return Err(TryFromProtoError::missing_field(
258                            "proto_primitive_stats::Lower::$lower",
259                        ));
260                    }
261                };
262                let upper = proto
263                    .upper
264                    .ok_or_else(|| TryFromProtoError::missing_field("ProtoPrimitiveStats::max"))?;
265                let upper = match upper {
266                    proto_primitive_stats::Upper::$upper(x) => x.into_rust()?,
267                    _ => {
268                        return Err(TryFromProtoError::missing_field(
269                            "proto_primitive_stats::Upper::$upper",
270                        ));
271                    }
272                };
273                Ok(PrimitiveStats { lower, upper })
274            }
275        }
276    };
277}
278
279primitive_stats_rust_type!(bool, LowerBool, UpperBool);
280primitive_stats_rust_type!(u8, LowerU8, UpperU8);
281primitive_stats_rust_type!(u16, LowerU16, UpperU16);
282primitive_stats_rust_type!(u32, LowerU32, UpperU32);
283primitive_stats_rust_type!(u64, LowerU64, UpperU64);
284primitive_stats_rust_type!(i8, LowerI8, UpperI8);
285primitive_stats_rust_type!(i16, LowerI16, UpperI16);
286primitive_stats_rust_type!(i32, LowerI32, UpperI32);
287primitive_stats_rust_type!(i64, LowerI64, UpperI64);
288primitive_stats_rust_type!(f32, LowerF32, UpperF32);
289primitive_stats_rust_type!(f64, LowerF64, UpperF64);
290primitive_stats_rust_type!(String, LowerString, UpperString);
291
292impl RustType<ProtoPrimitiveBytesStats> for PrimitiveStats<Vec<u8>> {
293    fn into_proto(&self) -> ProtoPrimitiveBytesStats {
294        ProtoPrimitiveBytesStats {
295            lower: self.lower.into_proto(),
296            upper: self.upper.into_proto(),
297        }
298    }
299
300    fn from_proto(proto: ProtoPrimitiveBytesStats) -> Result<Self, TryFromProtoError> {
301        let lower = proto.lower.into_rust()?;
302        let upper = proto.upper.into_rust()?;
303        Ok(PrimitiveStats { lower, upper })
304    }
305}
306
307impl TrimStats for ProtoPrimitiveStats {
308    fn trim(&mut self) {
309        use proto_primitive_stats::*;
310        match (&mut self.lower, &mut self.upper) {
311            (Some(Lower::LowerString(lower)), Some(Upper::UpperString(upper))) => {
312                // If the lower and upper strings both look like iso8601
313                // timestamps, then (1) they're small and (2) that's an
314                // extremely high signal that we might want to keep them around
315                // for filtering. We technically could still recover useful
316                // bounds here in the interpret code, but the complexity isn't
317                // worth it, so just skip any trimming.
318                if try_parse_monotonic_iso8601_timestamp(lower).is_some()
319                    && try_parse_monotonic_iso8601_timestamp(upper).is_some()
320                {
321                    return;
322                }
323
324                #[allow(clippy::disallowed_methods)]
325                let common_prefix = lower
326                    .char_indices()
327                    .zip(upper.chars())
328                    .take_while(|((_, x), y)| x == y)
329                    .last();
330                if let Some(((o, x), y)) = common_prefix {
331                    let new_len = o + std::cmp::max(x.len_utf8(), y.len_utf8());
332                    *lower = truncate_string(lower, new_len, TruncateBound::Lower)
333                        .expect("lower bound should always truncate");
334                    if let Some(new_upper) = truncate_string(upper, new_len, TruncateBound::Upper) {
335                        *upper = new_upper;
336                    }
337                }
338            }
339            _ => {}
340        }
341    }
342}
343
344impl TrimStats for ProtoPrimitiveBytesStats {
345    fn trim(&mut self) {
346        #[allow(clippy::disallowed_methods)]
347        let common_prefix = self
348            .lower
349            .iter()
350            .zip(self.upper.iter())
351            .take_while(|(x, y)| x == y)
352            .count();
353        self.lower = truncate_bytes(&self.lower, common_prefix + 1, TruncateBound::Lower)
354            .expect("lower bound should always truncate");
355        if let Some(upper) = truncate_bytes(&self.upper, common_prefix + 1, TruncateBound::Upper) {
356            self.upper = upper;
357        }
358    }
359}
360
361/// Enum wrapper around [`PrimitiveStats`] for each variant that we support.
362#[derive(Debug, Clone)]
363pub enum PrimitiveStatsVariants {
364    /// [`bool`]
365    Bool(PrimitiveStats<bool>),
366    /// [`u8`]
367    U8(PrimitiveStats<u8>),
368    /// [`u16`]
369    U16(PrimitiveStats<u16>),
370    /// [`u32`]
371    U32(PrimitiveStats<u32>),
372    /// [`u64`]
373    U64(PrimitiveStats<u64>),
374    /// [`i8`]
375    I8(PrimitiveStats<i8>),
376    /// [`i16`]
377    I16(PrimitiveStats<i16>),
378    /// [`i32`]
379    I32(PrimitiveStats<i32>),
380    /// [`i64`]
381    I64(PrimitiveStats<i64>),
382    /// [`f32`]
383    F32(PrimitiveStats<f32>),
384    /// [`f64`]
385    F64(PrimitiveStats<f64>),
386    /// [`String`]
387    String(PrimitiveStats<String>),
388}
389
390impl DynStats for PrimitiveStatsVariants {
391    fn debug_json(&self) -> serde_json::Value {
392        use PrimitiveStatsVariants::*;
393
394        match self {
395            Bool(stats) => stats.debug_json(),
396            U8(stats) => stats.debug_json(),
397            U16(stats) => stats.debug_json(),
398            U32(stats) => stats.debug_json(),
399            U64(stats) => stats.debug_json(),
400            I8(stats) => stats.debug_json(),
401            I16(stats) => stats.debug_json(),
402            I32(stats) => stats.debug_json(),
403            I64(stats) => stats.debug_json(),
404            F32(stats) => stats.debug_json(),
405            F64(stats) => stats.debug_json(),
406            String(stats) => stats.debug_json(),
407        }
408    }
409
410    fn into_columnar_stats(self) -> ColumnarStats {
411        ColumnarStats {
412            nulls: None,
413            values: ColumnStatKinds::Primitive(self),
414        }
415    }
416}
417
418impl RustType<ProtoPrimitiveStats> for PrimitiveStatsVariants {
419    fn into_proto(&self) -> ProtoPrimitiveStats {
420        use PrimitiveStatsVariants::*;
421
422        match self {
423            Bool(stats) => RustType::into_proto(stats),
424            U8(stats) => RustType::into_proto(stats),
425            U16(stats) => RustType::into_proto(stats),
426            U32(stats) => RustType::into_proto(stats),
427            U64(stats) => RustType::into_proto(stats),
428            I8(stats) => RustType::into_proto(stats),
429            I16(stats) => RustType::into_proto(stats),
430            I32(stats) => RustType::into_proto(stats),
431            I64(stats) => RustType::into_proto(stats),
432            F32(stats) => RustType::into_proto(stats),
433            F64(stats) => RustType::into_proto(stats),
434            String(stats) => RustType::into_proto(stats),
435        }
436    }
437
438    fn from_proto(proto: ProtoPrimitiveStats) -> Result<Self, mz_proto::TryFromProtoError> {
439        use crate::stats::proto_primitive_stats::{Lower, Upper};
440
441        let stats = match (proto.lower, proto.upper) {
442            (Some(Lower::LowerBool(lower)), Some(Upper::UpperBool(upper))) => {
443                PrimitiveStatsVariants::Bool(PrimitiveStats { lower, upper })
444            }
445            (Some(Lower::LowerU8(lower)), Some(Upper::UpperU8(upper))) => {
446                PrimitiveStatsVariants::U8(PrimitiveStats {
447                    lower: lower.into_rust()?,
448                    upper: upper.into_rust()?,
449                })
450            }
451            (Some(Lower::LowerU16(lower)), Some(Upper::UpperU16(upper))) => {
452                PrimitiveStatsVariants::U16(PrimitiveStats {
453                    lower: lower.into_rust()?,
454                    upper: upper.into_rust()?,
455                })
456            }
457            (Some(Lower::LowerU32(lower)), Some(Upper::UpperU32(upper))) => {
458                PrimitiveStatsVariants::U32(PrimitiveStats { lower, upper })
459            }
460            (Some(Lower::LowerU64(lower)), Some(Upper::UpperU64(upper))) => {
461                PrimitiveStatsVariants::U64(PrimitiveStats { lower, upper })
462            }
463            (Some(Lower::LowerI8(lower)), Some(Upper::UpperI8(upper))) => {
464                PrimitiveStatsVariants::I8(PrimitiveStats {
465                    lower: lower.into_rust()?,
466                    upper: upper.into_rust()?,
467                })
468            }
469            (Some(Lower::LowerI16(lower)), Some(Upper::UpperI16(upper))) => {
470                PrimitiveStatsVariants::I16(PrimitiveStats {
471                    lower: lower.into_rust()?,
472                    upper: upper.into_rust()?,
473                })
474            }
475            (Some(Lower::LowerI32(lower)), Some(Upper::UpperI32(upper))) => {
476                PrimitiveStatsVariants::I32(PrimitiveStats { lower, upper })
477            }
478            (Some(Lower::LowerI64(lower)), Some(Upper::UpperI64(upper))) => {
479                PrimitiveStatsVariants::I64(PrimitiveStats { lower, upper })
480            }
481            (Some(Lower::LowerF32(lower)), Some(Upper::UpperF32(upper))) => {
482                PrimitiveStatsVariants::F32(PrimitiveStats { lower, upper })
483            }
484            (Some(Lower::LowerF64(lower)), Some(Upper::UpperF64(upper))) => {
485                PrimitiveStatsVariants::F64(PrimitiveStats { lower, upper })
486            }
487            (Some(Lower::LowerString(lower)), Some(Upper::UpperString(upper))) => {
488                PrimitiveStatsVariants::String(PrimitiveStats { lower, upper })
489            }
490            (lower, upper) => {
491                return Err(TryFromProtoError::missing_field(format!(
492                    "expected lower and upper to be the same type, found {} and {}",
493                    std::any::type_name_of_val(&lower),
494                    std::any::type_name_of_val(&upper)
495                )));
496            }
497        };
498
499        Ok(stats)
500    }
501}
502
503/// Returns a [`Strategy`] for generating arbitrary [`PrimitiveStats`].
504pub(crate) fn any_primitive_stats<T>() -> impl Strategy<Value = PrimitiveStats<T>>
505where
506    T: Arbitrary + Ord + Serialize,
507    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>,
508{
509    Strategy::prop_map(proptest::arbitrary::any::<(T, T)>(), |(x0, x1)| {
510        if x0 <= x1 {
511            PrimitiveStats {
512                lower: x0,
513                upper: x1,
514            }
515        } else {
516            PrimitiveStats {
517                lower: x1,
518                upper: x0,
519            }
520        }
521    })
522}
523
524/// Returns a [`Strategy`] for generating arbitrary [`PrimitiveStats<Vec<u8>>`].
525pub(crate) fn any_primitive_vec_u8_stats() -> impl Strategy<Value = PrimitiveStats<Vec<u8>>> {
526    Strategy::prop_map(
527        proptest::arbitrary::any::<(Vec<u8>, Vec<u8>)>(),
528        |(x0, x1)| {
529            if x0 <= x1 {
530                PrimitiveStats {
531                    lower: x0,
532                    upper: x1,
533                }
534            } else {
535                PrimitiveStats {
536                    lower: x1,
537                    upper: x0,
538                }
539            }
540        },
541    )
542}
543
544#[cfg(test)]
545mod tests {
546    use arrow::array::{BinaryArray, StringArray};
547    use proptest::prelude::*;
548
549    use super::*;
550    use crate::stats::ColumnarStatsBuilder;
551
552    #[mz_ore::test]
553    fn test_truncate_bytes() {
554        #[track_caller]
555        fn testcase(x: &[u8], max_len: usize, upper_should_exist: bool) {
556            let lower = truncate_bytes(x, max_len, TruncateBound::Lower)
557                .expect("lower should always exist");
558            assert!(lower.len() <= max_len);
559            assert!(lower.as_slice() <= x);
560            let upper = truncate_bytes(x, max_len, TruncateBound::Upper);
561            assert_eq!(upper_should_exist, upper.is_some());
562            if let Some(upper) = upper {
563                assert!(upper.len() <= max_len);
564                assert!(upper.as_slice() >= x);
565            }
566        }
567
568        testcase(&[], 0, true);
569        testcase(&[], 1, true);
570        testcase(&[1], 0, false);
571        testcase(&[1], 1, true);
572        testcase(&[1], 2, true);
573        testcase(&[1, 2], 1, true);
574        testcase(&[1, 255], 2, true);
575        testcase(&[255, 255], 2, true);
576        testcase(&[255, 255, 255], 2, false);
577    }
578
579    #[mz_ore::test]
580    #[cfg_attr(miri, ignore)] // too slow
581    fn test_truncate_bytes_proptest() {
582        fn testcase(x: &[u8]) {
583            for max_len in 0..=x.len() {
584                let lower = truncate_bytes(x, max_len, TruncateBound::Lower)
585                    .expect("lower should always exist");
586                let upper = truncate_bytes(x, max_len, TruncateBound::Upper);
587                assert!(lower.len() <= max_len);
588                assert!(lower.as_slice() <= x);
589                if let Some(upper) = upper {
590                    assert!(upper.len() <= max_len);
591                    assert!(upper.as_slice() >= x);
592                }
593            }
594        }
595
596        proptest!(|(x in any::<Vec<u8>>())| {
597            // The proptest! macro interferes with rustfmt.
598            testcase(x.as_slice())
599        });
600    }
601
602    #[mz_ore::test]
603    fn test_truncate_string() {
604        #[track_caller]
605        fn testcase(x: &str, max_len: usize, upper_should_exist: bool) {
606            let lower = truncate_string(x, max_len, TruncateBound::Lower)
607                .expect("lower should always exist");
608            let upper = truncate_string(x, max_len, TruncateBound::Upper);
609            assert!(lower.len() <= max_len);
610            assert!(lower.as_str() <= x);
611            assert_eq!(upper_should_exist, upper.is_some());
612            if let Some(upper) = upper {
613                assert!(upper.len() <= max_len);
614                assert!(upper.as_str() >= x);
615            }
616        }
617
618        testcase("", 0, true);
619        testcase("1", 0, false);
620        testcase("1", 1, true);
621        testcase("12", 1, true);
622        testcase("⛄", 0, false);
623        testcase("⛄", 1, false);
624        testcase("⛄", 3, true);
625        testcase("\u{10FFFF}", 3, false);
626        testcase("\u{10FFFF}", 4, true);
627        testcase("\u{10FFFF}", 5, true);
628        testcase("⛄⛄", 3, true);
629        testcase("⛄⛄", 4, true);
630        testcase("⛄\u{10FFFF}", 6, true);
631        testcase("⛄\u{10FFFF}", 7, true);
632        testcase("\u{10FFFF}\u{10FFFF}", 7, false);
633        testcase("\u{10FFFF}\u{10FFFF}", 8, true);
634
635        // Just because I find this to be delightful.
636        assert_eq!(
637            truncate_string("⛄⛄", 3, TruncateBound::Upper),
638            Some("⛅".to_string())
639        );
640    }
641
642    #[mz_ore::test]
643    #[cfg_attr(miri, ignore)] // too slow
644    fn test_truncate_string_proptest() {
645        fn testcase(x: &str) {
646            for max_len in 0..=x.len() {
647                let lower = truncate_string(x, max_len, TruncateBound::Lower)
648                    .expect("lower should always exist");
649                let upper = truncate_string(x, max_len, TruncateBound::Upper);
650                assert!(lower.len() <= max_len);
651                assert!(lower.as_str() <= x);
652                if let Some(upper) = upper {
653                    // As explained in a comment in the impl, we don't quite
654                    // treat the max_len as a hard bound here. Give it a little
655                    // wiggle room.
656                    assert!(upper.len() <= max_len + char::MAX.len_utf8());
657                    assert!(upper.as_str() >= x);
658                }
659            }
660        }
661
662        proptest!(|(x in any::<String>())| {
663            // The proptest! macro interferes with rustfmt.
664            testcase(x.as_str())
665        });
666    }
667
668    #[mz_ore::test]
669    #[cfg_attr(miri, ignore)] // too slow
670    fn proptest_cost_trim() {
671        fn primitive_stats<T, A>(vals: &[T]) -> (&[T], PrimitiveStats<T>)
672        where
673            T: Clone,
674            A: arrow::array::Array + From<Vec<T>>,
675            PrimitiveStats<T>: ColumnarStatsBuilder<T, ArrowColumn = A>,
676        {
677            let column = A::from(vals.to_vec());
678            let stats = <PrimitiveStats<T> as ColumnarStatsBuilder<T>>::from_column(&column);
679
680            (vals, stats)
681        }
682
683        fn testcase<T: PartialOrd + Clone + Debug, P>(xs_stats: (&[T], PrimitiveStats<T>))
684        where
685            PrimitiveStats<T>: RustType<P> + DynStats,
686            P: TrimStats,
687        {
688            let (xs, stats) = xs_stats;
689            for x in xs {
690                assert!(&stats.lower <= x);
691                assert!(&stats.upper >= x);
692            }
693
694            let mut proto_stats = RustType::into_proto(&stats);
695            let cost_before = proto_stats.encoded_len();
696            proto_stats.trim();
697            assert!(proto_stats.encoded_len() <= cost_before);
698            let stats: PrimitiveStats<T> = RustType::from_proto(proto_stats).unwrap();
699            for x in xs {
700                assert!(&stats.lower <= x);
701                assert!(&stats.upper >= x);
702            }
703        }
704
705        proptest!(|(a in any::<bool>(), b in any::<bool>())| {
706            testcase(primitive_stats(&[a, b]))
707        });
708        proptest!(|(a in any::<u8>(), b in any::<u8>())| {
709            testcase(primitive_stats(&[a, b]))
710        });
711        proptest!(|(a in any::<u16>(), b in any::<u16>())| {
712            testcase(primitive_stats(&[a, b]))
713        });
714        proptest!(|(a in any::<u32>(), b in any::<u32>())| {
715            testcase(primitive_stats(&[a, b]))
716        });
717        proptest!(|(a in any::<u64>(), b in any::<u64>())| {
718            testcase(primitive_stats(&[a, b]))
719        });
720        proptest!(|(a in any::<i8>(), b in any::<i8>())| {
721            testcase(primitive_stats(&[a, b]))
722        });
723        proptest!(|(a in any::<i16>(), b in any::<i16>())| {
724            testcase(primitive_stats(&[a, b]))
725        });
726        proptest!(|(a in any::<i32>(), b in any::<i32>())| {
727            testcase(primitive_stats(&[a, b]))
728        });
729        proptest!(|(a in any::<i64>(), b in any::<i64>())| {
730            testcase(primitive_stats(&[a, b]))
731        });
732        proptest!(|(a in any::<f32>(), b in any::<f32>())| {
733            testcase(primitive_stats(&[a, b]))
734        });
735        proptest!(|(a in any::<f64>(), b in any::<f64>())| {
736            testcase(primitive_stats(&[a, b]))
737        });
738
739        // Construct strings that are "interesting" in that they have some
740        // (possibly empty) shared prefix.
741        proptest!(|(prefix in any::<String>(), a in any::<String>(), b in any::<String>())| {
742            let vals = &[format!("{}{}", prefix, a), format!("{}{}", prefix, b)];
743            let col = StringArray::from(vals.to_vec());
744            let stats = PrimitiveStats::<String>::from_column(&col);
745            testcase((&vals[..], stats))
746        });
747
748        // Construct strings that are "interesting" in that they have some
749        // (possibly empty) shared prefix.
750        proptest!(|(prefix in any::<Vec<u8>>(), a in any::<Vec<u8>>(), b in any::<Vec<u8>>())| {
751            let mut sa = prefix.clone();
752            sa.extend(&a);
753            let mut sb = prefix;
754            sb.extend(&b);
755            let vals = &[sa, sb];
756            let array = BinaryArray::from_vec(vals.iter().map(|v| v.as_ref()).collect());
757            let stats = PrimitiveStats::<Vec<u8>>::from_column(&array);
758            testcase((vals, stats));
759        });
760    }
761
762    // Regression test for a bug where "lossless" trimming would truncate an
763    // upper and lower bound that both parsed as our special iso8601 timestamps
764    // into something that no longer did.
765    #[mz_ore::test]
766    fn stats_trim_iso8601_recursion() {
767        use proto_primitive_stats::*;
768
769        let orig = PrimitiveStats {
770            lower: "2023-08-19T12:00:00.000Z".to_owned(),
771            upper: "2023-08-20T12:00:00.000Z".to_owned(),
772        };
773        let mut stats = RustType::into_proto(&orig);
774        stats.trim();
775        // Before the fix, this resulted in "2023-08-" and "2023-08.".
776        assert_eq!(stats.lower.unwrap(), Lower::LowerString(orig.lower));
777        assert_eq!(stats.upper.unwrap(), Upper::UpperString(orig.upper));
778    }
779}