mz_persist_types/stats/
primitive.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::fmt::{self, Debug};
11
12use mz_proto::{ProtoType, RustType, TryFromProtoError};
13use proptest::arbitrary::Arbitrary;
14use proptest::strategy::Strategy;
15use serde::Serialize;
16
17use crate::stats::{
18    BytesStats, ColumnStatKinds, ColumnStats, ColumnarStats, DynStats, OptionStats,
19    ProtoPrimitiveBytesStats, ProtoPrimitiveStats, TrimStats, proto_primitive_stats,
20};
21use crate::timestamp::try_parse_monotonic_iso8601_timestamp;
22
23/// The length to truncate `Vec<u8>` and `String` stats to.
24//
25// Ideally, this would be in LaunchDarkly, but the plumbing is tough.
26pub const TRUNCATE_LEN: usize = 100;
27
28/// Whether a truncated value should be a lower or upper bound on the original.
29pub enum TruncateBound {
30    /// Truncate such that the result is <= the original.
31    Lower,
32    /// Truncate such that the result is >= the original.
33    Upper,
34}
35
36/// Truncates a u8 slice to the given maximum byte length.
37///
38/// If `bound` is Lower, the returned value will sort <= the original value, and
39/// if `bound` is Upper, it will sort >= the original value.
40///
41/// Lower bounds will always return Some. Upper bounds might return None if the
42/// part that fits in `max_len` is entirely made of `u8::MAX`.
43pub fn truncate_bytes(x: &[u8], max_len: usize, bound: TruncateBound) -> Option<Vec<u8>> {
44    if x.len() <= max_len {
45        return Some(x.to_owned());
46    }
47    match bound {
48        // Any truncation is a lower bound.
49        TruncateBound::Lower => Some(x[..max_len].to_owned()),
50        TruncateBound::Upper => {
51            for idx in (0..max_len).rev() {
52                if x[idx] < u8::MAX {
53                    let mut ret = x[..=idx].to_owned();
54                    ret[idx] += 1;
55                    return Some(ret);
56                }
57            }
58            None
59        }
60    }
61}
62
63/// Truncates a string to the given maximum byte length.
64///
65/// The returned value is a valid utf-8 string. If `bound` is Lower, it will
66/// sort <= the original string, and if `bound` is Upper, it will sort >= the
67/// original string.
68///
69/// Lower bounds will always return Some. Upper bounds might return None if the
70/// part that fits in `max_len` is entirely made of `char::MAX` (so in practice,
71/// probably ~never).
72pub fn truncate_string(x: &str, max_len: usize, bound: TruncateBound) -> Option<String> {
73    if x.len() <= max_len {
74        return Some(x.to_owned());
75    }
76    // For the output to be valid utf-8, we have to truncate along a char
77    // boundary.
78    let truncation_idx = x
79        .char_indices()
80        .map(|(idx, c)| idx + c.len_utf8())
81        .take_while(|char_end| *char_end <= max_len)
82        .last()
83        .unwrap_or(0);
84    let truncated = &x[..truncation_idx];
85    match bound {
86        // Any truncation is a lower bound.
87        TruncateBound::Lower => Some(truncated.to_owned()),
88        TruncateBound::Upper => {
89            // See if we can find a char that's not already the max. If so, take
90            // the last of these and increment it.
91            for (idx, c) in truncated.char_indices().rev() {
92                if let Ok(new_last_char) = char::try_from(u32::from(c) + 1) {
93                    // NB: It's technically possible for `new_last_char` to be
94                    // more bytes than `c`, which means we could go over
95                    // max_len. It isn't a hard requirement for the initial
96                    // caller of this, so don't bother with the complexity yet.
97                    let mut ret = String::with_capacity(idx + new_last_char.len_utf8());
98                    ret.push_str(&truncated[..idx]);
99                    ret.push(new_last_char);
100                    return Some(ret);
101                }
102            }
103            None
104        }
105    }
106}
107
108/// Statistics about a primitive non-optional column.
109#[derive(Copy, Clone)]
110pub struct PrimitiveStats<T> {
111    /// An inclusive lower bound on the data contained in the column.
112    ///
113    /// This will often be a tight bound, but it's not guaranteed. Persist
114    /// reserves the right to (for example) invent smaller bounds for long byte
115    /// strings. SUBTLE: This means that this exact value may not be present in
116    /// the column.
117    ///
118    /// Similarly, if the column is empty, this will contain `T: Default`.
119    /// Emptiness will be indicated in statistics higher up (i.e.
120    /// [`crate::stats::StructStats`]).
121    pub lower: T,
122    /// Same as [Self::lower] but an (also inclusive) upper bound.
123    pub upper: T,
124}
125
126impl<T: Serialize> Debug for PrimitiveStats<T>
127where
128    // Note: We introduce this bound so we can have a different impl for PrimitiveStats<Vec<u8>>.
129    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>,
130{
131    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
132        let l = serde_json::to_value(&self.lower).expect("valid json");
133        let u = serde_json::to_value(&self.upper).expect("valid json");
134        Debug::fmt(&serde_json::json!({"lower": l, "upper": u}), f)
135    }
136}
137
138impl Debug for PrimitiveStats<Vec<u8>> {
139    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
140        Debug::fmt(&self.debug_json(), f)
141    }
142}
143
144impl<T: Serialize> DynStats for PrimitiveStats<T>
145where
146    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>
147        + Into<PrimitiveStatsVariants>
148        + Clone
149        + Send
150        + Sync
151        + 'static,
152{
153    fn debug_json(&self) -> serde_json::Value {
154        let l = serde_json::to_value(&self.lower).expect("valid json");
155        let u = serde_json::to_value(&self.upper).expect("valid json");
156        serde_json::json!({"lower": l, "upper": u})
157    }
158
159    fn into_columnar_stats(self) -> ColumnarStats {
160        ColumnarStats {
161            nulls: None,
162            values: ColumnStatKinds::Primitive(self.into()),
163        }
164    }
165}
166
167impl DynStats for PrimitiveStats<Vec<u8>> {
168    fn debug_json(&self) -> serde_json::Value {
169        serde_json::json!({
170            "lower": hex::encode(&self.lower),
171            "upper": hex::encode(&self.upper),
172        })
173    }
174
175    fn into_columnar_stats(self) -> ColumnarStats {
176        ColumnarStats {
177            nulls: None,
178            values: ColumnStatKinds::Bytes(BytesStats::Primitive(self)),
179        }
180    }
181}
182
183/// This macro implements the [`ColumnStats`] trait for all variants of [`PrimitiveStats`].
184macro_rules! stats_primitive {
185    ($data:ty, $ref_ty:ty, $ref:ident, $variant:ident) => {
186        impl ColumnStats for PrimitiveStats<$data> {
187            type Ref<'a> = $ref_ty;
188
189            fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
190                Some(self.lower.$ref())
191            }
192            fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
193                Some(self.upper.$ref())
194            }
195            fn none_count(&self) -> usize {
196                0
197            }
198        }
199
200        impl ColumnStats for OptionStats<PrimitiveStats<$data>> {
201            type Ref<'a> = Option<$ref_ty>;
202
203            fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
204                Some(self.some.lower())
205            }
206            fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
207                Some(self.some.upper())
208            }
209            fn none_count(&self) -> usize {
210                self.none
211            }
212        }
213
214        impl From<PrimitiveStats<$data>> for PrimitiveStatsVariants {
215            fn from(value: PrimitiveStats<$data>) -> Self {
216                PrimitiveStatsVariants::$variant(value)
217            }
218        }
219    };
220}
221
222stats_primitive!(bool, bool, clone, Bool);
223stats_primitive!(u8, u8, clone, U8);
224stats_primitive!(u16, u16, clone, U16);
225stats_primitive!(u32, u32, clone, U32);
226stats_primitive!(u64, u64, clone, U64);
227stats_primitive!(i8, i8, clone, I8);
228stats_primitive!(i16, i16, clone, I16);
229stats_primitive!(i32, i32, clone, I32);
230stats_primitive!(i64, i64, clone, I64);
231stats_primitive!(f32, f32, clone, F32);
232stats_primitive!(f64, f64, clone, F64);
233stats_primitive!(String, &'a str, as_str, String);
234
235/// This macro implements the [`RustType`] trait for all variants of [`PrimitiveStats`].
236macro_rules! primitive_stats_rust_type {
237    ($typ:ty, $lower:ident, $upper:ident) => {
238        impl RustType<ProtoPrimitiveStats> for PrimitiveStats<$typ> {
239            fn into_proto(&self) -> ProtoPrimitiveStats {
240                ProtoPrimitiveStats {
241                    lower: Some(proto_primitive_stats::Lower::$lower(
242                        self.lower.into_proto(),
243                    )),
244                    upper: Some(proto_primitive_stats::Upper::$upper(
245                        self.upper.into_proto(),
246                    )),
247                }
248            }
249
250            fn from_proto(proto: ProtoPrimitiveStats) -> Result<Self, TryFromProtoError> {
251                let lower = proto.lower.ok_or_else(|| {
252                    TryFromProtoError::missing_field("ProtoPrimitiveStats::lower")
253                })?;
254                let lower = match lower {
255                    proto_primitive_stats::Lower::$lower(x) => x.into_rust()?,
256                    _ => {
257                        return Err(TryFromProtoError::missing_field(
258                            "proto_primitive_stats::Lower::$lower",
259                        ));
260                    }
261                };
262                let upper = proto
263                    .upper
264                    .ok_or_else(|| TryFromProtoError::missing_field("ProtoPrimitiveStats::max"))?;
265                let upper = match upper {
266                    proto_primitive_stats::Upper::$upper(x) => x.into_rust()?,
267                    _ => {
268                        return Err(TryFromProtoError::missing_field(
269                            "proto_primitive_stats::Upper::$upper",
270                        ));
271                    }
272                };
273                Ok(PrimitiveStats { lower, upper })
274            }
275        }
276    };
277}
278
279primitive_stats_rust_type!(bool, LowerBool, UpperBool);
280primitive_stats_rust_type!(u8, LowerU8, UpperU8);
281primitive_stats_rust_type!(u16, LowerU16, UpperU16);
282primitive_stats_rust_type!(u32, LowerU32, UpperU32);
283primitive_stats_rust_type!(u64, LowerU64, UpperU64);
284primitive_stats_rust_type!(i8, LowerI8, UpperI8);
285primitive_stats_rust_type!(i16, LowerI16, UpperI16);
286primitive_stats_rust_type!(i32, LowerI32, UpperI32);
287primitive_stats_rust_type!(i64, LowerI64, UpperI64);
288primitive_stats_rust_type!(f32, LowerF32, UpperF32);
289primitive_stats_rust_type!(f64, LowerF64, UpperF64);
290primitive_stats_rust_type!(String, LowerString, UpperString);
291
292impl RustType<ProtoPrimitiveBytesStats> for PrimitiveStats<Vec<u8>> {
293    fn into_proto(&self) -> ProtoPrimitiveBytesStats {
294        ProtoPrimitiveBytesStats {
295            lower: self.lower.into_proto(),
296            upper: self.upper.into_proto(),
297        }
298    }
299
300    fn from_proto(proto: ProtoPrimitiveBytesStats) -> Result<Self, TryFromProtoError> {
301        let lower = proto.lower.into_rust()?;
302        let upper = proto.upper.into_rust()?;
303        Ok(PrimitiveStats { lower, upper })
304    }
305}
306
307impl TrimStats for ProtoPrimitiveStats {
308    fn trim(&mut self) {
309        use proto_primitive_stats::*;
310        match (&mut self.lower, &mut self.upper) {
311            (Some(Lower::LowerString(lower)), Some(Upper::UpperString(upper))) => {
312                // If the lower and upper strings both look like iso8601
313                // timestamps, then (1) they're small and (2) that's an
314                // extremely high signal that we might want to keep them around
315                // for filtering. We technically could still recover useful
316                // bounds here in the interpret code, but the complexity isn't
317                // worth it, so just skip any trimming.
318                if try_parse_monotonic_iso8601_timestamp(lower).is_some()
319                    && try_parse_monotonic_iso8601_timestamp(upper).is_some()
320                {
321                    return;
322                }
323
324                let common_prefix = lower
325                    .char_indices()
326                    .zip(upper.chars())
327                    .take_while(|((_, x), y)| x == y)
328                    .last();
329                if let Some(((o, x), y)) = common_prefix {
330                    let new_len = o + std::cmp::max(x.len_utf8(), y.len_utf8());
331                    *lower = truncate_string(lower, new_len, TruncateBound::Lower)
332                        .expect("lower bound should always truncate");
333                    if let Some(new_upper) = truncate_string(upper, new_len, TruncateBound::Upper) {
334                        *upper = new_upper;
335                    }
336                }
337            }
338            _ => {}
339        }
340    }
341}
342
343impl TrimStats for ProtoPrimitiveBytesStats {
344    fn trim(&mut self) {
345        let common_prefix = self
346            .lower
347            .iter()
348            .zip(self.upper.iter())
349            .take_while(|(x, y)| x == y)
350            .count();
351        self.lower = truncate_bytes(&self.lower, common_prefix + 1, TruncateBound::Lower)
352            .expect("lower bound should always truncate");
353        if let Some(upper) = truncate_bytes(&self.upper, common_prefix + 1, TruncateBound::Upper) {
354            self.upper = upper;
355        }
356    }
357}
358
359/// Enum wrapper around [`PrimitiveStats`] for each variant that we support.
360#[derive(Debug, Clone)]
361pub enum PrimitiveStatsVariants {
362    /// [`bool`]
363    Bool(PrimitiveStats<bool>),
364    /// [`u8`]
365    U8(PrimitiveStats<u8>),
366    /// [`u16`]
367    U16(PrimitiveStats<u16>),
368    /// [`u32`]
369    U32(PrimitiveStats<u32>),
370    /// [`u64`]
371    U64(PrimitiveStats<u64>),
372    /// [`i8`]
373    I8(PrimitiveStats<i8>),
374    /// [`i16`]
375    I16(PrimitiveStats<i16>),
376    /// [`i32`]
377    I32(PrimitiveStats<i32>),
378    /// [`i64`]
379    I64(PrimitiveStats<i64>),
380    /// [`f32`]
381    F32(PrimitiveStats<f32>),
382    /// [`f64`]
383    F64(PrimitiveStats<f64>),
384    /// [`String`]
385    String(PrimitiveStats<String>),
386}
387
388impl DynStats for PrimitiveStatsVariants {
389    fn debug_json(&self) -> serde_json::Value {
390        use PrimitiveStatsVariants::*;
391
392        match self {
393            Bool(stats) => stats.debug_json(),
394            U8(stats) => stats.debug_json(),
395            U16(stats) => stats.debug_json(),
396            U32(stats) => stats.debug_json(),
397            U64(stats) => stats.debug_json(),
398            I8(stats) => stats.debug_json(),
399            I16(stats) => stats.debug_json(),
400            I32(stats) => stats.debug_json(),
401            I64(stats) => stats.debug_json(),
402            F32(stats) => stats.debug_json(),
403            F64(stats) => stats.debug_json(),
404            String(stats) => stats.debug_json(),
405        }
406    }
407
408    fn into_columnar_stats(self) -> ColumnarStats {
409        ColumnarStats {
410            nulls: None,
411            values: ColumnStatKinds::Primitive(self),
412        }
413    }
414}
415
416impl RustType<ProtoPrimitiveStats> for PrimitiveStatsVariants {
417    fn into_proto(&self) -> ProtoPrimitiveStats {
418        use PrimitiveStatsVariants::*;
419
420        match self {
421            Bool(stats) => RustType::into_proto(stats),
422            U8(stats) => RustType::into_proto(stats),
423            U16(stats) => RustType::into_proto(stats),
424            U32(stats) => RustType::into_proto(stats),
425            U64(stats) => RustType::into_proto(stats),
426            I8(stats) => RustType::into_proto(stats),
427            I16(stats) => RustType::into_proto(stats),
428            I32(stats) => RustType::into_proto(stats),
429            I64(stats) => RustType::into_proto(stats),
430            F32(stats) => RustType::into_proto(stats),
431            F64(stats) => RustType::into_proto(stats),
432            String(stats) => RustType::into_proto(stats),
433        }
434    }
435
436    fn from_proto(proto: ProtoPrimitiveStats) -> Result<Self, mz_proto::TryFromProtoError> {
437        use crate::stats::proto_primitive_stats::{Lower, Upper};
438
439        let stats = match (proto.lower, proto.upper) {
440            (Some(Lower::LowerBool(lower)), Some(Upper::UpperBool(upper))) => {
441                PrimitiveStatsVariants::Bool(PrimitiveStats { lower, upper })
442            }
443            (Some(Lower::LowerU8(lower)), Some(Upper::UpperU8(upper))) => {
444                PrimitiveStatsVariants::U8(PrimitiveStats {
445                    lower: lower.into_rust()?,
446                    upper: upper.into_rust()?,
447                })
448            }
449            (Some(Lower::LowerU16(lower)), Some(Upper::UpperU16(upper))) => {
450                PrimitiveStatsVariants::U16(PrimitiveStats {
451                    lower: lower.into_rust()?,
452                    upper: upper.into_rust()?,
453                })
454            }
455            (Some(Lower::LowerU32(lower)), Some(Upper::UpperU32(upper))) => {
456                PrimitiveStatsVariants::U32(PrimitiveStats { lower, upper })
457            }
458            (Some(Lower::LowerU64(lower)), Some(Upper::UpperU64(upper))) => {
459                PrimitiveStatsVariants::U64(PrimitiveStats { lower, upper })
460            }
461            (Some(Lower::LowerI8(lower)), Some(Upper::UpperI8(upper))) => {
462                PrimitiveStatsVariants::I8(PrimitiveStats {
463                    lower: lower.into_rust()?,
464                    upper: upper.into_rust()?,
465                })
466            }
467            (Some(Lower::LowerI16(lower)), Some(Upper::UpperI16(upper))) => {
468                PrimitiveStatsVariants::I16(PrimitiveStats {
469                    lower: lower.into_rust()?,
470                    upper: upper.into_rust()?,
471                })
472            }
473            (Some(Lower::LowerI32(lower)), Some(Upper::UpperI32(upper))) => {
474                PrimitiveStatsVariants::I32(PrimitiveStats { lower, upper })
475            }
476            (Some(Lower::LowerI64(lower)), Some(Upper::UpperI64(upper))) => {
477                PrimitiveStatsVariants::I64(PrimitiveStats { lower, upper })
478            }
479            (Some(Lower::LowerF32(lower)), Some(Upper::UpperF32(upper))) => {
480                PrimitiveStatsVariants::F32(PrimitiveStats { lower, upper })
481            }
482            (Some(Lower::LowerF64(lower)), Some(Upper::UpperF64(upper))) => {
483                PrimitiveStatsVariants::F64(PrimitiveStats { lower, upper })
484            }
485            (Some(Lower::LowerString(lower)), Some(Upper::UpperString(upper))) => {
486                PrimitiveStatsVariants::String(PrimitiveStats { lower, upper })
487            }
488            (lower, upper) => {
489                return Err(TryFromProtoError::missing_field(format!(
490                    "expected lower and upper to be the same type, found {} and {}",
491                    std::any::type_name_of_val(&lower),
492                    std::any::type_name_of_val(&upper)
493                )));
494            }
495        };
496
497        Ok(stats)
498    }
499}
500
501/// Returns a [`Strategy`] for generating arbitrary [`PrimitiveStats`].
502pub(crate) fn any_primitive_stats<T>() -> impl Strategy<Value = PrimitiveStats<T>>
503where
504    T: Arbitrary + Ord + Serialize,
505    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>,
506{
507    Strategy::prop_map(proptest::arbitrary::any::<(T, T)>(), |(x0, x1)| {
508        if x0 <= x1 {
509            PrimitiveStats {
510                lower: x0,
511                upper: x1,
512            }
513        } else {
514            PrimitiveStats {
515                lower: x1,
516                upper: x0,
517            }
518        }
519    })
520}
521
522/// Returns a [`Strategy`] for generating arbitrary [`PrimitiveStats<Vec<u8>>`].
523pub(crate) fn any_primitive_vec_u8_stats() -> impl Strategy<Value = PrimitiveStats<Vec<u8>>> {
524    Strategy::prop_map(
525        proptest::arbitrary::any::<(Vec<u8>, Vec<u8>)>(),
526        |(x0, x1)| {
527            if x0 <= x1 {
528                PrimitiveStats {
529                    lower: x0,
530                    upper: x1,
531                }
532            } else {
533                PrimitiveStats {
534                    lower: x1,
535                    upper: x0,
536                }
537            }
538        },
539    )
540}
541
542#[cfg(test)]
543mod tests {
544    use arrow::array::{BinaryArray, StringArray};
545    use proptest::prelude::*;
546
547    use super::*;
548    use crate::stats::ColumnarStatsBuilder;
549
550    #[mz_ore::test]
551    fn test_truncate_bytes() {
552        #[track_caller]
553        fn testcase(x: &[u8], max_len: usize, upper_should_exist: bool) {
554            let lower = truncate_bytes(x, max_len, TruncateBound::Lower)
555                .expect("lower should always exist");
556            assert!(lower.len() <= max_len);
557            assert!(lower.as_slice() <= x);
558            let upper = truncate_bytes(x, max_len, TruncateBound::Upper);
559            assert_eq!(upper_should_exist, upper.is_some());
560            if let Some(upper) = upper {
561                assert!(upper.len() <= max_len);
562                assert!(upper.as_slice() >= x);
563            }
564        }
565
566        testcase(&[], 0, true);
567        testcase(&[], 1, true);
568        testcase(&[1], 0, false);
569        testcase(&[1], 1, true);
570        testcase(&[1], 2, true);
571        testcase(&[1, 2], 1, true);
572        testcase(&[1, 255], 2, true);
573        testcase(&[255, 255], 2, true);
574        testcase(&[255, 255, 255], 2, false);
575    }
576
577    #[mz_ore::test]
578    #[cfg_attr(miri, ignore)] // too slow
579    fn test_truncate_bytes_proptest() {
580        fn testcase(x: &[u8]) {
581            for max_len in 0..=x.len() {
582                let lower = truncate_bytes(x, max_len, TruncateBound::Lower)
583                    .expect("lower should always exist");
584                let upper = truncate_bytes(x, max_len, TruncateBound::Upper);
585                assert!(lower.len() <= max_len);
586                assert!(lower.as_slice() <= x);
587                if let Some(upper) = upper {
588                    assert!(upper.len() <= max_len);
589                    assert!(upper.as_slice() >= x);
590                }
591            }
592        }
593
594        proptest!(|(x in any::<Vec<u8>>())| {
595            // The proptest! macro interferes with rustfmt.
596            testcase(x.as_slice())
597        });
598    }
599
600    #[mz_ore::test]
601    fn test_truncate_string() {
602        #[track_caller]
603        fn testcase(x: &str, max_len: usize, upper_should_exist: bool) {
604            let lower = truncate_string(x, max_len, TruncateBound::Lower)
605                .expect("lower should always exist");
606            let upper = truncate_string(x, max_len, TruncateBound::Upper);
607            assert!(lower.len() <= max_len);
608            assert!(lower.as_str() <= x);
609            assert_eq!(upper_should_exist, upper.is_some());
610            if let Some(upper) = upper {
611                assert!(upper.len() <= max_len);
612                assert!(upper.as_str() >= x);
613            }
614        }
615
616        testcase("", 0, true);
617        testcase("1", 0, false);
618        testcase("1", 1, true);
619        testcase("12", 1, true);
620        testcase("⛄", 0, false);
621        testcase("⛄", 1, false);
622        testcase("⛄", 3, true);
623        testcase("\u{10FFFF}", 3, false);
624        testcase("\u{10FFFF}", 4, true);
625        testcase("\u{10FFFF}", 5, true);
626        testcase("⛄⛄", 3, true);
627        testcase("⛄⛄", 4, true);
628        testcase("⛄\u{10FFFF}", 6, true);
629        testcase("⛄\u{10FFFF}", 7, true);
630        testcase("\u{10FFFF}\u{10FFFF}", 7, false);
631        testcase("\u{10FFFF}\u{10FFFF}", 8, true);
632
633        // Just because I find this to be delightful.
634        assert_eq!(
635            truncate_string("⛄⛄", 3, TruncateBound::Upper),
636            Some("⛅".to_string())
637        );
638    }
639
640    #[mz_ore::test]
641    #[cfg_attr(miri, ignore)] // too slow
642    fn test_truncate_string_proptest() {
643        fn testcase(x: &str) {
644            for max_len in 0..=x.len() {
645                let lower = truncate_string(x, max_len, TruncateBound::Lower)
646                    .expect("lower should always exist");
647                let upper = truncate_string(x, max_len, TruncateBound::Upper);
648                assert!(lower.len() <= max_len);
649                assert!(lower.as_str() <= x);
650                if let Some(upper) = upper {
651                    // As explained in a comment in the impl, we don't quite
652                    // treat the max_len as a hard bound here. Give it a little
653                    // wiggle room.
654                    assert!(upper.len() <= max_len + char::MAX.len_utf8());
655                    assert!(upper.as_str() >= x);
656                }
657            }
658        }
659
660        proptest!(|(x in any::<String>())| {
661            // The proptest! macro interferes with rustfmt.
662            testcase(x.as_str())
663        });
664    }
665
666    #[mz_ore::test]
667    #[cfg_attr(miri, ignore)] // too slow
668    fn proptest_cost_trim() {
669        fn primitive_stats<T, A>(vals: &[T]) -> (&[T], PrimitiveStats<T>)
670        where
671            T: Clone,
672            A: arrow::array::Array + From<Vec<T>>,
673            PrimitiveStats<T>: ColumnarStatsBuilder<T, ArrowColumn = A>,
674        {
675            let column = A::from(vals.to_vec());
676            let stats = <PrimitiveStats<T> as ColumnarStatsBuilder<T>>::from_column(&column);
677
678            (vals, stats)
679        }
680
681        fn testcase<T: PartialOrd + Clone + Debug, P>(xs_stats: (&[T], PrimitiveStats<T>))
682        where
683            PrimitiveStats<T>: RustType<P> + DynStats,
684            P: TrimStats,
685        {
686            let (xs, stats) = xs_stats;
687            for x in xs {
688                assert!(&stats.lower <= x);
689                assert!(&stats.upper >= x);
690            }
691
692            let mut proto_stats = RustType::into_proto(&stats);
693            let cost_before = proto_stats.encoded_len();
694            proto_stats.trim();
695            assert!(proto_stats.encoded_len() <= cost_before);
696            let stats: PrimitiveStats<T> = RustType::from_proto(proto_stats).unwrap();
697            for x in xs {
698                assert!(&stats.lower <= x);
699                assert!(&stats.upper >= x);
700            }
701        }
702
703        proptest!(|(a in any::<bool>(), b in any::<bool>())| {
704            testcase(primitive_stats(&[a, b]))
705        });
706        proptest!(|(a in any::<u8>(), b in any::<u8>())| {
707            testcase(primitive_stats(&[a, b]))
708        });
709        proptest!(|(a in any::<u16>(), b in any::<u16>())| {
710            testcase(primitive_stats(&[a, b]))
711        });
712        proptest!(|(a in any::<u32>(), b in any::<u32>())| {
713            testcase(primitive_stats(&[a, b]))
714        });
715        proptest!(|(a in any::<u64>(), b in any::<u64>())| {
716            testcase(primitive_stats(&[a, b]))
717        });
718        proptest!(|(a in any::<i8>(), b in any::<i8>())| {
719            testcase(primitive_stats(&[a, b]))
720        });
721        proptest!(|(a in any::<i16>(), b in any::<i16>())| {
722            testcase(primitive_stats(&[a, b]))
723        });
724        proptest!(|(a in any::<i32>(), b in any::<i32>())| {
725            testcase(primitive_stats(&[a, b]))
726        });
727        proptest!(|(a in any::<i64>(), b in any::<i64>())| {
728            testcase(primitive_stats(&[a, b]))
729        });
730        proptest!(|(a in any::<f32>(), b in any::<f32>())| {
731            testcase(primitive_stats(&[a, b]))
732        });
733        proptest!(|(a in any::<f64>(), b in any::<f64>())| {
734            testcase(primitive_stats(&[a, b]))
735        });
736
737        // Construct strings that are "interesting" in that they have some
738        // (possibly empty) shared prefix.
739        proptest!(|(prefix in any::<String>(), a in any::<String>(), b in any::<String>())| {
740            let vals = &[format!("{}{}", prefix, a), format!("{}{}", prefix, b)];
741            let col = StringArray::from(vals.to_vec());
742            let stats = PrimitiveStats::<String>::from_column(&col);
743            testcase((&vals[..], stats))
744        });
745
746        // Construct strings that are "interesting" in that they have some
747        // (possibly empty) shared prefix.
748        proptest!(|(prefix in any::<Vec<u8>>(), a in any::<Vec<u8>>(), b in any::<Vec<u8>>())| {
749            let mut sa = prefix.clone();
750            sa.extend(&a);
751            let mut sb = prefix;
752            sb.extend(&b);
753            let vals = &[sa, sb];
754            let array = BinaryArray::from_vec(vals.iter().map(|v| v.as_ref()).collect());
755            let stats = PrimitiveStats::<Vec<u8>>::from_column(&array);
756            testcase((vals, stats));
757        });
758    }
759
760    // Regression test for a bug where "lossless" trimming would truncate an
761    // upper and lower bound that both parsed as our special iso8601 timestamps
762    // into something that no longer did.
763    #[mz_ore::test]
764    fn stats_trim_iso8601_recursion() {
765        use proto_primitive_stats::*;
766
767        let orig = PrimitiveStats {
768            lower: "2023-08-19T12:00:00.000Z".to_owned(),
769            upper: "2023-08-20T12:00:00.000Z".to_owned(),
770        };
771        let mut stats = RustType::into_proto(&orig);
772        stats.trim();
773        // Before the fix, this resulted in "2023-08-" and "2023-08.".
774        assert_eq!(stats.lower.unwrap(), Lower::LowerString(orig.lower));
775        assert_eq!(stats.upper.unwrap(), Upper::UpperString(orig.upper));
776    }
777}