mz_persist_types/stats/
primitive.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::fmt::{self, Debug};
11
12use mz_ore::str::redact;
13use mz_proto::{ProtoType, RustType, TryFromProtoError};
14use proptest::arbitrary::Arbitrary;
15use proptest::strategy::Strategy;
16use serde::Serialize;
17
18use crate::stats::{
19    BytesStats, ColumnStatKinds, ColumnStats, ColumnarStats, DynStats, OptionStats,
20    ProtoPrimitiveBytesStats, ProtoPrimitiveStats, TrimStats, proto_primitive_stats,
21};
22use crate::timestamp::try_parse_monotonic_iso8601_timestamp;
23
24/// The length to truncate `Vec<u8>` and `String` stats to.
25//
26// Ideally, this would be in LaunchDarkly, but the plumbing is tough.
27pub const TRUNCATE_LEN: usize = 100;
28
29/// Whether a truncated value should be a lower or upper bound on the original.
30pub enum TruncateBound {
31    /// Truncate such that the result is <= the original.
32    Lower,
33    /// Truncate such that the result is >= the original.
34    Upper,
35}
36
37/// Truncates a u8 slice to the given maximum byte length.
38///
39/// If `bound` is Lower, the returned value will sort <= the original value, and
40/// if `bound` is Upper, it will sort >= the original value.
41///
42/// Lower bounds will always return Some. Upper bounds might return None if the
43/// part that fits in `max_len` is entirely made of `u8::MAX`.
44pub fn truncate_bytes(x: &[u8], max_len: usize, bound: TruncateBound) -> Option<Vec<u8>> {
45    if x.len() <= max_len {
46        return Some(x.to_owned());
47    }
48    match bound {
49        // Any truncation is a lower bound.
50        TruncateBound::Lower => Some(x[..max_len].to_owned()),
51        TruncateBound::Upper => {
52            for idx in (0..max_len).rev() {
53                if x[idx] < u8::MAX {
54                    let mut ret = x[..=idx].to_owned();
55                    ret[idx] += 1;
56                    return Some(ret);
57                }
58            }
59            None
60        }
61    }
62}
63
64/// Truncates a string to the given maximum byte length.
65///
66/// The returned value is a valid utf-8 string. If `bound` is Lower, it will
67/// sort <= the original string, and if `bound` is Upper, it will sort >= the
68/// original string.
69///
70/// Lower bounds will always return Some. Upper bounds might return None if the
71/// part that fits in `max_len` is entirely made of `char::MAX` (so in practice,
72/// probably ~never).
73pub fn truncate_string(x: &str, max_len: usize, bound: TruncateBound) -> Option<String> {
74    if x.len() <= max_len {
75        return Some(x.to_owned());
76    }
77    // For the output to be valid utf-8, we have to truncate along a char
78    // boundary.
79    let truncation_idx = x
80        .char_indices()
81        .map(|(idx, c)| idx + c.len_utf8())
82        .take_while(|char_end| *char_end <= max_len)
83        .last()
84        .unwrap_or(0);
85    let truncated = &x[..truncation_idx];
86    match bound {
87        // Any truncation is a lower bound.
88        TruncateBound::Lower => Some(truncated.to_owned()),
89        TruncateBound::Upper => {
90            // See if we can find a char that's not already the max. If so, take
91            // the last of these and increment it.
92            for (idx, c) in truncated.char_indices().rev() {
93                if let Ok(new_last_char) = char::try_from(u32::from(c) + 1) {
94                    // NB: It's technically possible for `new_last_char` to be
95                    // more bytes than `c`, which means we could go over
96                    // max_len. It isn't a hard requirement for the initial
97                    // caller of this, so don't bother with the complexity yet.
98                    let mut ret = String::with_capacity(idx + new_last_char.len_utf8());
99                    ret.push_str(&truncated[..idx]);
100                    ret.push(new_last_char);
101                    return Some(ret);
102                }
103            }
104            None
105        }
106    }
107}
108
109/// Statistics about a primitive non-optional column.
110#[derive(Copy, Clone)]
111pub struct PrimitiveStats<T> {
112    /// An inclusive lower bound on the data contained in the column.
113    ///
114    /// This will often be a tight bound, but it's not guaranteed. Persist
115    /// reserves the right to (for example) invent smaller bounds for long byte
116    /// strings. SUBTLE: This means that this exact value may not be present in
117    /// the column.
118    ///
119    /// Similarly, if the column is empty, this will contain `T: Default`.
120    /// Emptiness will be indicated in statistics higher up (i.e.
121    /// [`crate::stats::StructStats`]).
122    pub lower: T,
123    /// Same as [Self::lower] but an (also inclusive) upper bound.
124    pub upper: T,
125}
126
127impl<T: Serialize> Debug for PrimitiveStats<T>
128where
129    // Note: We introduce this bound so we can have a different impl for PrimitiveStats<Vec<u8>>.
130    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>,
131{
132    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133        let l = serde_json::to_value(&self.lower).expect("valid json");
134        let u = serde_json::to_value(&self.upper).expect("valid json");
135        f.debug_struct("PrimitiveStats")
136            .field("lower", &redact(l))
137            .field("upper", &redact(u))
138            .finish()
139    }
140}
141
142impl Debug for PrimitiveStats<Vec<u8>> {
143    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
144        f.debug_struct("PrimitiveStats")
145            .field("lower", &redact(hex::encode(&self.lower)))
146            .field("upper", &redact(hex::encode(&self.upper)))
147            .finish()
148    }
149}
150
151impl<T: Serialize> DynStats for PrimitiveStats<T>
152where
153    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>
154        + Into<PrimitiveStatsVariants>
155        + Clone
156        + Send
157        + Sync
158        + 'static,
159{
160    fn debug_json(&self) -> serde_json::Value {
161        let l = serde_json::to_value(&self.lower).expect("valid json");
162        let u = serde_json::to_value(&self.upper).expect("valid json");
163        serde_json::json!({"lower": l, "upper": u})
164    }
165
166    fn into_columnar_stats(self) -> ColumnarStats {
167        ColumnarStats {
168            nulls: None,
169            values: ColumnStatKinds::Primitive(self.into()),
170        }
171    }
172}
173
174impl DynStats for PrimitiveStats<Vec<u8>> {
175    fn debug_json(&self) -> serde_json::Value {
176        serde_json::json!({
177            "lower": hex::encode(&self.lower),
178            "upper": hex::encode(&self.upper),
179        })
180    }
181
182    fn into_columnar_stats(self) -> ColumnarStats {
183        ColumnarStats {
184            nulls: None,
185            values: ColumnStatKinds::Bytes(BytesStats::Primitive(self)),
186        }
187    }
188}
189
190/// This macro implements the [`ColumnStats`] trait for all variants of [`PrimitiveStats`].
191macro_rules! stats_primitive {
192    ($data:ty, $ref_ty:ty, $ref:ident, $variant:ident) => {
193        impl ColumnStats for PrimitiveStats<$data> {
194            type Ref<'a> = $ref_ty;
195
196            fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
197                Some(self.lower.$ref())
198            }
199            fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
200                Some(self.upper.$ref())
201            }
202            fn none_count(&self) -> usize {
203                0
204            }
205        }
206
207        impl ColumnStats for OptionStats<PrimitiveStats<$data>> {
208            type Ref<'a> = Option<$ref_ty>;
209
210            fn lower<'a>(&'a self) -> Option<Self::Ref<'a>> {
211                Some(self.some.lower())
212            }
213            fn upper<'a>(&'a self) -> Option<Self::Ref<'a>> {
214                Some(self.some.upper())
215            }
216            fn none_count(&self) -> usize {
217                self.none
218            }
219        }
220
221        impl From<PrimitiveStats<$data>> for PrimitiveStatsVariants {
222            fn from(value: PrimitiveStats<$data>) -> Self {
223                PrimitiveStatsVariants::$variant(value)
224            }
225        }
226    };
227}
228
229stats_primitive!(bool, bool, clone, Bool);
230stats_primitive!(u8, u8, clone, U8);
231stats_primitive!(u16, u16, clone, U16);
232stats_primitive!(u32, u32, clone, U32);
233stats_primitive!(u64, u64, clone, U64);
234stats_primitive!(i8, i8, clone, I8);
235stats_primitive!(i16, i16, clone, I16);
236stats_primitive!(i32, i32, clone, I32);
237stats_primitive!(i64, i64, clone, I64);
238stats_primitive!(f32, f32, clone, F32);
239stats_primitive!(f64, f64, clone, F64);
240stats_primitive!(String, &'a str, as_str, String);
241
242/// This macro implements the [`RustType`] trait for all variants of [`PrimitiveStats`].
243macro_rules! primitive_stats_rust_type {
244    ($typ:ty, $lower:ident, $upper:ident) => {
245        impl RustType<ProtoPrimitiveStats> for PrimitiveStats<$typ> {
246            fn into_proto(&self) -> ProtoPrimitiveStats {
247                ProtoPrimitiveStats {
248                    lower: Some(proto_primitive_stats::Lower::$lower(
249                        self.lower.into_proto(),
250                    )),
251                    upper: Some(proto_primitive_stats::Upper::$upper(
252                        self.upper.into_proto(),
253                    )),
254                }
255            }
256
257            fn from_proto(proto: ProtoPrimitiveStats) -> Result<Self, TryFromProtoError> {
258                let lower = proto.lower.ok_or_else(|| {
259                    TryFromProtoError::missing_field("ProtoPrimitiveStats::lower")
260                })?;
261                let lower = match lower {
262                    proto_primitive_stats::Lower::$lower(x) => x.into_rust()?,
263                    _ => {
264                        return Err(TryFromProtoError::missing_field(
265                            "proto_primitive_stats::Lower::$lower",
266                        ));
267                    }
268                };
269                let upper = proto
270                    .upper
271                    .ok_or_else(|| TryFromProtoError::missing_field("ProtoPrimitiveStats::max"))?;
272                let upper = match upper {
273                    proto_primitive_stats::Upper::$upper(x) => x.into_rust()?,
274                    _ => {
275                        return Err(TryFromProtoError::missing_field(
276                            "proto_primitive_stats::Upper::$upper",
277                        ));
278                    }
279                };
280                Ok(PrimitiveStats { lower, upper })
281            }
282        }
283    };
284}
285
286primitive_stats_rust_type!(bool, LowerBool, UpperBool);
287primitive_stats_rust_type!(u8, LowerU8, UpperU8);
288primitive_stats_rust_type!(u16, LowerU16, UpperU16);
289primitive_stats_rust_type!(u32, LowerU32, UpperU32);
290primitive_stats_rust_type!(u64, LowerU64, UpperU64);
291primitive_stats_rust_type!(i8, LowerI8, UpperI8);
292primitive_stats_rust_type!(i16, LowerI16, UpperI16);
293primitive_stats_rust_type!(i32, LowerI32, UpperI32);
294primitive_stats_rust_type!(i64, LowerI64, UpperI64);
295primitive_stats_rust_type!(f32, LowerF32, UpperF32);
296primitive_stats_rust_type!(f64, LowerF64, UpperF64);
297primitive_stats_rust_type!(String, LowerString, UpperString);
298
299impl RustType<ProtoPrimitiveBytesStats> for PrimitiveStats<Vec<u8>> {
300    fn into_proto(&self) -> ProtoPrimitiveBytesStats {
301        ProtoPrimitiveBytesStats {
302            lower: self.lower.into_proto(),
303            upper: self.upper.into_proto(),
304        }
305    }
306
307    fn from_proto(proto: ProtoPrimitiveBytesStats) -> Result<Self, TryFromProtoError> {
308        let lower = proto.lower.into_rust()?;
309        let upper = proto.upper.into_rust()?;
310        Ok(PrimitiveStats { lower, upper })
311    }
312}
313
314impl TrimStats for ProtoPrimitiveStats {
315    fn trim(&mut self) {
316        use proto_primitive_stats::*;
317        match (&mut self.lower, &mut self.upper) {
318            (Some(Lower::LowerString(lower)), Some(Upper::UpperString(upper))) => {
319                // If the lower and upper strings both look like iso8601
320                // timestamps, then (1) they're small and (2) that's an
321                // extremely high signal that we might want to keep them around
322                // for filtering. We technically could still recover useful
323                // bounds here in the interpret code, but the complexity isn't
324                // worth it, so just skip any trimming.
325                if try_parse_monotonic_iso8601_timestamp(lower).is_some()
326                    && try_parse_monotonic_iso8601_timestamp(upper).is_some()
327                {
328                    return;
329                }
330
331                #[allow(clippy::disallowed_methods)]
332                let common_prefix = lower
333                    .char_indices()
334                    .zip(upper.chars())
335                    .take_while(|((_, x), y)| x == y)
336                    .last();
337                if let Some(((o, x), y)) = common_prefix {
338                    let new_len = o + std::cmp::max(x.len_utf8(), y.len_utf8());
339                    *lower = truncate_string(lower, new_len, TruncateBound::Lower)
340                        .expect("lower bound should always truncate");
341                    if let Some(new_upper) = truncate_string(upper, new_len, TruncateBound::Upper) {
342                        *upper = new_upper;
343                    }
344                }
345            }
346            _ => {}
347        }
348    }
349}
350
351impl TrimStats for ProtoPrimitiveBytesStats {
352    fn trim(&mut self) {
353        #[allow(clippy::disallowed_methods)]
354        let common_prefix = self
355            .lower
356            .iter()
357            .zip(self.upper.iter())
358            .take_while(|(x, y)| x == y)
359            .count();
360        self.lower = truncate_bytes(&self.lower, common_prefix + 1, TruncateBound::Lower)
361            .expect("lower bound should always truncate");
362        if let Some(upper) = truncate_bytes(&self.upper, common_prefix + 1, TruncateBound::Upper) {
363            self.upper = upper;
364        }
365    }
366}
367
368/// Enum wrapper around [`PrimitiveStats`] for each variant that we support.
369#[derive(Debug, Clone)]
370pub enum PrimitiveStatsVariants {
371    /// [`bool`]
372    Bool(PrimitiveStats<bool>),
373    /// [`u8`]
374    U8(PrimitiveStats<u8>),
375    /// [`u16`]
376    U16(PrimitiveStats<u16>),
377    /// [`u32`]
378    U32(PrimitiveStats<u32>),
379    /// [`u64`]
380    U64(PrimitiveStats<u64>),
381    /// [`i8`]
382    I8(PrimitiveStats<i8>),
383    /// [`i16`]
384    I16(PrimitiveStats<i16>),
385    /// [`i32`]
386    I32(PrimitiveStats<i32>),
387    /// [`i64`]
388    I64(PrimitiveStats<i64>),
389    /// [`f32`]
390    F32(PrimitiveStats<f32>),
391    /// [`f64`]
392    F64(PrimitiveStats<f64>),
393    /// [`String`]
394    String(PrimitiveStats<String>),
395}
396
397impl DynStats for PrimitiveStatsVariants {
398    fn debug_json(&self) -> serde_json::Value {
399        use PrimitiveStatsVariants::*;
400
401        match self {
402            Bool(stats) => stats.debug_json(),
403            U8(stats) => stats.debug_json(),
404            U16(stats) => stats.debug_json(),
405            U32(stats) => stats.debug_json(),
406            U64(stats) => stats.debug_json(),
407            I8(stats) => stats.debug_json(),
408            I16(stats) => stats.debug_json(),
409            I32(stats) => stats.debug_json(),
410            I64(stats) => stats.debug_json(),
411            F32(stats) => stats.debug_json(),
412            F64(stats) => stats.debug_json(),
413            String(stats) => stats.debug_json(),
414        }
415    }
416
417    fn into_columnar_stats(self) -> ColumnarStats {
418        ColumnarStats {
419            nulls: None,
420            values: ColumnStatKinds::Primitive(self),
421        }
422    }
423}
424
425impl RustType<ProtoPrimitiveStats> for PrimitiveStatsVariants {
426    fn into_proto(&self) -> ProtoPrimitiveStats {
427        use PrimitiveStatsVariants::*;
428
429        match self {
430            Bool(stats) => RustType::into_proto(stats),
431            U8(stats) => RustType::into_proto(stats),
432            U16(stats) => RustType::into_proto(stats),
433            U32(stats) => RustType::into_proto(stats),
434            U64(stats) => RustType::into_proto(stats),
435            I8(stats) => RustType::into_proto(stats),
436            I16(stats) => RustType::into_proto(stats),
437            I32(stats) => RustType::into_proto(stats),
438            I64(stats) => RustType::into_proto(stats),
439            F32(stats) => RustType::into_proto(stats),
440            F64(stats) => RustType::into_proto(stats),
441            String(stats) => RustType::into_proto(stats),
442        }
443    }
444
445    fn from_proto(proto: ProtoPrimitiveStats) -> Result<Self, mz_proto::TryFromProtoError> {
446        use crate::stats::proto_primitive_stats::{Lower, Upper};
447
448        let stats = match (proto.lower, proto.upper) {
449            (Some(Lower::LowerBool(lower)), Some(Upper::UpperBool(upper))) => {
450                PrimitiveStatsVariants::Bool(PrimitiveStats { lower, upper })
451            }
452            (Some(Lower::LowerU8(lower)), Some(Upper::UpperU8(upper))) => {
453                PrimitiveStatsVariants::U8(PrimitiveStats {
454                    lower: lower.into_rust()?,
455                    upper: upper.into_rust()?,
456                })
457            }
458            (Some(Lower::LowerU16(lower)), Some(Upper::UpperU16(upper))) => {
459                PrimitiveStatsVariants::U16(PrimitiveStats {
460                    lower: lower.into_rust()?,
461                    upper: upper.into_rust()?,
462                })
463            }
464            (Some(Lower::LowerU32(lower)), Some(Upper::UpperU32(upper))) => {
465                PrimitiveStatsVariants::U32(PrimitiveStats { lower, upper })
466            }
467            (Some(Lower::LowerU64(lower)), Some(Upper::UpperU64(upper))) => {
468                PrimitiveStatsVariants::U64(PrimitiveStats { lower, upper })
469            }
470            (Some(Lower::LowerI8(lower)), Some(Upper::UpperI8(upper))) => {
471                PrimitiveStatsVariants::I8(PrimitiveStats {
472                    lower: lower.into_rust()?,
473                    upper: upper.into_rust()?,
474                })
475            }
476            (Some(Lower::LowerI16(lower)), Some(Upper::UpperI16(upper))) => {
477                PrimitiveStatsVariants::I16(PrimitiveStats {
478                    lower: lower.into_rust()?,
479                    upper: upper.into_rust()?,
480                })
481            }
482            (Some(Lower::LowerI32(lower)), Some(Upper::UpperI32(upper))) => {
483                PrimitiveStatsVariants::I32(PrimitiveStats { lower, upper })
484            }
485            (Some(Lower::LowerI64(lower)), Some(Upper::UpperI64(upper))) => {
486                PrimitiveStatsVariants::I64(PrimitiveStats { lower, upper })
487            }
488            (Some(Lower::LowerF32(lower)), Some(Upper::UpperF32(upper))) => {
489                PrimitiveStatsVariants::F32(PrimitiveStats { lower, upper })
490            }
491            (Some(Lower::LowerF64(lower)), Some(Upper::UpperF64(upper))) => {
492                PrimitiveStatsVariants::F64(PrimitiveStats { lower, upper })
493            }
494            (Some(Lower::LowerString(lower)), Some(Upper::UpperString(upper))) => {
495                PrimitiveStatsVariants::String(PrimitiveStats { lower, upper })
496            }
497            (lower, upper) => {
498                return Err(TryFromProtoError::missing_field(format!(
499                    "expected lower and upper to be the same type, found {} and {}",
500                    std::any::type_name_of_val(&lower),
501                    std::any::type_name_of_val(&upper)
502                )));
503            }
504        };
505
506        Ok(stats)
507    }
508}
509
510/// Returns a [`Strategy`] for generating arbitrary [`PrimitiveStats`].
511pub(crate) fn any_primitive_stats<T>() -> impl Strategy<Value = PrimitiveStats<T>>
512where
513    T: Arbitrary + Ord + Serialize,
514    PrimitiveStats<T>: RustType<ProtoPrimitiveStats>,
515{
516    Strategy::prop_map(proptest::arbitrary::any::<(T, T)>(), |(x0, x1)| {
517        if x0 <= x1 {
518            PrimitiveStats {
519                lower: x0,
520                upper: x1,
521            }
522        } else {
523            PrimitiveStats {
524                lower: x1,
525                upper: x0,
526            }
527        }
528    })
529}
530
531/// Returns a [`Strategy`] for generating arbitrary [`PrimitiveStats<Vec<u8>>`].
532pub(crate) fn any_primitive_vec_u8_stats() -> impl Strategy<Value = PrimitiveStats<Vec<u8>>> {
533    Strategy::prop_map(
534        proptest::arbitrary::any::<(Vec<u8>, Vec<u8>)>(),
535        |(x0, x1)| {
536            if x0 <= x1 {
537                PrimitiveStats {
538                    lower: x0,
539                    upper: x1,
540                }
541            } else {
542                PrimitiveStats {
543                    lower: x1,
544                    upper: x0,
545                }
546            }
547        },
548    )
549}
550
551#[cfg(test)]
552mod tests {
553    use arrow::array::{BinaryArray, StringArray};
554    use proptest::prelude::*;
555
556    use super::*;
557    use crate::stats::ColumnarStatsBuilder;
558
559    #[mz_ore::test]
560    fn test_truncate_bytes() {
561        #[track_caller]
562        fn testcase(x: &[u8], max_len: usize, upper_should_exist: bool) {
563            let lower = truncate_bytes(x, max_len, TruncateBound::Lower)
564                .expect("lower should always exist");
565            assert!(lower.len() <= max_len);
566            assert!(lower.as_slice() <= x);
567            let upper = truncate_bytes(x, max_len, TruncateBound::Upper);
568            assert_eq!(upper_should_exist, upper.is_some());
569            if let Some(upper) = upper {
570                assert!(upper.len() <= max_len);
571                assert!(upper.as_slice() >= x);
572            }
573        }
574
575        testcase(&[], 0, true);
576        testcase(&[], 1, true);
577        testcase(&[1], 0, false);
578        testcase(&[1], 1, true);
579        testcase(&[1], 2, true);
580        testcase(&[1, 2], 1, true);
581        testcase(&[1, 255], 2, true);
582        testcase(&[255, 255], 2, true);
583        testcase(&[255, 255, 255], 2, false);
584    }
585
586    #[mz_ore::test]
587    #[cfg_attr(miri, ignore)] // too slow
588    fn test_truncate_bytes_proptest() {
589        fn testcase(x: &[u8]) {
590            for max_len in 0..=x.len() {
591                let lower = truncate_bytes(x, max_len, TruncateBound::Lower)
592                    .expect("lower should always exist");
593                let upper = truncate_bytes(x, max_len, TruncateBound::Upper);
594                assert!(lower.len() <= max_len);
595                assert!(lower.as_slice() <= x);
596                if let Some(upper) = upper {
597                    assert!(upper.len() <= max_len);
598                    assert!(upper.as_slice() >= x);
599                }
600            }
601        }
602
603        proptest!(|(x in any::<Vec<u8>>())| {
604            // The proptest! macro interferes with rustfmt.
605            testcase(x.as_slice())
606        });
607    }
608
609    #[mz_ore::test]
610    fn test_truncate_string() {
611        #[track_caller]
612        fn testcase(x: &str, max_len: usize, upper_should_exist: bool) {
613            let lower = truncate_string(x, max_len, TruncateBound::Lower)
614                .expect("lower should always exist");
615            let upper = truncate_string(x, max_len, TruncateBound::Upper);
616            assert!(lower.len() <= max_len);
617            assert!(lower.as_str() <= x);
618            assert_eq!(upper_should_exist, upper.is_some());
619            if let Some(upper) = upper {
620                assert!(upper.len() <= max_len);
621                assert!(upper.as_str() >= x);
622            }
623        }
624
625        testcase("", 0, true);
626        testcase("1", 0, false);
627        testcase("1", 1, true);
628        testcase("12", 1, true);
629        testcase("⛄", 0, false);
630        testcase("⛄", 1, false);
631        testcase("⛄", 3, true);
632        testcase("\u{10FFFF}", 3, false);
633        testcase("\u{10FFFF}", 4, true);
634        testcase("\u{10FFFF}", 5, true);
635        testcase("⛄⛄", 3, true);
636        testcase("⛄⛄", 4, true);
637        testcase("⛄\u{10FFFF}", 6, true);
638        testcase("⛄\u{10FFFF}", 7, true);
639        testcase("\u{10FFFF}\u{10FFFF}", 7, false);
640        testcase("\u{10FFFF}\u{10FFFF}", 8, true);
641
642        // Just because I find this to be delightful.
643        assert_eq!(
644            truncate_string("⛄⛄", 3, TruncateBound::Upper),
645            Some("⛅".to_string())
646        );
647    }
648
649    #[mz_ore::test]
650    #[cfg_attr(miri, ignore)] // too slow
651    fn test_truncate_string_proptest() {
652        fn testcase(x: &str) {
653            for max_len in 0..=x.len() {
654                let lower = truncate_string(x, max_len, TruncateBound::Lower)
655                    .expect("lower should always exist");
656                let upper = truncate_string(x, max_len, TruncateBound::Upper);
657                assert!(lower.len() <= max_len);
658                assert!(lower.as_str() <= x);
659                if let Some(upper) = upper {
660                    // As explained in a comment in the impl, we don't quite
661                    // treat the max_len as a hard bound here. Give it a little
662                    // wiggle room.
663                    assert!(upper.len() <= max_len + char::MAX.len_utf8());
664                    assert!(upper.as_str() >= x);
665                }
666            }
667        }
668
669        proptest!(|(x in any::<String>())| {
670            // The proptest! macro interferes with rustfmt.
671            testcase(x.as_str())
672        });
673    }
674
675    #[mz_ore::test]
676    #[cfg_attr(miri, ignore)] // too slow
677    fn proptest_cost_trim() {
678        fn primitive_stats<T, A>(vals: &[T]) -> (&[T], PrimitiveStats<T>)
679        where
680            T: Clone,
681            A: arrow::array::Array + From<Vec<T>>,
682            PrimitiveStats<T>: ColumnarStatsBuilder<T, ArrowColumn = A>,
683        {
684            let column = A::from(vals.to_vec());
685            let stats = <PrimitiveStats<T> as ColumnarStatsBuilder<T>>::from_column(&column);
686
687            (vals, stats)
688        }
689
690        fn testcase<T: PartialOrd + Clone + Debug, P>(xs_stats: (&[T], PrimitiveStats<T>))
691        where
692            PrimitiveStats<T>: RustType<P> + DynStats,
693            P: TrimStats,
694        {
695            let (xs, stats) = xs_stats;
696            for x in xs {
697                assert!(&stats.lower <= x);
698                assert!(&stats.upper >= x);
699            }
700
701            let mut proto_stats = RustType::into_proto(&stats);
702            let cost_before = proto_stats.encoded_len();
703            proto_stats.trim();
704            assert!(proto_stats.encoded_len() <= cost_before);
705            let stats: PrimitiveStats<T> = RustType::from_proto(proto_stats).unwrap();
706            for x in xs {
707                assert!(&stats.lower <= x);
708                assert!(&stats.upper >= x);
709            }
710        }
711
712        proptest!(|(a in any::<bool>(), b in any::<bool>())| {
713            testcase(primitive_stats(&[a, b]))
714        });
715        proptest!(|(a in any::<u8>(), b in any::<u8>())| {
716            testcase(primitive_stats(&[a, b]))
717        });
718        proptest!(|(a in any::<u16>(), b in any::<u16>())| {
719            testcase(primitive_stats(&[a, b]))
720        });
721        proptest!(|(a in any::<u32>(), b in any::<u32>())| {
722            testcase(primitive_stats(&[a, b]))
723        });
724        proptest!(|(a in any::<u64>(), b in any::<u64>())| {
725            testcase(primitive_stats(&[a, b]))
726        });
727        proptest!(|(a in any::<i8>(), b in any::<i8>())| {
728            testcase(primitive_stats(&[a, b]))
729        });
730        proptest!(|(a in any::<i16>(), b in any::<i16>())| {
731            testcase(primitive_stats(&[a, b]))
732        });
733        proptest!(|(a in any::<i32>(), b in any::<i32>())| {
734            testcase(primitive_stats(&[a, b]))
735        });
736        proptest!(|(a in any::<i64>(), b in any::<i64>())| {
737            testcase(primitive_stats(&[a, b]))
738        });
739        proptest!(|(a in any::<f32>(), b in any::<f32>())| {
740            testcase(primitive_stats(&[a, b]))
741        });
742        proptest!(|(a in any::<f64>(), b in any::<f64>())| {
743            testcase(primitive_stats(&[a, b]))
744        });
745
746        // Construct strings that are "interesting" in that they have some
747        // (possibly empty) shared prefix.
748        proptest!(|(prefix in any::<String>(), a in any::<String>(), b in any::<String>())| {
749            let vals = &[format!("{}{}", prefix, a), format!("{}{}", prefix, b)];
750            let col = StringArray::from(vals.to_vec());
751            let stats = PrimitiveStats::<String>::from_column(&col);
752            testcase((&vals[..], stats))
753        });
754
755        // Construct strings that are "interesting" in that they have some
756        // (possibly empty) shared prefix.
757        proptest!(|(prefix in any::<Vec<u8>>(), a in any::<Vec<u8>>(), b in any::<Vec<u8>>())| {
758            let mut sa = prefix.clone();
759            sa.extend(&a);
760            let mut sb = prefix;
761            sb.extend(&b);
762            let vals = &[sa, sb];
763            let array = BinaryArray::from_vec(vals.iter().map(|v| v.as_ref()).collect());
764            let stats = PrimitiveStats::<Vec<u8>>::from_column(&array);
765            testcase((vals, stats));
766        });
767    }
768
769    // Regression test for a bug where "lossless" trimming would truncate an
770    // upper and lower bound that both parsed as our special iso8601 timestamps
771    // into something that no longer did.
772    #[mz_ore::test]
773    fn stats_trim_iso8601_recursion() {
774        use proto_primitive_stats::*;
775
776        let orig = PrimitiveStats {
777            lower: "2023-08-19T12:00:00.000Z".to_owned(),
778            upper: "2023-08-20T12:00:00.000Z".to_owned(),
779        };
780        let mut stats = RustType::into_proto(&orig);
781        stats.trim();
782        // Before the fix, this resulted in "2023-08-" and "2023-08.".
783        assert_eq!(stats.lower.unwrap(), Lower::LowerString(orig.lower));
784        assert_eq!(stats.upper.unwrap(), Upper::UpperString(orig.upper));
785    }
786}