parquet/file/
statistics.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains definitions for working with Parquet statistics.
19//!
20//! Though some common methods are available on enum, use pattern match to extract
21//! actual min and max values from statistics, see below:
22//!
23//! # Examples
24//! ```rust
25//! use parquet::file::statistics::Statistics;
26//!
27//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true);
28//! assert_eq!(stats.null_count_opt(), Some(3));
29//! assert!(stats.is_min_max_deprecated());
30//! assert!(stats.min_is_exact());
31//! assert!(stats.max_is_exact());
32//!
33//! match stats {
34//!     Statistics::Int32(ref typed) => {
35//!         assert_eq!(typed.min_opt(), Some(&1));
36//!         assert_eq!(typed.max_opt(), Some(&10));
37//!     }
38//!     _ => {}
39//! }
40//! ```
41
42use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53    use super::*;
54
55    pub trait MakeStatistics {
56        fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57        where
58            Self: Sized;
59    }
60
61    macro_rules! gen_make_statistics {
62        ($value_ty:ty, $stat:ident) => {
63            impl MakeStatistics for $value_ty {
64                fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65                where
66                    Self: Sized,
67                {
68                    Statistics::$stat(statistics)
69                }
70            }
71        };
72    }
73
74    gen_make_statistics!(bool, Boolean);
75    gen_make_statistics!(i32, Int32);
76    gen_make_statistics!(i64, Int64);
77    gen_make_statistics!(Int96, Int96);
78    gen_make_statistics!(f32, Float);
79    gen_make_statistics!(f64, Double);
80    gen_make_statistics!(ByteArray, ByteArray);
81    gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84/// Macro to generate methods to create Statistics.
85macro_rules! statistics_new_func {
86    ($func:ident, $vtype:ty, $stat:ident) => {
87        #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88        pub fn $func(
89            min: $vtype,
90            max: $vtype,
91            distinct: Option<u64>,
92            nulls: Option<u64>,
93            is_deprecated: bool,
94        ) -> Self {
95            Statistics::$stat(ValueStatistics::new(
96                min,
97                max,
98                distinct,
99                nulls,
100                is_deprecated,
101            ))
102        }
103    };
104}
105
106// Macro to generate getter functions for Statistics.
107macro_rules! statistics_enum_func {
108    ($self:ident, $func:ident) => {{
109        match *$self {
110            Statistics::Boolean(ref typed) => typed.$func(),
111            Statistics::Int32(ref typed) => typed.$func(),
112            Statistics::Int64(ref typed) => typed.$func(),
113            Statistics::Int96(ref typed) => typed.$func(),
114            Statistics::Float(ref typed) => typed.$func(),
115            Statistics::Double(ref typed) => typed.$func(),
116            Statistics::ByteArray(ref typed) => typed.$func(),
117            Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118        }
119    }};
120}
121
122/// Converts Thrift definition into `Statistics`.
123pub fn from_thrift(
124    physical_type: Type,
125    thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127    Ok(match thrift_stats {
128        Some(stats) => {
129            // Number of nulls recorded, when it is not available, we just mark it as 0.
130            // TODO this should be `None` if there is no information about NULLS.
131            // see https://github.com/apache/arrow-rs/pull/6216/files
132            let null_count = stats.null_count.unwrap_or(0);
133
134            if null_count < 0 {
135                return Err(ParquetError::General(format!(
136                    "Statistics null count is negative {}",
137                    null_count
138                )));
139            }
140
141            // Generic null count.
142            let null_count = Some(null_count as u64);
143            // Generic distinct count (count of distinct values occurring)
144            let distinct_count = stats.distinct_count.map(|value| value as u64);
145            // Whether or not statistics use deprecated min/max fields.
146            let old_format = stats.min_value.is_none() && stats.max_value.is_none();
147            // Generic min value as bytes.
148            let min = if old_format {
149                stats.min
150            } else {
151                stats.min_value
152            };
153            // Generic max value as bytes.
154            let max = if old_format {
155                stats.max
156            } else {
157                stats.max_value
158            };
159
160            // Values are encoded using PLAIN encoding definition, except that
161            // variable-length byte arrays do not include a length prefix.
162            //
163            // Instead of using actual decoder, we manually convert values.
164            let res = match physical_type {
165                Type::BOOLEAN => Statistics::boolean(
166                    min.map(|data| data[0] != 0),
167                    max.map(|data| data[0] != 0),
168                    distinct_count,
169                    null_count,
170                    old_format,
171                ),
172                Type::INT32 => Statistics::int32(
173                    min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
174                    max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
175                    distinct_count,
176                    null_count,
177                    old_format,
178                ),
179                Type::INT64 => Statistics::int64(
180                    min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
181                    max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
182                    distinct_count,
183                    null_count,
184                    old_format,
185                ),
186                Type::INT96 => {
187                    // INT96 statistics may not be correct, because comparison is signed
188                    // byte-wise, not actual timestamps. It is recommended to ignore
189                    // min/max statistics for INT96 columns.
190                    let min = if let Some(data) = min {
191                        assert_eq!(data.len(), 12);
192                        Some(Int96::try_from_le_slice(&data)?)
193                    } else {
194                        None
195                    };
196                    let max = if let Some(data) = max {
197                        assert_eq!(data.len(), 12);
198                        Some(Int96::try_from_le_slice(&data)?)
199                    } else {
200                        None
201                    };
202                    Statistics::int96(min, max, distinct_count, null_count, old_format)
203                }
204                Type::FLOAT => Statistics::float(
205                    min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
206                    max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
207                    distinct_count,
208                    null_count,
209                    old_format,
210                ),
211                Type::DOUBLE => Statistics::double(
212                    min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
213                    max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
214                    distinct_count,
215                    null_count,
216                    old_format,
217                ),
218                Type::BYTE_ARRAY => Statistics::ByteArray(
219                    ValueStatistics::new(
220                        min.map(ByteArray::from),
221                        max.map(ByteArray::from),
222                        distinct_count,
223                        null_count,
224                        old_format,
225                    )
226                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
227                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
228                ),
229                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
230                    ValueStatistics::new(
231                        min.map(ByteArray::from).map(FixedLenByteArray::from),
232                        max.map(ByteArray::from).map(FixedLenByteArray::from),
233                        distinct_count,
234                        null_count,
235                        old_format,
236                    )
237                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
238                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
239                ),
240            };
241
242            Some(res)
243        }
244        None => None,
245    })
246}
247
248/// Convert Statistics into Thrift definition.
249pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
250    let stats = stats?;
251
252    // record null count if it can fit in i64
253    let null_count = stats
254        .null_count_opt()
255        .and_then(|value| i64::try_from(value).ok());
256
257    // record distinct count if it can fit in i64
258    let distinct_count = stats
259        .distinct_count_opt()
260        .and_then(|value| i64::try_from(value).ok());
261
262    let mut thrift_stats = TStatistics {
263        max: None,
264        min: None,
265        null_count,
266        distinct_count,
267        max_value: None,
268        min_value: None,
269        is_max_value_exact: None,
270        is_min_value_exact: None,
271    };
272
273    // Get min/max if set.
274    let (min, max, min_exact, max_exact) = (
275        stats.min_bytes_opt().map(|x| x.to_vec()),
276        stats.max_bytes_opt().map(|x| x.to_vec()),
277        Some(stats.min_is_exact()),
278        Some(stats.max_is_exact()),
279    );
280    if stats.is_min_max_backwards_compatible() {
281        // Copy to deprecated min, max values for compatibility with older readers
282        thrift_stats.min.clone_from(&min);
283        thrift_stats.max.clone_from(&max);
284    }
285
286    if !stats.is_min_max_deprecated() {
287        thrift_stats.min_value = min;
288        thrift_stats.max_value = max;
289    }
290
291    thrift_stats.is_min_value_exact = min_exact;
292    thrift_stats.is_max_value_exact = max_exact;
293
294    Some(thrift_stats)
295}
296
297/// Strongly typed statistics for a column chunk within a row group.
298///
299/// This structure is a natively typed, in memory representation of the
300/// [`Statistics`] structure in a parquet file footer. The statistics stored in
301/// this structure can be used by query engines to skip decoding pages while
302/// reading parquet data.
303///
304/// Page level statistics are stored separately, in [NativeIndex].
305///
306/// [`Statistics`]: crate::format::Statistics
307/// [NativeIndex]: crate::file::page_index::index::NativeIndex
308#[derive(Debug, Clone, PartialEq)]
309pub enum Statistics {
310    /// Statistics for Boolean column
311    Boolean(ValueStatistics<bool>),
312    /// Statistics for Int32 column
313    Int32(ValueStatistics<i32>),
314    /// Statistics for Int64 column
315    Int64(ValueStatistics<i64>),
316    /// Statistics for Int96 column
317    Int96(ValueStatistics<Int96>),
318    /// Statistics for Float column
319    Float(ValueStatistics<f32>),
320    /// Statistics for Double column
321    Double(ValueStatistics<f64>),
322    /// Statistics for ByteArray column
323    ByteArray(ValueStatistics<ByteArray>),
324    /// Statistics for FixedLenByteArray column
325    FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
326}
327
328impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
329    fn from(t: ValueStatistics<T>) -> Self {
330        T::make_statistics(t)
331    }
332}
333
334impl Statistics {
335    /// Creates new statistics for a column type
336    pub fn new<T: ParquetValueType>(
337        min: Option<T>,
338        max: Option<T>,
339        distinct_count: Option<u64>,
340        null_count: Option<u64>,
341        is_deprecated: bool,
342    ) -> Self {
343        Self::from(ValueStatistics::new(
344            min,
345            max,
346            distinct_count,
347            null_count,
348            is_deprecated,
349        ))
350    }
351
352    statistics_new_func![boolean, Option<bool>, Boolean];
353
354    statistics_new_func![int32, Option<i32>, Int32];
355
356    statistics_new_func![int64, Option<i64>, Int64];
357
358    statistics_new_func![int96, Option<Int96>, Int96];
359
360    statistics_new_func![float, Option<f32>, Float];
361
362    statistics_new_func![double, Option<f64>, Double];
363
364    statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
365
366    statistics_new_func![
367        fixed_len_byte_array,
368        Option<FixedLenByteArray>,
369        FixedLenByteArray
370    ];
371
372    /// Returns `true` if statistics have old `min` and `max` fields set.
373    /// This means that the column order is likely to be undefined, which, for old files
374    /// could mean a signed sort order of values.
375    ///
376    /// Refer to [`ColumnOrder`](crate::basic::ColumnOrder) and
377    /// [`SortOrder`](crate::basic::SortOrder) for more information.
378    pub fn is_min_max_deprecated(&self) -> bool {
379        statistics_enum_func![self, is_min_max_deprecated]
380    }
381
382    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
383    /// using signed comparison. This resulted in an undefined ordering for unsigned
384    /// quantities, such as booleans and unsigned integers.
385    ///
386    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
387    /// which have a type-defined sort order.
388    ///
389    /// However, not all readers have been updated. For backwards compatibility, this method
390    /// returns `true` if the statistics within this have a signed sort order, that is
391    /// compatible with being stored in the deprecated `min` and `max` fields
392    pub fn is_min_max_backwards_compatible(&self) -> bool {
393        statistics_enum_func![self, is_min_max_backwards_compatible]
394    }
395
396    /// Returns optional value of number of distinct values occurring.
397    /// When it is `None`, the value should be ignored.
398    #[deprecated(since = "53.0.0", note = "Use `distinct_count_opt` method instead")]
399    pub fn distinct_count(&self) -> Option<u64> {
400        self.distinct_count_opt()
401    }
402
403    /// Returns optional value of number of distinct values occurring.
404    /// When it is `None`, the value should be ignored.
405    pub fn distinct_count_opt(&self) -> Option<u64> {
406        statistics_enum_func![self, distinct_count]
407    }
408
409    /// Returns number of null values for the column.
410    /// Note that this includes all nulls when column is part of the complex type.
411    ///
412    /// Note this API returns 0 if the null count is not available.
413    #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
414    pub fn null_count(&self) -> u64 {
415        // 0 to remain consistent behavior prior to `null_count_opt`
416        self.null_count_opt().unwrap_or(0)
417    }
418
419    /// Returns `true` if statistics collected any null values, `false` otherwise.
420    #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
421    #[allow(deprecated)]
422    pub fn has_nulls(&self) -> bool {
423        self.null_count() > 0
424    }
425
426    /// Returns number of null values for the column, if known.
427    /// Note that this includes all nulls when column is part of the complex type.
428    ///
429    /// Note this API returns Some(0) even if the null count was not present
430    /// in the statistics.
431    /// See <https://github.com/apache/arrow-rs/pull/6216/files>
432    pub fn null_count_opt(&self) -> Option<u64> {
433        statistics_enum_func![self, null_count_opt]
434    }
435
436    /// Whether or not min and max values are set.
437    /// Normally both min/max values will be set to `Some(value)` or `None`.
438    #[deprecated(
439        since = "53.0.0",
440        note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead"
441    )]
442    pub fn has_min_max_set(&self) -> bool {
443        statistics_enum_func![self, _internal_has_min_max_set]
444    }
445
446    /// Returns `true` if the min value is set, and is an exact min value.
447    pub fn min_is_exact(&self) -> bool {
448        statistics_enum_func![self, min_is_exact]
449    }
450
451    /// Returns `true` if the max value is set, and is an exact max value.
452    pub fn max_is_exact(&self) -> bool {
453        statistics_enum_func![self, max_is_exact]
454    }
455
456    /// Returns slice of bytes that represent min value, if min value is known.
457    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
458        statistics_enum_func![self, min_bytes_opt]
459    }
460
461    /// Returns slice of bytes that represent min value.
462    /// Panics if min value is not set.
463    #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
464    pub fn min_bytes(&self) -> &[u8] {
465        self.min_bytes_opt().unwrap()
466    }
467
468    /// Returns slice of bytes that represent max value, if max value is known.
469    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
470        statistics_enum_func![self, max_bytes_opt]
471    }
472
473    /// Returns slice of bytes that represent max value.
474    /// Panics if max value is not set.
475    #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
476    pub fn max_bytes(&self) -> &[u8] {
477        self.max_bytes_opt().unwrap()
478    }
479
480    /// Returns physical type associated with statistics.
481    pub fn physical_type(&self) -> Type {
482        match self {
483            Statistics::Boolean(_) => Type::BOOLEAN,
484            Statistics::Int32(_) => Type::INT32,
485            Statistics::Int64(_) => Type::INT64,
486            Statistics::Int96(_) => Type::INT96,
487            Statistics::Float(_) => Type::FLOAT,
488            Statistics::Double(_) => Type::DOUBLE,
489            Statistics::ByteArray(_) => Type::BYTE_ARRAY,
490            Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
491        }
492    }
493}
494
495impl fmt::Display for Statistics {
496    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
497        match self {
498            Statistics::Boolean(typed) => write!(f, "{typed}"),
499            Statistics::Int32(typed) => write!(f, "{typed}"),
500            Statistics::Int64(typed) => write!(f, "{typed}"),
501            Statistics::Int96(typed) => write!(f, "{typed}"),
502            Statistics::Float(typed) => write!(f, "{typed}"),
503            Statistics::Double(typed) => write!(f, "{typed}"),
504            Statistics::ByteArray(typed) => write!(f, "{typed}"),
505            Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
506        }
507    }
508}
509
510/// Typed implementation for [`Statistics`].
511pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
512
513/// Typed statistics for one column chunk
514///
515/// See [`Statistics`] for more details
516#[derive(Clone, Eq, PartialEq)]
517pub struct ValueStatistics<T> {
518    min: Option<T>,
519    max: Option<T>,
520    // Distinct count could be omitted in some cases
521    distinct_count: Option<u64>,
522    null_count: Option<u64>,
523
524    // Whether or not the min or max values are exact, or truncated.
525    is_max_value_exact: bool,
526    is_min_value_exact: bool,
527
528    /// If `true` populate the deprecated `min` and `max` fields instead of
529    /// `min_value` and `max_value`
530    is_min_max_deprecated: bool,
531
532    /// If `true` the statistics are compatible with the deprecated `min` and
533    /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`]
534    is_min_max_backwards_compatible: bool,
535}
536
537impl<T: ParquetValueType> ValueStatistics<T> {
538    /// Creates new typed statistics.
539    pub fn new(
540        min: Option<T>,
541        max: Option<T>,
542        distinct_count: Option<u64>,
543        null_count: Option<u64>,
544        is_min_max_deprecated: bool,
545    ) -> Self {
546        Self {
547            is_max_value_exact: max.is_some(),
548            is_min_value_exact: min.is_some(),
549            min,
550            max,
551            distinct_count,
552            null_count,
553            is_min_max_deprecated,
554            is_min_max_backwards_compatible: is_min_max_deprecated,
555        }
556    }
557
558    /// Set whether the stored `min` field represents the exact
559    /// minimum, or just a bound on the minimum value.
560    ///
561    /// see [`Self::min_is_exact`]
562    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
563        Self {
564            is_min_value_exact,
565            ..self
566        }
567    }
568
569    /// Set whether the stored `max` field represents the exact
570    /// maximum, or just a bound on the maximum value.
571    ///
572    /// see [`Self::max_is_exact`]
573    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
574        Self {
575            is_max_value_exact,
576            ..self
577        }
578    }
579
580    /// Set whether to write the deprecated `min` and `max` fields
581    /// for compatibility with older parquet writers
582    ///
583    /// This should only be enabled if the field is signed,
584    /// see [`Self::is_min_max_backwards_compatible`]
585    pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
586        Self {
587            is_min_max_backwards_compatible: backwards_compatible,
588            ..self
589        }
590    }
591
592    /// Returns min value of the statistics.
593    ///
594    /// Panics if min value is not set, e.g. all values are `null`.
595    /// Use `has_min_max_set` method to check that.
596    #[deprecated(since = "53.0.0", note = "Use `min_opt` instead")]
597    pub fn min(&self) -> &T {
598        self.min.as_ref().unwrap()
599    }
600
601    /// Returns min value of the statistics, if known.
602    pub fn min_opt(&self) -> Option<&T> {
603        self.min.as_ref()
604    }
605
606    /// Returns max value of the statistics.
607    ///
608    /// Panics if max value is not set, e.g. all values are `null`.
609    /// Use `has_min_max_set` method to check that.
610    #[deprecated(since = "53.0.0", note = "Use `max_opt` instead")]
611    pub fn max(&self) -> &T {
612        self.max.as_ref().unwrap()
613    }
614
615    /// Returns max value of the statistics, if known.
616    pub fn max_opt(&self) -> Option<&T> {
617        self.max.as_ref()
618    }
619
620    /// Returns min value as bytes of the statistics, if min value is known.
621    pub fn min_bytes_opt(&self) -> Option<&[u8]> {
622        self.min_opt().map(AsBytes::as_bytes)
623    }
624
625    /// Returns min value as bytes of the statistics.
626    ///
627    /// Panics if min value is not set, use `has_min_max_set` method to check
628    /// if values are set.
629    #[deprecated(since = "53.0.0", note = "Use `min_bytes_opt` instead")]
630    pub fn min_bytes(&self) -> &[u8] {
631        self.min_bytes_opt().unwrap()
632    }
633
634    /// Returns max value as bytes of the statistics, if max value is known.
635    pub fn max_bytes_opt(&self) -> Option<&[u8]> {
636        self.max_opt().map(AsBytes::as_bytes)
637    }
638
639    /// Returns max value as bytes of the statistics.
640    ///
641    /// Panics if max value is not set, use `has_min_max_set` method to check
642    /// if values are set.
643    #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
644    pub fn max_bytes(&self) -> &[u8] {
645        self.max_bytes_opt().unwrap()
646    }
647
648    /// Whether or not min and max values are set.
649    /// Normally both min/max values will be set to `Some(value)` or `None`.
650    #[deprecated(since = "53.0.0", note = "Use `min_opt` and `max_opt` methods instead")]
651    pub fn has_min_max_set(&self) -> bool {
652        self._internal_has_min_max_set()
653    }
654
655    /// Whether or not min and max values are set.
656    /// Normally both min/max values will be set to `Some(value)` or `None`.
657    pub(crate) fn _internal_has_min_max_set(&self) -> bool {
658        self.min.is_some() && self.max.is_some()
659    }
660
661    /// Whether or not max value is set, and is an exact value.
662    pub fn max_is_exact(&self) -> bool {
663        self.max.is_some() && self.is_max_value_exact
664    }
665
666    /// Whether or not min value is set, and is an exact value.
667    pub fn min_is_exact(&self) -> bool {
668        self.min.is_some() && self.is_min_value_exact
669    }
670
671    /// Returns optional value of number of distinct values occurring.
672    pub fn distinct_count(&self) -> Option<u64> {
673        self.distinct_count
674    }
675
676    /// Returns number of null values for the column.
677    /// Note that this includes all nulls when column is part of the complex type.
678    #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
679    pub fn null_count(&self) -> u64 {
680        // 0 to remain consistent behavior prior to `null_count_opt`
681        self.null_count_opt().unwrap_or(0)
682    }
683
684    /// Returns null count.
685    pub fn null_count_opt(&self) -> Option<u64> {
686        self.null_count
687    }
688
689    /// Returns `true` if statistics were created using old min/max fields.
690    fn is_min_max_deprecated(&self) -> bool {
691        self.is_min_max_deprecated
692    }
693
694    /// Old versions of parquet stored statistics in `min` and `max` fields, ordered
695    /// using signed comparison. This resulted in an undefined ordering for unsigned
696    /// quantities, such as booleans and unsigned integers.
697    ///
698    /// These fields were therefore deprecated in favour of `min_value` and `max_value`,
699    /// which have a type-defined sort order.
700    ///
701    /// However, not all readers have been updated. For backwards compatibility, this method
702    /// returns `true` if the statistics within this have a signed sort order, that is
703    /// compatible with being stored in the deprecated `min` and `max` fields
704    pub fn is_min_max_backwards_compatible(&self) -> bool {
705        self.is_min_max_backwards_compatible
706    }
707}
708
709impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
710    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
711        write!(f, "{{")?;
712        write!(f, "min: ")?;
713        match self.min {
714            Some(ref value) => write!(f, "{value}")?,
715            None => write!(f, "N/A")?,
716        }
717        write!(f, ", max: ")?;
718        match self.max {
719            Some(ref value) => write!(f, "{value}")?,
720            None => write!(f, "N/A")?,
721        }
722        write!(f, ", distinct_count: ")?;
723        match self.distinct_count {
724            Some(value) => write!(f, "{value}")?,
725            None => write!(f, "N/A")?,
726        }
727        write!(f, ", null_count: ")?;
728        match self.null_count {
729            Some(value) => write!(f, "{value}")?,
730            None => write!(f, "N/A")?,
731        }
732        write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
733        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
734        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
735        write!(f, "}}")
736    }
737}
738
739impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
740    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
741        write!(
742            f,
743            "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
744             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
745            self.min,
746            self.max,
747            self.distinct_count,
748            self.null_count,
749            self.is_min_max_deprecated,
750            self.is_min_max_backwards_compatible,
751            self.is_max_value_exact,
752            self.is_min_value_exact
753        )
754    }
755}
756
757#[cfg(test)]
758mod tests {
759    use super::*;
760
761    #[test]
762    fn test_statistics_min_max_bytes() {
763        let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
764        assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
765        assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
766
767        let stats = Statistics::byte_array(
768            Some(ByteArray::from(vec![1, 2, 3])),
769            Some(ByteArray::from(vec![3, 4, 5])),
770            None,
771            Some(1),
772            true,
773        );
774        assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
775        assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
776    }
777
778    #[test]
779    #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
780    fn test_statistics_negative_null_count() {
781        let thrift_stats = TStatistics {
782            max: None,
783            min: None,
784            null_count: Some(-10),
785            distinct_count: None,
786            max_value: None,
787            min_value: None,
788            is_max_value_exact: None,
789            is_min_value_exact: None,
790        };
791
792        from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
793    }
794
795    #[test]
796    fn test_statistics_thrift_none() {
797        assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
798        assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
799    }
800
801    #[test]
802    fn test_statistics_debug() {
803        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
804        assert_eq!(
805            format!("{stats:?}"),
806            "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
807             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
808        );
809
810        let stats = Statistics::int32(None, None, None, Some(7), false);
811        assert_eq!(
812            format!("{stats:?}"),
813            "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
814             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
815        )
816    }
817
818    #[test]
819    fn test_statistics_display() {
820        let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
821        assert_eq!(
822            format!("{stats}"),
823            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
824        );
825
826        let stats = Statistics::int64(None, None, None, Some(7), false);
827        assert_eq!(
828            format!("{stats}"),
829            "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
830             false, max_value_exact: false, min_value_exact: false}"
831        );
832
833        let stats = Statistics::int96(
834            Some(Int96::from(vec![1, 0, 0])),
835            Some(Int96::from(vec![2, 3, 4])),
836            None,
837            Some(3),
838            true,
839        );
840        assert_eq!(
841            format!("{stats}"),
842            "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
843             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
844        );
845
846        let stats = Statistics::ByteArray(
847            ValueStatistics::new(
848                Some(ByteArray::from(vec![1u8])),
849                Some(ByteArray::from(vec![2u8])),
850                Some(5),
851                Some(7),
852                false,
853            )
854            .with_max_is_exact(false)
855            .with_min_is_exact(false),
856        );
857        assert_eq!(
858            format!("{stats}"),
859            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
860        );
861    }
862
863    #[test]
864    fn test_statistics_partial_eq() {
865        let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
866
867        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
868        assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
869        assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
870        assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
871        assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
872
873        assert!(
874            Statistics::int32(Some(12), Some(45), None, Some(11), false)
875                != Statistics::int64(Some(12), Some(45), None, Some(11), false)
876        );
877
878        assert!(
879            Statistics::boolean(Some(false), Some(true), None, None, true)
880                != Statistics::double(Some(1.2), Some(4.5), None, None, true)
881        );
882
883        assert!(
884            Statistics::byte_array(
885                Some(ByteArray::from(vec![1, 2, 3])),
886                Some(ByteArray::from(vec![1, 2, 3])),
887                None,
888                None,
889                true
890            ) != Statistics::fixed_len_byte_array(
891                Some(ByteArray::from(vec![1, 2, 3]).into()),
892                Some(ByteArray::from(vec![1, 2, 3]).into()),
893                None,
894                None,
895                true,
896            )
897        );
898
899        assert!(
900            Statistics::byte_array(
901                Some(ByteArray::from(vec![1, 2, 3])),
902                Some(ByteArray::from(vec![1, 2, 3])),
903                None,
904                None,
905                true,
906            ) != Statistics::ByteArray(
907                ValueStatistics::new(
908                    Some(ByteArray::from(vec![1, 2, 3])),
909                    Some(ByteArray::from(vec![1, 2, 3])),
910                    None,
911                    None,
912                    true,
913                )
914                .with_max_is_exact(false)
915            )
916        );
917
918        assert!(
919            Statistics::fixed_len_byte_array(
920                Some(FixedLenByteArray::from(vec![1, 2, 3])),
921                Some(FixedLenByteArray::from(vec![1, 2, 3])),
922                None,
923                None,
924                true,
925            ) != Statistics::FixedLenByteArray(
926                ValueStatistics::new(
927                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
928                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
929                    None,
930                    None,
931                    true,
932                )
933                .with_min_is_exact(false)
934            )
935        );
936    }
937
938    #[test]
939    fn test_statistics_from_thrift() {
940        // Helper method to check statistics conversion.
941        fn check_stats(stats: Statistics) {
942            let tpe = stats.physical_type();
943            let thrift_stats = to_thrift(Some(&stats));
944            assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
945        }
946
947        check_stats(Statistics::boolean(
948            Some(false),
949            Some(true),
950            None,
951            Some(7),
952            true,
953        ));
954        check_stats(Statistics::boolean(
955            Some(false),
956            Some(true),
957            None,
958            Some(7),
959            true,
960        ));
961        check_stats(Statistics::boolean(
962            Some(false),
963            Some(true),
964            None,
965            Some(0),
966            false,
967        ));
968        check_stats(Statistics::boolean(
969            Some(true),
970            Some(true),
971            None,
972            Some(7),
973            true,
974        ));
975        check_stats(Statistics::boolean(
976            Some(false),
977            Some(false),
978            None,
979            Some(7),
980            true,
981        ));
982        check_stats(Statistics::boolean(None, None, None, Some(7), true));
983
984        check_stats(Statistics::int32(
985            Some(-100),
986            Some(500),
987            None,
988            Some(7),
989            true,
990        ));
991        check_stats(Statistics::int32(
992            Some(-100),
993            Some(500),
994            None,
995            Some(0),
996            false,
997        ));
998        check_stats(Statistics::int32(None, None, None, Some(7), true));
999
1000        check_stats(Statistics::int64(
1001            Some(-100),
1002            Some(200),
1003            None,
1004            Some(7),
1005            true,
1006        ));
1007        check_stats(Statistics::int64(
1008            Some(-100),
1009            Some(200),
1010            None,
1011            Some(0),
1012            false,
1013        ));
1014        check_stats(Statistics::int64(None, None, None, Some(7), true));
1015
1016        check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
1017        check_stats(Statistics::float(
1018            Some(1.2),
1019            Some(3.4),
1020            None,
1021            Some(0),
1022            false,
1023        ));
1024        check_stats(Statistics::float(None, None, None, Some(7), true));
1025
1026        check_stats(Statistics::double(
1027            Some(1.2),
1028            Some(3.4),
1029            None,
1030            Some(7),
1031            true,
1032        ));
1033        check_stats(Statistics::double(
1034            Some(1.2),
1035            Some(3.4),
1036            None,
1037            Some(0),
1038            false,
1039        ));
1040        check_stats(Statistics::double(None, None, None, Some(7), true));
1041
1042        check_stats(Statistics::byte_array(
1043            Some(ByteArray::from(vec![1, 2, 3])),
1044            Some(ByteArray::from(vec![3, 4, 5])),
1045            None,
1046            Some(7),
1047            true,
1048        ));
1049        check_stats(Statistics::byte_array(None, None, None, Some(7), true));
1050
1051        check_stats(Statistics::fixed_len_byte_array(
1052            Some(ByteArray::from(vec![1, 2, 3]).into()),
1053            Some(ByteArray::from(vec![3, 4, 5]).into()),
1054            None,
1055            Some(7),
1056            true,
1057        ));
1058        check_stats(Statistics::fixed_len_byte_array(
1059            None,
1060            None,
1061            None,
1062            Some(7),
1063            true,
1064        ));
1065    }
1066
1067    #[test]
1068    fn test_count_encoding() {
1069        statistics_count_test(None, None);
1070        statistics_count_test(Some(0), Some(0));
1071        statistics_count_test(Some(100), Some(2000));
1072        statistics_count_test(Some(1), None);
1073        statistics_count_test(None, Some(1));
1074    }
1075
1076    #[test]
1077    fn test_count_encoding_distinct_too_large() {
1078        // statistics are stored using i64, so test trying to store larger values
1079        let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1080        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1081        assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null
1082        assert_eq!(thrift_stats.null_count, Some(100));
1083    }
1084
1085    #[test]
1086    fn test_count_encoding_null_too_large() {
1087        // statistics are stored using i64, so test trying to store larger values
1088        let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1089        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1090        assert_eq!(thrift_stats.distinct_count, Some(100));
1091        assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null
1092    }
1093
1094    #[test]
1095    fn test_count_decoding_null_invalid() {
1096        let tstatistics = TStatistics {
1097            null_count: Some(-42),
1098            ..Default::default()
1099        };
1100        let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1101        assert_eq!(
1102            err.to_string(),
1103            "Parquet error: Statistics null count is negative -42"
1104        );
1105    }
1106
1107    /// Writes statistics to thrift and reads them back and ensures:
1108    /// - The statistics are the same
1109    /// - The statistics written to thrift are the same as the original statistics
1110    fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1111        let statistics = make_bool_stats(distinct_count, null_count);
1112
1113        let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1114        assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1115        assert_eq!(
1116            thrift_stats.distinct_count.map(|c| c as u64),
1117            distinct_count
1118        );
1119
1120        let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1121            .unwrap()
1122            .unwrap();
1123        // TODO: remove branch when we no longer support assuming null_count==None in the thrift
1124        // means null_count = Some(0)
1125        if null_count.is_none() {
1126            assert_ne!(round_tripped, statistics);
1127            assert!(round_tripped.null_count_opt().is_some());
1128            assert_eq!(round_tripped.null_count_opt(), Some(0));
1129            assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1130            assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1131            assert_eq!(
1132                round_tripped.distinct_count_opt(),
1133                statistics.distinct_count_opt()
1134            );
1135        } else {
1136            assert_eq!(round_tripped, statistics);
1137        }
1138    }
1139
1140    fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1141        let min = Some(true);
1142        let max = Some(false);
1143        let is_min_max_deprecated = false;
1144
1145        // test is about the counts, so we aren't really testing the min/max values
1146        Statistics::Boolean(ValueStatistics::new(
1147            min,
1148            max,
1149            distinct_count,
1150            null_count,
1151            is_min_max_deprecated,
1152        ))
1153    }
1154}