1use std::fmt;
43
44use crate::format::Statistics as TStatistics;
45
46use crate::basic::Type;
47use crate::data_type::private::ParquetValueType;
48use crate::data_type::*;
49use crate::errors::{ParquetError, Result};
50use crate::util::bit_util::FromBytes;
51
52pub(crate) mod private {
53 use super::*;
54
55 pub trait MakeStatistics {
56 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
57 where
58 Self: Sized;
59 }
60
61 macro_rules! gen_make_statistics {
62 ($value_ty:ty, $stat:ident) => {
63 impl MakeStatistics for $value_ty {
64 fn make_statistics(statistics: ValueStatistics<Self>) -> Statistics
65 where
66 Self: Sized,
67 {
68 Statistics::$stat(statistics)
69 }
70 }
71 };
72 }
73
74 gen_make_statistics!(bool, Boolean);
75 gen_make_statistics!(i32, Int32);
76 gen_make_statistics!(i64, Int64);
77 gen_make_statistics!(Int96, Int96);
78 gen_make_statistics!(f32, Float);
79 gen_make_statistics!(f64, Double);
80 gen_make_statistics!(ByteArray, ByteArray);
81 gen_make_statistics!(FixedLenByteArray, FixedLenByteArray);
82}
83
84macro_rules! statistics_new_func {
86 ($func:ident, $vtype:ty, $stat:ident) => {
87 #[doc = concat!("Creates new statistics for `", stringify!($stat), "` column type.")]
88 pub fn $func(
89 min: $vtype,
90 max: $vtype,
91 distinct: Option<u64>,
92 nulls: Option<u64>,
93 is_deprecated: bool,
94 ) -> Self {
95 Statistics::$stat(ValueStatistics::new(
96 min,
97 max,
98 distinct,
99 nulls,
100 is_deprecated,
101 ))
102 }
103 };
104}
105
106macro_rules! statistics_enum_func {
108 ($self:ident, $func:ident) => {{
109 match *$self {
110 Statistics::Boolean(ref typed) => typed.$func(),
111 Statistics::Int32(ref typed) => typed.$func(),
112 Statistics::Int64(ref typed) => typed.$func(),
113 Statistics::Int96(ref typed) => typed.$func(),
114 Statistics::Float(ref typed) => typed.$func(),
115 Statistics::Double(ref typed) => typed.$func(),
116 Statistics::ByteArray(ref typed) => typed.$func(),
117 Statistics::FixedLenByteArray(ref typed) => typed.$func(),
118 }
119 }};
120}
121
122pub fn from_thrift(
124 physical_type: Type,
125 thrift_stats: Option<TStatistics>,
126) -> Result<Option<Statistics>> {
127 Ok(match thrift_stats {
128 Some(stats) => {
129 let null_count = stats.null_count.unwrap_or(0);
133
134 if null_count < 0 {
135 return Err(ParquetError::General(format!(
136 "Statistics null count is negative {}",
137 null_count
138 )));
139 }
140
141 let null_count = Some(null_count as u64);
143 let distinct_count = stats.distinct_count.map(|value| value as u64);
145 let old_format = stats.min_value.is_none() && stats.max_value.is_none();
147 let min = if old_format {
149 stats.min
150 } else {
151 stats.min_value
152 };
153 let max = if old_format {
155 stats.max
156 } else {
157 stats.max_value
158 };
159
160 let res = match physical_type {
165 Type::BOOLEAN => Statistics::boolean(
166 min.map(|data| data[0] != 0),
167 max.map(|data| data[0] != 0),
168 distinct_count,
169 null_count,
170 old_format,
171 ),
172 Type::INT32 => Statistics::int32(
173 min.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
174 max.map(|data| i32::from_le_bytes(data[..4].try_into().unwrap())),
175 distinct_count,
176 null_count,
177 old_format,
178 ),
179 Type::INT64 => Statistics::int64(
180 min.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
181 max.map(|data| i64::from_le_bytes(data[..8].try_into().unwrap())),
182 distinct_count,
183 null_count,
184 old_format,
185 ),
186 Type::INT96 => {
187 let min = if let Some(data) = min {
191 assert_eq!(data.len(), 12);
192 Some(Int96::try_from_le_slice(&data)?)
193 } else {
194 None
195 };
196 let max = if let Some(data) = max {
197 assert_eq!(data.len(), 12);
198 Some(Int96::try_from_le_slice(&data)?)
199 } else {
200 None
201 };
202 Statistics::int96(min, max, distinct_count, null_count, old_format)
203 }
204 Type::FLOAT => Statistics::float(
205 min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
206 max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())),
207 distinct_count,
208 null_count,
209 old_format,
210 ),
211 Type::DOUBLE => Statistics::double(
212 min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
213 max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())),
214 distinct_count,
215 null_count,
216 old_format,
217 ),
218 Type::BYTE_ARRAY => Statistics::ByteArray(
219 ValueStatistics::new(
220 min.map(ByteArray::from),
221 max.map(ByteArray::from),
222 distinct_count,
223 null_count,
224 old_format,
225 )
226 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
227 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
228 ),
229 Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
230 ValueStatistics::new(
231 min.map(ByteArray::from).map(FixedLenByteArray::from),
232 max.map(ByteArray::from).map(FixedLenByteArray::from),
233 distinct_count,
234 null_count,
235 old_format,
236 )
237 .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
238 .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
239 ),
240 };
241
242 Some(res)
243 }
244 None => None,
245 })
246}
247
248pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
250 let stats = stats?;
251
252 let null_count = stats
254 .null_count_opt()
255 .and_then(|value| i64::try_from(value).ok());
256
257 let distinct_count = stats
259 .distinct_count_opt()
260 .and_then(|value| i64::try_from(value).ok());
261
262 let mut thrift_stats = TStatistics {
263 max: None,
264 min: None,
265 null_count,
266 distinct_count,
267 max_value: None,
268 min_value: None,
269 is_max_value_exact: None,
270 is_min_value_exact: None,
271 };
272
273 let (min, max, min_exact, max_exact) = (
275 stats.min_bytes_opt().map(|x| x.to_vec()),
276 stats.max_bytes_opt().map(|x| x.to_vec()),
277 Some(stats.min_is_exact()),
278 Some(stats.max_is_exact()),
279 );
280 if stats.is_min_max_backwards_compatible() {
281 thrift_stats.min.clone_from(&min);
283 thrift_stats.max.clone_from(&max);
284 }
285
286 if !stats.is_min_max_deprecated() {
287 thrift_stats.min_value = min;
288 thrift_stats.max_value = max;
289 }
290
291 thrift_stats.is_min_value_exact = min_exact;
292 thrift_stats.is_max_value_exact = max_exact;
293
294 Some(thrift_stats)
295}
296
297#[derive(Debug, Clone, PartialEq)]
309pub enum Statistics {
310 Boolean(ValueStatistics<bool>),
312 Int32(ValueStatistics<i32>),
314 Int64(ValueStatistics<i64>),
316 Int96(ValueStatistics<Int96>),
318 Float(ValueStatistics<f32>),
320 Double(ValueStatistics<f64>),
322 ByteArray(ValueStatistics<ByteArray>),
324 FixedLenByteArray(ValueStatistics<FixedLenByteArray>),
326}
327
328impl<T: ParquetValueType> From<ValueStatistics<T>> for Statistics {
329 fn from(t: ValueStatistics<T>) -> Self {
330 T::make_statistics(t)
331 }
332}
333
334impl Statistics {
335 pub fn new<T: ParquetValueType>(
337 min: Option<T>,
338 max: Option<T>,
339 distinct_count: Option<u64>,
340 null_count: Option<u64>,
341 is_deprecated: bool,
342 ) -> Self {
343 Self::from(ValueStatistics::new(
344 min,
345 max,
346 distinct_count,
347 null_count,
348 is_deprecated,
349 ))
350 }
351
352 statistics_new_func![boolean, Option<bool>, Boolean];
353
354 statistics_new_func![int32, Option<i32>, Int32];
355
356 statistics_new_func![int64, Option<i64>, Int64];
357
358 statistics_new_func![int96, Option<Int96>, Int96];
359
360 statistics_new_func![float, Option<f32>, Float];
361
362 statistics_new_func![double, Option<f64>, Double];
363
364 statistics_new_func![byte_array, Option<ByteArray>, ByteArray];
365
366 statistics_new_func![
367 fixed_len_byte_array,
368 Option<FixedLenByteArray>,
369 FixedLenByteArray
370 ];
371
372 pub fn is_min_max_deprecated(&self) -> bool {
379 statistics_enum_func![self, is_min_max_deprecated]
380 }
381
382 pub fn is_min_max_backwards_compatible(&self) -> bool {
393 statistics_enum_func![self, is_min_max_backwards_compatible]
394 }
395
396 #[deprecated(since = "53.0.0", note = "Use `distinct_count_opt` method instead")]
399 pub fn distinct_count(&self) -> Option<u64> {
400 self.distinct_count_opt()
401 }
402
403 pub fn distinct_count_opt(&self) -> Option<u64> {
406 statistics_enum_func![self, distinct_count]
407 }
408
409 #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
414 pub fn null_count(&self) -> u64 {
415 self.null_count_opt().unwrap_or(0)
417 }
418
419 #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
421 #[allow(deprecated)]
422 pub fn has_nulls(&self) -> bool {
423 self.null_count() > 0
424 }
425
426 pub fn null_count_opt(&self) -> Option<u64> {
433 statistics_enum_func![self, null_count_opt]
434 }
435
436 #[deprecated(
439 since = "53.0.0",
440 note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead"
441 )]
442 pub fn has_min_max_set(&self) -> bool {
443 statistics_enum_func![self, _internal_has_min_max_set]
444 }
445
446 pub fn min_is_exact(&self) -> bool {
448 statistics_enum_func![self, min_is_exact]
449 }
450
451 pub fn max_is_exact(&self) -> bool {
453 statistics_enum_func![self, max_is_exact]
454 }
455
456 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
458 statistics_enum_func![self, min_bytes_opt]
459 }
460
461 #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
464 pub fn min_bytes(&self) -> &[u8] {
465 self.min_bytes_opt().unwrap()
466 }
467
468 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
470 statistics_enum_func![self, max_bytes_opt]
471 }
472
473 #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
476 pub fn max_bytes(&self) -> &[u8] {
477 self.max_bytes_opt().unwrap()
478 }
479
480 pub fn physical_type(&self) -> Type {
482 match self {
483 Statistics::Boolean(_) => Type::BOOLEAN,
484 Statistics::Int32(_) => Type::INT32,
485 Statistics::Int64(_) => Type::INT64,
486 Statistics::Int96(_) => Type::INT96,
487 Statistics::Float(_) => Type::FLOAT,
488 Statistics::Double(_) => Type::DOUBLE,
489 Statistics::ByteArray(_) => Type::BYTE_ARRAY,
490 Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY,
491 }
492 }
493}
494
495impl fmt::Display for Statistics {
496 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
497 match self {
498 Statistics::Boolean(typed) => write!(f, "{typed}"),
499 Statistics::Int32(typed) => write!(f, "{typed}"),
500 Statistics::Int64(typed) => write!(f, "{typed}"),
501 Statistics::Int96(typed) => write!(f, "{typed}"),
502 Statistics::Float(typed) => write!(f, "{typed}"),
503 Statistics::Double(typed) => write!(f, "{typed}"),
504 Statistics::ByteArray(typed) => write!(f, "{typed}"),
505 Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"),
506 }
507 }
508}
509
510pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
512
513#[derive(Clone, Eq, PartialEq)]
517pub struct ValueStatistics<T> {
518 min: Option<T>,
519 max: Option<T>,
520 distinct_count: Option<u64>,
522 null_count: Option<u64>,
523
524 is_max_value_exact: bool,
526 is_min_value_exact: bool,
527
528 is_min_max_deprecated: bool,
531
532 is_min_max_backwards_compatible: bool,
535}
536
537impl<T: ParquetValueType> ValueStatistics<T> {
538 pub fn new(
540 min: Option<T>,
541 max: Option<T>,
542 distinct_count: Option<u64>,
543 null_count: Option<u64>,
544 is_min_max_deprecated: bool,
545 ) -> Self {
546 Self {
547 is_max_value_exact: max.is_some(),
548 is_min_value_exact: min.is_some(),
549 min,
550 max,
551 distinct_count,
552 null_count,
553 is_min_max_deprecated,
554 is_min_max_backwards_compatible: is_min_max_deprecated,
555 }
556 }
557
558 pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
563 Self {
564 is_min_value_exact,
565 ..self
566 }
567 }
568
569 pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
574 Self {
575 is_max_value_exact,
576 ..self
577 }
578 }
579
580 pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self {
586 Self {
587 is_min_max_backwards_compatible: backwards_compatible,
588 ..self
589 }
590 }
591
592 #[deprecated(since = "53.0.0", note = "Use `min_opt` instead")]
597 pub fn min(&self) -> &T {
598 self.min.as_ref().unwrap()
599 }
600
601 pub fn min_opt(&self) -> Option<&T> {
603 self.min.as_ref()
604 }
605
606 #[deprecated(since = "53.0.0", note = "Use `max_opt` instead")]
611 pub fn max(&self) -> &T {
612 self.max.as_ref().unwrap()
613 }
614
615 pub fn max_opt(&self) -> Option<&T> {
617 self.max.as_ref()
618 }
619
620 pub fn min_bytes_opt(&self) -> Option<&[u8]> {
622 self.min_opt().map(AsBytes::as_bytes)
623 }
624
625 #[deprecated(since = "53.0.0", note = "Use `min_bytes_opt` instead")]
630 pub fn min_bytes(&self) -> &[u8] {
631 self.min_bytes_opt().unwrap()
632 }
633
634 pub fn max_bytes_opt(&self) -> Option<&[u8]> {
636 self.max_opt().map(AsBytes::as_bytes)
637 }
638
639 #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")]
644 pub fn max_bytes(&self) -> &[u8] {
645 self.max_bytes_opt().unwrap()
646 }
647
648 #[deprecated(since = "53.0.0", note = "Use `min_opt` and `max_opt` methods instead")]
651 pub fn has_min_max_set(&self) -> bool {
652 self._internal_has_min_max_set()
653 }
654
655 pub(crate) fn _internal_has_min_max_set(&self) -> bool {
658 self.min.is_some() && self.max.is_some()
659 }
660
661 pub fn max_is_exact(&self) -> bool {
663 self.max.is_some() && self.is_max_value_exact
664 }
665
666 pub fn min_is_exact(&self) -> bool {
668 self.min.is_some() && self.is_min_value_exact
669 }
670
671 pub fn distinct_count(&self) -> Option<u64> {
673 self.distinct_count
674 }
675
676 #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
679 pub fn null_count(&self) -> u64 {
680 self.null_count_opt().unwrap_or(0)
682 }
683
684 pub fn null_count_opt(&self) -> Option<u64> {
686 self.null_count
687 }
688
689 fn is_min_max_deprecated(&self) -> bool {
691 self.is_min_max_deprecated
692 }
693
694 pub fn is_min_max_backwards_compatible(&self) -> bool {
705 self.is_min_max_backwards_compatible
706 }
707}
708
709impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
710 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
711 write!(f, "{{")?;
712 write!(f, "min: ")?;
713 match self.min {
714 Some(ref value) => write!(f, "{value}")?,
715 None => write!(f, "N/A")?,
716 }
717 write!(f, ", max: ")?;
718 match self.max {
719 Some(ref value) => write!(f, "{value}")?,
720 None => write!(f, "N/A")?,
721 }
722 write!(f, ", distinct_count: ")?;
723 match self.distinct_count {
724 Some(value) => write!(f, "{value}")?,
725 None => write!(f, "N/A")?,
726 }
727 write!(f, ", null_count: ")?;
728 match self.null_count {
729 Some(value) => write!(f, "{value}")?,
730 None => write!(f, "N/A")?,
731 }
732 write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
733 write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
734 write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
735 write!(f, "}}")
736 }
737}
738
739impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
740 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
741 write!(
742 f,
743 "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \
744 min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
745 self.min,
746 self.max,
747 self.distinct_count,
748 self.null_count,
749 self.is_min_max_deprecated,
750 self.is_min_max_backwards_compatible,
751 self.is_max_value_exact,
752 self.is_min_value_exact
753 )
754 }
755}
756
757#[cfg(test)]
758mod tests {
759 use super::*;
760
761 #[test]
762 fn test_statistics_min_max_bytes() {
763 let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false);
764 assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes()));
765 assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes()));
766
767 let stats = Statistics::byte_array(
768 Some(ByteArray::from(vec![1, 2, 3])),
769 Some(ByteArray::from(vec![3, 4, 5])),
770 None,
771 Some(1),
772 true,
773 );
774 assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]);
775 assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]);
776 }
777
778 #[test]
779 #[should_panic(expected = "General(\"Statistics null count is negative -10\")")]
780 fn test_statistics_negative_null_count() {
781 let thrift_stats = TStatistics {
782 max: None,
783 min: None,
784 null_count: Some(-10),
785 distinct_count: None,
786 max_value: None,
787 min_value: None,
788 is_max_value_exact: None,
789 is_min_value_exact: None,
790 };
791
792 from_thrift(Type::INT32, Some(thrift_stats)).unwrap();
793 }
794
795 #[test]
796 fn test_statistics_thrift_none() {
797 assert_eq!(from_thrift(Type::INT32, None).unwrap(), None);
798 assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None);
799 }
800
801 #[test]
802 fn test_statistics_debug() {
803 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
804 assert_eq!(
805 format!("{stats:?}"),
806 "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \
807 min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
808 );
809
810 let stats = Statistics::int32(None, None, None, Some(7), false);
811 assert_eq!(
812 format!("{stats:?}"),
813 "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \
814 min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
815 )
816 }
817
818 #[test]
819 fn test_statistics_display() {
820 let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true);
821 assert_eq!(
822 format!("{stats}"),
823 "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
824 );
825
826 let stats = Statistics::int64(None, None, None, Some(7), false);
827 assert_eq!(
828 format!("{stats}"),
829 "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
830 false, max_value_exact: false, min_value_exact: false}"
831 );
832
833 let stats = Statistics::int96(
834 Some(Int96::from(vec![1, 0, 0])),
835 Some(Int96::from(vec![2, 3, 4])),
836 None,
837 Some(3),
838 true,
839 );
840 assert_eq!(
841 format!("{stats}"),
842 "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
843 min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
844 );
845
846 let stats = Statistics::ByteArray(
847 ValueStatistics::new(
848 Some(ByteArray::from(vec![1u8])),
849 Some(ByteArray::from(vec![2u8])),
850 Some(5),
851 Some(7),
852 false,
853 )
854 .with_max_is_exact(false)
855 .with_min_is_exact(false),
856 );
857 assert_eq!(
858 format!("{stats}"),
859 "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
860 );
861 }
862
863 #[test]
864 fn test_statistics_partial_eq() {
865 let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true);
866
867 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected);
868 assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected);
869 assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected);
870 assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected);
871 assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected);
872
873 assert!(
874 Statistics::int32(Some(12), Some(45), None, Some(11), false)
875 != Statistics::int64(Some(12), Some(45), None, Some(11), false)
876 );
877
878 assert!(
879 Statistics::boolean(Some(false), Some(true), None, None, true)
880 != Statistics::double(Some(1.2), Some(4.5), None, None, true)
881 );
882
883 assert!(
884 Statistics::byte_array(
885 Some(ByteArray::from(vec![1, 2, 3])),
886 Some(ByteArray::from(vec![1, 2, 3])),
887 None,
888 None,
889 true
890 ) != Statistics::fixed_len_byte_array(
891 Some(ByteArray::from(vec![1, 2, 3]).into()),
892 Some(ByteArray::from(vec![1, 2, 3]).into()),
893 None,
894 None,
895 true,
896 )
897 );
898
899 assert!(
900 Statistics::byte_array(
901 Some(ByteArray::from(vec![1, 2, 3])),
902 Some(ByteArray::from(vec![1, 2, 3])),
903 None,
904 None,
905 true,
906 ) != Statistics::ByteArray(
907 ValueStatistics::new(
908 Some(ByteArray::from(vec![1, 2, 3])),
909 Some(ByteArray::from(vec![1, 2, 3])),
910 None,
911 None,
912 true,
913 )
914 .with_max_is_exact(false)
915 )
916 );
917
918 assert!(
919 Statistics::fixed_len_byte_array(
920 Some(FixedLenByteArray::from(vec![1, 2, 3])),
921 Some(FixedLenByteArray::from(vec![1, 2, 3])),
922 None,
923 None,
924 true,
925 ) != Statistics::FixedLenByteArray(
926 ValueStatistics::new(
927 Some(FixedLenByteArray::from(vec![1, 2, 3])),
928 Some(FixedLenByteArray::from(vec![1, 2, 3])),
929 None,
930 None,
931 true,
932 )
933 .with_min_is_exact(false)
934 )
935 );
936 }
937
938 #[test]
939 fn test_statistics_from_thrift() {
940 fn check_stats(stats: Statistics) {
942 let tpe = stats.physical_type();
943 let thrift_stats = to_thrift(Some(&stats));
944 assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats));
945 }
946
947 check_stats(Statistics::boolean(
948 Some(false),
949 Some(true),
950 None,
951 Some(7),
952 true,
953 ));
954 check_stats(Statistics::boolean(
955 Some(false),
956 Some(true),
957 None,
958 Some(7),
959 true,
960 ));
961 check_stats(Statistics::boolean(
962 Some(false),
963 Some(true),
964 None,
965 Some(0),
966 false,
967 ));
968 check_stats(Statistics::boolean(
969 Some(true),
970 Some(true),
971 None,
972 Some(7),
973 true,
974 ));
975 check_stats(Statistics::boolean(
976 Some(false),
977 Some(false),
978 None,
979 Some(7),
980 true,
981 ));
982 check_stats(Statistics::boolean(None, None, None, Some(7), true));
983
984 check_stats(Statistics::int32(
985 Some(-100),
986 Some(500),
987 None,
988 Some(7),
989 true,
990 ));
991 check_stats(Statistics::int32(
992 Some(-100),
993 Some(500),
994 None,
995 Some(0),
996 false,
997 ));
998 check_stats(Statistics::int32(None, None, None, Some(7), true));
999
1000 check_stats(Statistics::int64(
1001 Some(-100),
1002 Some(200),
1003 None,
1004 Some(7),
1005 true,
1006 ));
1007 check_stats(Statistics::int64(
1008 Some(-100),
1009 Some(200),
1010 None,
1011 Some(0),
1012 false,
1013 ));
1014 check_stats(Statistics::int64(None, None, None, Some(7), true));
1015
1016 check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true));
1017 check_stats(Statistics::float(
1018 Some(1.2),
1019 Some(3.4),
1020 None,
1021 Some(0),
1022 false,
1023 ));
1024 check_stats(Statistics::float(None, None, None, Some(7), true));
1025
1026 check_stats(Statistics::double(
1027 Some(1.2),
1028 Some(3.4),
1029 None,
1030 Some(7),
1031 true,
1032 ));
1033 check_stats(Statistics::double(
1034 Some(1.2),
1035 Some(3.4),
1036 None,
1037 Some(0),
1038 false,
1039 ));
1040 check_stats(Statistics::double(None, None, None, Some(7), true));
1041
1042 check_stats(Statistics::byte_array(
1043 Some(ByteArray::from(vec![1, 2, 3])),
1044 Some(ByteArray::from(vec![3, 4, 5])),
1045 None,
1046 Some(7),
1047 true,
1048 ));
1049 check_stats(Statistics::byte_array(None, None, None, Some(7), true));
1050
1051 check_stats(Statistics::fixed_len_byte_array(
1052 Some(ByteArray::from(vec![1, 2, 3]).into()),
1053 Some(ByteArray::from(vec![3, 4, 5]).into()),
1054 None,
1055 Some(7),
1056 true,
1057 ));
1058 check_stats(Statistics::fixed_len_byte_array(
1059 None,
1060 None,
1061 None,
1062 Some(7),
1063 true,
1064 ));
1065 }
1066
1067 #[test]
1068 fn test_count_encoding() {
1069 statistics_count_test(None, None);
1070 statistics_count_test(Some(0), Some(0));
1071 statistics_count_test(Some(100), Some(2000));
1072 statistics_count_test(Some(1), None);
1073 statistics_count_test(None, Some(1));
1074 }
1075
1076 #[test]
1077 fn test_count_encoding_distinct_too_large() {
1078 let statistics = make_bool_stats(Some(u64::MAX), Some(100));
1080 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1081 assert_eq!(thrift_stats.distinct_count, None); assert_eq!(thrift_stats.null_count, Some(100));
1083 }
1084
1085 #[test]
1086 fn test_count_encoding_null_too_large() {
1087 let statistics = make_bool_stats(Some(100), Some(u64::MAX));
1089 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1090 assert_eq!(thrift_stats.distinct_count, Some(100));
1091 assert_eq!(thrift_stats.null_count, None); }
1093
1094 #[test]
1095 fn test_count_decoding_null_invalid() {
1096 let tstatistics = TStatistics {
1097 null_count: Some(-42),
1098 ..Default::default()
1099 };
1100 let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err();
1101 assert_eq!(
1102 err.to_string(),
1103 "Parquet error: Statistics null count is negative -42"
1104 );
1105 }
1106
1107 fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) {
1111 let statistics = make_bool_stats(distinct_count, null_count);
1112
1113 let thrift_stats = to_thrift(Some(&statistics)).unwrap();
1114 assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count);
1115 assert_eq!(
1116 thrift_stats.distinct_count.map(|c| c as u64),
1117 distinct_count
1118 );
1119
1120 let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats))
1121 .unwrap()
1122 .unwrap();
1123 if null_count.is_none() {
1126 assert_ne!(round_tripped, statistics);
1127 assert!(round_tripped.null_count_opt().is_some());
1128 assert_eq!(round_tripped.null_count_opt(), Some(0));
1129 assert_eq!(round_tripped.min_bytes_opt(), statistics.min_bytes_opt());
1130 assert_eq!(round_tripped.max_bytes_opt(), statistics.max_bytes_opt());
1131 assert_eq!(
1132 round_tripped.distinct_count_opt(),
1133 statistics.distinct_count_opt()
1134 );
1135 } else {
1136 assert_eq!(round_tripped, statistics);
1137 }
1138 }
1139
1140 fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics {
1141 let min = Some(true);
1142 let max = Some(false);
1143 let is_min_max_deprecated = false;
1144
1145 Statistics::Boolean(ValueStatistics::new(
1147 min,
1148 max,
1149 distinct_count,
1150 null_count,
1151 is_min_max_deprecated,
1152 ))
1153 }
1154}