arrow_schema/datatype.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::fmt;
19use std::str::FromStr;
20use std::sync::Arc;
21
22use crate::{ArrowError, Field, FieldRef, Fields, UnionFields};
23
24/// Datatypes supported by this implementation of Apache Arrow.
25///
26/// The variants of this enum include primitive fixed size types as well as
27/// parametric or nested types. See [`Schema.fbs`] for Arrow's specification.
28///
29/// # Examples
30///
31/// Primitive types
32/// ```
33/// # use arrow_schema::DataType;
34/// // create a new 32-bit signed integer
35/// let data_type = DataType::Int32;
36/// ```
37///
38/// Nested Types
39/// ```
40/// # use arrow_schema::{DataType, Field};
41/// # use std::sync::Arc;
42/// // create a new list of 32-bit signed integers directly
43/// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
44/// // Create the same list type with constructor
45/// let list_data_type2 = DataType::new_list(DataType::Int32, true);
46/// assert_eq!(list_data_type, list_data_type2);
47/// ```
48///
49/// Dictionary Types
50/// ```
51/// # use arrow_schema::{DataType};
52/// // String Dictionary (key type Int32 and value type Utf8)
53/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
54/// ```
55///
56/// Timestamp Types
57/// ```
58/// # use arrow_schema::{DataType, TimeUnit};
59/// // timestamp with millisecond precision without timezone specified
60/// let data_type = DataType::Timestamp(TimeUnit::Millisecond, None);
61/// // timestamp with nanosecond precision in UTC timezone
62/// let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()));
63///```
64///
65/// # Display and FromStr
66///
67/// The `Display` and `FromStr` implementations for `DataType` are
68/// human-readable, parseable, and reversible.
69///
70/// ```
71/// # use arrow_schema::DataType;
72/// let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
73/// let data_type_string = data_type.to_string();
74/// assert_eq!(data_type_string, "Dictionary(Int32, Utf8)");
75/// // display can be parsed back into the original type
76/// let parsed_data_type: DataType = data_type.to_string().parse().unwrap();
77/// assert_eq!(data_type, parsed_data_type);
78/// ```
79///
80/// # Nested Support
81/// Currently, the Rust implementation supports the following nested types:
82///  - `List<T>`
83///  - `LargeList<T>`
84///  - `FixedSizeList<T>`
85///  - `Struct<T, U, V, ...>`
86///  - `Union<T, U, V, ...>`
87///  - `Map<K, V>`
88///
89/// Nested types can themselves be nested within other arrays.
90/// For more information on these types please see
91/// [the physical memory layout of Apache Arrow]
92///
93/// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
94/// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout
95#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
96#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
97pub enum DataType {
98    /// Null type
99    Null,
100    /// A boolean datatype representing the values `true` and `false`.
101    Boolean,
102    /// A signed 8-bit integer.
103    Int8,
104    /// A signed 16-bit integer.
105    Int16,
106    /// A signed 32-bit integer.
107    Int32,
108    /// A signed 64-bit integer.
109    Int64,
110    /// An unsigned 8-bit integer.
111    UInt8,
112    /// An unsigned 16-bit integer.
113    UInt16,
114    /// An unsigned 32-bit integer.
115    UInt32,
116    /// An unsigned 64-bit integer.
117    UInt64,
118    /// A 16-bit floating point number.
119    Float16,
120    /// A 32-bit floating point number.
121    Float32,
122    /// A 64-bit floating point number.
123    Float64,
124    /// A timestamp with an optional timezone.
125    ///
126    /// Time is measured as a Unix epoch, counting the seconds from
127    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
128    /// as a signed 64-bit integer.
129    ///
130    /// The time zone is a string indicating the name of a time zone, one of:
131    ///
132    /// * As used in the Olson time zone database (the "tz database" or
133    ///   "tzdata"), such as "America/New_York"
134    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
135    ///
136    /// Timestamps with a non-empty timezone
137    /// ------------------------------------
138    ///
139    /// If a Timestamp column has a non-empty timezone value, its epoch is
140    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
141    /// (the Unix epoch), regardless of the Timestamp's own timezone.
142    ///
143    /// Therefore, timestamp values with a non-empty timezone correspond to
144    /// physical points in time together with some additional information about
145    /// how the data was obtained and/or how to display it (the timezone).
146    ///
147    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
148    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
149    ///   application may prefer to display it as "January 1st 1970, 01h00" in
150    ///   the Europe/Paris timezone (which is the same physical point in time).
151    ///
152    /// One consequence is that timestamp values with a non-empty timezone
153    /// can be compared and ordered directly, since they all share the same
154    /// well-known point of reference (the Unix epoch).
155    ///
156    /// Timestamps with an unset / empty timezone
157    /// -----------------------------------------
158    ///
159    /// If a Timestamp column has no timezone value, its epoch is
160    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
161    ///
162    /// Therefore, timestamp values without a timezone cannot be meaningfully
163    /// interpreted as physical points in time, but only as calendar / clock
164    /// indications ("wall clock time") in an unspecified timezone.
165    ///
166    ///   For example, the timestamp value 0 with an empty timezone string
167    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
168    ///   is not enough information to interpret it as a well-defined physical
169    ///   point in time.
170    ///
171    /// One consequence is that timestamp values without a timezone cannot
172    /// be reliably compared or ordered, since they may have different points of
173    /// reference.  In particular, it is *not* possible to interpret an unset
174    /// or empty timezone as the same as "UTC".
175    ///
176    /// Conversion between timezones
177    /// ----------------------------
178    ///
179    /// If a Timestamp column has a non-empty timezone, changing the timezone
180    /// to a different non-empty value is a metadata-only operation:
181    /// the timestamp values need not change as their point of reference remains
182    /// the same (the Unix epoch).
183    ///
184    /// However, if a Timestamp column has no timezone value, changing it to a
185    /// non-empty value requires to think about the desired semantics.
186    /// One possibility is to assume that the original timestamp values are
187    /// relative to the epoch of the timezone being set; timestamp values should
188    /// then adjusted to the Unix epoch (for example, changing the timezone from
189    /// empty to "Europe/Paris" would require converting the timestamp values
190    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
191    /// nevertheless correct).
192    ///
193    /// ```
194    /// # use arrow_schema::{DataType, TimeUnit};
195    /// DataType::Timestamp(TimeUnit::Second, None);
196    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
197    /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
198    /// ```
199    ///
200    /// # Timezone representation
201    /// ----------------------------
202    /// It is possible to use either the timezone string representation, such as "UTC", or the absolute time zone offset "+00:00".
203    /// For timezones with fixed offsets, such as "UTC" or "JST", the offset representation is recommended, as it is more explicit and less ambiguous.
204    ///
205    /// Most arrow-rs functionalities use the absolute offset representation,
206    /// such as [`PrimitiveArray::with_timezone_utc`] that applies a
207    /// UTC timezone to timestamp arrays.
208    ///
209    /// [`PrimitiveArray::with_timezone_utc`]: https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveArray.html#method.with_timezone_utc
210    ///
211    /// Timezone string parsing
212    /// -----------------------
213    /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
214    ///
215    /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
216    /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
217    /// timezones.
218    Timestamp(TimeUnit, Option<Arc<str>>),
219    /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
220    /// in days.
221    Date32,
222    /// A signed 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
223    /// in milliseconds.
224    ///
225    /// # Valid Ranges
226    ///
227    /// According to the Arrow specification ([Schema.fbs]), values of Date64
228    /// are treated as the number of *days*, in milliseconds, since the UNIX
229    /// epoch. Therefore, values of this type  must be evenly divisible by
230    /// `86_400_000`, the number of milliseconds in a standard day.
231    ///
232    /// It is not valid to store milliseconds that do not represent an exact
233    /// day. The reason for this restriction is compatibility with other
234    /// language's native libraries (specifically Java), which historically
235    /// lacked a dedicated date type and only supported timestamps.
236    ///
237    /// # Validation
238    ///
239    /// This library does not validate or enforce that Date64 values are evenly
240    /// divisible by `86_400_000`  for performance and usability reasons. Date64
241    /// values are treated similarly to `Timestamp(TimeUnit::Millisecond,
242    /// None)`: values will be displayed with a time of day if the value does
243    /// not represent an exact day, and arithmetic will be done at the
244    /// millisecond granularity.
245    ///
246    /// # Recommendation
247    ///
248    /// Users should prefer [`Date32`] to cleanly represent the number
249    /// of days, or one of the Timestamp variants to include time as part of the
250    /// representation, depending on their use case.
251    ///
252    /// # Further Reading
253    ///
254    /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288).
255    ///
256    /// [`Date32`]: Self::Date32
257    /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs
258    Date64,
259    /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
260    /// Must be either seconds or milliseconds.
261    Time32(TimeUnit),
262    /// A signed 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
263    /// Must be either microseconds or nanoseconds.
264    Time64(TimeUnit),
265    /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
266    Duration(TimeUnit),
267    /// A "calendar" interval which models types that don't necessarily
268    /// have a precise duration without the context of a base timestamp (e.g.
269    /// days can differ in length during day light savings time transitions).
270    Interval(IntervalUnit),
271    /// Opaque binary data of variable length.
272    ///
273    /// A single Binary array can store up to [`i32::MAX`] bytes
274    /// of binary data in total.
275    Binary,
276    /// Opaque binary data of fixed size.
277    /// Enum parameter specifies the number of bytes per value.
278    FixedSizeBinary(i32),
279    /// Opaque binary data of variable length and 64-bit offsets.
280    ///
281    /// A single LargeBinary array can store up to [`i64::MAX`] bytes
282    /// of binary data in total.
283    LargeBinary,
284    /// Opaque binary data of variable length.
285    ///
286    /// Logically the same as [`Binary`], but the internal representation uses a view
287    /// struct that contains the string length and either the string's entire data
288    /// inline (for small strings) or an inlined prefix, an index of another buffer,
289    /// and an offset pointing to a slice in that buffer (for non-small strings).
290    ///
291    /// [`Binary`]: Self::Binary
292    BinaryView,
293    /// A variable-length string in Unicode with UTF-8 encoding.
294    ///
295    /// A single Utf8 array can store up to [`i32::MAX`] bytes
296    /// of string data in total.
297    Utf8,
298    /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
299    ///
300    /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes
301    /// of string data in total.
302    LargeUtf8,
303    /// A variable-length string in Unicode with UTF-8 encoding
304    ///
305    /// Logically the same as [`Utf8`], but the internal representation uses a view
306    /// struct that contains the string length and either the string's entire data
307    /// inline (for small strings) or an inlined prefix, an index of another buffer,
308    /// and an offset pointing to a slice in that buffer (for non-small strings).
309    ///
310    /// [`Utf8`]: Self::Utf8
311    Utf8View,
312    /// A list of some logical data type with variable length.
313    ///
314    /// A single List array can store up to [`i32::MAX`] elements in total.
315    List(FieldRef),
316
317    /// (NOT YET FULLY SUPPORTED)  A list of some logical data type with variable length.
318    ///
319    /// Logically the same as [`List`], but the internal representation differs in how child
320    /// data is referenced, allowing flexibility in how data is layed out.
321    ///
322    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
323    ///
324    /// [`List`]: Self::List
325    ListView(FieldRef),
326    /// A list of some logical data type with fixed length.
327    FixedSizeList(FieldRef, i32),
328    /// A list of some logical data type with variable length and 64-bit offsets.
329    ///
330    /// A single LargeList array can store up to [`i64::MAX`] elements in total.
331    LargeList(FieldRef),
332
333    /// (NOT YET FULLY SUPPORTED)  A list of some logical data type with variable length and 64-bit offsets.
334    ///
335    /// Logically the same as [`LargeList`], but the internal representation differs in how child
336    /// data is referenced, allowing flexibility in how data is layed out.
337    ///
338    /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s.
339    ///
340    /// [`LargeList`]: Self::LargeList
341    LargeListView(FieldRef),
342    /// A nested datatype that contains a number of sub-fields.
343    Struct(Fields),
344    /// A nested datatype that can represent slots of differing types. Components:
345    ///
346    /// 1. [`UnionFields`]
347    /// 2. The type of union (Sparse or Dense)
348    Union(UnionFields, UnionMode),
349    /// A dictionary encoded array (`key_type`, `value_type`), where
350    /// each array element is an index of `key_type` into an
351    /// associated dictionary of `value_type`.
352    ///
353    /// Dictionary arrays are used to store columns of `value_type`
354    /// that contain many repeated values using less memory, but with
355    /// a higher CPU overhead for some operations.
356    ///
357    /// This type mostly used to represent low cardinality string
358    /// arrays or a limited set of primitive types as integers.
359    Dictionary(Box<DataType>, Box<DataType>),
360    /// Exact 128-bit width decimal value with precision and scale
361    ///
362    /// * precision is the total number of digits
363    /// * scale is the number of digits past the decimal
364    ///
365    /// For example the number 123.45 has precision 5 and scale 2.
366    ///
367    /// In certain situations, scale could be negative number. For
368    /// negative scale, it is the number of padding 0 to the right
369    /// of the digits.
370    ///
371    /// For example the number 12300 could be treated as a decimal
372    /// has precision 3 and scale -2.
373    Decimal128(u8, i8),
374    /// Exact 256-bit width decimal value with precision and scale
375    ///
376    /// * precision is the total number of digits
377    /// * scale is the number of digits past the decimal
378    ///
379    /// For example the number 123.45 has precision 5 and scale 2.
380    ///
381    /// In certain situations, scale could be negative number. For
382    /// negative scale, it is the number of padding 0 to the right
383    /// of the digits.
384    ///
385    /// For example the number 12300 could be treated as a decimal
386    /// has precision 3 and scale -2.
387    Decimal256(u8, i8),
388    /// A Map is a logical nested type that is represented as
389    ///
390    /// `List<entries: Struct<key: K, value: V>>`
391    ///
392    /// The keys and values are each respectively contiguous.
393    /// The key and value types are not constrained, but keys should be
394    /// hashable and unique.
395    /// Whether the keys are sorted can be set in the `bool` after the `Field`.
396    ///
397    /// In a field with Map type, the field has a child Struct field, which then
398    /// has two children: key type and the second the value type. The names of the
399    /// child fields may be respectively "entries", "key", and "value", but this is
400    /// not enforced.
401    Map(FieldRef, bool),
402    /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These
403    /// encodings are well-suited for representing data containing sequences of the
404    /// same value, called runs. Each run is represented as a value and an integer giving
405    /// the index in the array where the run ends.
406    ///
407    /// A run-end encoded array has no buffers by itself, but has two child arrays. The
408    /// first child array, called the run ends array, holds either 16, 32, or 64-bit
409    /// signed integers. The actual values of each run are held in the second child array.
410    ///
411    /// These child arrays are prescribed the standard names of "run_ends" and "values"
412    /// respectively.
413    RunEndEncoded(FieldRef, FieldRef),
414}
415
416/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
417#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
418#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
419pub enum TimeUnit {
420    /// Time in seconds.
421    Second,
422    /// Time in milliseconds.
423    Millisecond,
424    /// Time in microseconds.
425    Microsecond,
426    /// Time in nanoseconds.
427    Nanosecond,
428}
429
430/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style.
431#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
432#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
433pub enum IntervalUnit {
434    /// Indicates the number of elapsed whole months, stored as 4-byte integers.
435    YearMonth,
436    /// Indicates the number of elapsed days and milliseconds,
437    /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
438    DayTime,
439    /// A triple of the number of elapsed months, days, and nanoseconds.
440    /// The values are stored contiguously in 16 byte blocks. Months and
441    /// days are encoded as 32 bit integers and nanoseconds is encoded as a
442    /// 64 bit integer. All integers are signed. Each field is independent
443    /// (e.g. there is no constraint that nanoseconds have the same sign
444    /// as days or that the quantity of nanoseconds represents less
445    /// than a day's worth of time).
446    MonthDayNano,
447}
448
449/// Sparse or Dense union layouts
450#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]
451#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
452pub enum UnionMode {
453    /// Sparse union layout
454    Sparse,
455    /// Dense union layout
456    Dense,
457}
458
459impl fmt::Display for DataType {
460    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
461        match &self {
462            DataType::Struct(fields) => {
463                write!(f, "Struct(")?;
464                if !fields.is_empty() {
465                    let fields_str = fields
466                        .iter()
467                        .map(|f| format!("{} {}", f.name(), f.data_type()))
468                        .collect::<Vec<_>>()
469                        .join(", ");
470                    write!(f, "{}", fields_str)?;
471                }
472                write!(f, ")")?;
473                Ok(())
474            }
475            _ => write!(f, "{self:?}"),
476        }
477    }
478}
479
480/// Parses `str` into a `DataType`.
481///
482/// This is the reverse of [`DataType`]'s `Display`
483/// impl, and maintains the invariant that
484/// `DataType::try_from(&data_type.to_string()).unwrap() == data_type`
485///
486/// # Example
487/// ```
488/// use arrow_schema::DataType;
489///
490/// let data_type: DataType = "Int32".parse().unwrap();
491/// assert_eq!(data_type, DataType::Int32);
492/// ```
493impl FromStr for DataType {
494    type Err = ArrowError;
495
496    fn from_str(s: &str) -> Result<Self, Self::Err> {
497        crate::datatype_parse::parse_data_type(s)
498    }
499}
500
501impl TryFrom<&str> for DataType {
502    type Error = ArrowError;
503
504    fn try_from(value: &str) -> Result<Self, Self::Error> {
505        value.parse()
506    }
507}
508
509impl DataType {
510    /// Returns true if the type is primitive: (numeric, temporal).
511    #[inline]
512    pub fn is_primitive(&self) -> bool {
513        self.is_numeric() || self.is_temporal()
514    }
515
516    /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*).
517    #[inline]
518    pub fn is_numeric(&self) -> bool {
519        use DataType::*;
520        matches!(
521            self,
522            UInt8
523                | UInt16
524                | UInt32
525                | UInt64
526                | Int8
527                | Int16
528                | Int32
529                | Int64
530                | Float16
531                | Float32
532                | Float64
533                | Decimal128(_, _)
534                | Decimal256(_, _)
535        )
536    }
537
538    /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval).
539    #[inline]
540    pub fn is_temporal(&self) -> bool {
541        use DataType::*;
542        matches!(
543            self,
544            Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_)
545        )
546    }
547
548    /// Returns true if this type is floating: (Float*).
549    #[inline]
550    pub fn is_floating(&self) -> bool {
551        use DataType::*;
552        matches!(self, Float16 | Float32 | Float64)
553    }
554
555    /// Returns true if this type is integer: (Int*, UInt*).
556    #[inline]
557    pub fn is_integer(&self) -> bool {
558        self.is_signed_integer() || self.is_unsigned_integer()
559    }
560
561    /// Returns true if this type is signed integer: (Int*).
562    #[inline]
563    pub fn is_signed_integer(&self) -> bool {
564        use DataType::*;
565        matches!(self, Int8 | Int16 | Int32 | Int64)
566    }
567
568    /// Returns true if this type is unsigned integer: (UInt*).
569    #[inline]
570    pub fn is_unsigned_integer(&self) -> bool {
571        use DataType::*;
572        matches!(self, UInt8 | UInt16 | UInt32 | UInt64)
573    }
574
575    /// Returns true if this type is valid as a dictionary key
576    #[inline]
577    pub fn is_dictionary_key_type(&self) -> bool {
578        self.is_integer()
579    }
580
581    /// Returns true if this type is valid for run-ends array in RunArray
582    #[inline]
583    pub fn is_run_ends_type(&self) -> bool {
584        use DataType::*;
585        matches!(self, Int16 | Int32 | Int64)
586    }
587
588    /// Returns true if this type is nested (List, FixedSizeList, LargeList, ListView. LargeListView, Struct, Union,
589    /// or Map), or a dictionary of a nested type
590    #[inline]
591    pub fn is_nested(&self) -> bool {
592        use DataType::*;
593        match self {
594            Dictionary(_, v) => DataType::is_nested(v.as_ref()),
595            RunEndEncoded(_, v) => DataType::is_nested(v.data_type()),
596            List(_)
597            | FixedSizeList(_, _)
598            | LargeList(_)
599            | ListView(_)
600            | LargeListView(_)
601            | Struct(_)
602            | Union(_, _)
603            | Map(_, _) => true,
604            _ => false,
605        }
606    }
607
608    /// Returns true if this type is DataType::Null.
609    #[inline]
610    pub fn is_null(&self) -> bool {
611        use DataType::*;
612        matches!(self, Null)
613    }
614
615    /// Compares the datatype with another, ignoring nested field names
616    /// and metadata.
617    pub fn equals_datatype(&self, other: &DataType) -> bool {
618        match (&self, other) {
619            (DataType::List(a), DataType::List(b))
620            | (DataType::LargeList(a), DataType::LargeList(b))
621            | (DataType::ListView(a), DataType::ListView(b))
622            | (DataType::LargeListView(a), DataType::LargeListView(b)) => {
623                a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type())
624            }
625            (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
626                a_size == b_size
627                    && a.is_nullable() == b.is_nullable()
628                    && a.data_type().equals_datatype(b.data_type())
629            }
630            (DataType::Struct(a), DataType::Struct(b)) => {
631                a.len() == b.len()
632                    && a.iter().zip(b).all(|(a, b)| {
633                        a.is_nullable() == b.is_nullable()
634                            && a.data_type().equals_datatype(b.data_type())
635                    })
636            }
637            (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => {
638                a_field.is_nullable() == b_field.is_nullable()
639                    && a_field.data_type().equals_datatype(b_field.data_type())
640                    && a_is_sorted == b_is_sorted
641            }
642            (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => {
643                a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value)
644            }
645            (
646                DataType::RunEndEncoded(a_run_ends, a_values),
647                DataType::RunEndEncoded(b_run_ends, b_values),
648            ) => {
649                a_run_ends.is_nullable() == b_run_ends.is_nullable()
650                    && a_run_ends
651                        .data_type()
652                        .equals_datatype(b_run_ends.data_type())
653                    && a_values.is_nullable() == b_values.is_nullable()
654                    && a_values.data_type().equals_datatype(b_values.data_type())
655            }
656            (
657                DataType::Union(a_union_fields, a_union_mode),
658                DataType::Union(b_union_fields, b_union_mode),
659            ) => {
660                a_union_mode == b_union_mode
661                    && a_union_fields.len() == b_union_fields.len()
662                    && a_union_fields.iter().all(|a| {
663                        b_union_fields.iter().any(|b| {
664                            a.0 == b.0
665                                && a.1.is_nullable() == b.1.is_nullable()
666                                && a.1.data_type().equals_datatype(b.1.data_type())
667                        })
668                    })
669            }
670            _ => self == other,
671        }
672    }
673
674    /// Returns the byte width of this type if it is a primitive type
675    ///
676    /// Returns `None` if not a primitive type
677    #[inline]
678    pub fn primitive_width(&self) -> Option<usize> {
679        match self {
680            DataType::Null => None,
681            DataType::Boolean => None,
682            DataType::Int8 | DataType::UInt8 => Some(1),
683            DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
684            DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
685            DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
686            DataType::Timestamp(_, _) => Some(8),
687            DataType::Date32 | DataType::Time32(_) => Some(4),
688            DataType::Date64 | DataType::Time64(_) => Some(8),
689            DataType::Duration(_) => Some(8),
690            DataType::Interval(IntervalUnit::YearMonth) => Some(4),
691            DataType::Interval(IntervalUnit::DayTime) => Some(8),
692            DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
693            DataType::Decimal128(_, _) => Some(16),
694            DataType::Decimal256(_, _) => Some(32),
695            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None,
696            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
697            DataType::FixedSizeBinary(_) => None,
698            DataType::List(_)
699            | DataType::ListView(_)
700            | DataType::LargeList(_)
701            | DataType::LargeListView(_)
702            | DataType::Map(_, _) => None,
703            DataType::FixedSizeList(_, _) => None,
704            DataType::Struct(_) => None,
705            DataType::Union(_, _) => None,
706            DataType::Dictionary(_, _) => None,
707            DataType::RunEndEncoded(_, _) => None,
708        }
709    }
710
711    /// Return size of this instance in bytes.
712    ///
713    /// Includes the size of `Self`.
714    pub fn size(&self) -> usize {
715        std::mem::size_of_val(self)
716            + match self {
717                DataType::Null
718                | DataType::Boolean
719                | DataType::Int8
720                | DataType::Int16
721                | DataType::Int32
722                | DataType::Int64
723                | DataType::UInt8
724                | DataType::UInt16
725                | DataType::UInt32
726                | DataType::UInt64
727                | DataType::Float16
728                | DataType::Float32
729                | DataType::Float64
730                | DataType::Date32
731                | DataType::Date64
732                | DataType::Time32(_)
733                | DataType::Time64(_)
734                | DataType::Duration(_)
735                | DataType::Interval(_)
736                | DataType::Binary
737                | DataType::FixedSizeBinary(_)
738                | DataType::LargeBinary
739                | DataType::BinaryView
740                | DataType::Utf8
741                | DataType::LargeUtf8
742                | DataType::Utf8View
743                | DataType::Decimal128(_, _)
744                | DataType::Decimal256(_, _) => 0,
745                DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(),
746                DataType::List(field)
747                | DataType::ListView(field)
748                | DataType::FixedSizeList(field, _)
749                | DataType::LargeList(field)
750                | DataType::LargeListView(field)
751                | DataType::Map(field, _) => field.size(),
752                DataType::Struct(fields) => fields.size(),
753                DataType::Union(fields, _) => fields.size(),
754                DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
755                DataType::RunEndEncoded(run_ends, values) => {
756                    run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
757                        - std::mem::size_of_val(values)
758                }
759            }
760    }
761
762    /// Check to see if `self` is a superset of `other`
763    ///
764    /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type
765    /// else it will check to see if the DataType is equal to the other DataType
766    pub fn contains(&self, other: &DataType) -> bool {
767        match (self, other) {
768            (DataType::List(f1), DataType::List(f2))
769            | (DataType::LargeList(f1), DataType::LargeList(f2))
770            | (DataType::ListView(f1), DataType::ListView(f2))
771            | (DataType::LargeListView(f1), DataType::LargeListView(f2)) => f1.contains(f2),
772            (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => {
773                s1 == s2 && f1.contains(f2)
774            }
775            (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2),
776            (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2),
777            (DataType::Union(f1, s1), DataType::Union(f2, s2)) => {
778                s1 == s2
779                    && f1
780                        .iter()
781                        .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1)))
782            }
783            (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => {
784                k1.contains(k2) && v1.contains(v2)
785            }
786            _ => self == other,
787        }
788    }
789
790    /// Create a [`DataType::List`] with elements of the specified type
791    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
792    ///
793    /// To specify field level metadata, construct the inner [`Field`]
794    /// directly via [`Field::new`] or [`Field::new_list_field`].
795    pub fn new_list(data_type: DataType, nullable: bool) -> Self {
796        DataType::List(Arc::new(Field::new_list_field(data_type, nullable)))
797    }
798
799    /// Create a [`DataType::LargeList`] with elements of the specified type
800    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
801    ///
802    /// To specify field level metadata, construct the inner [`Field`]
803    /// directly via [`Field::new`] or [`Field::new_list_field`].
804    pub fn new_large_list(data_type: DataType, nullable: bool) -> Self {
805        DataType::LargeList(Arc::new(Field::new_list_field(data_type, nullable)))
806    }
807
808    /// Create a [`DataType::FixedSizeList`] with elements of the specified type, size
809    /// and nullability, and conventionally named inner [`Field`] (`"item"`).
810    ///
811    /// To specify field level metadata, construct the inner [`Field`]
812    /// directly via [`Field::new`] or [`Field::new_list_field`].
813    pub fn new_fixed_size_list(data_type: DataType, size: i32, nullable: bool) -> Self {
814        DataType::FixedSizeList(Arc::new(Field::new_list_field(data_type, nullable)), size)
815    }
816}
817
818/// The maximum precision for [DataType::Decimal128] values
819pub const DECIMAL128_MAX_PRECISION: u8 = 38;
820
821/// The maximum scale for [DataType::Decimal128] values
822pub const DECIMAL128_MAX_SCALE: i8 = 38;
823
824/// The maximum precision for [DataType::Decimal256] values
825pub const DECIMAL256_MAX_PRECISION: u8 = 76;
826
827/// The maximum scale for [DataType::Decimal256] values
828pub const DECIMAL256_MAX_SCALE: i8 = 76;
829
830/// The default scale for [DataType::Decimal128] and [DataType::Decimal256]
831/// values
832pub const DECIMAL_DEFAULT_SCALE: i8 = 10;
833
834#[cfg(test)]
835mod tests {
836    use super::*;
837
838    #[test]
839    #[cfg(feature = "serde")]
840    fn serde_struct_type() {
841        use std::collections::HashMap;
842
843        let kv_array = [("k".to_string(), "v".to_string())];
844        let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
845
846        // Non-empty map: should be converted as JSON obj { ... }
847        let first_name =
848            Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);
849
850        // Empty map: should be omitted.
851        let last_name =
852            Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default());
853
854        let person = DataType::Struct(Fields::from(vec![
855            first_name,
856            last_name,
857            Field::new(
858                "address",
859                DataType::Struct(Fields::from(vec![
860                    Field::new("street", DataType::Utf8, false),
861                    Field::new("zip", DataType::UInt16, false),
862                ])),
863                false,
864            ),
865        ]));
866
867        let serialized = serde_json::to_string(&person).unwrap();
868
869        // NOTE that this is testing the default (derived) serialization format, not the
870        // JSON format specified in metadata.md
871
872        assert_eq!(
873            "{\"Struct\":[\
874             {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
875             {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
876             {\"name\":\"address\",\"data_type\":{\"Struct\":\
877             [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\
878             {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\
879             ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}",
880            serialized
881        );
882
883        let deserialized = serde_json::from_str(&serialized).unwrap();
884
885        assert_eq!(person, deserialized);
886    }
887
888    #[test]
889    fn test_list_datatype_equality() {
890        // tests that list type equality is checked while ignoring list names
891        let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
892        let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true)));
893        let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false)));
894        let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true)));
895        assert!(list_a.equals_datatype(&list_b));
896        assert!(!list_a.equals_datatype(&list_c));
897        assert!(!list_b.equals_datatype(&list_c));
898        assert!(!list_a.equals_datatype(&list_d));
899
900        let list_e =
901            DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3);
902        let list_f =
903            DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3);
904        let list_g = DataType::FixedSizeList(
905            Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)),
906            3,
907        );
908        assert!(list_e.equals_datatype(&list_f));
909        assert!(!list_e.equals_datatype(&list_g));
910        assert!(!list_f.equals_datatype(&list_g));
911
912        let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)]));
913        let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)]));
914        let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)]));
915        let list_k = DataType::Struct(Fields::from(vec![
916            Field::new("f1", list_f.clone(), false),
917            Field::new("f2", list_g.clone(), false),
918            Field::new("f3", DataType::Utf8, true),
919        ]));
920        let list_l = DataType::Struct(Fields::from(vec![
921            Field::new("ff1", list_f.clone(), false),
922            Field::new("ff2", list_g.clone(), false),
923            Field::new("ff3", DataType::LargeUtf8, true),
924        ]));
925        let list_m = DataType::Struct(Fields::from(vec![
926            Field::new("ff1", list_f, false),
927            Field::new("ff2", list_g, false),
928            Field::new("ff3", DataType::Utf8, true),
929        ]));
930        assert!(list_h.equals_datatype(&list_i));
931        assert!(!list_h.equals_datatype(&list_j));
932        assert!(!list_k.equals_datatype(&list_l));
933        assert!(list_k.equals_datatype(&list_m));
934
935        let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true);
936        let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true);
937        let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false);
938        let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true);
939        let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true);
940
941        assert!(list_n.equals_datatype(&list_o));
942        assert!(!list_n.equals_datatype(&list_p));
943        assert!(!list_n.equals_datatype(&list_q));
944        assert!(!list_n.equals_datatype(&list_r));
945
946        let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a));
947        let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone()));
948        let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b));
949        let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c));
950
951        assert!(list_s.equals_datatype(&list_t));
952        assert!(!list_s.equals_datatype(&list_u));
953        assert!(!list_s.equals_datatype(&list_v));
954
955        let union_a = DataType::Union(
956            UnionFields::new(
957                vec![1, 2],
958                vec![
959                    Field::new("f1", DataType::Utf8, false),
960                    Field::new("f2", DataType::UInt8, false),
961                ],
962            ),
963            UnionMode::Sparse,
964        );
965        let union_b = DataType::Union(
966            UnionFields::new(
967                vec![1, 2],
968                vec![
969                    Field::new("ff1", DataType::Utf8, false),
970                    Field::new("ff2", DataType::UInt8, false),
971                ],
972            ),
973            UnionMode::Sparse,
974        );
975        let union_c = DataType::Union(
976            UnionFields::new(
977                vec![2, 1],
978                vec![
979                    Field::new("fff2", DataType::UInt8, false),
980                    Field::new("fff1", DataType::Utf8, false),
981                ],
982            ),
983            UnionMode::Sparse,
984        );
985        let union_d = DataType::Union(
986            UnionFields::new(
987                vec![2, 1],
988                vec![
989                    Field::new("fff1", DataType::Int8, false),
990                    Field::new("fff2", DataType::UInt8, false),
991                ],
992            ),
993            UnionMode::Sparse,
994        );
995        let union_e = DataType::Union(
996            UnionFields::new(
997                vec![1, 2],
998                vec![
999                    Field::new("f1", DataType::Utf8, true),
1000                    Field::new("f2", DataType::UInt8, false),
1001                ],
1002            ),
1003            UnionMode::Sparse,
1004        );
1005
1006        assert!(union_a.equals_datatype(&union_b));
1007        assert!(union_a.equals_datatype(&union_c));
1008        assert!(!union_a.equals_datatype(&union_d));
1009        assert!(!union_a.equals_datatype(&union_e));
1010
1011        let list_w = DataType::RunEndEncoded(
1012            Arc::new(Field::new("f1", DataType::Int64, true)),
1013            Arc::new(Field::new("f2", DataType::Utf8, true)),
1014        );
1015        let list_x = DataType::RunEndEncoded(
1016            Arc::new(Field::new("ff1", DataType::Int64, true)),
1017            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1018        );
1019        let list_y = DataType::RunEndEncoded(
1020            Arc::new(Field::new("ff1", DataType::UInt16, true)),
1021            Arc::new(Field::new("ff2", DataType::Utf8, true)),
1022        );
1023        let list_z = DataType::RunEndEncoded(
1024            Arc::new(Field::new("f1", DataType::Int64, false)),
1025            Arc::new(Field::new("f2", DataType::Utf8, true)),
1026        );
1027
1028        assert!(list_w.equals_datatype(&list_x));
1029        assert!(!list_w.equals_datatype(&list_y));
1030        assert!(!list_w.equals_datatype(&list_z));
1031    }
1032
1033    #[test]
1034    fn create_struct_type() {
1035        let _person = DataType::Struct(Fields::from(vec![
1036            Field::new("first_name", DataType::Utf8, false),
1037            Field::new("last_name", DataType::Utf8, false),
1038            Field::new(
1039                "address",
1040                DataType::Struct(Fields::from(vec![
1041                    Field::new("street", DataType::Utf8, false),
1042                    Field::new("zip", DataType::UInt16, false),
1043                ])),
1044                false,
1045            ),
1046        ]));
1047    }
1048
1049    #[test]
1050    fn test_nested() {
1051        let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true)));
1052        let list_view = DataType::ListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1053        let large_list_view =
1054            DataType::LargeListView(Arc::new(Field::new("foo", DataType::Utf8, true)));
1055
1056        assert!(!DataType::is_nested(&DataType::Boolean));
1057        assert!(!DataType::is_nested(&DataType::Int32));
1058        assert!(!DataType::is_nested(&DataType::Utf8));
1059        assert!(DataType::is_nested(&list));
1060        assert!(DataType::is_nested(&list_view));
1061        assert!(DataType::is_nested(&large_list_view));
1062
1063        assert!(!DataType::is_nested(&DataType::Dictionary(
1064            Box::new(DataType::Int32),
1065            Box::new(DataType::Boolean)
1066        )));
1067        assert!(!DataType::is_nested(&DataType::Dictionary(
1068            Box::new(DataType::Int32),
1069            Box::new(DataType::Int64)
1070        )));
1071        assert!(!DataType::is_nested(&DataType::Dictionary(
1072            Box::new(DataType::Int32),
1073            Box::new(DataType::LargeUtf8)
1074        )));
1075        assert!(DataType::is_nested(&DataType::Dictionary(
1076            Box::new(DataType::Int32),
1077            Box::new(list)
1078        )));
1079    }
1080
1081    #[test]
1082    fn test_integer() {
1083        // is_integer
1084        assert!(DataType::is_integer(&DataType::Int32));
1085        assert!(DataType::is_integer(&DataType::UInt64));
1086        assert!(!DataType::is_integer(&DataType::Float16));
1087
1088        // is_signed_integer
1089        assert!(DataType::is_signed_integer(&DataType::Int32));
1090        assert!(!DataType::is_signed_integer(&DataType::UInt64));
1091        assert!(!DataType::is_signed_integer(&DataType::Float16));
1092
1093        // is_unsigned_integer
1094        assert!(!DataType::is_unsigned_integer(&DataType::Int32));
1095        assert!(DataType::is_unsigned_integer(&DataType::UInt64));
1096        assert!(!DataType::is_unsigned_integer(&DataType::Float16));
1097
1098        // is_dictionary_key_type
1099        assert!(DataType::is_dictionary_key_type(&DataType::Int32));
1100        assert!(DataType::is_dictionary_key_type(&DataType::UInt64));
1101        assert!(!DataType::is_dictionary_key_type(&DataType::Float16));
1102    }
1103
1104    #[test]
1105    fn test_floating() {
1106        assert!(DataType::is_floating(&DataType::Float16));
1107        assert!(!DataType::is_floating(&DataType::Int32));
1108    }
1109
1110    #[test]
1111    fn test_datatype_is_null() {
1112        assert!(DataType::is_null(&DataType::Null));
1113        assert!(!DataType::is_null(&DataType::Int32));
1114    }
1115
1116    #[test]
1117    fn size_should_not_regress() {
1118        assert_eq!(std::mem::size_of::<DataType>(), 24);
1119    }
1120
1121    #[test]
1122    #[should_panic(expected = "duplicate type id: 1")]
1123    fn test_union_with_duplicated_type_id() {
1124        let type_ids = vec![1, 1];
1125        let _union = DataType::Union(
1126            UnionFields::new(
1127                type_ids,
1128                vec![
1129                    Field::new("f1", DataType::Int32, false),
1130                    Field::new("f2", DataType::Utf8, false),
1131                ],
1132            ),
1133            UnionMode::Dense,
1134        );
1135    }
1136
1137    #[test]
1138    fn test_try_from_str() {
1139        let data_type: DataType = "Int32".try_into().unwrap();
1140        assert_eq!(data_type, DataType::Int32);
1141    }
1142
1143    #[test]
1144    fn test_from_str() {
1145        let data_type: DataType = "UInt64".parse().unwrap();
1146        assert_eq!(data_type, DataType::UInt64);
1147    }
1148}