arrow_array/array/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! The concrete array definitions
19
20mod binary_array;
21
22use crate::types::*;
23use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer};
24use arrow_data::ArrayData;
25use arrow_schema::{DataType, IntervalUnit, TimeUnit};
26use std::any::Any;
27use std::sync::Arc;
28
29pub use binary_array::*;
30
31mod boolean_array;
32pub use boolean_array::*;
33
34mod byte_array;
35pub use byte_array::*;
36
37mod dictionary_array;
38pub use dictionary_array::*;
39
40mod fixed_size_binary_array;
41pub use fixed_size_binary_array::*;
42
43mod fixed_size_list_array;
44pub use fixed_size_list_array::*;
45
46mod list_array;
47pub use list_array::*;
48
49mod map_array;
50pub use map_array::*;
51
52mod null_array;
53pub use null_array::*;
54
55mod primitive_array;
56pub use primitive_array::*;
57
58mod string_array;
59pub use string_array::*;
60
61mod struct_array;
62pub use struct_array::*;
63
64mod union_array;
65pub use union_array::*;
66
67mod run_array;
68
69pub use run_array::*;
70
71mod byte_view_array;
72
73pub use byte_view_array::*;
74
75mod list_view_array;
76
77pub use list_view_array::*;
78
79use crate::iterator::ArrayIter;
80
81/// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html)
82pub trait Array: std::fmt::Debug + Send + Sync {
83    /// Returns the array as [`Any`] so that it can be
84    /// downcasted to a specific implementation.
85    ///
86    /// # Example:
87    ///
88    /// ```
89    /// # use std::sync::Arc;
90    /// # use arrow_array::{Int32Array, RecordBatch};
91    /// # use arrow_schema::{Schema, Field, DataType, ArrowError};
92    ///
93    /// let id = Int32Array::from(vec![1, 2, 3, 4, 5]);
94    /// let batch = RecordBatch::try_new(
95    ///     Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])),
96    ///     vec![Arc::new(id)]
97    /// ).unwrap();
98    ///
99    /// let int32array = batch
100    ///     .column(0)
101    ///     .as_any()
102    ///     .downcast_ref::<Int32Array>()
103    ///     .expect("Failed to downcast");
104    /// ```
105    fn as_any(&self) -> &dyn Any;
106
107    /// Returns the underlying data of this array
108    fn to_data(&self) -> ArrayData;
109
110    /// Returns the underlying data of this array
111    ///
112    /// Unlike [`Array::to_data`] this consumes self, allowing it avoid unnecessary clones
113    fn into_data(self) -> ArrayData;
114
115    /// Returns a reference to the [`DataType`] of this array.
116    ///
117    /// # Example:
118    ///
119    /// ```
120    /// use arrow_schema::DataType;
121    /// use arrow_array::{Array, Int32Array};
122    ///
123    /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
124    ///
125    /// assert_eq!(*array.data_type(), DataType::Int32);
126    /// ```
127    fn data_type(&self) -> &DataType;
128
129    /// Returns a zero-copy slice of this array with the indicated offset and length.
130    ///
131    /// # Example:
132    ///
133    /// ```
134    /// use arrow_array::{Array, Int32Array};
135    ///
136    /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
137    /// // Make slice over the values [2, 3, 4]
138    /// let array_slice = array.slice(1, 3);
139    ///
140    /// assert_eq!(&array_slice, &Int32Array::from(vec![2, 3, 4]));
141    /// ```
142    fn slice(&self, offset: usize, length: usize) -> ArrayRef;
143
144    /// Returns the length (i.e., number of elements) of this array.
145    ///
146    /// # Example:
147    ///
148    /// ```
149    /// use arrow_array::{Array, Int32Array};
150    ///
151    /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
152    ///
153    /// assert_eq!(array.len(), 5);
154    /// ```
155    fn len(&self) -> usize;
156
157    /// Returns whether this array is empty.
158    ///
159    /// # Example:
160    ///
161    /// ```
162    /// use arrow_array::{Array, Int32Array};
163    ///
164    /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
165    ///
166    /// assert_eq!(array.is_empty(), false);
167    /// ```
168    fn is_empty(&self) -> bool;
169
170    /// Returns the offset into the underlying data used by this array(-slice).
171    /// Note that the underlying data can be shared by many arrays.
172    /// This defaults to `0`.
173    ///
174    /// # Example:
175    ///
176    /// ```
177    /// use arrow_array::{Array, BooleanArray};
178    ///
179    /// let array = BooleanArray::from(vec![false, false, true, true]);
180    /// let array_slice = array.slice(1, 3);
181    ///
182    /// assert_eq!(array.offset(), 0);
183    /// assert_eq!(array_slice.offset(), 1);
184    /// ```
185    fn offset(&self) -> usize;
186
187    /// Returns the null buffer of this array if any.
188    ///
189    /// The null buffer contains the "physical" nulls of an array, that is how
190    /// the nulls are represented in the underlying arrow format.
191    ///
192    /// The physical representation is efficient, but is sometimes non intuitive
193    /// for certain array types such as those with nullable child arrays like
194    /// [`DictionaryArray::values`], [`RunArray::values`] or [`UnionArray`], or without a
195    /// null buffer, such as [`NullArray`].
196    ///
197    /// To determine if each element of such an array is "logically" null,
198    /// use the slower [`Array::logical_nulls`] to obtain a computed mask.
199    fn nulls(&self) -> Option<&NullBuffer>;
200
201    /// Returns a potentially computed [`NullBuffer`] that represents the logical
202    /// null values of this array, if any.
203    ///
204    /// Logical nulls represent the values that are null in the array,
205    /// regardless of the underlying physical arrow representation.
206    ///
207    /// For most array types, this is equivalent to the "physical" nulls
208    /// returned by [`Array::nulls`]. It is different for the following cases, because which
209    /// elements are null is not encoded in a single null buffer:
210    ///
211    /// * [`DictionaryArray`] where [`DictionaryArray::values`] contains nulls
212    /// * [`RunArray`] where [`RunArray::values`] contains nulls
213    /// * [`NullArray`] where all indices are nulls
214    /// * [`UnionArray`] where the selected values contains nulls
215    ///
216    /// In these cases a logical [`NullBuffer`] will be computed, encoding the
217    /// logical nullability of these arrays, beyond what is encoded in
218    /// [`Array::nulls`]
219    fn logical_nulls(&self) -> Option<NullBuffer> {
220        self.nulls().cloned()
221    }
222
223    /// Returns whether the element at `index` is null according to [`Array::nulls`]
224    ///
225    /// Note: For performance reasons, this method returns nullability solely as determined by the
226    /// null buffer. This difference can lead to surprising results, for example, [`NullArray::is_null`] always
227    /// returns `false` as the array lacks a null buffer. Similarly [`DictionaryArray`], [`RunArray`] and [`UnionArray`] may
228    /// encode nullability in their children. See [`Self::logical_nulls`] for more information.
229    ///
230    /// # Example:
231    ///
232    /// ```
233    /// use arrow_array::{Array, Int32Array, NullArray};
234    ///
235    /// let array = Int32Array::from(vec![Some(1), None]);
236    /// assert_eq!(array.is_null(0), false);
237    /// assert_eq!(array.is_null(1), true);
238    ///
239    /// // NullArrays do not have a null buffer, and therefore always
240    /// // return false for is_null.
241    /// let array = NullArray::new(1);
242    /// assert_eq!(array.is_null(0), false);
243    /// ```
244    fn is_null(&self, index: usize) -> bool {
245        self.nulls().map(|n| n.is_null(index)).unwrap_or_default()
246    }
247
248    /// Returns whether the element at `index` is *not* null, the
249    /// opposite of [`Self::is_null`].
250    ///
251    /// # Example:
252    ///
253    /// ```
254    /// use arrow_array::{Array, Int32Array};
255    ///
256    /// let array = Int32Array::from(vec![Some(1), None]);
257    ///
258    /// assert_eq!(array.is_valid(0), true);
259    /// assert_eq!(array.is_valid(1), false);
260    /// ```
261    fn is_valid(&self, index: usize) -> bool {
262        !self.is_null(index)
263    }
264
265    /// Returns the total number of physical null values in this array.
266    ///
267    /// Note: this method returns the physical null count, i.e. that encoded in [`Array::nulls`],
268    /// see [`Array::logical_nulls`] for logical nullability
269    ///
270    /// # Example:
271    ///
272    /// ```
273    /// use arrow_array::{Array, Int32Array};
274    ///
275    /// // Construct an array with values [1, NULL, NULL]
276    /// let array = Int32Array::from(vec![Some(1), None, None]);
277    ///
278    /// assert_eq!(array.null_count(), 2);
279    /// ```
280    fn null_count(&self) -> usize {
281        self.nulls().map(|n| n.null_count()).unwrap_or_default()
282    }
283
284    /// Returns the total number of logical null values in this array.
285    ///
286    /// Note: this method returns the logical null count, i.e. that encoded in
287    /// [`Array::logical_nulls`]. In general this is equivalent to [`Array::null_count`] but may differ in the
288    /// presence of logical nullability, see [`Array::nulls`] and [`Array::logical_nulls`].
289    ///
290    /// # Example:
291    ///
292    /// ```
293    /// use arrow_array::{Array, Int32Array};
294    ///
295    /// // Construct an array with values [1, NULL, NULL]
296    /// let array = Int32Array::from(vec![Some(1), None, None]);
297    ///
298    /// assert_eq!(array.logical_null_count(), 2);
299    /// ```
300    fn logical_null_count(&self) -> usize {
301        self.logical_nulls()
302            .map(|n| n.null_count())
303            .unwrap_or_default()
304    }
305
306    /// Returns `false` if the array is guaranteed to not contain any logical nulls
307    ///
308    /// This is generally equivalent to `Array::logical_null_count() != 0` unless determining
309    /// the logical nulls is expensive, in which case this method can return true even for an
310    /// array without nulls.
311    ///
312    /// This is also generally equivalent to `Array::null_count() != 0` but may differ in the
313    /// presence of logical nullability, see [`Array::logical_null_count`] and [`Array::null_count`].
314    ///
315    /// Implementations will return `true` unless they can cheaply prove no logical nulls
316    /// are present. For example a [`DictionaryArray`] with nullable values will still return true,
317    /// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key,
318    /// and therefore would not appear in [`Array::logical_nulls`].
319    fn is_nullable(&self) -> bool {
320        // TODO this is not necessarily perfect default implementation, since null_count() and logical_null_count() are not always equivalent
321        self.null_count() != 0
322    }
323
324    /// Returns the total number of bytes of memory pointed to by this array.
325    /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
326    /// Note that this does not always correspond to the exact memory usage of an array,
327    /// since multiple arrays can share the same buffers or slices thereof.
328    fn get_buffer_memory_size(&self) -> usize;
329
330    /// Returns the total number of bytes of memory occupied physically by this array.
331    /// This value will always be greater than returned by `get_buffer_memory_size()` and
332    /// includes the overhead of the data structures that contain the pointers to the various buffers.
333    fn get_array_memory_size(&self) -> usize;
334}
335
336/// A reference-counted reference to a generic `Array`
337pub type ArrayRef = Arc<dyn Array>;
338
339/// Ergonomics: Allow use of an ArrayRef as an `&dyn Array`
340impl Array for ArrayRef {
341    fn as_any(&self) -> &dyn Any {
342        self.as_ref().as_any()
343    }
344
345    fn to_data(&self) -> ArrayData {
346        self.as_ref().to_data()
347    }
348
349    fn into_data(self) -> ArrayData {
350        self.to_data()
351    }
352
353    fn data_type(&self) -> &DataType {
354        self.as_ref().data_type()
355    }
356
357    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
358        self.as_ref().slice(offset, length)
359    }
360
361    fn len(&self) -> usize {
362        self.as_ref().len()
363    }
364
365    fn is_empty(&self) -> bool {
366        self.as_ref().is_empty()
367    }
368
369    fn offset(&self) -> usize {
370        self.as_ref().offset()
371    }
372
373    fn nulls(&self) -> Option<&NullBuffer> {
374        self.as_ref().nulls()
375    }
376
377    fn logical_nulls(&self) -> Option<NullBuffer> {
378        self.as_ref().logical_nulls()
379    }
380
381    fn is_null(&self, index: usize) -> bool {
382        self.as_ref().is_null(index)
383    }
384
385    fn is_valid(&self, index: usize) -> bool {
386        self.as_ref().is_valid(index)
387    }
388
389    fn null_count(&self) -> usize {
390        self.as_ref().null_count()
391    }
392
393    fn logical_null_count(&self) -> usize {
394        self.as_ref().logical_null_count()
395    }
396
397    fn is_nullable(&self) -> bool {
398        self.as_ref().is_nullable()
399    }
400
401    fn get_buffer_memory_size(&self) -> usize {
402        self.as_ref().get_buffer_memory_size()
403    }
404
405    fn get_array_memory_size(&self) -> usize {
406        self.as_ref().get_array_memory_size()
407    }
408}
409
410impl<T: Array> Array for &T {
411    fn as_any(&self) -> &dyn Any {
412        T::as_any(self)
413    }
414
415    fn to_data(&self) -> ArrayData {
416        T::to_data(self)
417    }
418
419    fn into_data(self) -> ArrayData {
420        self.to_data()
421    }
422
423    fn data_type(&self) -> &DataType {
424        T::data_type(self)
425    }
426
427    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
428        T::slice(self, offset, length)
429    }
430
431    fn len(&self) -> usize {
432        T::len(self)
433    }
434
435    fn is_empty(&self) -> bool {
436        T::is_empty(self)
437    }
438
439    fn offset(&self) -> usize {
440        T::offset(self)
441    }
442
443    fn nulls(&self) -> Option<&NullBuffer> {
444        T::nulls(self)
445    }
446
447    fn logical_nulls(&self) -> Option<NullBuffer> {
448        T::logical_nulls(self)
449    }
450
451    fn is_null(&self, index: usize) -> bool {
452        T::is_null(self, index)
453    }
454
455    fn is_valid(&self, index: usize) -> bool {
456        T::is_valid(self, index)
457    }
458
459    fn null_count(&self) -> usize {
460        T::null_count(self)
461    }
462
463    fn logical_null_count(&self) -> usize {
464        T::logical_null_count(self)
465    }
466
467    fn is_nullable(&self) -> bool {
468        T::is_nullable(self)
469    }
470
471    fn get_buffer_memory_size(&self) -> usize {
472        T::get_buffer_memory_size(self)
473    }
474
475    fn get_array_memory_size(&self) -> usize {
476        T::get_array_memory_size(self)
477    }
478}
479
480/// A generic trait for accessing the values of an [`Array`]
481///
482/// This trait helps write specialized implementations of algorithms for
483/// different array types. Specialized implementations allow the compiler
484/// to optimize the code for the specific array type, which can lead to
485/// significant performance improvements.
486///
487/// # Example
488/// For example, to write three different implementations of a string length function
489/// for [`StringArray`], [`LargeStringArray`], and [`StringViewArray`], you can write
490///
491/// ```
492/// # use std::sync::Arc;
493/// # use arrow_array::{ArrayAccessor, ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
494/// # use arrow_buffer::ArrowNativeType;
495/// # use arrow_array::cast::AsArray;
496/// # use arrow_array::iterator::ArrayIter;
497/// # use arrow_array::types::{Int32Type, Int64Type};
498/// # use arrow_schema::{ArrowError, DataType};
499/// /// This function takes a dynamically typed `ArrayRef` and calls
500/// /// calls one of three specialized implementations
501/// fn character_length(arg: ArrayRef) -> Result<ArrayRef, ArrowError> {
502///     match arg.data_type() {
503///         DataType::Utf8 => {
504///             // downcast the ArrayRef to a StringArray and call the specialized implementation
505///             let string_array = arg.as_string::<i32>();
506///             character_length_general::<Int32Type, _>(string_array)
507///         }
508///         DataType::LargeUtf8 => {
509///             character_length_general::<Int64Type, _>(arg.as_string::<i64>())
510///         }
511///         DataType::Utf8View => {
512///             character_length_general::<Int32Type, _>(arg.as_string_view())
513///         }
514///         _ => Err(ArrowError::InvalidArgumentError("Unsupported data type".to_string())),
515///     }
516/// }
517///
518/// /// A generic implementation of the character_length function
519/// /// This function uses the `ArrayAccessor` trait to access the values of the array
520/// /// so the compiler can generated specialized implementations for different array types
521/// ///
522/// /// Returns a new array with the length of each string in the input array
523/// /// * Int32Array for Utf8 and Utf8View arrays (lengths are 32-bit integers)
524/// /// * Int64Array for LargeUtf8 arrays (lengths are 64-bit integers)
525/// ///
526/// /// This is generic on the type of the primitive array (different string arrays have
527/// /// different lengths) and the type of the array accessor (different string arrays
528/// /// have different ways to access the values)
529/// fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor<Item = &'a str>>(
530///     array: V,
531/// ) -> Result<ArrayRef, ArrowError>
532/// where
533///     T::Native: OffsetSizeTrait,
534/// {
535///     let iter = ArrayIter::new(array);
536///     // Create a Int32Array / Int64Array with the length of each string
537///     let result = iter
538///         .map(|string| {
539///             string.map(|string: &str| {
540///                 T::Native::from_usize(string.chars().count())
541///                     .expect("should not fail as string.chars will always return integer")
542///             })
543///         })
544///         .collect::<PrimitiveArray<T>>();
545///
546///     /// Return the result as a new ArrayRef (dynamically typed)
547///     Ok(Arc::new(result) as ArrayRef)
548/// }
549/// ```
550///
551/// # Validity
552///
553/// An [`ArrayAccessor`] must always return a well-defined value for an index
554/// that is within the bounds `0..Array::len`, including for null indexes where
555/// [`Array::is_null`] is true.
556///
557/// The value at null indexes is unspecified, and implementations must not rely
558/// on a specific value such as [`Default::default`] being returned, however, it
559/// must not be undefined
560pub trait ArrayAccessor: Array {
561    /// The Arrow type of the element being accessed.
562    type Item: Send + Sync;
563
564    /// Returns the element at index `i`
565    /// # Panics
566    /// Panics if the value is outside the bounds of the array
567    fn value(&self, index: usize) -> Self::Item;
568
569    /// Returns the element at index `i`
570    /// # Safety
571    /// Caller is responsible for ensuring that the index is within the bounds of the array
572    unsafe fn value_unchecked(&self, index: usize) -> Self::Item;
573}
574
575/// A trait for Arrow String Arrays, currently three types are supported:
576/// - `StringArray`
577/// - `LargeStringArray`
578/// - `StringViewArray`
579///
580/// This trait helps to abstract over the different types of string arrays
581/// so that we don't need to duplicate the implementation for each type.
582pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
583    /// Returns true if all data within this string array is ASCII
584    fn is_ascii(&self) -> bool;
585
586    /// Constructs a new iterator
587    fn iter(&self) -> ArrayIter<Self>;
588}
589
590impl<'a, O: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<O> {
591    fn is_ascii(&self) -> bool {
592        GenericStringArray::<O>::is_ascii(self)
593    }
594
595    fn iter(&self) -> ArrayIter<Self> {
596        GenericStringArray::<O>::iter(self)
597    }
598}
599impl<'a> StringArrayType<'a> for &'a StringViewArray {
600    fn is_ascii(&self) -> bool {
601        StringViewArray::is_ascii(self)
602    }
603
604    fn iter(&self) -> ArrayIter<Self> {
605        StringViewArray::iter(self)
606    }
607}
608
609impl PartialEq for dyn Array + '_ {
610    fn eq(&self, other: &Self) -> bool {
611        self.to_data().eq(&other.to_data())
612    }
613}
614
615impl<T: Array> PartialEq<T> for dyn Array + '_ {
616    fn eq(&self, other: &T) -> bool {
617        self.to_data().eq(&other.to_data())
618    }
619}
620
621impl PartialEq for NullArray {
622    fn eq(&self, other: &NullArray) -> bool {
623        self.to_data().eq(&other.to_data())
624    }
625}
626
627impl<T: ArrowPrimitiveType> PartialEq for PrimitiveArray<T> {
628    fn eq(&self, other: &PrimitiveArray<T>) -> bool {
629        self.to_data().eq(&other.to_data())
630    }
631}
632
633impl<K: ArrowDictionaryKeyType> PartialEq for DictionaryArray<K> {
634    fn eq(&self, other: &Self) -> bool {
635        self.to_data().eq(&other.to_data())
636    }
637}
638
639impl PartialEq for BooleanArray {
640    fn eq(&self, other: &BooleanArray) -> bool {
641        self.to_data().eq(&other.to_data())
642    }
643}
644
645impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericStringArray<OffsetSize> {
646    fn eq(&self, other: &Self) -> bool {
647        self.to_data().eq(&other.to_data())
648    }
649}
650
651impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericBinaryArray<OffsetSize> {
652    fn eq(&self, other: &Self) -> bool {
653        self.to_data().eq(&other.to_data())
654    }
655}
656
657impl PartialEq for FixedSizeBinaryArray {
658    fn eq(&self, other: &Self) -> bool {
659        self.to_data().eq(&other.to_data())
660    }
661}
662
663impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericListArray<OffsetSize> {
664    fn eq(&self, other: &Self) -> bool {
665        self.to_data().eq(&other.to_data())
666    }
667}
668
669impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericListViewArray<OffsetSize> {
670    fn eq(&self, other: &Self) -> bool {
671        self.to_data().eq(&other.to_data())
672    }
673}
674
675impl PartialEq for MapArray {
676    fn eq(&self, other: &Self) -> bool {
677        self.to_data().eq(&other.to_data())
678    }
679}
680
681impl PartialEq for FixedSizeListArray {
682    fn eq(&self, other: &Self) -> bool {
683        self.to_data().eq(&other.to_data())
684    }
685}
686
687impl PartialEq for StructArray {
688    fn eq(&self, other: &Self) -> bool {
689        self.to_data().eq(&other.to_data())
690    }
691}
692
693impl<T: ByteViewType + ?Sized> PartialEq for GenericByteViewArray<T> {
694    fn eq(&self, other: &Self) -> bool {
695        self.to_data().eq(&other.to_data())
696    }
697}
698
699/// Constructs an array using the input `data`.
700/// Returns a reference-counted `Array` instance.
701pub fn make_array(data: ArrayData) -> ArrayRef {
702    match data.data_type() {
703        DataType::Boolean => Arc::new(BooleanArray::from(data)) as ArrayRef,
704        DataType::Int8 => Arc::new(Int8Array::from(data)) as ArrayRef,
705        DataType::Int16 => Arc::new(Int16Array::from(data)) as ArrayRef,
706        DataType::Int32 => Arc::new(Int32Array::from(data)) as ArrayRef,
707        DataType::Int64 => Arc::new(Int64Array::from(data)) as ArrayRef,
708        DataType::UInt8 => Arc::new(UInt8Array::from(data)) as ArrayRef,
709        DataType::UInt16 => Arc::new(UInt16Array::from(data)) as ArrayRef,
710        DataType::UInt32 => Arc::new(UInt32Array::from(data)) as ArrayRef,
711        DataType::UInt64 => Arc::new(UInt64Array::from(data)) as ArrayRef,
712        DataType::Float16 => Arc::new(Float16Array::from(data)) as ArrayRef,
713        DataType::Float32 => Arc::new(Float32Array::from(data)) as ArrayRef,
714        DataType::Float64 => Arc::new(Float64Array::from(data)) as ArrayRef,
715        DataType::Date32 => Arc::new(Date32Array::from(data)) as ArrayRef,
716        DataType::Date64 => Arc::new(Date64Array::from(data)) as ArrayRef,
717        DataType::Time32(TimeUnit::Second) => Arc::new(Time32SecondArray::from(data)) as ArrayRef,
718        DataType::Time32(TimeUnit::Millisecond) => {
719            Arc::new(Time32MillisecondArray::from(data)) as ArrayRef
720        }
721        DataType::Time64(TimeUnit::Microsecond) => {
722            Arc::new(Time64MicrosecondArray::from(data)) as ArrayRef
723        }
724        DataType::Time64(TimeUnit::Nanosecond) => {
725            Arc::new(Time64NanosecondArray::from(data)) as ArrayRef
726        }
727        DataType::Timestamp(TimeUnit::Second, _) => {
728            Arc::new(TimestampSecondArray::from(data)) as ArrayRef
729        }
730        DataType::Timestamp(TimeUnit::Millisecond, _) => {
731            Arc::new(TimestampMillisecondArray::from(data)) as ArrayRef
732        }
733        DataType::Timestamp(TimeUnit::Microsecond, _) => {
734            Arc::new(TimestampMicrosecondArray::from(data)) as ArrayRef
735        }
736        DataType::Timestamp(TimeUnit::Nanosecond, _) => {
737            Arc::new(TimestampNanosecondArray::from(data)) as ArrayRef
738        }
739        DataType::Interval(IntervalUnit::YearMonth) => {
740            Arc::new(IntervalYearMonthArray::from(data)) as ArrayRef
741        }
742        DataType::Interval(IntervalUnit::DayTime) => {
743            Arc::new(IntervalDayTimeArray::from(data)) as ArrayRef
744        }
745        DataType::Interval(IntervalUnit::MonthDayNano) => {
746            Arc::new(IntervalMonthDayNanoArray::from(data)) as ArrayRef
747        }
748        DataType::Duration(TimeUnit::Second) => {
749            Arc::new(DurationSecondArray::from(data)) as ArrayRef
750        }
751        DataType::Duration(TimeUnit::Millisecond) => {
752            Arc::new(DurationMillisecondArray::from(data)) as ArrayRef
753        }
754        DataType::Duration(TimeUnit::Microsecond) => {
755            Arc::new(DurationMicrosecondArray::from(data)) as ArrayRef
756        }
757        DataType::Duration(TimeUnit::Nanosecond) => {
758            Arc::new(DurationNanosecondArray::from(data)) as ArrayRef
759        }
760        DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef,
761        DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as ArrayRef,
762        DataType::FixedSizeBinary(_) => Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef,
763        DataType::BinaryView => Arc::new(BinaryViewArray::from(data)) as ArrayRef,
764        DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef,
765        DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef,
766        DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef,
767        DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef,
768        DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef,
769        DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef,
770        DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef,
771        DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef,
772        DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef,
773        DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef,
774        DataType::FixedSizeList(_, _) => Arc::new(FixedSizeListArray::from(data)) as ArrayRef,
775        DataType::Dictionary(ref key_type, _) => match key_type.as_ref() {
776            DataType::Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)) as ArrayRef,
777            DataType::Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)) as ArrayRef,
778            DataType::Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)) as ArrayRef,
779            DataType::Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)) as ArrayRef,
780            DataType::UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)) as ArrayRef,
781            DataType::UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)) as ArrayRef,
782            DataType::UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)) as ArrayRef,
783            DataType::UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)) as ArrayRef,
784            dt => panic!("Unexpected dictionary key type {dt:?}"),
785        },
786        DataType::RunEndEncoded(ref run_ends_type, _) => match run_ends_type.data_type() {
787            DataType::Int16 => Arc::new(RunArray::<Int16Type>::from(data)) as ArrayRef,
788            DataType::Int32 => Arc::new(RunArray::<Int32Type>::from(data)) as ArrayRef,
789            DataType::Int64 => Arc::new(RunArray::<Int64Type>::from(data)) as ArrayRef,
790            dt => panic!("Unexpected data type for run_ends array {dt:?}"),
791        },
792        DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef,
793        DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef,
794        DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef,
795        dt => panic!("Unexpected data type {dt:?}"),
796    }
797}
798
799/// Creates a new empty array
800///
801/// ```
802/// use std::sync::Arc;
803/// use arrow_schema::DataType;
804/// use arrow_array::{ArrayRef, Int32Array, new_empty_array};
805///
806/// let empty_array = new_empty_array(&DataType::Int32);
807/// let array: ArrayRef = Arc::new(Int32Array::from(vec![] as Vec<i32>));
808///
809/// assert_eq!(&array, &empty_array);
810/// ```
811pub fn new_empty_array(data_type: &DataType) -> ArrayRef {
812    let data = ArrayData::new_empty(data_type);
813    make_array(data)
814}
815
816/// Creates a new array of `data_type` of length `length` filled
817/// entirely of `NULL` values
818///
819/// ```
820/// use std::sync::Arc;
821/// use arrow_schema::DataType;
822/// use arrow_array::{ArrayRef, Int32Array, new_null_array};
823///
824/// let null_array = new_null_array(&DataType::Int32, 3);
825/// let array: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None]));
826///
827/// assert_eq!(&array, &null_array);
828/// ```
829pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
830    make_array(ArrayData::new_null(data_type, length))
831}
832
833/// Helper function that gets offset from an [`ArrayData`]
834///
835/// # Safety
836///
837/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer
838unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
839    match data.is_empty() && data.buffers()[0].is_empty() {
840        true => OffsetBuffer::new_empty(),
841        false => {
842            let buffer =
843                ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1);
844            // Safety:
845            // ArrayData is valid
846            unsafe { OffsetBuffer::new_unchecked(buffer) }
847        }
848    }
849}
850
851/// Helper function for printing potentially long arrays.
852fn print_long_array<A, F>(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result
853where
854    A: Array,
855    F: Fn(&A, usize, &mut std::fmt::Formatter) -> std::fmt::Result,
856{
857    let head = std::cmp::min(10, array.len());
858
859    for i in 0..head {
860        if array.is_null(i) {
861            writeln!(f, "  null,")?;
862        } else {
863            write!(f, "  ")?;
864            print_item(array, i, f)?;
865            writeln!(f, ",")?;
866        }
867    }
868    if array.len() > 10 {
869        if array.len() > 20 {
870            writeln!(f, "  ...{} elements...,", array.len() - 20)?;
871        }
872
873        let tail = std::cmp::max(head, array.len() - 10);
874
875        for i in tail..array.len() {
876            if array.is_null(i) {
877                writeln!(f, "  null,")?;
878            } else {
879                write!(f, "  ")?;
880                print_item(array, i, f)?;
881                writeln!(f, ",")?;
882            }
883        }
884    }
885    Ok(())
886}
887
888#[cfg(test)]
889mod tests {
890    use super::*;
891    use crate::cast::{as_union_array, downcast_array};
892    use crate::downcast_run_array;
893    use arrow_buffer::MutableBuffer;
894    use arrow_schema::{Field, Fields, UnionFields, UnionMode};
895
896    #[test]
897    fn test_empty_primitive() {
898        let array = new_empty_array(&DataType::Int32);
899        let a = array.as_any().downcast_ref::<Int32Array>().unwrap();
900        assert_eq!(a.len(), 0);
901        let expected: &[i32] = &[];
902        assert_eq!(a.values(), expected);
903    }
904
905    #[test]
906    fn test_empty_variable_sized() {
907        let array = new_empty_array(&DataType::Utf8);
908        let a = array.as_any().downcast_ref::<StringArray>().unwrap();
909        assert_eq!(a.len(), 0);
910        assert_eq!(a.value_offsets()[0], 0i32);
911    }
912
913    #[test]
914    fn test_empty_list_primitive() {
915        let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false)));
916        let array = new_empty_array(&data_type);
917        let a = array.as_any().downcast_ref::<ListArray>().unwrap();
918        assert_eq!(a.len(), 0);
919        assert_eq!(a.value_offsets()[0], 0i32);
920    }
921
922    #[test]
923    fn test_null_boolean() {
924        let array = new_null_array(&DataType::Boolean, 9);
925        let a = array.as_any().downcast_ref::<BooleanArray>().unwrap();
926        assert_eq!(a.len(), 9);
927        for i in 0..9 {
928            assert!(a.is_null(i));
929        }
930    }
931
932    #[test]
933    fn test_null_primitive() {
934        let array = new_null_array(&DataType::Int32, 9);
935        let a = array.as_any().downcast_ref::<Int32Array>().unwrap();
936        assert_eq!(a.len(), 9);
937        for i in 0..9 {
938            assert!(a.is_null(i));
939        }
940    }
941
942    #[test]
943    fn test_null_struct() {
944        // It is possible to create a null struct containing a non-nullable child
945        // see https://github.com/apache/arrow-rs/pull/3244 for details
946        let struct_type = DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into());
947        let array = new_null_array(&struct_type, 9);
948
949        let a = array.as_any().downcast_ref::<StructArray>().unwrap();
950        assert_eq!(a.len(), 9);
951        assert_eq!(a.column(0).len(), 9);
952        for i in 0..9 {
953            assert!(a.is_null(i));
954        }
955
956        // Make sure we can slice the resulting array.
957        a.slice(0, 5);
958    }
959
960    #[test]
961    fn test_null_variable_sized() {
962        let array = new_null_array(&DataType::Utf8, 9);
963        let a = array.as_any().downcast_ref::<StringArray>().unwrap();
964        assert_eq!(a.len(), 9);
965        assert_eq!(a.value_offsets()[9], 0i32);
966        for i in 0..9 {
967            assert!(a.is_null(i));
968        }
969    }
970
971    #[test]
972    fn test_null_list_primitive() {
973        let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true)));
974        let array = new_null_array(&data_type, 9);
975        let a = array.as_any().downcast_ref::<ListArray>().unwrap();
976        assert_eq!(a.len(), 9);
977        assert_eq!(a.value_offsets()[9], 0i32);
978        for i in 0..9 {
979            assert!(a.is_null(i));
980        }
981    }
982
983    #[test]
984    fn test_null_map() {
985        let data_type = DataType::Map(
986            Arc::new(Field::new(
987                "entry",
988                DataType::Struct(Fields::from(vec![
989                    Field::new("key", DataType::Utf8, false),
990                    Field::new("value", DataType::Int32, true),
991                ])),
992                false,
993            )),
994            false,
995        );
996        let array = new_null_array(&data_type, 9);
997        let a = array.as_any().downcast_ref::<MapArray>().unwrap();
998        assert_eq!(a.len(), 9);
999        assert_eq!(a.value_offsets()[9], 0i32);
1000        for i in 0..9 {
1001            assert!(a.is_null(i));
1002        }
1003    }
1004
1005    #[test]
1006    fn test_null_dictionary() {
1007        let values =
1008            vec![None, None, None, None, None, None, None, None, None] as Vec<Option<&str>>;
1009
1010        let array: DictionaryArray<Int8Type> = values.into_iter().collect();
1011        let array = Arc::new(array) as ArrayRef;
1012
1013        let null_array = new_null_array(array.data_type(), 9);
1014        assert_eq!(&array, &null_array);
1015        assert_eq!(
1016            array.to_data().buffers()[0].len(),
1017            null_array.to_data().buffers()[0].len()
1018        );
1019    }
1020
1021    #[test]
1022    fn test_null_union() {
1023        for mode in [UnionMode::Sparse, UnionMode::Dense] {
1024            let data_type = DataType::Union(
1025                UnionFields::new(
1026                    vec![2, 1],
1027                    vec![
1028                        Field::new("foo", DataType::Int32, true),
1029                        Field::new("bar", DataType::Int64, true),
1030                    ],
1031                ),
1032                mode,
1033            );
1034            let array = new_null_array(&data_type, 4);
1035
1036            let array = as_union_array(array.as_ref());
1037            assert_eq!(array.len(), 4);
1038            assert_eq!(array.null_count(), 0);
1039            assert_eq!(array.logical_null_count(), 4);
1040
1041            for i in 0..4 {
1042                let a = array.value(i);
1043                assert_eq!(a.len(), 1);
1044                assert_eq!(a.null_count(), 1);
1045                assert_eq!(a.logical_null_count(), 1);
1046                assert!(a.is_null(0))
1047            }
1048
1049            array.to_data().validate_full().unwrap();
1050        }
1051    }
1052
1053    #[test]
1054    #[allow(unused_parens)]
1055    fn test_null_runs() {
1056        for r in [DataType::Int16, DataType::Int32, DataType::Int64] {
1057            let data_type = DataType::RunEndEncoded(
1058                Arc::new(Field::new("run_ends", r, false)),
1059                Arc::new(Field::new("values", DataType::Utf8, true)),
1060            );
1061
1062            let array = new_null_array(&data_type, 4);
1063            let array = array.as_ref();
1064
1065            downcast_run_array! {
1066                array => {
1067                    assert_eq!(array.len(), 4);
1068                    assert_eq!(array.null_count(), 0);
1069                    assert_eq!(array.logical_null_count(), 4);
1070                    assert_eq!(array.values().len(), 1);
1071                    assert_eq!(array.values().null_count(), 1);
1072                    assert_eq!(array.run_ends().len(), 4);
1073                    assert_eq!(array.run_ends().values(), &[4]);
1074
1075                    let idx = array.get_physical_indices(&[0, 1, 2, 3]).unwrap();
1076                    assert_eq!(idx, &[0,0,0,0]);
1077                }
1078                d => unreachable!("{d}")
1079            }
1080        }
1081    }
1082
1083    #[test]
1084    fn test_null_fixed_size_binary() {
1085        for size in [1, 2, 7] {
1086            let array = new_null_array(&DataType::FixedSizeBinary(size), 6);
1087            let array = array
1088                .as_ref()
1089                .as_any()
1090                .downcast_ref::<FixedSizeBinaryArray>()
1091                .unwrap();
1092
1093            assert_eq!(array.len(), 6);
1094            assert_eq!(array.null_count(), 6);
1095            assert_eq!(array.logical_null_count(), 6);
1096            array.iter().for_each(|x| assert!(x.is_none()));
1097        }
1098    }
1099
1100    #[test]
1101    fn test_memory_size_null() {
1102        let null_arr = NullArray::new(32);
1103
1104        assert_eq!(0, null_arr.get_buffer_memory_size());
1105        assert_eq!(
1106            std::mem::size_of::<usize>(),
1107            null_arr.get_array_memory_size()
1108        );
1109    }
1110
1111    #[test]
1112    fn test_memory_size_primitive() {
1113        let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
1114        let empty = PrimitiveArray::<Int64Type>::from(ArrayData::new_empty(arr.data_type()));
1115
1116        // subtract empty array to avoid magic numbers for the size of additional fields
1117        assert_eq!(
1118            arr.get_array_memory_size() - empty.get_array_memory_size(),
1119            128 * std::mem::size_of::<i64>()
1120        );
1121    }
1122
1123    #[test]
1124    fn test_memory_size_primitive_sliced() {
1125        let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
1126        let slice1 = arr.slice(0, 64);
1127        let slice2 = arr.slice(64, 64);
1128
1129        // both slices report the full buffer memory usage, even though the buffers are shared
1130        assert_eq!(slice1.get_array_memory_size(), arr.get_array_memory_size());
1131        assert_eq!(slice2.get_array_memory_size(), arr.get_array_memory_size());
1132    }
1133
1134    #[test]
1135    fn test_memory_size_primitive_nullable() {
1136        let arr: PrimitiveArray<Int64Type> = (0..128)
1137            .map(|i| if i % 20 == 0 { Some(i) } else { None })
1138            .collect();
1139        let empty_with_bitmap = PrimitiveArray::<Int64Type>::from(
1140            ArrayData::builder(arr.data_type().clone())
1141                .add_buffer(MutableBuffer::new(0).into())
1142                .null_bit_buffer(Some(MutableBuffer::new_null(0).into()))
1143                .build()
1144                .unwrap(),
1145        );
1146
1147        // expected size is the size of the PrimitiveArray struct,
1148        // which includes the optional validity buffer
1149        // plus one buffer on the heap
1150        assert_eq!(
1151            std::mem::size_of::<PrimitiveArray<Int64Type>>(),
1152            empty_with_bitmap.get_array_memory_size()
1153        );
1154
1155        // subtract empty array to avoid magic numbers for the size of additional fields
1156        // the size of the validity bitmap is rounded up to 64 bytes
1157        assert_eq!(
1158            arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(),
1159            128 * std::mem::size_of::<i64>() + 64
1160        );
1161    }
1162
1163    #[test]
1164    fn test_memory_size_dictionary() {
1165        let values = PrimitiveArray::<Int64Type>::from_iter_values(0..16);
1166        let keys = PrimitiveArray::<Int16Type>::from_iter_values(
1167            (0..256).map(|i| (i % values.len()) as i16),
1168        );
1169
1170        let dict_data_type = DataType::Dictionary(
1171            Box::new(keys.data_type().clone()),
1172            Box::new(values.data_type().clone()),
1173        );
1174        let dict_data = keys
1175            .into_data()
1176            .into_builder()
1177            .data_type(dict_data_type)
1178            .child_data(vec![values.into_data()])
1179            .build()
1180            .unwrap();
1181
1182        let empty_data = ArrayData::new_empty(&DataType::Dictionary(
1183            Box::new(DataType::Int16),
1184            Box::new(DataType::Int64),
1185        ));
1186
1187        let arr = DictionaryArray::<Int16Type>::from(dict_data);
1188        let empty = DictionaryArray::<Int16Type>::from(empty_data);
1189
1190        let expected_keys_size = 256 * std::mem::size_of::<i16>();
1191        assert_eq!(
1192            arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(),
1193            expected_keys_size
1194        );
1195
1196        let expected_values_size = 16 * std::mem::size_of::<i64>();
1197        assert_eq!(
1198            arr.values().get_array_memory_size() - empty.values().get_array_memory_size(),
1199            expected_values_size
1200        );
1201
1202        let expected_size = expected_keys_size + expected_values_size;
1203        assert_eq!(
1204            arr.get_array_memory_size() - empty.get_array_memory_size(),
1205            expected_size
1206        );
1207    }
1208
1209    /// Test function that takes an &dyn Array
1210    fn compute_my_thing(arr: &dyn Array) -> bool {
1211        !arr.is_empty()
1212    }
1213
1214    #[test]
1215    fn test_array_ref_as_array() {
1216        let arr: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect();
1217
1218        // works well!
1219        assert!(compute_my_thing(&arr));
1220
1221        // Should also work when wrapped as an ArrayRef
1222        let arr: ArrayRef = Arc::new(arr);
1223        assert!(compute_my_thing(&arr));
1224        assert!(compute_my_thing(arr.as_ref()));
1225    }
1226
1227    #[test]
1228    fn test_downcast_array() {
1229        let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect();
1230
1231        let boxed: ArrayRef = Arc::new(array);
1232        let array: Int32Array = downcast_array(&boxed);
1233
1234        let expected: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect();
1235        assert_eq!(array, expected);
1236    }
1237}