1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[doc(hidden)]
35#[deprecated(note = "Use [Buffer]")]
36pub type Buffers<'a> = &'a [Buffer];
37
38#[inline]
39pub(crate) fn contains_nulls(
40 null_bit_buffer: Option<&NullBuffer>,
41 offset: usize,
42 len: usize,
43) -> bool {
44 match null_bit_buffer {
45 Some(buffer) => {
46 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
47 Some((start, end)) => start != 0 || end != len,
48 None => len != 0, }
50 }
51 None => false, }
53}
54
55#[inline]
56pub(crate) fn count_nulls(
57 null_bit_buffer: Option<&NullBuffer>,
58 offset: usize,
59 len: usize,
60) -> usize {
61 if let Some(buf) = null_bit_buffer {
62 let buffer = buf.buffer();
63 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
64 } else {
65 0
66 }
67}
68
69#[inline]
71pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
72 let empty_buffer = MutableBuffer::new(0);
73 match data_type {
74 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
75 DataType::Boolean => {
76 let bytes = bit_util::ceil(capacity, 8);
77 let buffer = MutableBuffer::new(bytes);
78 [buffer, empty_buffer]
79 }
80 DataType::UInt8
81 | DataType::UInt16
82 | DataType::UInt32
83 | DataType::UInt64
84 | DataType::Int8
85 | DataType::Int16
86 | DataType::Int32
87 | DataType::Int64
88 | DataType::Float16
89 | DataType::Float32
90 | DataType::Float64
91 | DataType::Date32
92 | DataType::Time32(_)
93 | DataType::Date64
94 | DataType::Time64(_)
95 | DataType::Duration(_)
96 | DataType::Timestamp(_, _)
97 | DataType::Interval(_) => [
98 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
99 empty_buffer,
100 ],
101 DataType::Utf8 | DataType::Binary => {
102 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
103 buffer.push(0i32);
105 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
106 }
107 DataType::LargeUtf8 | DataType::LargeBinary => {
108 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
109 buffer.push(0i64);
111 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
112 }
113 DataType::BinaryView | DataType::Utf8View => [
114 MutableBuffer::new(capacity * mem::size_of::<u128>()),
115 empty_buffer,
116 ],
117 DataType::List(_) | DataType::Map(_, _) => {
118 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
120 buffer.push(0i32);
121 [buffer, empty_buffer]
122 }
123 DataType::ListView(_) => [
124 MutableBuffer::new(capacity * mem::size_of::<i32>()),
125 MutableBuffer::new(capacity * mem::size_of::<i32>()),
126 ],
127 DataType::LargeList(_) => {
128 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
130 buffer.push(0i64);
131 [buffer, empty_buffer]
132 }
133 DataType::LargeListView(_) => [
134 MutableBuffer::new(capacity * mem::size_of::<i64>()),
135 MutableBuffer::new(capacity * mem::size_of::<i64>()),
136 ],
137 DataType::FixedSizeBinary(size) => {
138 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
139 }
140 DataType::Dictionary(k, _) => [
141 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
142 empty_buffer,
143 ],
144 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
145 [empty_buffer, MutableBuffer::new(0)]
146 }
147 DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [
148 MutableBuffer::new(capacity * mem::size_of::<u8>()),
149 empty_buffer,
150 ],
151 DataType::Union(_, mode) => {
152 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
153 match mode {
154 UnionMode::Sparse => [type_ids, empty_buffer],
155 UnionMode::Dense => {
156 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
157 [type_ids, offsets]
158 }
159 }
160 }
161 }
162}
163
164#[derive(Debug, Clone)]
210pub struct ArrayData {
211 data_type: DataType,
213
214 len: usize,
216
217 offset: usize,
219
220 buffers: Vec<Buffer>,
224
225 child_data: Vec<ArrayData>,
228
229 nulls: Option<NullBuffer>,
232}
233
234pub type ArrayDataRef = Arc<ArrayData>;
236
237impl ArrayData {
238 pub unsafe fn new_unchecked(
255 data_type: DataType,
256 len: usize,
257 null_count: Option<usize>,
258 null_bit_buffer: Option<Buffer>,
259 offset: usize,
260 buffers: Vec<Buffer>,
261 child_data: Vec<ArrayData>,
262 ) -> Self {
263 ArrayDataBuilder {
264 data_type,
265 len,
266 null_count,
267 null_bit_buffer,
268 nulls: None,
269 offset,
270 buffers,
271 child_data,
272 }
273 .build_unchecked()
274 }
275
276 pub fn try_new(
287 data_type: DataType,
288 len: usize,
289 null_bit_buffer: Option<Buffer>,
290 offset: usize,
291 buffers: Vec<Buffer>,
292 child_data: Vec<ArrayData>,
293 ) -> Result<Self, ArrowError> {
294 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
298 let needed_len = bit_util::ceil(len + offset, 8);
299 if null_bit_buffer.len() < needed_len {
300 return Err(ArrowError::InvalidArgumentError(format!(
301 "null_bit_buffer size too small. got {} needed {}",
302 null_bit_buffer.len(),
303 needed_len
304 )));
305 }
306 }
307 let new_self = unsafe {
309 Self::new_unchecked(
310 data_type,
311 len,
312 None,
313 null_bit_buffer,
314 offset,
315 buffers,
316 child_data,
317 )
318 };
319
320 new_self.validate_data()?;
325 Ok(new_self)
326 }
327
328 #[inline]
330 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
331 ArrayDataBuilder::new(data_type)
332 }
333
334 #[inline]
336 pub const fn data_type(&self) -> &DataType {
337 &self.data_type
338 }
339
340 pub fn buffers(&self) -> &[Buffer] {
342 &self.buffers
343 }
344
345 pub fn child_data(&self) -> &[ArrayData] {
348 &self.child_data[..]
349 }
350
351 #[inline]
353 pub fn is_null(&self, i: usize) -> bool {
354 match &self.nulls {
355 Some(v) => v.is_null(i),
356 None => false,
357 }
358 }
359
360 #[inline]
364 pub fn nulls(&self) -> Option<&NullBuffer> {
365 self.nulls.as_ref()
366 }
367
368 #[inline]
370 pub fn is_valid(&self, i: usize) -> bool {
371 !self.is_null(i)
372 }
373
374 #[inline]
376 pub const fn len(&self) -> usize {
377 self.len
378 }
379
380 #[inline]
382 pub const fn is_empty(&self) -> bool {
383 self.len == 0
384 }
385
386 #[inline]
388 pub const fn offset(&self) -> usize {
389 self.offset
390 }
391
392 #[inline]
394 pub fn null_count(&self) -> usize {
395 self.nulls
396 .as_ref()
397 .map(|x| x.null_count())
398 .unwrap_or_default()
399 }
400
401 pub fn get_buffer_memory_size(&self) -> usize {
413 let mut size = 0;
414 for buffer in &self.buffers {
415 size += buffer.capacity();
416 }
417 if let Some(bitmap) = &self.nulls {
418 size += bitmap.buffer().capacity()
419 }
420 for child in &self.child_data {
421 size += child.get_buffer_memory_size();
422 }
423 size
424 }
425
426 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
439 let mut result: usize = 0;
440 let layout = layout(&self.data_type);
441
442 for spec in layout.buffers.iter() {
443 match spec {
444 BufferSpec::FixedWidth { byte_width, .. } => {
445 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
446 ArrowError::ComputeError(
447 "Integer overflow computing buffer size".to_string(),
448 )
449 })?;
450 result += buffer_size;
451 }
452 BufferSpec::VariableWidth => {
453 let buffer_len: usize;
454 match self.data_type {
455 DataType::Utf8 | DataType::Binary => {
456 let offsets = self.typed_offsets::<i32>()?;
457 buffer_len = (offsets[self.len] - offsets[0] ) as usize;
458 }
459 DataType::LargeUtf8 | DataType::LargeBinary => {
460 let offsets = self.typed_offsets::<i64>()?;
461 buffer_len = (offsets[self.len] - offsets[0]) as usize;
462 }
463 _ => {
464 return Err(ArrowError::NotYetImplemented(format!(
465 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
466 self.data_type
467 )))
468 }
469 };
470 result += buffer_len;
471 }
472 BufferSpec::BitMap => {
473 let buffer_size = bit_util::ceil(self.len, 8);
474 result += buffer_size;
475 }
476 BufferSpec::AlwaysNull => {
477 }
479 }
480 }
481
482 if self.nulls().is_some() {
483 result += bit_util::ceil(self.len, 8);
484 }
485
486 for child in &self.child_data {
487 result += child.get_slice_memory_size()?;
488 }
489 Ok(result)
490 }
491
492 pub fn get_array_memory_size(&self) -> usize {
501 let mut size = mem::size_of_val(self);
502
503 for buffer in &self.buffers {
505 size += mem::size_of::<Buffer>();
506 size += buffer.capacity();
507 }
508 if let Some(nulls) = &self.nulls {
509 size += nulls.buffer().capacity();
510 }
511 for child in &self.child_data {
512 size += child.get_array_memory_size();
513 }
514
515 size
516 }
517
518 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
526 assert!((offset + length) <= self.len());
527
528 if let DataType::Struct(_) = self.data_type() {
529 let new_offset = self.offset + offset;
531 let new_data = ArrayData {
532 data_type: self.data_type().clone(),
533 len: length,
534 offset: new_offset,
535 buffers: self.buffers.clone(),
536 child_data: self
538 .child_data()
539 .iter()
540 .map(|data| data.slice(offset, length))
541 .collect(),
542 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
543 };
544
545 new_data
546 } else {
547 let mut new_data = self.clone();
548
549 new_data.len = length;
550 new_data.offset = offset + self.offset;
551 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
552
553 new_data
554 }
555 }
556
557 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
563 &self.buffers()[buffer].typed_data()[self.offset..]
564 }
565
566 pub fn new_null(data_type: &DataType, len: usize) -> Self {
568 let bit_len = bit_util::ceil(len, 8);
569 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
570
571 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
572 Some(width) => (vec![zeroed(width * len)], vec![], true),
573 None => match data_type {
574 DataType::Null => (vec![], vec![], false),
575 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
576 DataType::Binary | DataType::Utf8 => {
577 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
578 }
579 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
580 DataType::LargeBinary | DataType::LargeUtf8 => {
581 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
582 }
583 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
584 DataType::List(f) | DataType::Map(f, _) => (
585 vec![zeroed((len + 1) * 4)],
586 vec![ArrayData::new_empty(f.data_type())],
587 true,
588 ),
589 DataType::LargeList(f) => (
590 vec![zeroed((len + 1) * 8)],
591 vec![ArrayData::new_empty(f.data_type())],
592 true,
593 ),
594 DataType::FixedSizeList(f, list_len) => (
595 vec![],
596 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
597 true,
598 ),
599 DataType::Struct(fields) => (
600 vec![],
601 fields
602 .iter()
603 .map(|f| Self::new_null(f.data_type(), len))
604 .collect(),
605 true,
606 ),
607 DataType::Dictionary(k, v) => (
608 vec![zeroed(k.primitive_width().unwrap() * len)],
609 vec![ArrayData::new_empty(v.as_ref())],
610 true,
611 ),
612 DataType::Union(f, mode) => {
613 let (id, _) = f.iter().next().unwrap();
614 let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
615 let buffers = match mode {
616 UnionMode::Sparse => vec![ids],
617 UnionMode::Dense => {
618 let end_offset = i32::from_usize(len).unwrap();
619 vec![ids, Buffer::from_iter(0_i32..end_offset)]
620 }
621 };
622
623 let children = f
624 .iter()
625 .enumerate()
626 .map(|(idx, (_, f))| {
627 if idx == 0 || *mode == UnionMode::Sparse {
628 Self::new_null(f.data_type(), len)
629 } else {
630 Self::new_empty(f.data_type())
631 }
632 })
633 .collect();
634
635 (buffers, children, false)
636 }
637 DataType::RunEndEncoded(r, v) => {
638 let runs = match r.data_type() {
639 DataType::Int16 => {
640 let i = i16::from_usize(len).expect("run overflow");
641 Buffer::from_slice_ref([i])
642 }
643 DataType::Int32 => {
644 let i = i32::from_usize(len).expect("run overflow");
645 Buffer::from_slice_ref([i])
646 }
647 DataType::Int64 => {
648 let i = i64::from_usize(len).expect("run overflow");
649 Buffer::from_slice_ref([i])
650 }
651 dt => unreachable!("Invalid run ends data type {dt}"),
652 };
653
654 let builder = ArrayData::builder(r.data_type().clone())
655 .len(1)
656 .buffers(vec![runs]);
657
658 let runs = unsafe { builder.build_unchecked() };
661 (
662 vec![],
663 vec![runs, ArrayData::new_null(v.data_type(), 1)],
664 false,
665 )
666 }
667 d => unreachable!("{d}"),
668 },
669 };
670
671 let mut builder = ArrayDataBuilder::new(data_type.clone())
672 .len(len)
673 .buffers(buffers)
674 .child_data(child_data);
675
676 if has_nulls {
677 builder = builder.nulls(Some(NullBuffer::new_null(len)))
678 }
679
680 unsafe { builder.build_unchecked() }
683 }
684
685 pub fn new_empty(data_type: &DataType) -> Self {
687 Self::new_null(data_type, 0)
688 }
689
690 pub fn align_buffers(&mut self) {
699 let layout = layout(&self.data_type);
700 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
701 if let BufferSpec::FixedWidth { alignment, .. } = spec {
702 if buffer.as_ptr().align_offset(*alignment) != 0 {
703 *buffer = Buffer::from_slice_ref(buffer.as_ref());
704 }
705 }
706 }
707 for data in self.child_data.iter_mut() {
709 data.align_buffers()
710 }
711 }
712
713 pub fn validate(&self) -> Result<(), ArrowError> {
724 let len_plus_offset = self.len + self.offset;
726
727 let layout = layout(&self.data_type);
729
730 if !layout.can_contain_null_mask && self.nulls.is_some() {
731 return Err(ArrowError::InvalidArgumentError(format!(
732 "Arrays of type {:?} cannot contain a null bitmask",
733 self.data_type,
734 )));
735 }
736
737 if self.buffers.len() < layout.buffers.len()
739 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
740 {
741 return Err(ArrowError::InvalidArgumentError(format!(
742 "Expected {} buffers in array of type {:?}, got {}",
743 layout.buffers.len(),
744 self.data_type,
745 self.buffers.len(),
746 )));
747 }
748
749 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
750 match spec {
751 BufferSpec::FixedWidth {
752 byte_width,
753 alignment,
754 } => {
755 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
756
757 if buffer.len() < min_buffer_size {
758 return Err(ArrowError::InvalidArgumentError(format!(
759 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
760 min_buffer_size, i, self.data_type, buffer.len()
761 )));
762 }
763
764 let align_offset = buffer.as_ptr().align_offset(*alignment);
765 if align_offset != 0 {
766 return Err(ArrowError::InvalidArgumentError(format!(
767 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
768 self.data_type, align_offset.min(alignment - align_offset)
769 )));
770 }
771 }
772 BufferSpec::VariableWidth => {
773 }
777 BufferSpec::BitMap => {
778 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
779 if buffer.len() < min_buffer_size {
780 return Err(ArrowError::InvalidArgumentError(format!(
781 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
782 min_buffer_size, i, self.data_type, buffer.len()
783 )));
784 }
785 }
786 BufferSpec::AlwaysNull => {
787 }
789 }
790 }
791
792 if let Some(nulls) = self.nulls() {
794 if nulls.null_count() > self.len {
795 return Err(ArrowError::InvalidArgumentError(format!(
796 "null_count {} for an array exceeds length of {} elements",
797 nulls.null_count(),
798 self.len
799 )));
800 }
801
802 let actual_len = nulls.validity().len();
803 let needed_len = bit_util::ceil(len_plus_offset, 8);
804 if actual_len < needed_len {
805 return Err(ArrowError::InvalidArgumentError(format!(
806 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
807 )));
808 }
809
810 if nulls.len() != self.len {
811 return Err(ArrowError::InvalidArgumentError(format!(
812 "null buffer incorrect size. got {} expected {}",
813 nulls.len(),
814 self.len
815 )));
816 }
817 }
818
819 self.validate_child_data()?;
820
821 match &self.data_type {
823 DataType::Utf8 | DataType::Binary => {
824 self.validate_offsets::<i32>(self.buffers[1].len())?;
825 }
826 DataType::LargeUtf8 | DataType::LargeBinary => {
827 self.validate_offsets::<i64>(self.buffers[1].len())?;
828 }
829 DataType::Dictionary(key_type, _value_type) => {
830 if !DataType::is_dictionary_key_type(key_type) {
832 return Err(ArrowError::InvalidArgumentError(format!(
833 "Dictionary key type must be integer, but was {key_type}"
834 )));
835 }
836 }
837 DataType::RunEndEncoded(run_ends_type, _) => {
838 if run_ends_type.is_nullable() {
839 return Err(ArrowError::InvalidArgumentError(
840 "The nullable should be set to false for the field defining run_ends array.".to_string()
841 ));
842 }
843 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
844 return Err(ArrowError::InvalidArgumentError(format!(
845 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
846 run_ends_type.data_type()
847 )));
848 }
849 }
850 _ => {}
851 };
852
853 Ok(())
854 }
855
856 fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
863 if self.len == 0 && self.buffers[0].is_empty() {
865 return Ok(&[]);
866 }
867
868 self.typed_buffer(0, self.len + 1)
869 }
870
871 fn typed_buffer<T: ArrowNativeType + num::Num>(
873 &self,
874 idx: usize,
875 len: usize,
876 ) -> Result<&[T], ArrowError> {
877 let buffer = &self.buffers[idx];
878
879 let required_len = (len + self.offset) * mem::size_of::<T>();
880
881 if buffer.len() < required_len {
882 return Err(ArrowError::InvalidArgumentError(format!(
883 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
884 idx,
885 self.data_type,
886 required_len,
887 buffer.len()
888 )));
889 }
890
891 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
892 }
893
894 fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
897 &self,
898 values_length: usize,
899 ) -> Result<(), ArrowError> {
900 let offsets = self.typed_offsets::<T>()?;
902 if offsets.is_empty() {
903 return Ok(());
904 }
905
906 let first_offset = offsets[0].to_usize().ok_or_else(|| {
907 ArrowError::InvalidArgumentError(format!(
908 "Error converting offset[0] ({}) to usize for {}",
909 offsets[0], self.data_type
910 ))
911 })?;
912
913 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
914 ArrowError::InvalidArgumentError(format!(
915 "Error converting offset[{}] ({}) to usize for {}",
916 self.len, offsets[self.len], self.data_type
917 ))
918 })?;
919
920 if first_offset > values_length {
921 return Err(ArrowError::InvalidArgumentError(format!(
922 "First offset {} of {} is larger than values length {}",
923 first_offset, self.data_type, values_length,
924 )));
925 }
926
927 if last_offset > values_length {
928 return Err(ArrowError::InvalidArgumentError(format!(
929 "Last offset {} of {} is larger than values length {}",
930 last_offset, self.data_type, values_length,
931 )));
932 }
933
934 if first_offset > last_offset {
935 return Err(ArrowError::InvalidArgumentError(format!(
936 "First offset {} in {} is smaller than last offset {}",
937 first_offset, self.data_type, last_offset,
938 )));
939 }
940
941 Ok(())
942 }
943
944 fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
947 &self,
948 values_length: usize,
949 ) -> Result<(), ArrowError> {
950 let offsets: &[T] = self.typed_buffer(0, self.len)?;
951 let sizes: &[T] = self.typed_buffer(1, self.len)?;
952 for i in 0..values_length {
953 let size = sizes[i].to_usize().ok_or_else(|| {
954 ArrowError::InvalidArgumentError(format!(
955 "Error converting size[{}] ({}) to usize for {}",
956 i, sizes[i], self.data_type
957 ))
958 })?;
959 let offset = offsets[i].to_usize().ok_or_else(|| {
960 ArrowError::InvalidArgumentError(format!(
961 "Error converting offset[{}] ({}) to usize for {}",
962 i, offsets[i], self.data_type
963 ))
964 })?;
965 if size
966 .checked_add(offset)
967 .expect("Offset and size have exceeded the usize boundary")
968 > values_length
969 {
970 return Err(ArrowError::InvalidArgumentError(format!(
971 "Size {} at index {} is larger than the remaining values for {}",
972 size, i, self.data_type
973 )));
974 }
975 }
976 Ok(())
977 }
978
979 fn validate_child_data(&self) -> Result<(), ArrowError> {
981 match &self.data_type {
982 DataType::List(field) | DataType::Map(field, _) => {
983 let values_data = self.get_single_valid_child_data(field.data_type())?;
984 self.validate_offsets::<i32>(values_data.len)?;
985 Ok(())
986 }
987 DataType::LargeList(field) => {
988 let values_data = self.get_single_valid_child_data(field.data_type())?;
989 self.validate_offsets::<i64>(values_data.len)?;
990 Ok(())
991 }
992 DataType::ListView(field) => {
993 let values_data = self.get_single_valid_child_data(field.data_type())?;
994 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
995 Ok(())
996 }
997 DataType::LargeListView(field) => {
998 let values_data = self.get_single_valid_child_data(field.data_type())?;
999 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1000 Ok(())
1001 }
1002 DataType::FixedSizeList(field, list_size) => {
1003 let values_data = self.get_single_valid_child_data(field.data_type())?;
1004
1005 let list_size: usize = (*list_size).try_into().map_err(|_| {
1006 ArrowError::InvalidArgumentError(format!(
1007 "{} has a negative list_size {}",
1008 self.data_type, list_size
1009 ))
1010 })?;
1011
1012 let expected_values_len = self.len
1013 .checked_mul(list_size)
1014 .expect("integer overflow computing expected number of expected values in FixedListSize");
1015
1016 if values_data.len < expected_values_len {
1017 return Err(ArrowError::InvalidArgumentError(format!(
1018 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1019 values_data.len, list_size, list_size, self.data_type
1020 )));
1021 }
1022
1023 Ok(())
1024 }
1025 DataType::Struct(fields) => {
1026 self.validate_num_child_data(fields.len())?;
1027 for (i, field) in fields.iter().enumerate() {
1028 let field_data = self.get_valid_child_data(i, field.data_type())?;
1029
1030 if field_data.len < self.len {
1032 return Err(ArrowError::InvalidArgumentError(format!(
1033 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1034 self.data_type, i, field.name(), field_data.len, self.len
1035 )));
1036 }
1037 }
1038 Ok(())
1039 }
1040 DataType::RunEndEncoded(run_ends_field, values_field) => {
1041 self.validate_num_child_data(2)?;
1042 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1043 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1044 if run_ends_data.len != values_data.len {
1045 return Err(ArrowError::InvalidArgumentError(format!(
1046 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1047 run_ends_data.len, values_data.len
1048 )));
1049 }
1050 if run_ends_data.nulls.is_some() {
1051 return Err(ArrowError::InvalidArgumentError(
1052 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1053 ));
1054 }
1055 Ok(())
1056 }
1057 DataType::Union(fields, mode) => {
1058 self.validate_num_child_data(fields.len())?;
1059
1060 for (i, (_, field)) in fields.iter().enumerate() {
1061 let field_data = self.get_valid_child_data(i, field.data_type())?;
1062
1063 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1064 return Err(ArrowError::InvalidArgumentError(format!(
1065 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1066 i, field_data.len, self.len + self.offset
1067 )));
1068 }
1069 }
1070 Ok(())
1071 }
1072 DataType::Dictionary(_key_type, value_type) => {
1073 self.get_single_valid_child_data(value_type)?;
1074 Ok(())
1075 }
1076 _ => {
1077 if !self.child_data.is_empty() {
1079 return Err(ArrowError::InvalidArgumentError(format!(
1080 "Expected no child arrays for type {} but got {}",
1081 self.data_type,
1082 self.child_data.len()
1083 )));
1084 }
1085 Ok(())
1086 }
1087 }
1088 }
1089
1090 fn get_single_valid_child_data(
1094 &self,
1095 expected_type: &DataType,
1096 ) -> Result<&ArrayData, ArrowError> {
1097 self.validate_num_child_data(1)?;
1098 self.get_valid_child_data(0, expected_type)
1099 }
1100
1101 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1103 if self.child_data.len() != expected_len {
1104 Err(ArrowError::InvalidArgumentError(format!(
1105 "Value data for {} should contain {} child data array(s), had {}",
1106 self.data_type,
1107 expected_len,
1108 self.child_data.len()
1109 )))
1110 } else {
1111 Ok(())
1112 }
1113 }
1114
1115 fn get_valid_child_data(
1118 &self,
1119 i: usize,
1120 expected_type: &DataType,
1121 ) -> Result<&ArrayData, ArrowError> {
1122 let values_data = self.child_data.get(i).ok_or_else(|| {
1123 ArrowError::InvalidArgumentError(format!(
1124 "{} did not have enough child arrays. Expected at least {} but had only {}",
1125 self.data_type,
1126 i + 1,
1127 self.child_data.len()
1128 ))
1129 })?;
1130
1131 if expected_type != &values_data.data_type {
1132 return Err(ArrowError::InvalidArgumentError(format!(
1133 "Child type mismatch for {}. Expected {} but child data had {}",
1134 self.data_type, expected_type, values_data.data_type
1135 )));
1136 }
1137
1138 values_data.validate()?;
1139 Ok(values_data)
1140 }
1141
1142 pub fn validate_data(&self) -> Result<(), ArrowError> {
1158 self.validate()?;
1159
1160 self.validate_nulls()?;
1161 self.validate_values()?;
1162 Ok(())
1163 }
1164
1165 pub fn validate_full(&self) -> Result<(), ArrowError> {
1170 self.validate_data()?;
1171 self.child_data
1173 .iter()
1174 .enumerate()
1175 .try_for_each(|(i, child_data)| {
1176 child_data.validate_full().map_err(|e| {
1177 ArrowError::InvalidArgumentError(format!(
1178 "{} child #{} invalid: {}",
1179 self.data_type, i, e
1180 ))
1181 })
1182 })?;
1183 Ok(())
1184 }
1185
1186 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1196 if let Some(nulls) = &self.nulls {
1197 let actual = nulls.len() - nulls.inner().count_set_bits();
1198 if actual != nulls.null_count() {
1199 return Err(ArrowError::InvalidArgumentError(format!(
1200 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1201 nulls.null_count(),
1202 actual
1203 )));
1204 }
1205 }
1206
1207 match &self.data_type {
1212 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1213 if !f.is_nullable() {
1214 self.validate_non_nullable(None, &self.child_data[0])?
1215 }
1216 }
1217 DataType::FixedSizeList(field, len) => {
1218 let child = &self.child_data[0];
1219 if !field.is_nullable() {
1220 match &self.nulls {
1221 Some(nulls) => {
1222 let element_len = *len as usize;
1223 let expanded = nulls.expand(element_len);
1224 self.validate_non_nullable(Some(&expanded), child)?;
1225 }
1226 None => self.validate_non_nullable(None, child)?,
1227 }
1228 }
1229 }
1230 DataType::Struct(fields) => {
1231 for (field, child) in fields.iter().zip(&self.child_data) {
1232 if !field.is_nullable() {
1233 self.validate_non_nullable(self.nulls(), child)?
1234 }
1235 }
1236 }
1237 _ => {}
1238 }
1239
1240 Ok(())
1241 }
1242
1243 fn validate_non_nullable(
1245 &self,
1246 mask: Option<&NullBuffer>,
1247 child: &ArrayData,
1248 ) -> Result<(), ArrowError> {
1249 let mask = match mask {
1250 Some(mask) => mask,
1251 None => {
1252 return match child.null_count() {
1253 0 => Ok(()),
1254 _ => Err(ArrowError::InvalidArgumentError(format!(
1255 "non-nullable child of type {} contains nulls not present in parent {}",
1256 child.data_type, self.data_type
1257 ))),
1258 }
1259 }
1260 };
1261
1262 match child.nulls() {
1263 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1264 "non-nullable child of type {} contains nulls not present in parent",
1265 child.data_type
1266 ))),
1267 _ => Ok(()),
1268 }
1269 }
1270
1271 pub fn validate_values(&self) -> Result<(), ArrowError> {
1277 match &self.data_type {
1278 DataType::Utf8 => self.validate_utf8::<i32>(),
1279 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1280 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1281 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1282 DataType::BinaryView => {
1283 let views = self.typed_buffer::<u128>(0, self.len)?;
1284 validate_binary_view(views, &self.buffers[1..])
1285 }
1286 DataType::Utf8View => {
1287 let views = self.typed_buffer::<u128>(0, self.len)?;
1288 validate_string_view(views, &self.buffers[1..])
1289 }
1290 DataType::List(_) | DataType::Map(_, _) => {
1291 let child = &self.child_data[0];
1292 self.validate_offsets_full::<i32>(child.len)
1293 }
1294 DataType::LargeList(_) => {
1295 let child = &self.child_data[0];
1296 self.validate_offsets_full::<i64>(child.len)
1297 }
1298 DataType::Union(_, _) => {
1299 Ok(())
1305 }
1306 DataType::Dictionary(key_type, _value_type) => {
1307 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1308 let max_value = dictionary_length - 1;
1309 match key_type.as_ref() {
1310 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1311 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1312 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1313 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1314 DataType::Int8 => self.check_bounds::<i8>(max_value),
1315 DataType::Int16 => self.check_bounds::<i16>(max_value),
1316 DataType::Int32 => self.check_bounds::<i32>(max_value),
1317 DataType::Int64 => self.check_bounds::<i64>(max_value),
1318 _ => unreachable!(),
1319 }
1320 }
1321 DataType::RunEndEncoded(run_ends, _values) => {
1322 let run_ends_data = self.child_data()[0].clone();
1323 match run_ends.data_type() {
1324 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1325 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1326 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1327 _ => unreachable!(),
1328 }
1329 }
1330 _ => {
1331 Ok(())
1333 }
1334 }
1335 }
1336
1337 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1348 where
1349 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1350 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1351 {
1352 self.typed_offsets::<T>()?
1353 .iter()
1354 .enumerate()
1355 .map(|(i, x)| {
1356 let r = x.to_usize().ok_or_else(|| {
1358 ArrowError::InvalidArgumentError(format!(
1359 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1360 );
1361 match r {
1363 Ok(n) if n <= offset_limit => Ok((i, n)),
1364 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1365 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1366 ),
1367 Err(e) => Err(e),
1368 }
1369 })
1370 .scan(0_usize, |start, end| {
1371 match end {
1373 Ok((i, end)) if *start <= end => {
1374 let range = Some(Ok((i, *start..end)));
1375 *start = end;
1376 range
1377 }
1378 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1379 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1380 i - 1, start, end))
1381 )),
1382 Err(err) => Some(Err(err)),
1383 }
1384 })
1385 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1387 let (item_index, range) = res?;
1388 validate(item_index-1, range)
1389 })
1390 }
1391
1392 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1395 where
1396 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1397 {
1398 let values_buffer = &self.buffers[1].as_slice();
1399 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1400 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1402 if !values_str.is_char_boundary(range.start)
1403 || !values_str.is_char_boundary(range.end)
1404 {
1405 return Err(ArrowError::InvalidArgumentError(format!(
1406 "incomplete utf-8 byte sequence from index {string_index}"
1407 )));
1408 }
1409 Ok(())
1410 })
1411 } else {
1412 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1414 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1415 ArrowError::InvalidArgumentError(format!(
1416 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1417 ))
1418 })?;
1419 Ok(())
1420 })
1421 }
1422 }
1423
1424 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1427 where
1428 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1429 {
1430 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1431 Ok(())
1434 })
1435 }
1436
1437 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1440 where
1441 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1442 {
1443 let required_len = self.len + self.offset;
1444 let buffer = &self.buffers[0];
1445
1446 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1449
1450 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1452
1453 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1454 if self.is_null(i) {
1456 return Ok(());
1457 }
1458 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1459 ArrowError::InvalidArgumentError(format!(
1460 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1461 ))
1462 })?;
1463
1464 if dict_index < 0 || dict_index > max_value {
1465 return Err(ArrowError::InvalidArgumentError(format!(
1466 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1467 )));
1468 }
1469 Ok(())
1470 })
1471 }
1472
1473 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1475 where
1476 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1477 {
1478 let values = self.typed_buffer::<T>(0, self.len)?;
1479 let mut prev_value: i64 = 0_i64;
1480 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1481 let value: i64 = inp_value.try_into().map_err(|_| {
1482 ArrowError::InvalidArgumentError(format!(
1483 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1484 ))
1485 })?;
1486 if value <= 0_i64 {
1487 return Err(ArrowError::InvalidArgumentError(format!(
1488 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1489 )));
1490 }
1491 if ix > 0 && value <= prev_value {
1492 return Err(ArrowError::InvalidArgumentError(format!(
1493 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1494 )));
1495 }
1496
1497 prev_value = value;
1498 Ok(())
1499 })?;
1500
1501 if prev_value.as_usize() < (self.offset + self.len) {
1502 return Err(ArrowError::InvalidArgumentError(format!(
1503 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1504 self.offset + self.len
1505 )));
1506 }
1507 Ok(())
1508 }
1509
1510 pub fn ptr_eq(&self, other: &Self) -> bool {
1514 if self.offset != other.offset
1515 || self.len != other.len
1516 || self.data_type != other.data_type
1517 || self.buffers.len() != other.buffers.len()
1518 || self.child_data.len() != other.child_data.len()
1519 {
1520 return false;
1521 }
1522
1523 match (&self.nulls, &other.nulls) {
1524 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1525 (Some(_), None) | (None, Some(_)) => return false,
1526 _ => {}
1527 };
1528
1529 if !self
1530 .buffers
1531 .iter()
1532 .zip(other.buffers.iter())
1533 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1534 {
1535 return false;
1536 }
1537
1538 self.child_data
1539 .iter()
1540 .zip(other.child_data.iter())
1541 .all(|(a, b)| a.ptr_eq(b))
1542 }
1543
1544 pub fn into_builder(self) -> ArrayDataBuilder {
1546 self.into()
1547 }
1548}
1549
1550pub fn layout(data_type: &DataType) -> DataTypeLayout {
1553 use arrow_schema::IntervalUnit::*;
1556
1557 match data_type {
1558 DataType::Null => DataTypeLayout {
1559 buffers: vec![],
1560 can_contain_null_mask: false,
1561 variadic: false,
1562 },
1563 DataType::Boolean => DataTypeLayout {
1564 buffers: vec![BufferSpec::BitMap],
1565 can_contain_null_mask: true,
1566 variadic: false,
1567 },
1568 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1569 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1570 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1571 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1572 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1573 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1574 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1575 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1576 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1577 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1578 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1579 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1580 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1581 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1582 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1583 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1584 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1585 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1586 DataType::Interval(MonthDayNano) => {
1587 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1588 }
1589 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1590 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1591 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1592 DataType::FixedSizeBinary(size) => {
1593 let spec = BufferSpec::FixedWidth {
1594 byte_width: (*size).try_into().unwrap(),
1595 alignment: mem::align_of::<u8>(),
1596 };
1597 DataTypeLayout {
1598 buffers: vec![spec],
1599 can_contain_null_mask: true,
1600 variadic: false,
1601 }
1602 }
1603 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1604 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1605 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1606 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1607 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1608 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1610 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1611 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1612 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1613 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1614 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1617 let type_ids = BufferSpec::FixedWidth {
1618 byte_width: mem::size_of::<i8>(),
1619 alignment: mem::align_of::<i8>(),
1620 };
1621
1622 DataTypeLayout {
1623 buffers: match mode {
1624 UnionMode::Sparse => {
1625 vec![type_ids]
1626 }
1627 UnionMode::Dense => {
1628 vec![
1629 type_ids,
1630 BufferSpec::FixedWidth {
1631 byte_width: mem::size_of::<i32>(),
1632 alignment: mem::align_of::<i32>(),
1633 },
1634 ]
1635 }
1636 },
1637 can_contain_null_mask: false,
1638 variadic: false,
1639 }
1640 }
1641 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1642 }
1643}
1644
1645#[derive(Debug, PartialEq, Eq)]
1647pub struct DataTypeLayout {
1649 pub buffers: Vec<BufferSpec>,
1651
1652 pub can_contain_null_mask: bool,
1654
1655 pub variadic: bool,
1659}
1660
1661impl DataTypeLayout {
1662 pub fn new_fixed_width<T>() -> Self {
1664 Self {
1665 buffers: vec![BufferSpec::FixedWidth {
1666 byte_width: mem::size_of::<T>(),
1667 alignment: mem::align_of::<T>(),
1668 }],
1669 can_contain_null_mask: true,
1670 variadic: false,
1671 }
1672 }
1673
1674 pub fn new_nullable_empty() -> Self {
1677 Self {
1678 buffers: vec![],
1679 can_contain_null_mask: true,
1680 variadic: false,
1681 }
1682 }
1683
1684 pub fn new_empty() -> Self {
1687 Self {
1688 buffers: vec![],
1689 can_contain_null_mask: false,
1690 variadic: false,
1691 }
1692 }
1693
1694 pub fn new_binary<T>() -> Self {
1698 Self {
1699 buffers: vec![
1700 BufferSpec::FixedWidth {
1702 byte_width: mem::size_of::<T>(),
1703 alignment: mem::align_of::<T>(),
1704 },
1705 BufferSpec::VariableWidth,
1707 ],
1708 can_contain_null_mask: true,
1709 variadic: false,
1710 }
1711 }
1712
1713 pub fn new_view() -> Self {
1715 Self {
1716 buffers: vec![BufferSpec::FixedWidth {
1717 byte_width: mem::size_of::<u128>(),
1718 alignment: mem::align_of::<u128>(),
1719 }],
1720 can_contain_null_mask: true,
1721 variadic: true,
1722 }
1723 }
1724
1725 pub fn new_list_view<T>() -> Self {
1727 Self {
1728 buffers: vec![
1729 BufferSpec::FixedWidth {
1730 byte_width: mem::size_of::<T>(),
1731 alignment: mem::align_of::<T>(),
1732 },
1733 BufferSpec::FixedWidth {
1734 byte_width: mem::size_of::<T>(),
1735 alignment: mem::align_of::<T>(),
1736 },
1737 ],
1738 can_contain_null_mask: true,
1739 variadic: true,
1740 }
1741 }
1742}
1743
1744#[derive(Debug, PartialEq, Eq)]
1746pub enum BufferSpec {
1747 FixedWidth {
1758 byte_width: usize,
1760 alignment: usize,
1762 },
1763 VariableWidth,
1765 BitMap,
1771 #[allow(dead_code)]
1774 AlwaysNull,
1775}
1776
1777impl PartialEq for ArrayData {
1778 fn eq(&self, other: &Self) -> bool {
1779 equal::equal(self, other)
1780 }
1781}
1782
1783#[derive(Debug)]
1785pub struct ArrayDataBuilder {
1786 data_type: DataType,
1787 len: usize,
1788 null_count: Option<usize>,
1789 null_bit_buffer: Option<Buffer>,
1790 nulls: Option<NullBuffer>,
1791 offset: usize,
1792 buffers: Vec<Buffer>,
1793 child_data: Vec<ArrayData>,
1794}
1795
1796impl ArrayDataBuilder {
1797 #[inline]
1798 pub const fn new(data_type: DataType) -> Self {
1800 Self {
1801 data_type,
1802 len: 0,
1803 null_count: None,
1804 null_bit_buffer: None,
1805 nulls: None,
1806 offset: 0,
1807 buffers: vec![],
1808 child_data: vec![],
1809 }
1810 }
1811
1812 pub fn data_type(self, data_type: DataType) -> Self {
1814 Self { data_type, ..self }
1815 }
1816
1817 #[inline]
1818 #[allow(clippy::len_without_is_empty)]
1819 pub const fn len(mut self, n: usize) -> Self {
1821 self.len = n;
1822 self
1823 }
1824
1825 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1827 self.nulls = nulls;
1828 self.null_count = None;
1829 self.null_bit_buffer = None;
1830 self
1831 }
1832
1833 pub fn null_count(mut self, null_count: usize) -> Self {
1835 self.null_count = Some(null_count);
1836 self
1837 }
1838
1839 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1841 self.nulls = None;
1842 self.null_bit_buffer = buf;
1843 self
1844 }
1845
1846 #[inline]
1848 pub const fn offset(mut self, n: usize) -> Self {
1849 self.offset = n;
1850 self
1851 }
1852
1853 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1855 self.buffers = v;
1856 self
1857 }
1858
1859 pub fn add_buffer(mut self, b: Buffer) -> Self {
1861 self.buffers.push(b);
1862 self
1863 }
1864
1865 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1867 self.buffers.extend(bs);
1868 self
1869 }
1870
1871 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1873 self.child_data = v;
1874 self
1875 }
1876
1877 pub fn add_child_data(mut self, r: ArrayData) -> Self {
1879 self.child_data.push(r);
1880 self
1881 }
1882
1883 #[allow(clippy::let_and_return)]
1890 pub unsafe fn build_unchecked(self) -> ArrayData {
1891 let data = self.build_impl();
1892 #[cfg(feature = "force_validate")]
1894 data.validate_data().unwrap();
1895 data
1896 }
1897
1898 unsafe fn build_impl(self) -> ArrayData {
1900 let nulls = self
1901 .nulls
1902 .or_else(|| {
1903 let buffer = self.null_bit_buffer?;
1904 let buffer = BooleanBuffer::new(buffer, self.offset, self.len);
1905 Some(match self.null_count {
1906 Some(n) => NullBuffer::new_unchecked(buffer, n),
1907 None => NullBuffer::new(buffer),
1908 })
1909 })
1910 .filter(|b| b.null_count() != 0);
1911
1912 ArrayData {
1913 data_type: self.data_type,
1914 len: self.len,
1915 offset: self.offset,
1916 buffers: self.buffers,
1917 child_data: self.child_data,
1918 nulls,
1919 }
1920 }
1921
1922 pub fn build(self) -> Result<ArrayData, ArrowError> {
1924 let data = unsafe { self.build_impl() };
1925 data.validate_data()?;
1926 Ok(data)
1927 }
1928
1929 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
1945 let mut data = unsafe { self.build_impl() };
1946 data.align_buffers();
1947 data.validate_data()?;
1948 Ok(data)
1949 }
1950}
1951
1952impl From<ArrayData> for ArrayDataBuilder {
1953 fn from(d: ArrayData) -> Self {
1954 Self {
1955 data_type: d.data_type,
1956 len: d.len,
1957 offset: d.offset,
1958 buffers: d.buffers,
1959 child_data: d.child_data,
1960 nulls: d.nulls,
1961 null_bit_buffer: None,
1962 null_count: None,
1963 }
1964 }
1965}
1966
1967#[cfg(test)]
1968mod tests {
1969 use super::*;
1970 use arrow_schema::{Field, Fields};
1971
1972 fn make_i32_buffer(n: usize) -> Buffer {
1976 Buffer::from_slice_ref(vec![42i32; n])
1977 }
1978
1979 fn make_f32_buffer(n: usize) -> Buffer {
1981 Buffer::from_slice_ref(vec![42f32; n])
1982 }
1983
1984 #[test]
1985 fn test_builder() {
1986 let v = (0..25).collect::<Vec<i32>>();
1988 let b1 = Buffer::from_slice_ref(&v);
1989 let arr_data = ArrayData::builder(DataType::Int32)
1990 .len(20)
1991 .offset(5)
1992 .add_buffer(b1)
1993 .null_bit_buffer(Some(Buffer::from([
1994 0b01011111, 0b10110101, 0b01100011, 0b00011110,
1995 ])))
1996 .build()
1997 .unwrap();
1998
1999 assert_eq!(20, arr_data.len());
2000 assert_eq!(10, arr_data.null_count());
2001 assert_eq!(5, arr_data.offset());
2002 assert_eq!(1, arr_data.buffers().len());
2003 assert_eq!(
2004 Buffer::from_slice_ref(&v).as_slice(),
2005 arr_data.buffers()[0].as_slice()
2006 );
2007 }
2008
2009 #[test]
2010 fn test_builder_with_child_data() {
2011 let child_arr_data = ArrayData::try_new(
2012 DataType::Int32,
2013 5,
2014 None,
2015 0,
2016 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2017 vec![],
2018 )
2019 .unwrap();
2020
2021 let field = Arc::new(Field::new("x", DataType::Int32, true));
2022 let data_type = DataType::Struct(vec![field].into());
2023
2024 let arr_data = ArrayData::builder(data_type)
2025 .len(5)
2026 .offset(0)
2027 .add_child_data(child_arr_data.clone())
2028 .build()
2029 .unwrap();
2030
2031 assert_eq!(5, arr_data.len());
2032 assert_eq!(1, arr_data.child_data().len());
2033 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2034 }
2035
2036 #[test]
2037 fn test_null_count() {
2038 let mut bit_v: [u8; 2] = [0; 2];
2039 bit_util::set_bit(&mut bit_v, 0);
2040 bit_util::set_bit(&mut bit_v, 3);
2041 bit_util::set_bit(&mut bit_v, 10);
2042 let arr_data = ArrayData::builder(DataType::Int32)
2043 .len(16)
2044 .add_buffer(make_i32_buffer(16))
2045 .null_bit_buffer(Some(Buffer::from(bit_v)))
2046 .build()
2047 .unwrap();
2048 assert_eq!(13, arr_data.null_count());
2049
2050 let mut bit_v: [u8; 2] = [0; 2];
2052 bit_util::set_bit(&mut bit_v, 0);
2053 bit_util::set_bit(&mut bit_v, 3);
2054 bit_util::set_bit(&mut bit_v, 10);
2055 let arr_data = ArrayData::builder(DataType::Int32)
2056 .len(12)
2057 .offset(2)
2058 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2060 .build()
2061 .unwrap();
2062 assert_eq!(10, arr_data.null_count());
2063 }
2064
2065 #[test]
2066 fn test_null_buffer_ref() {
2067 let mut bit_v: [u8; 2] = [0; 2];
2068 bit_util::set_bit(&mut bit_v, 0);
2069 bit_util::set_bit(&mut bit_v, 3);
2070 bit_util::set_bit(&mut bit_v, 10);
2071 let arr_data = ArrayData::builder(DataType::Int32)
2072 .len(16)
2073 .add_buffer(make_i32_buffer(16))
2074 .null_bit_buffer(Some(Buffer::from(bit_v)))
2075 .build()
2076 .unwrap();
2077 assert!(arr_data.nulls().is_some());
2078 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2079 }
2080
2081 #[test]
2082 fn test_slice() {
2083 let mut bit_v: [u8; 2] = [0; 2];
2084 bit_util::set_bit(&mut bit_v, 0);
2085 bit_util::set_bit(&mut bit_v, 3);
2086 bit_util::set_bit(&mut bit_v, 10);
2087 let data = ArrayData::builder(DataType::Int32)
2088 .len(16)
2089 .add_buffer(make_i32_buffer(16))
2090 .null_bit_buffer(Some(Buffer::from(bit_v)))
2091 .build()
2092 .unwrap();
2093 let new_data = data.slice(1, 15);
2094 assert_eq!(data.len() - 1, new_data.len());
2095 assert_eq!(1, new_data.offset());
2096 assert_eq!(data.null_count(), new_data.null_count());
2097
2098 let new_data = new_data.slice(1, 14);
2100 assert_eq!(data.len() - 2, new_data.len());
2101 assert_eq!(2, new_data.offset());
2102 assert_eq!(data.null_count() - 1, new_data.null_count());
2103 }
2104
2105 #[test]
2106 fn test_equality() {
2107 let int_data = ArrayData::builder(DataType::Int32)
2108 .len(1)
2109 .add_buffer(make_i32_buffer(1))
2110 .build()
2111 .unwrap();
2112
2113 let float_data = ArrayData::builder(DataType::Float32)
2114 .len(1)
2115 .add_buffer(make_f32_buffer(1))
2116 .build()
2117 .unwrap();
2118 assert_ne!(int_data, float_data);
2119 assert!(!int_data.ptr_eq(&float_data));
2120 assert!(int_data.ptr_eq(&int_data));
2121
2122 #[allow(clippy::redundant_clone)]
2123 let int_data_clone = int_data.clone();
2124 assert_eq!(int_data, int_data_clone);
2125 assert!(int_data.ptr_eq(&int_data_clone));
2126 assert!(int_data_clone.ptr_eq(&int_data));
2127
2128 let int_data_slice = int_data_clone.slice(1, 0);
2129 assert!(int_data_slice.ptr_eq(&int_data_slice));
2130 assert!(!int_data.ptr_eq(&int_data_slice));
2131 assert!(!int_data_slice.ptr_eq(&int_data));
2132
2133 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2134 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2135 let string_data = ArrayData::try_new(
2136 DataType::Utf8,
2137 3,
2138 Some(Buffer::from_iter(vec![true, false, true])),
2139 0,
2140 vec![offsets_buffer, data_buffer],
2141 vec![],
2142 )
2143 .unwrap();
2144
2145 assert_ne!(float_data, string_data);
2146 assert!(!float_data.ptr_eq(&string_data));
2147
2148 assert!(string_data.ptr_eq(&string_data));
2149
2150 #[allow(clippy::redundant_clone)]
2151 let string_data_cloned = string_data.clone();
2152 assert!(string_data_cloned.ptr_eq(&string_data));
2153 assert!(string_data.ptr_eq(&string_data_cloned));
2154
2155 let string_data_slice = string_data.slice(1, 2);
2156 assert!(string_data_slice.ptr_eq(&string_data_slice));
2157 assert!(!string_data_slice.ptr_eq(&string_data))
2158 }
2159
2160 #[test]
2161 fn test_slice_memory_size() {
2162 let mut bit_v: [u8; 2] = [0; 2];
2163 bit_util::set_bit(&mut bit_v, 0);
2164 bit_util::set_bit(&mut bit_v, 3);
2165 bit_util::set_bit(&mut bit_v, 10);
2166 let data = ArrayData::builder(DataType::Int32)
2167 .len(16)
2168 .add_buffer(make_i32_buffer(16))
2169 .null_bit_buffer(Some(Buffer::from(bit_v)))
2170 .build()
2171 .unwrap();
2172 let new_data = data.slice(1, 14);
2173 assert_eq!(
2174 data.get_slice_memory_size().unwrap() - 8,
2175 new_data.get_slice_memory_size().unwrap()
2176 );
2177 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2178 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2179 let string_data = ArrayData::try_new(
2180 DataType::Utf8,
2181 3,
2182 Some(Buffer::from_iter(vec![true, false, true])),
2183 0,
2184 vec![offsets_buffer, data_buffer],
2185 vec![],
2186 )
2187 .unwrap();
2188 let string_data_slice = string_data.slice(1, 2);
2189 assert_eq!(
2191 string_data.get_slice_memory_size().unwrap() - 6,
2192 string_data_slice.get_slice_memory_size().unwrap()
2193 );
2194 }
2195
2196 #[test]
2197 fn test_count_nulls() {
2198 let buffer = Buffer::from([0b00010110, 0b10011111]);
2199 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2200 let count = count_nulls(Some(&buffer), 0, 16);
2201 assert_eq!(count, 7);
2202
2203 let count = count_nulls(Some(&buffer), 4, 8);
2204 assert_eq!(count, 3);
2205 }
2206
2207 #[test]
2208 fn test_contains_nulls() {
2209 let buffer: Buffer =
2210 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2211 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2212 assert!(contains_nulls(Some(&buffer), 0, 6));
2213 assert!(contains_nulls(Some(&buffer), 0, 3));
2214 assert!(!contains_nulls(Some(&buffer), 3, 2));
2215 assert!(!contains_nulls(Some(&buffer), 0, 0));
2216 }
2217
2218 #[test]
2219 fn test_alignment() {
2220 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2221 let sliced = buffer.slice(1);
2222
2223 let mut data = ArrayData {
2224 data_type: DataType::Int32,
2225 len: 0,
2226 offset: 0,
2227 buffers: vec![buffer],
2228 child_data: vec![],
2229 nulls: None,
2230 };
2231 data.validate_full().unwrap();
2232
2233 data.buffers[0] = sliced;
2235 let err = data.validate().unwrap_err();
2236
2237 assert_eq!(
2238 err.to_string(),
2239 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2240 );
2241
2242 data.align_buffers();
2243 data.validate_full().unwrap();
2244 }
2245
2246 #[test]
2247 fn test_alignment_struct() {
2248 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2249 let sliced = buffer.slice(1);
2250
2251 let child_data = ArrayData {
2252 data_type: DataType::Int32,
2253 len: 0,
2254 offset: 0,
2255 buffers: vec![buffer],
2256 child_data: vec![],
2257 nulls: None,
2258 };
2259
2260 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2261 let mut data = ArrayData {
2262 data_type: schema,
2263 len: 0,
2264 offset: 0,
2265 buffers: vec![],
2266 child_data: vec![child_data],
2267 nulls: None,
2268 };
2269 data.validate_full().unwrap();
2270
2271 data.child_data[0].buffers[0] = sliced;
2273 let err = data.validate().unwrap_err();
2274
2275 assert_eq!(
2276 err.to_string(),
2277 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2278 );
2279
2280 data.align_buffers();
2281 data.validate_full().unwrap();
2282 }
2283
2284 #[test]
2285 fn test_null_view_types() {
2286 let array_len = 32;
2287 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2288 assert_eq!(array.len(), array_len);
2289 for i in 0..array.len() {
2290 assert!(array.is_null(i));
2291 }
2292
2293 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2294 assert_eq!(array.len(), array_len);
2295 for i in 0..array.len() {
2296 assert!(array.is_null(i));
2297 }
2298 }
2299}