arrow_array/array/byte_view_array.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::array::print_long_array;
19use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
25use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
26use arrow_schema::{ArrowError, DataType};
27use core::str;
28use num::ToPrimitive;
29use std::any::Any;
30use std::fmt::Debug;
31use std::marker::PhantomData;
32use std::sync::Arc;
33
34use super::ByteArrayType;
35
36/// [Variable-size Binary View Layout]: An array of variable length bytes views.
37///
38/// This array type is used to store variable length byte data (e.g. Strings, Binary)
39/// and has efficient operations such as `take`, `filter`, and comparison.
40///
41/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
42///
43/// This is different from [`GenericByteArray`], which also stores variable
44/// length byte data, as it represents strings with an offset and length. `take`
45/// and `filter` like operations are implemented by manipulating the "views"
46/// (`u128`) without modifying the bytes. Each view also stores an inlined
47/// prefix which speed up comparisons.
48///
49/// # See Also
50///
51/// * [`StringViewArray`] for storing utf8 encoded string data
52/// * [`BinaryViewArray`] for storing bytes
53/// * [`ByteView`] to interpret `u128`s layout of the views.
54///
55/// [`ByteView`]: arrow_data::ByteView
56///
57/// # Layout: "views" and buffers
58///
59/// A `GenericByteViewArray` stores variable length byte strings. An array of
60/// `N` elements is stored as `N` fixed length "views" and a variable number
61/// of variable length "buffers".
62///
63/// Each view is a `u128` value whose layout is different depending on the
64/// length of the string stored at that location:
65///
66/// ```text
67/// ┌──────┬────────────────────────┐
68/// │length│ string value │
69/// Strings (len <= 12) │ │ (padded with 0) │
70/// └──────┴────────────────────────┘
71/// 0 31 127
72///
73/// ┌───────┬───────┬───────┬───────┐
74/// │length │prefix │ buf │offset │
75/// Strings (len > 12) │ │ │ index │ │
76/// └───────┴───────┴───────┴───────┘
77/// 0 31 63 95 127
78/// ```
79///
80/// * Strings with length <= 12 are stored directly in the view. See
81/// [`Self::inline_value`] to access the inlined prefix from a short view.
82///
83/// * Strings with length > 12: The first four bytes are stored inline in the
84/// view and the entire string is stored in one of the buffers. See [`ByteView`]
85/// to access the fields of the these views.
86///
87/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
88/// the easiest and fastest way to work with this data. However, it is possible
89/// to access the views and buffers directly for more control.
90///
91/// For example
92///
93/// ```rust
94/// # use arrow_array::StringViewArray;
95/// # use arrow_array::Array;
96/// use arrow_data::ByteView;
97/// let array = StringViewArray::from(vec![
98/// "hello",
99/// "this string is longer than 12 bytes",
100/// "this string is also longer than 12 bytes"
101/// ]);
102///
103/// // ** Examine the first view (short string) **
104/// assert!(array.is_valid(0)); // Check for nulls
105/// let short_view: u128 = array.views()[0]; // "hello"
106/// // get length of the string
107/// let len = short_view as u32;
108/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
109/// // SAFETY: `view` is a valid view
110/// let value = unsafe {
111/// StringViewArray::inline_value(&short_view, len as usize)
112/// };
113/// assert_eq!(value, b"hello");
114///
115/// // ** Examine the third view (long string) **
116/// assert!(array.is_valid(12)); // Check for nulls
117/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
118/// let len = long_view as u32;
119/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
120/// let view = ByteView::from(long_view); // use ByteView to access the fields
121/// assert_eq!(view.length, 40);
122/// assert_eq!(view.buffer_index, 0);
123/// assert_eq!(view.offset, 35); // data starts after the first long string
124/// // Views for long strings store a 4 byte prefix
125/// let prefix = view.prefix.to_le_bytes();
126/// assert_eq!(&prefix, b"this");
127/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
128/// assert_eq!(value, "this string is also longer than 12 bytes");
129/// ```
130///
131/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
132///
133/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
134/// than they must point into a valid buffer. However, they can be out of order,
135/// non continuous and overlapping.
136///
137/// For example, in the following diagram, the strings "FishWasInTownToday" and
138/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
139/// separate buffer while the string "LavaMonster" is stored inlined in the
140/// view. In this case, the same bytes for "Fish" are used to store both strings.
141///
142/// [`ByteView`]: arrow_data::ByteView
143///
144/// ```text
145/// ┌───┐
146/// ┌──────┬──────┬──────┬──────┐ offset │...│
147/// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│
148/// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│
149/// ┌──────┬──────┬──────┬──────┐ │mpl│
150/// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│
151/// └──────┴──────┴──────┴──────┘ │ced│
152/// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
153/// "LavaMonster" │ 11 │ LavaMonster\0 │ │hWa│
154/// └──────┴────────────────────┘ offset │sIn│
155/// 115 │Tow│
156/// │nTo│
157/// │day│
158/// u128 "views" │Yay│
159/// buffer 0 │...│
160/// └───┘
161/// ```
162pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
163 data_type: DataType,
164 views: ScalarBuffer<u128>,
165 buffers: Vec<Buffer>,
166 phantom: PhantomData<T>,
167 nulls: Option<NullBuffer>,
168}
169
170impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
171 fn clone(&self) -> Self {
172 Self {
173 data_type: T::DATA_TYPE,
174 views: self.views.clone(),
175 buffers: self.buffers.clone(),
176 nulls: self.nulls.clone(),
177 phantom: Default::default(),
178 }
179 }
180}
181
182impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
183 /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
184 ///
185 /// # Panics
186 ///
187 /// Panics if [`GenericByteViewArray::try_new`] returns an error
188 pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
189 Self::try_new(views, buffers, nulls).unwrap()
190 }
191
192 /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
193 ///
194 /// # Errors
195 ///
196 /// * `views.len() != nulls.len()`
197 /// * [ByteViewType::validate] fails
198 pub fn try_new(
199 views: ScalarBuffer<u128>,
200 buffers: Vec<Buffer>,
201 nulls: Option<NullBuffer>,
202 ) -> Result<Self, ArrowError> {
203 T::validate(&views, &buffers)?;
204
205 if let Some(n) = nulls.as_ref() {
206 if n.len() != views.len() {
207 return Err(ArrowError::InvalidArgumentError(format!(
208 "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
209 T::PREFIX,
210 views.len(),
211 n.len(),
212 )));
213 }
214 }
215
216 Ok(Self {
217 data_type: T::DATA_TYPE,
218 views,
219 buffers,
220 nulls,
221 phantom: Default::default(),
222 })
223 }
224
225 /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
226 ///
227 /// # Safety
228 ///
229 /// Safe if [`Self::try_new`] would not error
230 pub unsafe fn new_unchecked(
231 views: ScalarBuffer<u128>,
232 buffers: Vec<Buffer>,
233 nulls: Option<NullBuffer>,
234 ) -> Self {
235 Self {
236 data_type: T::DATA_TYPE,
237 phantom: Default::default(),
238 views,
239 buffers,
240 nulls,
241 }
242 }
243
244 /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
245 pub fn new_null(len: usize) -> Self {
246 Self {
247 data_type: T::DATA_TYPE,
248 views: vec![0; len].into(),
249 buffers: vec![],
250 nulls: Some(NullBuffer::new_null(len)),
251 phantom: Default::default(),
252 }
253 }
254
255 /// Create a new [`Scalar`] from `value`
256 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
257 Scalar::new(Self::from_iter_values(std::iter::once(value)))
258 }
259
260 /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
261 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
262 where
263 Ptr: AsRef<T::Native>,
264 I: IntoIterator<Item = Ptr>,
265 {
266 let iter = iter.into_iter();
267 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
268 for v in iter {
269 builder.append_value(v);
270 }
271 builder.finish()
272 }
273
274 /// Deconstruct this array into its constituent parts
275 pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
276 (self.views, self.buffers, self.nulls)
277 }
278
279 /// Returns the views buffer
280 #[inline]
281 pub fn views(&self) -> &ScalarBuffer<u128> {
282 &self.views
283 }
284
285 /// Returns the buffers storing string data
286 #[inline]
287 pub fn data_buffers(&self) -> &[Buffer] {
288 &self.buffers
289 }
290
291 /// Returns the element at index `i`
292 /// # Panics
293 /// Panics if index `i` is out of bounds.
294 pub fn value(&self, i: usize) -> &T::Native {
295 assert!(
296 i < self.len(),
297 "Trying to access an element at index {} from a {}ViewArray of length {}",
298 i,
299 T::PREFIX,
300 self.len()
301 );
302
303 unsafe { self.value_unchecked(i) }
304 }
305
306 /// Returns the element at index `i` without bounds checking
307 ///
308 /// # Safety
309 ///
310 /// Caller is responsible for ensuring that the index is within the bounds
311 /// of the array
312 pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
313 let v = self.views.get_unchecked(idx);
314 let len = *v as u32;
315 let b = if len <= 12 {
316 Self::inline_value(v, len as usize)
317 } else {
318 let view = ByteView::from(*v);
319 let data = self.buffers.get_unchecked(view.buffer_index as usize);
320 let offset = view.offset as usize;
321 data.get_unchecked(offset..offset + len as usize)
322 };
323 T::Native::from_bytes_unchecked(b)
324 }
325
326 /// Returns the first `len` bytes the inline value of the view.
327 ///
328 /// # Safety
329 /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
330 /// - The `len` must be the length of the inlined value. It should never be larger than 12.
331 #[inline(always)]
332 pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
333 debug_assert!(len <= 12);
334 std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
335 }
336
337 /// Constructs a new iterator for iterating over the values of this array
338 pub fn iter(&self) -> ArrayIter<&Self> {
339 ArrayIter::new(self)
340 }
341
342 /// Returns an iterator over the bytes of this array, including null values
343 pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
344 self.views.iter().map(move |v| {
345 let len = *v as u32;
346 if len <= 12 {
347 unsafe { Self::inline_value(v, len as usize) }
348 } else {
349 let view = ByteView::from(*v);
350 let data = &self.buffers[view.buffer_index as usize];
351 let offset = view.offset as usize;
352 unsafe { data.get_unchecked(offset..offset + len as usize) }
353 }
354 })
355 }
356
357 /// Returns an iterator over the first `prefix_len` bytes of each array
358 /// element, including null values.
359 ///
360 /// If `prefix_len` is larger than the element's length, the iterator will
361 /// return an empty slice (`&[]`).
362 pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
363 self.views().into_iter().map(move |v| {
364 let len = (*v as u32) as usize;
365
366 if len < prefix_len {
367 return &[] as &[u8];
368 }
369
370 if prefix_len <= 4 || len <= 12 {
371 unsafe { StringViewArray::inline_value(v, prefix_len) }
372 } else {
373 let view = ByteView::from(*v);
374 let data = unsafe {
375 self.data_buffers()
376 .get_unchecked(view.buffer_index as usize)
377 };
378 let offset = view.offset as usize;
379 unsafe { data.get_unchecked(offset..offset + prefix_len) }
380 }
381 })
382 }
383
384 /// Returns an iterator over the last `suffix_len` bytes of each array
385 /// element, including null values.
386 ///
387 /// Note that for [`StringViewArray`] the last bytes may start in the middle
388 /// of a UTF-8 codepoint, and thus may not be a valid `&str`.
389 ///
390 /// If `suffix_len` is larger than the element's length, the iterator will
391 /// return an empty slice (`&[]`).
392 pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
393 self.views().into_iter().map(move |v| {
394 let len = (*v as u32) as usize;
395
396 if len < suffix_len {
397 return &[] as &[u8];
398 }
399
400 if len <= 12 {
401 unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
402 } else {
403 let view = ByteView::from(*v);
404 let data = unsafe {
405 self.data_buffers()
406 .get_unchecked(view.buffer_index as usize)
407 };
408 let offset = view.offset as usize;
409 unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
410 }
411 })
412 }
413
414 /// Returns a zero-copy slice of this array with the indicated offset and length.
415 pub fn slice(&self, offset: usize, length: usize) -> Self {
416 Self {
417 data_type: T::DATA_TYPE,
418 views: self.views.slice(offset, length),
419 buffers: self.buffers.clone(),
420 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
421 phantom: Default::default(),
422 }
423 }
424
425 /// Returns a "compacted" version of this array
426 ///
427 /// The original array will *not* be modified
428 ///
429 /// # Garbage Collection
430 ///
431 /// Before GC:
432 /// ```text
433 /// ┌──────┐
434 /// │......│
435 /// │......│
436 /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
437 /// │ View 1 │─ ─ ─ ─ │......│ with data that
438 /// ├────────────────────┤ │......│ is not referred
439 /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
440 /// └────────────────────┘ │......│ View 2
441 /// │......│
442 /// 2 views, refer to │......│
443 /// small portions of a └──────┘
444 /// large buffer
445 /// ```
446 ///
447 /// After GC:
448 ///
449 /// ```text
450 /// ┌────────────────────┐ ┌─────┐ After gc, only
451 /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
452 /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
453 /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
454 /// └────────────────────┘ left
455 ///
456 ///
457 /// 2 views
458 /// ```
459 /// This method will compact the data buffers by recreating the view array and only include the data
460 /// that is pointed to by the views.
461 ///
462 /// Note that it will copy the array regardless of whether the original array is compact.
463 /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
464 /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
465 ///
466 /// Note: this function does not attempt to canonicalize / deduplicate values. For this
467 /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
468 pub fn gc(&self) -> Self {
469 let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
470
471 for v in self.iter() {
472 builder.append_option(v);
473 }
474
475 builder.finish()
476 }
477
478 /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
479 ///
480 /// Comparing two ByteView types are non-trivial.
481 /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
482 ///
483 /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
484 /// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
485 /// Meaning that reading one array element requires only one memory access
486 /// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
487 ///
488 /// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
489 /// thanks to the inlined 4 bytes.
490 /// Consider equality check:
491 /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
492 ///
493 /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
494 /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
495 /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
496 ///
497 /// # Order check flow
498 /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
499 /// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
500 /// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
501 /// (2.2) o.w., we need to compare the full string.
502 ///
503 /// # Safety
504 /// The left/right_idx must within range of each array
505 pub unsafe fn compare_unchecked(
506 left: &GenericByteViewArray<T>,
507 left_idx: usize,
508 right: &GenericByteViewArray<T>,
509 right_idx: usize,
510 ) -> std::cmp::Ordering {
511 let l_view = left.views().get_unchecked(left_idx);
512 let l_len = *l_view as u32;
513
514 let r_view = right.views().get_unchecked(right_idx);
515 let r_len = *r_view as u32;
516
517 if l_len <= 12 && r_len <= 12 {
518 let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
519 let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
520 return l_data.cmp(r_data);
521 }
522
523 // one of the string is larger than 12 bytes,
524 // we then try to compare the inlined data first
525 let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
526 let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
527 if r_inlined_data != l_inlined_data {
528 return l_inlined_data.cmp(r_inlined_data);
529 }
530
531 // unfortunately, we need to compare the full data
532 let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
533 let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };
534
535 l_full_data.cmp(r_full_data)
536 }
537}
538
539impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
540 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
541 write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
542 print_long_array(self, f, |array, index, f| {
543 std::fmt::Debug::fmt(&array.value(index), f)
544 })?;
545 write!(f, "]")
546 }
547}
548
549impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
550 fn as_any(&self) -> &dyn Any {
551 self
552 }
553
554 fn to_data(&self) -> ArrayData {
555 self.clone().into()
556 }
557
558 fn into_data(self) -> ArrayData {
559 self.into()
560 }
561
562 fn data_type(&self) -> &DataType {
563 &self.data_type
564 }
565
566 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
567 Arc::new(self.slice(offset, length))
568 }
569
570 fn len(&self) -> usize {
571 self.views.len()
572 }
573
574 fn is_empty(&self) -> bool {
575 self.views.is_empty()
576 }
577
578 fn offset(&self) -> usize {
579 0
580 }
581
582 fn nulls(&self) -> Option<&NullBuffer> {
583 self.nulls.as_ref()
584 }
585
586 fn logical_null_count(&self) -> usize {
587 // More efficient that the default implementation
588 self.null_count()
589 }
590
591 fn get_buffer_memory_size(&self) -> usize {
592 let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
593 sum += self.views.inner().capacity();
594 if let Some(x) = &self.nulls {
595 sum += x.buffer().capacity()
596 }
597 sum
598 }
599
600 fn get_array_memory_size(&self) -> usize {
601 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
602 }
603}
604
605impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
606 type Item = &'a T::Native;
607
608 fn value(&self, index: usize) -> Self::Item {
609 GenericByteViewArray::value(self, index)
610 }
611
612 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
613 GenericByteViewArray::value_unchecked(self, index)
614 }
615}
616
617impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
618 type Item = Option<&'a T::Native>;
619 type IntoIter = ArrayIter<Self>;
620
621 fn into_iter(self) -> Self::IntoIter {
622 ArrayIter::new(self)
623 }
624}
625
626impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
627 fn from(value: ArrayData) -> Self {
628 let views = value.buffers()[0].clone();
629 let views = ScalarBuffer::new(views, value.offset(), value.len());
630 let buffers = value.buffers()[1..].to_vec();
631 Self {
632 data_type: T::DATA_TYPE,
633 views,
634 buffers,
635 nulls: value.nulls().cloned(),
636 phantom: Default::default(),
637 }
638 }
639}
640
641/// Efficiently convert a [`GenericByteArray`] to a [`GenericByteViewArray`]
642///
643/// For example this method can convert a [`StringArray`] to a
644/// [`StringViewArray`].
645///
646/// If the offsets are all less than u32::MAX, the new [`GenericByteViewArray`]
647/// is built without copying the underlying string data (views are created
648/// directly into the existing buffer)
649///
650/// [`StringArray`]: crate::StringArray
651impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
652where
653 FROM: ByteArrayType,
654 FROM::Offset: OffsetSizeTrait + ToPrimitive,
655 V: ByteViewType<Native = FROM::Native>,
656{
657 fn from(byte_array: &GenericByteArray<FROM>) -> Self {
658 let offsets = byte_array.offsets();
659
660 let can_reuse_buffer = match offsets.last() {
661 Some(offset) => offset.as_usize() < u32::MAX as usize,
662 None => true,
663 };
664
665 if can_reuse_buffer {
666 // build views directly pointing to the existing buffer
667 let len = byte_array.len();
668 let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
669 let str_values_buf = byte_array.values().clone();
670 let block = views_builder.append_block(str_values_buf);
671 for (i, w) in offsets.windows(2).enumerate() {
672 let offset = w[0].as_usize();
673 let end = w[1].as_usize();
674 let length = end - offset;
675
676 if byte_array.is_null(i) {
677 views_builder.append_null();
678 } else {
679 // Safety: the input was a valid array so it valid UTF8 (if string). And
680 // all offsets were valid
681 unsafe {
682 views_builder.append_view_unchecked(block, offset as u32, length as u32)
683 }
684 }
685 }
686 assert_eq!(views_builder.len(), len);
687 views_builder.finish()
688 } else {
689 // Otherwise, create a new buffer for large strings
690 // TODO: the original buffer could still be used
691 // by making multiple slices of u32::MAX length
692 GenericByteViewArray::<V>::from_iter(byte_array.iter())
693 }
694 }
695}
696
697impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
698 fn from(mut array: GenericByteViewArray<T>) -> Self {
699 let len = array.len();
700 array.buffers.insert(0, array.views.into_inner());
701 let builder = ArrayDataBuilder::new(T::DATA_TYPE)
702 .len(len)
703 .buffers(array.buffers)
704 .nulls(array.nulls);
705
706 unsafe { builder.build_unchecked() }
707 }
708}
709
710impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
711where
712 Ptr: AsRef<T::Native> + 'a,
713 T: ByteViewType + ?Sized,
714{
715 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
716 iter.into_iter()
717 .map(|o| o.as_ref().map(|p| p.as_ref()))
718 .collect()
719 }
720}
721
722impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
723where
724 Ptr: AsRef<T::Native>,
725{
726 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
727 let iter = iter.into_iter();
728 let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
729 builder.extend(iter);
730 builder.finish()
731 }
732}
733
734/// A [`GenericByteViewArray`] of `[u8]`
735///
736/// See [`GenericByteViewArray`] for format and layout details.
737///
738/// # Example
739/// ```
740/// use arrow_array::BinaryViewArray;
741/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
742/// assert_eq!(array.value(0), b"hello");
743/// assert_eq!(array.value(3), b"large payload over 12 bytes");
744/// ```
745pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
746
747impl BinaryViewArray {
748 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
749 /// If items not utf8 data, validate will fail and error returned.
750 pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
751 StringViewType::validate(self.views(), self.data_buffers())?;
752 unsafe { Ok(self.to_string_view_unchecked()) }
753 }
754
755 /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
756 /// # Safety
757 /// Caller is responsible for ensuring that items in array are utf8 data.
758 pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
759 StringViewArray::new_unchecked(self.views, self.buffers, self.nulls)
760 }
761}
762
763impl From<Vec<&[u8]>> for BinaryViewArray {
764 fn from(v: Vec<&[u8]>) -> Self {
765 Self::from_iter_values(v)
766 }
767}
768
769impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
770 fn from(v: Vec<Option<&[u8]>>) -> Self {
771 v.into_iter().collect()
772 }
773}
774
775/// A [`GenericByteViewArray`] that stores utf8 data
776///
777/// See [`GenericByteViewArray`] for format and layout details.
778///
779/// # Example
780/// ```
781/// use arrow_array::StringViewArray;
782/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
783/// assert_eq!(array.value(0), "hello");
784/// assert_eq!(array.value(3), "large payload over 12 bytes");
785/// ```
786pub type StringViewArray = GenericByteViewArray<StringViewType>;
787
788impl StringViewArray {
789 /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
790 pub fn to_binary_view(self) -> BinaryViewArray {
791 unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
792 }
793
794 /// Returns true if all data within this array is ASCII
795 pub fn is_ascii(&self) -> bool {
796 // Alternative (but incorrect): directly check the underlying buffers
797 // (1) Our string view might be sparse, i.e., a subset of the buffers,
798 // so even if the buffer is not ascii, we can still be ascii.
799 // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
800 // This means that this operation is quite expensive, shall we cache the result?
801 // i.e. track `is_ascii` in the builder.
802 self.iter().all(|v| match v {
803 Some(v) => v.is_ascii(),
804 None => true,
805 })
806 }
807}
808
809impl From<Vec<&str>> for StringViewArray {
810 fn from(v: Vec<&str>) -> Self {
811 Self::from_iter_values(v)
812 }
813}
814
815impl From<Vec<Option<&str>>> for StringViewArray {
816 fn from(v: Vec<Option<&str>>) -> Self {
817 v.into_iter().collect()
818 }
819}
820
821impl From<Vec<String>> for StringViewArray {
822 fn from(v: Vec<String>) -> Self {
823 Self::from_iter_values(v)
824 }
825}
826
827impl From<Vec<Option<String>>> for StringViewArray {
828 fn from(v: Vec<Option<String>>) -> Self {
829 v.into_iter().collect()
830 }
831}
832
833#[cfg(test)]
834mod tests {
835 use crate::builder::{BinaryViewBuilder, StringViewBuilder};
836 use crate::{Array, BinaryViewArray, StringViewArray};
837 use arrow_buffer::{Buffer, ScalarBuffer};
838 use arrow_data::ByteView;
839
840 #[test]
841 fn try_new_string() {
842 let array = StringViewArray::from_iter_values(vec![
843 "hello",
844 "world",
845 "lulu",
846 "large payload over 12 bytes",
847 ]);
848 assert_eq!(array.value(0), "hello");
849 assert_eq!(array.value(3), "large payload over 12 bytes");
850 }
851
852 #[test]
853 fn try_new_binary() {
854 let array = BinaryViewArray::from_iter_values(vec![
855 b"hello".as_slice(),
856 b"world".as_slice(),
857 b"lulu".as_slice(),
858 b"large payload over 12 bytes".as_slice(),
859 ]);
860 assert_eq!(array.value(0), b"hello");
861 assert_eq!(array.value(3), b"large payload over 12 bytes");
862 }
863
864 #[test]
865 fn try_new_empty_string() {
866 // test empty array
867 let array = {
868 let mut builder = StringViewBuilder::new();
869 builder.finish()
870 };
871 assert!(array.is_empty());
872 }
873
874 #[test]
875 fn try_new_empty_binary() {
876 // test empty array
877 let array = {
878 let mut builder = BinaryViewBuilder::new();
879 builder.finish()
880 };
881 assert!(array.is_empty());
882 }
883
884 #[test]
885 fn test_append_string() {
886 // test builder append
887 let array = {
888 let mut builder = StringViewBuilder::new();
889 builder.append_value("hello");
890 builder.append_null();
891 builder.append_option(Some("large payload over 12 bytes"));
892 builder.finish()
893 };
894 assert_eq!(array.value(0), "hello");
895 assert!(array.is_null(1));
896 assert_eq!(array.value(2), "large payload over 12 bytes");
897 }
898
899 #[test]
900 fn test_append_binary() {
901 // test builder append
902 let array = {
903 let mut builder = BinaryViewBuilder::new();
904 builder.append_value(b"hello");
905 builder.append_null();
906 builder.append_option(Some(b"large payload over 12 bytes"));
907 builder.finish()
908 };
909 assert_eq!(array.value(0), b"hello");
910 assert!(array.is_null(1));
911 assert_eq!(array.value(2), b"large payload over 12 bytes");
912 }
913
914 #[test]
915 fn test_in_progress_recreation() {
916 let array = {
917 // make a builder with small block size.
918 let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
919 builder.append_value("large payload over 12 bytes");
920 builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
921 builder.finish()
922 };
923 assert_eq!(array.value(0), "large payload over 12 bytes");
924 assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
925 assert_eq!(2, array.buffers.len());
926 }
927
928 #[test]
929 #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
930 fn new_with_invalid_view_data() {
931 let v = "large payload over 12 bytes";
932 let view = ByteView::new(13, &v.as_bytes()[0..4])
933 .with_buffer_index(3)
934 .with_offset(1);
935 let views = ScalarBuffer::from(vec![view.into()]);
936 let buffers = vec![Buffer::from_slice_ref(v)];
937 StringViewArray::new(views, buffers, None);
938 }
939
940 #[test]
941 #[should_panic(
942 expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
943 )]
944 fn new_with_invalid_utf8_data() {
945 let v: Vec<u8> = vec![
946 // invalid UTF8
947 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
948 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
949 ];
950 let view = ByteView::new(v.len() as u32, &v[0..4]);
951 let views = ScalarBuffer::from(vec![view.into()]);
952 let buffers = vec![Buffer::from_slice_ref(v)];
953 StringViewArray::new(views, buffers, None);
954 }
955
956 #[test]
957 #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
958 fn new_with_invalid_zero_padding() {
959 let mut data = [0; 12];
960 data[0] = b'H';
961 data[11] = 1; // no zero padding
962
963 let mut view_buffer = [0; 16];
964 view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
965 view_buffer[4..].copy_from_slice(&data);
966
967 let view = ByteView::from(u128::from_le_bytes(view_buffer));
968 let views = ScalarBuffer::from(vec![view.into()]);
969 let buffers = vec![];
970 StringViewArray::new(views, buffers, None);
971 }
972
973 #[test]
974 #[should_panic(expected = "Mismatch between embedded prefix and data")]
975 fn test_mismatch_between_embedded_prefix_and_data() {
976 let input_str_1 = "Hello, Rustaceans!";
977 let input_str_2 = "Hallo, Rustaceans!";
978 let length = input_str_1.len() as u32;
979 assert!(input_str_1.len() > 12);
980
981 let mut view_buffer = [0; 16];
982 view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
983 view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
984 view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
985 view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
986 let view = ByteView::from(u128::from_le_bytes(view_buffer));
987 let views = ScalarBuffer::from(vec![view.into()]);
988 let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
989
990 StringViewArray::new(views, buffers, None);
991 }
992
993 #[test]
994 fn test_gc() {
995 let test_data = [
996 Some("longer than 12 bytes"),
997 Some("short"),
998 Some("t"),
999 Some("longer than 12 bytes"),
1000 None,
1001 Some("short"),
1002 ];
1003
1004 let array = {
1005 let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
1006 test_data.into_iter().for_each(|v| builder.append_option(v));
1007 builder.finish()
1008 };
1009 assert!(array.buffers.len() > 1);
1010
1011 fn check_gc(to_test: &StringViewArray) {
1012 let gc = to_test.gc();
1013 assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
1014
1015 to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
1016 assert_eq!(a, b);
1017 });
1018 assert_eq!(to_test.len(), gc.len());
1019 }
1020
1021 check_gc(&array);
1022 check_gc(&array.slice(1, 3));
1023 check_gc(&array.slice(2, 1));
1024 check_gc(&array.slice(2, 2));
1025 check_gc(&array.slice(3, 1));
1026 }
1027
1028 #[test]
1029 fn test_eq() {
1030 let test_data = [
1031 Some("longer than 12 bytes"),
1032 None,
1033 Some("short"),
1034 Some("again, this is longer than 12 bytes"),
1035 ];
1036
1037 let array1 = {
1038 let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
1039 test_data.into_iter().for_each(|v| builder.append_option(v));
1040 builder.finish()
1041 };
1042 let array2 = {
1043 // create a new array with the same data but different layout
1044 let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
1045 test_data.into_iter().for_each(|v| builder.append_option(v));
1046 builder.finish()
1047 };
1048 assert_eq!(array1, array1.clone());
1049 assert_eq!(array2, array2.clone());
1050 assert_eq!(array1, array2);
1051 }
1052}