arrow_array/array/
string_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::types::GenericStringType;
19use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20use arrow_schema::{ArrowError, DataType};
21
22/// A [`GenericByteArray`] for storing `str`
23pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26    /// Get the data type of the array.
27    #[deprecated(note = "please use `Self::DATA_TYPE` instead")]
28    pub const fn get_data_type() -> DataType {
29        Self::DATA_TYPE
30    }
31
32    /// Returns the number of `Unicode Scalar Value` in the string at index `i`.
33    /// # Performance
34    /// This function has `O(n)` time complexity where `n` is the string length.
35    /// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`,
36    /// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
37    pub fn num_chars(&self, i: usize) -> usize {
38        self.value(i).chars().count()
39    }
40
41    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
42    pub fn take_iter<'a>(
43        &'a self,
44        indexes: impl Iterator<Item = Option<usize>> + 'a,
45    ) -> impl Iterator<Item = Option<&'a str>> {
46        indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
47    }
48
49    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
50    /// # Safety
51    ///
52    /// caller must ensure that the indexes in the iterator are less than the `array.len()`
53    pub unsafe fn take_iter_unchecked<'a>(
54        &'a self,
55        indexes: impl Iterator<Item = Option<usize>> + 'a,
56    ) -> impl Iterator<Item = Option<&'a str>> {
57        indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
58    }
59
60    /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning
61    /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data
62    pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
63        let (offsets, values, nulls) = v.into_parts();
64        Self::try_new(offsets, values, nulls)
65    }
66}
67
68impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
69    for GenericStringArray<OffsetSize>
70{
71    fn from(v: GenericListArray<OffsetSize>) -> Self {
72        GenericBinaryArray::<OffsetSize>::from(v).into()
73    }
74}
75
76impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
77    for GenericStringArray<OffsetSize>
78{
79    fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
80        Self::try_from_binary(v).unwrap()
81    }
82}
83
84impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
85    fn from(v: Vec<Option<&str>>) -> Self {
86        v.into_iter().collect()
87    }
88}
89
90impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
91    fn from(v: Vec<&str>) -> Self {
92        Self::from_iter_values(v)
93    }
94}
95
96impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
97    fn from(v: Vec<Option<String>>) -> Self {
98        v.into_iter().collect()
99    }
100}
101
102impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
103    fn from(v: Vec<String>) -> Self {
104        Self::from_iter_values(v)
105    }
106}
107
108/// A [`GenericStringArray`] of `str` using `i32` offsets
109///
110/// # Examples
111///
112/// Construction
113///
114/// ```
115/// # use arrow_array::StringArray;
116/// // Create from Vec<Option<&str>>
117/// let arr = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
118/// // Create from Vec<&str>
119/// let arr = StringArray::from(vec!["foo", "bar", "baz"]);
120/// // Create from iter/collect (requires Option<&str>)
121/// let arr: StringArray = std::iter::repeat(Some("foo")).take(10).collect();
122/// ```
123///
124/// Construction and Access
125///
126/// ```
127/// # use arrow_array::StringArray;
128/// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]);
129/// assert_eq!(array.value(0), "foo");
130/// ```
131///
132/// See [`GenericByteArray`] for more information and examples
133pub type StringArray = GenericStringArray<i32>;
134
135/// A [`GenericStringArray`] of `str` using `i64` offsets
136///
137/// # Examples
138///
139/// Construction
140///
141/// ```
142/// # use arrow_array::LargeStringArray;
143/// // Create from Vec<Option<&str>>
144/// let arr = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
145/// // Create from Vec<&str>
146/// let arr = LargeStringArray::from(vec!["foo", "bar", "baz"]);
147/// // Create from iter/collect (requires Option<&str>)
148/// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect();
149/// ```
150///
151/// Construction and Access
152///
153/// ```
154/// use arrow_array::LargeStringArray;
155/// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]);
156/// assert_eq!(array.value(2), "bar");
157/// ```
158///
159/// See [`GenericByteArray`] for more information and examples
160pub type LargeStringArray = GenericStringArray<i64>;
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165    use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
166    use crate::types::UInt8Type;
167    use crate::Array;
168    use arrow_buffer::Buffer;
169    use arrow_data::ArrayData;
170    use arrow_schema::Field;
171    use std::sync::Arc;
172
173    #[test]
174    fn test_string_array_from_u8_slice() {
175        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
176
177        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
178        let string_array = StringArray::from(values);
179
180        assert_eq!(3, string_array.len());
181        assert_eq!(0, string_array.null_count());
182        assert_eq!("hello", string_array.value(0));
183        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
184        assert_eq!("", string_array.value(1));
185        assert_eq!("", unsafe { string_array.value_unchecked(1) });
186        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
187        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
188            string_array.value_unchecked(2)
189        });
190        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
191        assert_eq!(8, string_array.num_chars(2));
192        for i in 0..3 {
193            assert!(string_array.is_valid(i));
194            assert!(!string_array.is_null(i));
195        }
196    }
197
198    #[test]
199    #[should_panic(expected = "StringArray expects DataType::Utf8")]
200    fn test_string_array_from_int() {
201        let array = LargeStringArray::from(vec!["a", "b"]);
202        drop(StringArray::from(array.into_data()));
203    }
204
205    #[test]
206    fn test_large_string_array_from_u8_slice() {
207        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
208
209        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
210        let string_array = LargeStringArray::from(values);
211
212        assert_eq!(3, string_array.len());
213        assert_eq!(0, string_array.null_count());
214        assert_eq!("hello", string_array.value(0));
215        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
216        assert_eq!("", string_array.value(1));
217        assert_eq!("", unsafe { string_array.value_unchecked(1) });
218        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
219        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
220            string_array.value_unchecked(2)
221        });
222        assert_eq!(5, string_array.value_offsets()[2]);
223        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
224        assert_eq!(8, string_array.num_chars(2));
225        for i in 0..3 {
226            assert!(string_array.is_valid(i));
227            assert!(!string_array.is_null(i));
228        }
229    }
230
231    #[test]
232    fn test_nested_string_array() {
233        let string_builder = StringBuilder::with_capacity(3, 10);
234        let mut list_of_string_builder = ListBuilder::new(string_builder);
235
236        list_of_string_builder.values().append_value("foo");
237        list_of_string_builder.values().append_value("bar");
238        list_of_string_builder.append(true);
239
240        list_of_string_builder.values().append_value("foobar");
241        list_of_string_builder.append(true);
242        let list_of_strings = list_of_string_builder.finish();
243
244        assert_eq!(list_of_strings.len(), 2);
245
246        let first_slot = list_of_strings.value(0);
247        let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
248        assert_eq!(first_list.len(), 2);
249        assert_eq!(first_list.value(0), "foo");
250        assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
251        assert_eq!(first_list.value(1), "bar");
252        assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
253
254        let second_slot = list_of_strings.value(1);
255        let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
256        assert_eq!(second_list.len(), 1);
257        assert_eq!(second_list.value(0), "foobar");
258        assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
259    }
260
261    #[test]
262    #[should_panic(
263        expected = "Trying to access an element at index 4 from a StringArray of length 3"
264    )]
265    fn test_string_array_get_value_index_out_of_bound() {
266        let values: [u8; 12] = [
267            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
268        ];
269        let offsets: [i32; 4] = [0, 5, 5, 12];
270        let array_data = ArrayData::builder(DataType::Utf8)
271            .len(3)
272            .add_buffer(Buffer::from_slice_ref(offsets))
273            .add_buffer(Buffer::from_slice_ref(values))
274            .build()
275            .unwrap();
276        let string_array = StringArray::from(array_data);
277        string_array.value(4);
278    }
279
280    #[test]
281    fn test_string_array_fmt_debug() {
282        let arr: StringArray = vec!["hello", "arrow"].into();
283        assert_eq!(
284            "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
285            format!("{arr:?}")
286        );
287    }
288
289    #[test]
290    fn test_large_string_array_fmt_debug() {
291        let arr: LargeStringArray = vec!["hello", "arrow"].into();
292        assert_eq!(
293            "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
294            format!("{arr:?}")
295        );
296    }
297
298    #[test]
299    fn test_string_array_from_iter() {
300        let data = [Some("hello"), None, Some("arrow")];
301        let data_vec = data.to_vec();
302        // from Vec<Option<&str>>
303        let array1 = StringArray::from(data_vec.clone());
304        // from Iterator<Option<&str>>
305        let array2: StringArray = data_vec.clone().into_iter().collect();
306        // from Iterator<Option<String>>
307        let array3: StringArray = data_vec
308            .into_iter()
309            .map(|x| x.map(|s| s.to_string()))
310            .collect();
311        // from Iterator<&Option<&str>>
312        let array4: StringArray = data.iter().collect::<StringArray>();
313
314        assert_eq!(array1, array2);
315        assert_eq!(array2, array3);
316        assert_eq!(array3, array4);
317    }
318
319    #[test]
320    fn test_string_array_from_iter_values() {
321        let data = ["hello", "hello2"];
322        let array1 = StringArray::from_iter_values(data.iter());
323
324        assert_eq!(array1.value(0), "hello");
325        assert_eq!(array1.value(1), "hello2");
326
327        // Also works with String types.
328        let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
329        let array2 = StringArray::from_iter_values(data2.iter());
330
331        assert_eq!(array2.value(0), "goodbye");
332        assert_eq!(array2.value(1), "goodbye2");
333    }
334
335    #[test]
336    fn test_string_array_from_unbound_iter() {
337        // iterator that doesn't declare (upper) size bound
338        let string_iter = (0..)
339            .scan(0usize, |pos, i| {
340                if *pos < 10 {
341                    *pos += 1;
342                    Some(Some(format!("value {i}")))
343                } else {
344                    // actually returns up to 10 values
345                    None
346                }
347            })
348            // limited using take()
349            .take(100);
350
351        let (_, upper_size_bound) = string_iter.size_hint();
352        // the upper bound, defined by take above, is 100
353        assert_eq!(upper_size_bound, Some(100));
354        let string_array: StringArray = string_iter.collect();
355        // but the actual number of items in the array should be 10
356        assert_eq!(string_array.len(), 10);
357    }
358
359    #[test]
360    fn test_string_array_all_null() {
361        let data: Vec<Option<&str>> = vec![None];
362        let array = StringArray::from(data);
363        array
364            .into_data()
365            .validate_full()
366            .expect("All null array has valid array data");
367    }
368
369    #[test]
370    fn test_large_string_array_all_null() {
371        let data: Vec<Option<&str>> = vec![None];
372        let array = LargeStringArray::from(data);
373        array
374            .into_data()
375            .validate_full()
376            .expect("All null array has valid array data");
377    }
378
379    fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
380        let values = b"HelloArrowAndParquet";
381        // "ArrowAndParquet"
382        let child_data = ArrayData::builder(DataType::UInt8)
383            .len(15)
384            .offset(5)
385            .add_buffer(Buffer::from(&values[..]))
386            .build()
387            .unwrap();
388
389        let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
390        let null_buffer = Buffer::from_slice_ref([0b101]);
391        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
392            "item",
393            DataType::UInt8,
394            false,
395        )));
396
397        // [None, Some("Parquet")]
398        let array_data = ArrayData::builder(data_type)
399            .len(2)
400            .offset(1)
401            .add_buffer(Buffer::from_slice_ref(offsets))
402            .null_bit_buffer(Some(null_buffer))
403            .add_child_data(child_data)
404            .build()
405            .unwrap();
406        let list_array = GenericListArray::<O>::from(array_data);
407        let string_array = GenericStringArray::<O>::from(list_array);
408
409        assert_eq!(2, string_array.len());
410        assert_eq!(1, string_array.null_count());
411        assert!(string_array.is_null(0));
412        assert!(string_array.is_valid(1));
413        assert_eq!("Parquet", string_array.value(1));
414    }
415
416    #[test]
417    fn test_string_array_from_list_array() {
418        _test_generic_string_array_from_list_array::<i32>();
419    }
420
421    #[test]
422    fn test_large_string_array_from_list_array() {
423        _test_generic_string_array_from_list_array::<i64>();
424    }
425
426    fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
427        let values = b"HelloArrow";
428        let child_data = ArrayData::builder(DataType::UInt8)
429            .len(10)
430            .add_buffer(Buffer::from(&values[..]))
431            .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
432            .build()
433            .unwrap();
434
435        let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
436
437        // It is possible to create a null struct containing a non-nullable child
438        // see https://github.com/apache/arrow-rs/pull/3244 for details
439        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
440            "item",
441            DataType::UInt8,
442            true,
443        )));
444
445        // [None, Some(b"Parquet")]
446        let array_data = ArrayData::builder(data_type)
447            .len(2)
448            .add_buffer(Buffer::from_slice_ref(offsets))
449            .add_child_data(child_data)
450            .build()
451            .unwrap();
452        let list_array = GenericListArray::<O>::from(array_data);
453        drop(GenericStringArray::<O>::from(list_array));
454    }
455
456    #[test]
457    #[should_panic(expected = "The child array cannot contain null values.")]
458    fn test_string_array_from_list_array_with_child_nulls_failed() {
459        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
460    }
461
462    #[test]
463    #[should_panic(expected = "The child array cannot contain null values.")]
464    fn test_large_string_array_from_list_array_with_child_nulls_failed() {
465        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
466    }
467
468    fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
469        let values = b"HelloArrow";
470        let child_data = ArrayData::builder(DataType::UInt16)
471            .len(5)
472            .add_buffer(Buffer::from(&values[..]))
473            .build()
474            .unwrap();
475
476        let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
477        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
478            "item",
479            DataType::UInt16,
480            false,
481        )));
482
483        let array_data = ArrayData::builder(data_type)
484            .len(2)
485            .add_buffer(Buffer::from_slice_ref(offsets))
486            .add_child_data(child_data)
487            .build()
488            .unwrap();
489        let list_array = GenericListArray::<O>::from(array_data);
490        drop(GenericStringArray::<O>::from(list_array));
491    }
492
493    #[test]
494    #[should_panic(
495        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
496    )]
497    fn test_string_array_from_list_array_wrong_type() {
498        _test_generic_string_array_from_list_array_wrong_type::<i32>();
499    }
500
501    #[test]
502    #[should_panic(
503        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
504    )]
505    fn test_large_string_array_from_list_array_wrong_type() {
506        _test_generic_string_array_from_list_array_wrong_type::<i64>();
507    }
508
509    #[test]
510    #[should_panic(
511        expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
512    )]
513    fn test_list_array_utf8_validation() {
514        let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
515        builder.values().append_value(0xFF);
516        builder.append(true);
517        let list = builder.finish();
518        let _ = StringArray::from(list);
519    }
520
521    #[test]
522    fn test_empty_offsets() {
523        let string = StringArray::from(
524            ArrayData::builder(DataType::Utf8)
525                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
526                .build()
527                .unwrap(),
528        );
529        assert_eq!(string.len(), 0);
530        assert_eq!(string.value_offsets(), &[0]);
531
532        let string = LargeStringArray::from(
533            ArrayData::builder(DataType::LargeUtf8)
534                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
535                .build()
536                .unwrap(),
537        );
538        assert_eq!(string.len(), 0);
539        assert_eq!(string.value_offsets(), &[0]);
540    }
541
542    #[test]
543    fn test_into_builder() {
544        let array: StringArray = vec!["hello", "arrow"].into();
545
546        // Append values
547        let mut builder = array.into_builder().unwrap();
548
549        builder.append_value("rust");
550
551        let expected: StringArray = vec!["hello", "arrow", "rust"].into();
552        let array = builder.finish();
553        assert_eq!(expected, array);
554    }
555
556    #[test]
557    fn test_into_builder_err() {
558        let array: StringArray = vec!["hello", "arrow"].into();
559
560        // Clone it, so we cannot get a mutable builder back
561        let shared_array = array.clone();
562
563        let err_return = array.into_builder().unwrap_err();
564        assert_eq!(&err_return, &shared_array);
565    }
566}