1use crate::types::GenericStringType;
19use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20use arrow_schema::{ArrowError, DataType};
21
22pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26 #[deprecated(note = "please use `Self::DATA_TYPE` instead")]
28 pub const fn get_data_type() -> DataType {
29 Self::DATA_TYPE
30 }
31
32 pub fn num_chars(&self, i: usize) -> usize {
38 self.value(i).chars().count()
39 }
40
41 pub fn take_iter<'a>(
43 &'a self,
44 indexes: impl Iterator<Item = Option<usize>> + 'a,
45 ) -> impl Iterator<Item = Option<&'a str>> {
46 indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
47 }
48
49 pub unsafe fn take_iter_unchecked<'a>(
54 &'a self,
55 indexes: impl Iterator<Item = Option<usize>> + 'a,
56 ) -> impl Iterator<Item = Option<&'a str>> {
57 indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
58 }
59
60 pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
63 let (offsets, values, nulls) = v.into_parts();
64 Self::try_new(offsets, values, nulls)
65 }
66}
67
68impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
69 for GenericStringArray<OffsetSize>
70{
71 fn from(v: GenericListArray<OffsetSize>) -> Self {
72 GenericBinaryArray::<OffsetSize>::from(v).into()
73 }
74}
75
76impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
77 for GenericStringArray<OffsetSize>
78{
79 fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
80 Self::try_from_binary(v).unwrap()
81 }
82}
83
84impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
85 fn from(v: Vec<Option<&str>>) -> Self {
86 v.into_iter().collect()
87 }
88}
89
90impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
91 fn from(v: Vec<&str>) -> Self {
92 Self::from_iter_values(v)
93 }
94}
95
96impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
97 fn from(v: Vec<Option<String>>) -> Self {
98 v.into_iter().collect()
99 }
100}
101
102impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
103 fn from(v: Vec<String>) -> Self {
104 Self::from_iter_values(v)
105 }
106}
107
108pub type StringArray = GenericStringArray<i32>;
134
135pub type LargeStringArray = GenericStringArray<i64>;
161
162#[cfg(test)]
163mod tests {
164 use super::*;
165 use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
166 use crate::types::UInt8Type;
167 use crate::Array;
168 use arrow_buffer::Buffer;
169 use arrow_data::ArrayData;
170 use arrow_schema::Field;
171 use std::sync::Arc;
172
173 #[test]
174 fn test_string_array_from_u8_slice() {
175 let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
176
177 let string_array = StringArray::from(values);
179
180 assert_eq!(3, string_array.len());
181 assert_eq!(0, string_array.null_count());
182 assert_eq!("hello", string_array.value(0));
183 assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
184 assert_eq!("", string_array.value(1));
185 assert_eq!("", unsafe { string_array.value_unchecked(1) });
186 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
187 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
188 string_array.value_unchecked(2)
189 });
190 assert_eq!(20, string_array.value_length(2)); assert_eq!(8, string_array.num_chars(2));
192 for i in 0..3 {
193 assert!(string_array.is_valid(i));
194 assert!(!string_array.is_null(i));
195 }
196 }
197
198 #[test]
199 #[should_panic(expected = "StringArray expects DataType::Utf8")]
200 fn test_string_array_from_int() {
201 let array = LargeStringArray::from(vec!["a", "b"]);
202 drop(StringArray::from(array.into_data()));
203 }
204
205 #[test]
206 fn test_large_string_array_from_u8_slice() {
207 let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
208
209 let string_array = LargeStringArray::from(values);
211
212 assert_eq!(3, string_array.len());
213 assert_eq!(0, string_array.null_count());
214 assert_eq!("hello", string_array.value(0));
215 assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
216 assert_eq!("", string_array.value(1));
217 assert_eq!("", unsafe { string_array.value_unchecked(1) });
218 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
219 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
220 string_array.value_unchecked(2)
221 });
222 assert_eq!(5, string_array.value_offsets()[2]);
223 assert_eq!(20, string_array.value_length(2)); assert_eq!(8, string_array.num_chars(2));
225 for i in 0..3 {
226 assert!(string_array.is_valid(i));
227 assert!(!string_array.is_null(i));
228 }
229 }
230
231 #[test]
232 fn test_nested_string_array() {
233 let string_builder = StringBuilder::with_capacity(3, 10);
234 let mut list_of_string_builder = ListBuilder::new(string_builder);
235
236 list_of_string_builder.values().append_value("foo");
237 list_of_string_builder.values().append_value("bar");
238 list_of_string_builder.append(true);
239
240 list_of_string_builder.values().append_value("foobar");
241 list_of_string_builder.append(true);
242 let list_of_strings = list_of_string_builder.finish();
243
244 assert_eq!(list_of_strings.len(), 2);
245
246 let first_slot = list_of_strings.value(0);
247 let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
248 assert_eq!(first_list.len(), 2);
249 assert_eq!(first_list.value(0), "foo");
250 assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
251 assert_eq!(first_list.value(1), "bar");
252 assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
253
254 let second_slot = list_of_strings.value(1);
255 let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
256 assert_eq!(second_list.len(), 1);
257 assert_eq!(second_list.value(0), "foobar");
258 assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
259 }
260
261 #[test]
262 #[should_panic(
263 expected = "Trying to access an element at index 4 from a StringArray of length 3"
264 )]
265 fn test_string_array_get_value_index_out_of_bound() {
266 let values: [u8; 12] = [
267 b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
268 ];
269 let offsets: [i32; 4] = [0, 5, 5, 12];
270 let array_data = ArrayData::builder(DataType::Utf8)
271 .len(3)
272 .add_buffer(Buffer::from_slice_ref(offsets))
273 .add_buffer(Buffer::from_slice_ref(values))
274 .build()
275 .unwrap();
276 let string_array = StringArray::from(array_data);
277 string_array.value(4);
278 }
279
280 #[test]
281 fn test_string_array_fmt_debug() {
282 let arr: StringArray = vec!["hello", "arrow"].into();
283 assert_eq!(
284 "StringArray\n[\n \"hello\",\n \"arrow\",\n]",
285 format!("{arr:?}")
286 );
287 }
288
289 #[test]
290 fn test_large_string_array_fmt_debug() {
291 let arr: LargeStringArray = vec!["hello", "arrow"].into();
292 assert_eq!(
293 "LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]",
294 format!("{arr:?}")
295 );
296 }
297
298 #[test]
299 fn test_string_array_from_iter() {
300 let data = [Some("hello"), None, Some("arrow")];
301 let data_vec = data.to_vec();
302 let array1 = StringArray::from(data_vec.clone());
304 let array2: StringArray = data_vec.clone().into_iter().collect();
306 let array3: StringArray = data_vec
308 .into_iter()
309 .map(|x| x.map(|s| s.to_string()))
310 .collect();
311 let array4: StringArray = data.iter().collect::<StringArray>();
313
314 assert_eq!(array1, array2);
315 assert_eq!(array2, array3);
316 assert_eq!(array3, array4);
317 }
318
319 #[test]
320 fn test_string_array_from_iter_values() {
321 let data = ["hello", "hello2"];
322 let array1 = StringArray::from_iter_values(data.iter());
323
324 assert_eq!(array1.value(0), "hello");
325 assert_eq!(array1.value(1), "hello2");
326
327 let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
329 let array2 = StringArray::from_iter_values(data2.iter());
330
331 assert_eq!(array2.value(0), "goodbye");
332 assert_eq!(array2.value(1), "goodbye2");
333 }
334
335 #[test]
336 fn test_string_array_from_unbound_iter() {
337 let string_iter = (0..)
339 .scan(0usize, |pos, i| {
340 if *pos < 10 {
341 *pos += 1;
342 Some(Some(format!("value {i}")))
343 } else {
344 None
346 }
347 })
348 .take(100);
350
351 let (_, upper_size_bound) = string_iter.size_hint();
352 assert_eq!(upper_size_bound, Some(100));
354 let string_array: StringArray = string_iter.collect();
355 assert_eq!(string_array.len(), 10);
357 }
358
359 #[test]
360 fn test_string_array_all_null() {
361 let data: Vec<Option<&str>> = vec![None];
362 let array = StringArray::from(data);
363 array
364 .into_data()
365 .validate_full()
366 .expect("All null array has valid array data");
367 }
368
369 #[test]
370 fn test_large_string_array_all_null() {
371 let data: Vec<Option<&str>> = vec![None];
372 let array = LargeStringArray::from(data);
373 array
374 .into_data()
375 .validate_full()
376 .expect("All null array has valid array data");
377 }
378
379 fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
380 let values = b"HelloArrowAndParquet";
381 let child_data = ArrayData::builder(DataType::UInt8)
383 .len(15)
384 .offset(5)
385 .add_buffer(Buffer::from(&values[..]))
386 .build()
387 .unwrap();
388
389 let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
390 let null_buffer = Buffer::from_slice_ref([0b101]);
391 let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
392 "item",
393 DataType::UInt8,
394 false,
395 )));
396
397 let array_data = ArrayData::builder(data_type)
399 .len(2)
400 .offset(1)
401 .add_buffer(Buffer::from_slice_ref(offsets))
402 .null_bit_buffer(Some(null_buffer))
403 .add_child_data(child_data)
404 .build()
405 .unwrap();
406 let list_array = GenericListArray::<O>::from(array_data);
407 let string_array = GenericStringArray::<O>::from(list_array);
408
409 assert_eq!(2, string_array.len());
410 assert_eq!(1, string_array.null_count());
411 assert!(string_array.is_null(0));
412 assert!(string_array.is_valid(1));
413 assert_eq!("Parquet", string_array.value(1));
414 }
415
416 #[test]
417 fn test_string_array_from_list_array() {
418 _test_generic_string_array_from_list_array::<i32>();
419 }
420
421 #[test]
422 fn test_large_string_array_from_list_array() {
423 _test_generic_string_array_from_list_array::<i64>();
424 }
425
426 fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
427 let values = b"HelloArrow";
428 let child_data = ArrayData::builder(DataType::UInt8)
429 .len(10)
430 .add_buffer(Buffer::from(&values[..]))
431 .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
432 .build()
433 .unwrap();
434
435 let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
436
437 let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
440 "item",
441 DataType::UInt8,
442 true,
443 )));
444
445 let array_data = ArrayData::builder(data_type)
447 .len(2)
448 .add_buffer(Buffer::from_slice_ref(offsets))
449 .add_child_data(child_data)
450 .build()
451 .unwrap();
452 let list_array = GenericListArray::<O>::from(array_data);
453 drop(GenericStringArray::<O>::from(list_array));
454 }
455
456 #[test]
457 #[should_panic(expected = "The child array cannot contain null values.")]
458 fn test_string_array_from_list_array_with_child_nulls_failed() {
459 _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
460 }
461
462 #[test]
463 #[should_panic(expected = "The child array cannot contain null values.")]
464 fn test_large_string_array_from_list_array_with_child_nulls_failed() {
465 _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
466 }
467
468 fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
469 let values = b"HelloArrow";
470 let child_data = ArrayData::builder(DataType::UInt16)
471 .len(5)
472 .add_buffer(Buffer::from(&values[..]))
473 .build()
474 .unwrap();
475
476 let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
477 let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
478 "item",
479 DataType::UInt16,
480 false,
481 )));
482
483 let array_data = ArrayData::builder(data_type)
484 .len(2)
485 .add_buffer(Buffer::from_slice_ref(offsets))
486 .add_child_data(child_data)
487 .build()
488 .unwrap();
489 let list_array = GenericListArray::<O>::from(array_data);
490 drop(GenericStringArray::<O>::from(list_array));
491 }
492
493 #[test]
494 #[should_panic(
495 expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
496 )]
497 fn test_string_array_from_list_array_wrong_type() {
498 _test_generic_string_array_from_list_array_wrong_type::<i32>();
499 }
500
501 #[test]
502 #[should_panic(
503 expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
504 )]
505 fn test_large_string_array_from_list_array_wrong_type() {
506 _test_generic_string_array_from_list_array_wrong_type::<i64>();
507 }
508
509 #[test]
510 #[should_panic(
511 expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
512 )]
513 fn test_list_array_utf8_validation() {
514 let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
515 builder.values().append_value(0xFF);
516 builder.append(true);
517 let list = builder.finish();
518 let _ = StringArray::from(list);
519 }
520
521 #[test]
522 fn test_empty_offsets() {
523 let string = StringArray::from(
524 ArrayData::builder(DataType::Utf8)
525 .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
526 .build()
527 .unwrap(),
528 );
529 assert_eq!(string.len(), 0);
530 assert_eq!(string.value_offsets(), &[0]);
531
532 let string = LargeStringArray::from(
533 ArrayData::builder(DataType::LargeUtf8)
534 .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
535 .build()
536 .unwrap(),
537 );
538 assert_eq!(string.len(), 0);
539 assert_eq!(string.value_offsets(), &[0]);
540 }
541
542 #[test]
543 fn test_into_builder() {
544 let array: StringArray = vec!["hello", "arrow"].into();
545
546 let mut builder = array.into_builder().unwrap();
548
549 builder.append_value("rust");
550
551 let expected: StringArray = vec!["hello", "arrow", "rust"].into();
552 let array = builder.finish();
553 assert_eq!(expected, array);
554 }
555
556 #[test]
557 fn test_into_builder_err() {
558 let array: StringArray = vec!["hello", "arrow"].into();
559
560 let shared_array = array.clone();
562
563 let err_return = array.into_builder().unwrap_err();
564 assert_eq!(&err_return, &shared_array);
565 }
566}