arrow_cast/cast/
string.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19use arrow_buffer::NullBuffer;
20
21pub(crate) fn value_to_string<O: OffsetSizeTrait>(
22    array: &dyn Array,
23    options: &CastOptions,
24) -> Result<ArrayRef, ArrowError> {
25    let mut builder = GenericStringBuilder::<O>::new();
26    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
27    let nulls = array.nulls();
28    for i in 0..array.len() {
29        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
30            true => builder.append_null(),
31            false => {
32                formatter.value(i).write(&mut builder)?;
33                // tell the builder the row is finished
34                builder.append_value("");
35            }
36        }
37    }
38    Ok(Arc::new(builder.finish()))
39}
40
41/// Parse UTF-8
42pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
43    array: &dyn Array,
44    cast_options: &CastOptions,
45) -> Result<ArrayRef, ArrowError> {
46    let string_array = array.as_string::<O>();
47    parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || {
48        string_array.nulls().cloned()
49    })
50}
51
52/// Parse UTF-8 View
53pub(crate) fn parse_string_view<P: Parser>(
54    array: &dyn Array,
55    cast_options: &CastOptions,
56) -> Result<ArrayRef, ArrowError> {
57    let string_view_array = array.as_string_view();
58    parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || {
59        string_view_array.nulls().cloned()
60    })
61}
62
63fn parse_string_iter<
64    'a,
65    P: Parser,
66    I: Iterator<Item = Option<&'a str>>,
67    F: FnOnce() -> Option<NullBuffer>,
68>(
69    iter: I,
70    cast_options: &CastOptions,
71    nulls: F,
72) -> Result<ArrayRef, ArrowError> {
73    let array = if cast_options.safe {
74        let iter = iter.map(|x| x.and_then(P::parse));
75
76        // Benefit:
77        //     20% performance improvement
78        // Soundness:
79        //     The iterator is trustedLen because it comes from an `StringArray`.
80        unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
81    } else {
82        let v = iter
83            .map(|x| match x {
84                Some(v) => P::parse(v).ok_or_else(|| {
85                    ArrowError::CastError(format!(
86                        "Cannot cast string '{}' to value of {:?} type",
87                        v,
88                        P::DATA_TYPE
89                    ))
90                }),
91                None => Ok(P::Native::default()),
92            })
93            .collect::<Result<Vec<_>, ArrowError>>()?;
94        PrimitiveArray::new(v.into(), nulls())
95    };
96
97    Ok(Arc::new(array) as ArrayRef)
98}
99
100/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
101pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
102    array: &dyn Array,
103    to_tz: &Option<Arc<str>>,
104    cast_options: &CastOptions,
105) -> Result<ArrayRef, ArrowError> {
106    let array = array.as_string::<O>();
107    let out: PrimitiveArray<T> = match to_tz {
108        Some(tz) => {
109            let tz: Tz = tz.as_ref().parse()?;
110            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
111        }
112        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
113    };
114    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
115}
116
117/// Casts string view arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
118pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>(
119    array: &dyn Array,
120    to_tz: &Option<Arc<str>>,
121    cast_options: &CastOptions,
122) -> Result<ArrayRef, ArrowError> {
123    let array = array.as_string_view();
124    let out: PrimitiveArray<T> = match to_tz {
125        Some(tz) => {
126            let tz: Tz = tz.as_ref().parse()?;
127            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
128        }
129        None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?,
130    };
131    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
132}
133
134fn cast_string_to_timestamp_impl<
135    'a,
136    I: Iterator<Item = Option<&'a str>>,
137    T: ArrowTimestampType,
138    Tz: TimeZone,
139>(
140    iter: I,
141    tz: &Tz,
142    cast_options: &CastOptions,
143) -> Result<PrimitiveArray<T>, ArrowError> {
144    if cast_options.safe {
145        let iter = iter.map(|v| {
146            v.and_then(|v| {
147                let naive = string_to_datetime(tz, v).ok()?.naive_utc();
148                T::make_value(naive)
149            })
150        });
151        // Benefit:
152        //     20% performance improvement
153        // Soundness:
154        //     The iterator is trustedLen because it comes from an `StringArray`.
155
156        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
157    } else {
158        let vec = iter
159            .map(|v| {
160                v.map(|v| {
161                    let naive = string_to_datetime(tz, v)?.naive_utc();
162                    T::make_value(naive).ok_or_else(|| match T::UNIT {
163                        TimeUnit::Nanosecond => ArrowError::CastError(format!(
164                            "Overflow converting {naive} to Nanosecond. The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"
165                        )),
166                        _ => ArrowError::CastError(format!(
167                            "Overflow converting {naive} to {:?}",
168                            T::UNIT
169                        ))
170                    })
171                })
172                    .transpose()
173            })
174            .collect::<Result<Vec<Option<i64>>, _>>()?;
175
176        // Benefit:
177        //     20% performance improvement
178        // Soundness:
179        //     The iterator is trustedLen because it comes from an `StringArray`.
180        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
181    }
182}
183
184pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>(
185    array: &dyn Array,
186    cast_options: &CastOptions,
187    parse_function: F,
188) -> Result<ArrayRef, ArrowError>
189where
190    Offset: OffsetSizeTrait,
191    ArrowType: ArrowPrimitiveType,
192    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
193{
194    let string_array = array
195        .as_any()
196        .downcast_ref::<GenericStringArray<Offset>>()
197        .unwrap();
198    cast_string_to_interval_impl::<_, ArrowType, F>(
199        string_array.iter(),
200        cast_options,
201        parse_function,
202    )
203}
204
205pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
206    array: &dyn Array,
207    cast_options: &CastOptions,
208) -> Result<ArrayRef, ArrowError> {
209    cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
210        array,
211        cast_options,
212        parse_interval_year_month,
213    )
214}
215
216pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
217    array: &dyn Array,
218    cast_options: &CastOptions,
219) -> Result<ArrayRef, ArrowError> {
220    cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
221        array,
222        cast_options,
223        parse_interval_day_time,
224    )
225}
226
227pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
228    array: &dyn Array,
229    cast_options: &CastOptions,
230) -> Result<ArrayRef, ArrowError> {
231    cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
232        array,
233        cast_options,
234        parse_interval_month_day_nano,
235    )
236}
237
238pub(crate) fn cast_view_to_interval<F, ArrowType>(
239    array: &dyn Array,
240    cast_options: &CastOptions,
241    parse_function: F,
242) -> Result<ArrayRef, ArrowError>
243where
244    ArrowType: ArrowPrimitiveType,
245    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
246{
247    let string_view_array = array.as_any().downcast_ref::<StringViewArray>().unwrap();
248    cast_string_to_interval_impl::<_, ArrowType, F>(
249        string_view_array.iter(),
250        cast_options,
251        parse_function,
252    )
253}
254
255pub(crate) fn cast_view_to_year_month_interval(
256    array: &dyn Array,
257    cast_options: &CastOptions,
258) -> Result<ArrayRef, ArrowError> {
259    cast_view_to_interval::<_, IntervalYearMonthType>(
260        array,
261        cast_options,
262        parse_interval_year_month,
263    )
264}
265
266pub(crate) fn cast_view_to_day_time_interval(
267    array: &dyn Array,
268    cast_options: &CastOptions,
269) -> Result<ArrayRef, ArrowError> {
270    cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, parse_interval_day_time)
271}
272
273pub(crate) fn cast_view_to_month_day_nano_interval(
274    array: &dyn Array,
275    cast_options: &CastOptions,
276) -> Result<ArrayRef, ArrowError> {
277    cast_view_to_interval::<_, IntervalMonthDayNanoType>(
278        array,
279        cast_options,
280        parse_interval_month_day_nano,
281    )
282}
283
284fn cast_string_to_interval_impl<'a, I, ArrowType, F>(
285    iter: I,
286    cast_options: &CastOptions,
287    parse_function: F,
288) -> Result<ArrayRef, ArrowError>
289where
290    I: Iterator<Item = Option<&'a str>>,
291    ArrowType: ArrowPrimitiveType,
292    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
293{
294    let interval_array = if cast_options.safe {
295        let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok()));
296
297        // Benefit:
298        //     20% performance improvement
299        // Soundness:
300        //     The iterator is trustedLen because it comes from an `StringArray`.
301        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
302    } else {
303        let vec = iter
304            .map(|v| v.map(parse_function).transpose())
305            .collect::<Result<Vec<_>, ArrowError>>()?;
306
307        // Benefit:
308        //     20% performance improvement
309        // Soundness:
310        //     The iterator is trustedLen because it comes from an `StringArray`.
311        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
312    };
313    Ok(Arc::new(interval_array) as ArrayRef)
314}
315
316/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
317/// offset size so re-encoding offset is unnecessary.
318pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
319    array: &dyn Array,
320    cast_options: &CastOptions,
321) -> Result<ArrayRef, ArrowError> {
322    let array = array
323        .as_any()
324        .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
325        .unwrap();
326
327    match GenericStringArray::<O>::try_from_binary(array.clone()) {
328        Ok(a) => Ok(Arc::new(a)),
329        Err(e) => match cast_options.safe {
330            true => {
331                // Fallback to slow method to convert invalid sequences to nulls
332                let mut builder =
333                    GenericStringBuilder::<O>::with_capacity(array.len(), array.value_data().len());
334
335                let iter = array
336                    .iter()
337                    .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
338
339                builder.extend(iter);
340                Ok(Arc::new(builder.finish()))
341            }
342            false => Err(e),
343        },
344    }
345}
346
347/// Casts Utf8 to Boolean
348pub(crate) fn cast_utf8_to_boolean<OffsetSize>(
349    from: &dyn Array,
350    cast_options: &CastOptions,
351) -> Result<ArrayRef, ArrowError>
352where
353    OffsetSize: OffsetSizeTrait,
354{
355    let array = from
356        .as_any()
357        .downcast_ref::<GenericStringArray<OffsetSize>>()
358        .unwrap();
359
360    let output_array = array
361        .iter()
362        .map(|value| match value {
363            Some(value) => match value.to_ascii_lowercase().trim() {
364                "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)),
365                "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => {
366                    Ok(Some(false))
367                }
368                invalid_value => match cast_options.safe {
369                    true => Ok(None),
370                    false => Err(ArrowError::CastError(format!(
371                        "Cannot cast value '{invalid_value}' to value of Boolean type",
372                    ))),
373                },
374            },
375            None => Ok(None),
376        })
377        .collect::<Result<BooleanArray, _>>()?;
378
379    Ok(Arc::new(output_array))
380}