arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    match to_type {
32        Dictionary(to_index_type, to_value_type) => {
33            let dict_array = array
34                .as_any()
35                .downcast_ref::<DictionaryArray<K>>()
36                .ok_or_else(|| {
37                    ArrowError::ComputeError(
38                        "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
39                    )
40                })?;
41
42            let keys_array: ArrayRef =
43                Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
44            let values_array = dict_array.values();
45            let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
46            let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
47
48            // Failure to cast keys (because they don't fit in the
49            // target type) results in NULL values;
50            if cast_keys.null_count() > keys_array.null_count() {
51                return Err(ArrowError::ComputeError(format!(
52                    "Could not convert {} dictionary indexes from {:?} to {:?}",
53                    cast_keys.null_count() - keys_array.null_count(),
54                    keys_array.data_type(),
55                    to_index_type
56                )));
57            }
58
59            let data = cast_keys.into_data();
60            let builder = data
61                .into_builder()
62                .data_type(to_type.clone())
63                .child_data(vec![cast_values.into_data()]);
64
65            // Safety
66            // Cast keys are still valid
67            let data = unsafe { builder.build_unchecked() };
68
69            // create the appropriate array type
70            let new_array: ArrayRef = match **to_index_type {
71                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
72                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
73                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
74                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
75                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
76                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
77                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
78                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
79                _ => {
80                    return Err(ArrowError::CastError(format!(
81                        "Unsupported type {to_index_type:?} for dictionary index"
82                    )));
83                }
84            };
85
86            Ok(new_array)
87        }
88        Utf8View => {
89            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
90            // we handle it here to avoid the copy.
91            let dict_array = array
92                .as_dictionary::<K>()
93                .downcast_dict::<StringArray>()
94                .ok_or_else(|| {
95                    ArrowError::ComputeError(
96                        "Internal Error: Cannot cast Utf8View to StringArray of expected type"
97                            .to_string(),
98                    )
99                })?;
100
101            let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
102                dict_array.values(),
103                dict_array.keys(),
104            )?;
105            Ok(Arc::new(string_view))
106        }
107        BinaryView => {
108            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
109            // we handle it here to avoid the copy.
110            let dict_array = array
111                .as_dictionary::<K>()
112                .downcast_dict::<BinaryArray>()
113                .ok_or_else(|| {
114                    ArrowError::ComputeError(
115                        "Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
116                            .to_string(),
117                    )
118                })?;
119
120            let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
121                dict_array.values(),
122                dict_array.keys(),
123            )?;
124            Ok(Arc::new(binary_view))
125        }
126        _ => unpack_dictionary::<K>(array, to_type, cast_options),
127    }
128}
129
130fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
131    array: &GenericByteArray<V>,
132    keys: &PrimitiveArray<K>,
133) -> Result<GenericByteViewArray<T>, ArrowError> {
134    let value_buffer = array.values();
135    let value_offsets = array.value_offsets();
136    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
137    builder.append_block(value_buffer.clone());
138    for i in keys.iter() {
139        match i {
140            Some(v) => {
141                let idx = v.to_usize().ok_or_else(|| {
142                    ArrowError::ComputeError("Invalid dictionary index".to_string())
143                })?;
144
145                // Safety
146                // (1) The index is within bounds as they are offsets
147                // (2) The append_view is safe
148                unsafe {
149                    let offset = value_offsets.get_unchecked(idx).as_usize();
150                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
151                    let length = end - offset;
152                    builder.append_view_unchecked(0, offset as u32, length as u32)
153                }
154            }
155            None => {
156                builder.append_null();
157            }
158        }
159    }
160    Ok(builder.finish())
161}
162
163// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
164pub(crate) fn unpack_dictionary<K>(
165    array: &dyn Array,
166    to_type: &DataType,
167    cast_options: &CastOptions,
168) -> Result<ArrayRef, ArrowError>
169where
170    K: ArrowDictionaryKeyType,
171{
172    let dict_array = array.as_dictionary::<K>();
173    let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
174    take(cast_dict_values.as_ref(), dict_array.keys(), None)
175}
176
177/// Pack a data type into a dictionary array passing the values through a primitive array
178pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    primitive_type: DataType,
181    dict_value_type: &DataType,
182    cast_options: &CastOptions,
183) -> Result<ArrayRef, ArrowError> {
184    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
185    let dict = cast_with_options(
186        primitive.as_ref(),
187        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
188        cast_options,
189    )?;
190    cast_with_options(
191        dict.as_ref(),
192        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
193        cast_options,
194    )
195}
196
197/// Attempts to encode an array into an `ArrayDictionary` with index
198/// type K and value (dictionary) type value_type
199///
200/// K is the key type
201pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
202    array: &dyn Array,
203    dict_value_type: &DataType,
204    cast_options: &CastOptions,
205) -> Result<ArrayRef, ArrowError> {
206    use DataType::*;
207
208    match *dict_value_type {
209        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
210        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
211        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
212        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
213        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
214        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
215        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
216        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
217        Decimal128(p, s) => {
218            let dict = pack_numeric_to_dictionary::<K, Decimal128Type>(
219                array,
220                dict_value_type,
221                cast_options,
222            )?;
223            let dict = dict
224                .as_dictionary::<K>()
225                .downcast_dict::<Decimal128Array>()
226                .ok_or_else(|| {
227                    ArrowError::ComputeError(
228                        "Internal Error: Cannot cast dict to Decimal128Array".to_string(),
229                    )
230                })?;
231            let value = dict.values().clone();
232            // Set correct precision/scale
233            let value = value.with_precision_and_scale(p, s)?;
234            Ok(Arc::new(DictionaryArray::<K>::try_new(
235                dict.keys().clone(),
236                Arc::new(value),
237            )?))
238        }
239        Decimal256(p, s) => {
240            let dict = pack_numeric_to_dictionary::<K, Decimal256Type>(
241                array,
242                dict_value_type,
243                cast_options,
244            )?;
245            let dict = dict
246                .as_dictionary::<K>()
247                .downcast_dict::<Decimal256Array>()
248                .ok_or_else(|| {
249                    ArrowError::ComputeError(
250                        "Internal Error: Cannot cast dict to Decimal256Array".to_string(),
251                    )
252                })?;
253            let value = dict.values().clone();
254            // Set correct precision/scale
255            let value = value.with_precision_and_scale(p, s)?;
256            Ok(Arc::new(DictionaryArray::<K>::try_new(
257                dict.keys().clone(),
258                Arc::new(value),
259            )?))
260        }
261        Float16 => {
262            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
263        }
264        Float32 => {
265            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
266        }
267        Float64 => {
268            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
269        }
270        Date32 => pack_array_to_dictionary_via_primitive::<K>(
271            array,
272            DataType::Int32,
273            dict_value_type,
274            cast_options,
275        ),
276        Date64 => pack_array_to_dictionary_via_primitive::<K>(
277            array,
278            DataType::Int64,
279            dict_value_type,
280            cast_options,
281        ),
282        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
283            array,
284            DataType::Int32,
285            dict_value_type,
286            cast_options,
287        ),
288        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
289            array,
290            DataType::Int64,
291            dict_value_type,
292            cast_options,
293        ),
294        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
295            array,
296            DataType::Int64,
297            dict_value_type,
298            cast_options,
299        ),
300        Utf8 => {
301            // If the input is a view type, we can avoid casting (thus copying) the data
302            if array.data_type() == &DataType::Utf8View {
303                return string_view_to_dictionary::<K, i32>(array);
304            }
305            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
306        }
307        LargeUtf8 => {
308            // If the input is a view type, we can avoid casting (thus copying) the data
309            if array.data_type() == &DataType::Utf8View {
310                return string_view_to_dictionary::<K, i64>(array);
311            }
312            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
313        }
314        Binary => {
315            // If the input is a view type, we can avoid casting (thus copying) the data
316            if array.data_type() == &DataType::BinaryView {
317                return binary_view_to_dictionary::<K, i32>(array);
318            }
319            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
320        }
321        LargeBinary => {
322            // If the input is a view type, we can avoid casting (thus copying) the data
323            if array.data_type() == &DataType::BinaryView {
324                return binary_view_to_dictionary::<K, i64>(array);
325            }
326            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
327        }
328        _ => Err(ArrowError::CastError(format!(
329            "Unsupported output type for dictionary packing: {dict_value_type:?}"
330        ))),
331    }
332}
333
334// Packs the data from the primitive array of type <V> to a
335// DictionaryArray with keys of type K and values of value_type V
336pub(crate) fn pack_numeric_to_dictionary<K, V>(
337    array: &dyn Array,
338    dict_value_type: &DataType,
339    cast_options: &CastOptions,
340) -> Result<ArrayRef, ArrowError>
341where
342    K: ArrowDictionaryKeyType,
343    V: ArrowPrimitiveType,
344{
345    // attempt to cast the source array values to the target value type (the dictionary values type)
346    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
347    let values = cast_values.as_primitive::<V>();
348
349    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
350
351    // copy each element one at a time
352    for i in 0..values.len() {
353        if values.is_null(i) {
354            b.append_null();
355        } else {
356            b.append(values.value(i))?;
357        }
358    }
359    Ok(Arc::new(b.finish()))
360}
361
362pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
363    array: &dyn Array,
364) -> Result<ArrayRef, ArrowError>
365where
366    K: ArrowDictionaryKeyType,
367{
368    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
369        array.len(),
370        1024,
371        1024,
372    );
373    let string_view = array
374        .as_any()
375        .downcast_ref::<StringViewArray>()
376        .ok_or_else(|| {
377            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
378        })?;
379    for v in string_view.iter() {
380        match v {
381            Some(v) => {
382                b.append(v)?;
383            }
384            None => {
385                b.append_null();
386            }
387        }
388    }
389
390    Ok(Arc::new(b.finish()))
391}
392
393pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
394    array: &dyn Array,
395) -> Result<ArrayRef, ArrowError>
396where
397    K: ArrowDictionaryKeyType,
398{
399    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
400        array.len(),
401        1024,
402        1024,
403    );
404    let binary_view = array
405        .as_any()
406        .downcast_ref::<BinaryViewArray>()
407        .ok_or_else(|| {
408            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
409        })?;
410    for v in binary_view.iter() {
411        match v {
412            Some(v) => {
413                b.append(v)?;
414            }
415            None => {
416                b.append_null();
417            }
418        }
419    }
420
421    Ok(Arc::new(b.finish()))
422}
423
424// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
425// key types of K
426pub(crate) fn pack_byte_to_dictionary<K, T>(
427    array: &dyn Array,
428    cast_options: &CastOptions,
429) -> Result<ArrayRef, ArrowError>
430where
431    K: ArrowDictionaryKeyType,
432    T: ByteArrayType,
433{
434    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
435    let values = cast_values
436        .as_any()
437        .downcast_ref::<GenericByteArray<T>>()
438        .ok_or_else(|| {
439            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
440        })?;
441    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
442
443    // copy each element one at a time
444    for i in 0..values.len() {
445        if values.is_null(i) {
446            b.append_null();
447        } else {
448            b.append(values.value(i))?;
449        }
450    }
451    Ok(Arc::new(b.finish()))
452}