arrow_array/builder/
primitive_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
21use arrow_buffer::{ArrowNativeType, ToByteSlice};
22use arrow_schema::{ArrowError, DataType};
23use std::any::Any;
24use std::collections::HashMap;
25use std::sync::Arc;
26
27/// Wraps a type implementing `ToByteSlice` implementing `Hash` and `Eq` for it
28///
29/// This is necessary to handle types such as f32, which don't natively implement these
30#[derive(Debug)]
31struct Value<T>(T);
32
33impl<T: ToByteSlice> std::hash::Hash for Value<T> {
34    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
35        self.0.to_byte_slice().hash(state)
36    }
37}
38
39impl<T: ToByteSlice> PartialEq for Value<T> {
40    fn eq(&self, other: &Self) -> bool {
41        self.0.to_byte_slice().eq(other.0.to_byte_slice())
42    }
43}
44
45impl<T: ToByteSlice> Eq for Value<T> {}
46
47/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray)
48///
49/// # Example:
50///
51/// ```
52///
53/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
54/// # use arrow_array::types::{UInt32Type, UInt8Type};
55/// # use arrow_array::{Array, UInt32Array, UInt8Array};
56///
57/// let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
58///  builder.append(12345678).unwrap();
59///  builder.append_null();
60///  builder.append(22345678).unwrap();
61///  let array = builder.finish();
62///
63///  assert_eq!(
64///      array.keys(),
65///      &UInt8Array::from(vec![Some(0), None, Some(1)])
66///  );
67///
68///  // Values are polymorphic and so require a downcast.
69///  let av = array.values();
70///  let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
71///  let avs: &[u32] = ava.values();
72///
73///  assert!(!array.is_null(0));
74///  assert!(array.is_null(1));
75///  assert!(!array.is_null(2));
76///
77///  assert_eq!(avs, &[12345678, 22345678]);
78/// ```
79#[derive(Debug)]
80pub struct PrimitiveDictionaryBuilder<K, V>
81where
82    K: ArrowPrimitiveType,
83    V: ArrowPrimitiveType,
84{
85    keys_builder: PrimitiveBuilder<K>,
86    values_builder: PrimitiveBuilder<V>,
87    map: HashMap<Value<V::Native>, usize>,
88}
89
90impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
91where
92    K: ArrowPrimitiveType,
93    V: ArrowPrimitiveType,
94{
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100impl<K, V> PrimitiveDictionaryBuilder<K, V>
101where
102    K: ArrowPrimitiveType,
103    V: ArrowPrimitiveType,
104{
105    /// Creates a new `PrimitiveDictionaryBuilder`.
106    pub fn new() -> Self {
107        Self {
108            keys_builder: PrimitiveBuilder::new(),
109            values_builder: PrimitiveBuilder::new(),
110            map: HashMap::new(),
111        }
112    }
113
114    /// Creates a new `PrimitiveDictionaryBuilder` from the provided keys and values builders.
115    ///
116    /// # Panics
117    ///
118    /// This method panics if `keys_builder` or `values_builder` is not empty.
119    pub fn new_from_empty_builders(
120        keys_builder: PrimitiveBuilder<K>,
121        values_builder: PrimitiveBuilder<V>,
122    ) -> Self {
123        assert!(
124            keys_builder.is_empty() && values_builder.is_empty(),
125            "keys and values builders must be empty"
126        );
127        Self {
128            keys_builder,
129            values_builder,
130            map: HashMap::new(),
131        }
132    }
133
134    /// Creates a new `PrimitiveDictionaryBuilder` from existing `PrimitiveBuilder`s of keys and values.
135    ///
136    /// # Safety
137    ///
138    /// caller must ensure that the passed in builders are valid for DictionaryArray.
139    pub unsafe fn new_from_builders(
140        keys_builder: PrimitiveBuilder<K>,
141        values_builder: PrimitiveBuilder<V>,
142    ) -> Self {
143        let keys = keys_builder.values_slice();
144        let values = values_builder.values_slice();
145        let mut map = HashMap::with_capacity(values.len());
146
147        keys.iter().zip(values.iter()).for_each(|(key, value)| {
148            map.insert(Value(*value), K::Native::to_usize(*key).unwrap());
149        });
150
151        Self {
152            keys_builder,
153            values_builder,
154            map,
155        }
156    }
157
158    /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities
159    ///
160    /// `keys_capacity`: the number of keys, i.e. length of array to build
161    /// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary
162    pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
163        Self {
164            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
165            values_builder: PrimitiveBuilder::with_capacity(values_capacity),
166            map: HashMap::with_capacity(values_capacity),
167        }
168    }
169}
170
171impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
172where
173    K: ArrowDictionaryKeyType,
174    V: ArrowPrimitiveType,
175{
176    /// Returns the builder as an non-mutable `Any` reference.
177    fn as_any(&self) -> &dyn Any {
178        self
179    }
180
181    /// Returns the builder as an mutable `Any` reference.
182    fn as_any_mut(&mut self) -> &mut dyn Any {
183        self
184    }
185
186    /// Returns the boxed builder as a box of `Any`.
187    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
188        self
189    }
190
191    /// Returns the number of array slots in the builder
192    fn len(&self) -> usize {
193        self.keys_builder.len()
194    }
195
196    /// Builds the array and reset this builder.
197    fn finish(&mut self) -> ArrayRef {
198        Arc::new(self.finish())
199    }
200
201    /// Builds the array without resetting the builder.
202    fn finish_cloned(&self) -> ArrayRef {
203        Arc::new(self.finish_cloned())
204    }
205}
206
207impl<K, V> PrimitiveDictionaryBuilder<K, V>
208where
209    K: ArrowDictionaryKeyType,
210    V: ArrowPrimitiveType,
211{
212    #[inline]
213    fn get_or_insert_key(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
214        match self.map.get(&Value(value)) {
215            Some(&key) => {
216                Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
217            }
218            None => {
219                let key = self.values_builder.len();
220                self.values_builder.append_value(value);
221                self.map.insert(Value(value), key);
222                Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
223            }
224        }
225    }
226
227    /// Append a primitive value to the array. Return an existing index
228    /// if already present in the values array or a new index if the
229    /// value is appended to the values array.
230    #[inline]
231    pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
232        let key = self.get_or_insert_key(value)?;
233        self.keys_builder.append_value(key);
234        Ok(key)
235    }
236
237    /// Append a value multiple times to the array.
238    /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups.
239    ///
240    /// Returns an error if the new index would overflow the key type.
241    pub fn append_n(&mut self, value: V::Native, count: usize) -> Result<K::Native, ArrowError> {
242        let key = self.get_or_insert_key(value)?;
243        self.keys_builder.append_value_n(key, count);
244        Ok(key)
245    }
246
247    /// Infallibly append a value to this builder
248    ///
249    /// # Panics
250    ///
251    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
252    #[inline]
253    pub fn append_value(&mut self, value: V::Native) {
254        self.append(value).expect("dictionary key overflow");
255    }
256
257    /// Infallibly append a value to this builder repeatedly `count` times.
258    /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups.
259    ///
260    /// # Panics
261    ///
262    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
263    pub fn append_values(&mut self, value: V::Native, count: usize) {
264        self.append_n(value, count)
265            .expect("dictionary key overflow");
266    }
267
268    /// Appends a null slot into the builder
269    #[inline]
270    pub fn append_null(&mut self) {
271        self.keys_builder.append_null()
272    }
273
274    /// Append `n` null slots into the builder
275    #[inline]
276    pub fn append_nulls(&mut self, n: usize) {
277        self.keys_builder.append_nulls(n)
278    }
279
280    /// Append an `Option` value into the builder
281    ///
282    /// # Panics
283    ///
284    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
285    #[inline]
286    pub fn append_option(&mut self, value: Option<V::Native>) {
287        match value {
288            None => self.append_null(),
289            Some(v) => self.append_value(v),
290        };
291    }
292
293    /// Append an `Option` value into the builder repeatedly `count` times.
294    /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups.
295    ///
296    /// # Panics
297    ///
298    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
299    pub fn append_options(&mut self, value: Option<V::Native>, count: usize) {
300        match value {
301            None => self.keys_builder.append_nulls(count),
302            Some(v) => self.append_values(v, count),
303        };
304    }
305
306    /// Builds the `DictionaryArray` and reset this builder.
307    pub fn finish(&mut self) -> DictionaryArray<K> {
308        self.map.clear();
309        let values = self.values_builder.finish();
310        let keys = self.keys_builder.finish();
311
312        let data_type =
313            DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
314
315        let builder = keys
316            .into_data()
317            .into_builder()
318            .data_type(data_type)
319            .child_data(vec![values.into_data()]);
320
321        DictionaryArray::from(unsafe { builder.build_unchecked() })
322    }
323
324    /// Builds the `DictionaryArray` without resetting the builder.
325    pub fn finish_cloned(&self) -> DictionaryArray<K> {
326        let values = self.values_builder.finish_cloned();
327        let keys = self.keys_builder.finish_cloned();
328
329        let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE));
330
331        let builder = keys
332            .into_data()
333            .into_builder()
334            .data_type(data_type)
335            .child_data(vec![values.into_data()]);
336
337        DictionaryArray::from(unsafe { builder.build_unchecked() })
338    }
339
340    /// Returns the current dictionary values buffer as a slice
341    pub fn values_slice(&self) -> &[V::Native] {
342        self.values_builder.values_slice()
343    }
344
345    /// Returns the current dictionary values buffer as a mutable slice
346    pub fn values_slice_mut(&mut self) -> &mut [V::Native] {
347        self.values_builder.values_slice_mut()
348    }
349
350    /// Returns the current null buffer as a slice
351    pub fn validity_slice(&self) -> Option<&[u8]> {
352        self.keys_builder.validity_slice()
353    }
354}
355
356impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
357    for PrimitiveDictionaryBuilder<K, P>
358{
359    #[inline]
360    fn extend<T: IntoIterator<Item = Option<P::Native>>>(&mut self, iter: T) {
361        for v in iter {
362            self.append_option(v)
363        }
364    }
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370
371    use crate::array::UInt32Array;
372    use crate::array::UInt8Array;
373    use crate::builder::Decimal128Builder;
374    use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};
375
376    #[test]
377    fn test_primitive_dictionary_builder() {
378        let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
379        builder.append(12345678).unwrap();
380        builder.append_null();
381        builder.append(22345678).unwrap();
382        let array = builder.finish();
383
384        assert_eq!(
385            array.keys(),
386            &UInt8Array::from(vec![Some(0), None, Some(1)])
387        );
388
389        // Values are polymorphic and so require a downcast.
390        let av = array.values();
391        let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
392        let avs: &[u32] = ava.values();
393
394        assert!(!array.is_null(0));
395        assert!(array.is_null(1));
396        assert!(!array.is_null(2));
397
398        assert_eq!(avs, &[12345678, 22345678]);
399    }
400
401    #[test]
402    fn test_extend() {
403        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
404        builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
405        builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
406        let dict = builder.finish();
407        assert_eq!(
408            dict.keys().values(),
409            &[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0]
410        );
411        assert_eq!(dict.values().len(), 5);
412    }
413
414    #[test]
415    #[should_panic(expected = "DictionaryKeyOverflowError")]
416    fn test_primitive_dictionary_overflow() {
417        let mut builder =
418            PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
419        // 256 unique keys.
420        for i in 0..256 {
421            builder.append(i + 1000).unwrap();
422        }
423        // Special error if the key overflows (256th entry)
424        builder.append(1257).unwrap();
425    }
426
427    #[test]
428    fn test_primitive_dictionary_with_builders() {
429        let keys_builder = PrimitiveBuilder::<Int32Type>::new();
430        let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
431        let mut builder =
432            PrimitiveDictionaryBuilder::<Int32Type, Decimal128Type>::new_from_empty_builders(
433                keys_builder,
434                values_builder,
435            );
436        let dict_array = builder.finish();
437        assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2));
438        assert_eq!(
439            dict_array.data_type(),
440            &DataType::Dictionary(
441                Box::new(DataType::Int32),
442                Box::new(DataType::Decimal128(1, 2)),
443            )
444        );
445    }
446}