1use crate::builder::{ArrayBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
21use arrow_buffer::{ArrowNativeType, ToByteSlice};
22use arrow_schema::{ArrowError, DataType};
23use std::any::Any;
24use std::collections::HashMap;
25use std::sync::Arc;
26
27#[derive(Debug)]
31struct Value<T>(T);
32
33impl<T: ToByteSlice> std::hash::Hash for Value<T> {
34 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
35 self.0.to_byte_slice().hash(state)
36 }
37}
38
39impl<T: ToByteSlice> PartialEq for Value<T> {
40 fn eq(&self, other: &Self) -> bool {
41 self.0.to_byte_slice().eq(other.0.to_byte_slice())
42 }
43}
44
45impl<T: ToByteSlice> Eq for Value<T> {}
46
47#[derive(Debug)]
80pub struct PrimitiveDictionaryBuilder<K, V>
81where
82 K: ArrowPrimitiveType,
83 V: ArrowPrimitiveType,
84{
85 keys_builder: PrimitiveBuilder<K>,
86 values_builder: PrimitiveBuilder<V>,
87 map: HashMap<Value<V::Native>, usize>,
88}
89
90impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
91where
92 K: ArrowPrimitiveType,
93 V: ArrowPrimitiveType,
94{
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100impl<K, V> PrimitiveDictionaryBuilder<K, V>
101where
102 K: ArrowPrimitiveType,
103 V: ArrowPrimitiveType,
104{
105 pub fn new() -> Self {
107 Self {
108 keys_builder: PrimitiveBuilder::new(),
109 values_builder: PrimitiveBuilder::new(),
110 map: HashMap::new(),
111 }
112 }
113
114 pub fn new_from_empty_builders(
120 keys_builder: PrimitiveBuilder<K>,
121 values_builder: PrimitiveBuilder<V>,
122 ) -> Self {
123 assert!(
124 keys_builder.is_empty() && values_builder.is_empty(),
125 "keys and values builders must be empty"
126 );
127 Self {
128 keys_builder,
129 values_builder,
130 map: HashMap::new(),
131 }
132 }
133
134 pub unsafe fn new_from_builders(
140 keys_builder: PrimitiveBuilder<K>,
141 values_builder: PrimitiveBuilder<V>,
142 ) -> Self {
143 let keys = keys_builder.values_slice();
144 let values = values_builder.values_slice();
145 let mut map = HashMap::with_capacity(values.len());
146
147 keys.iter().zip(values.iter()).for_each(|(key, value)| {
148 map.insert(Value(*value), K::Native::to_usize(*key).unwrap());
149 });
150
151 Self {
152 keys_builder,
153 values_builder,
154 map,
155 }
156 }
157
158 pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
163 Self {
164 keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
165 values_builder: PrimitiveBuilder::with_capacity(values_capacity),
166 map: HashMap::with_capacity(values_capacity),
167 }
168 }
169}
170
171impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
172where
173 K: ArrowDictionaryKeyType,
174 V: ArrowPrimitiveType,
175{
176 fn as_any(&self) -> &dyn Any {
178 self
179 }
180
181 fn as_any_mut(&mut self) -> &mut dyn Any {
183 self
184 }
185
186 fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
188 self
189 }
190
191 fn len(&self) -> usize {
193 self.keys_builder.len()
194 }
195
196 fn finish(&mut self) -> ArrayRef {
198 Arc::new(self.finish())
199 }
200
201 fn finish_cloned(&self) -> ArrayRef {
203 Arc::new(self.finish_cloned())
204 }
205}
206
207impl<K, V> PrimitiveDictionaryBuilder<K, V>
208where
209 K: ArrowDictionaryKeyType,
210 V: ArrowPrimitiveType,
211{
212 #[inline]
213 fn get_or_insert_key(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
214 match self.map.get(&Value(value)) {
215 Some(&key) => {
216 Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
217 }
218 None => {
219 let key = self.values_builder.len();
220 self.values_builder.append_value(value);
221 self.map.insert(Value(value), key);
222 Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?)
223 }
224 }
225 }
226
227 #[inline]
231 pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
232 let key = self.get_or_insert_key(value)?;
233 self.keys_builder.append_value(key);
234 Ok(key)
235 }
236
237 pub fn append_n(&mut self, value: V::Native, count: usize) -> Result<K::Native, ArrowError> {
242 let key = self.get_or_insert_key(value)?;
243 self.keys_builder.append_value_n(key, count);
244 Ok(key)
245 }
246
247 #[inline]
253 pub fn append_value(&mut self, value: V::Native) {
254 self.append(value).expect("dictionary key overflow");
255 }
256
257 pub fn append_values(&mut self, value: V::Native, count: usize) {
264 self.append_n(value, count)
265 .expect("dictionary key overflow");
266 }
267
268 #[inline]
270 pub fn append_null(&mut self) {
271 self.keys_builder.append_null()
272 }
273
274 #[inline]
276 pub fn append_nulls(&mut self, n: usize) {
277 self.keys_builder.append_nulls(n)
278 }
279
280 #[inline]
286 pub fn append_option(&mut self, value: Option<V::Native>) {
287 match value {
288 None => self.append_null(),
289 Some(v) => self.append_value(v),
290 };
291 }
292
293 pub fn append_options(&mut self, value: Option<V::Native>, count: usize) {
300 match value {
301 None => self.keys_builder.append_nulls(count),
302 Some(v) => self.append_values(v, count),
303 };
304 }
305
306 pub fn finish(&mut self) -> DictionaryArray<K> {
308 self.map.clear();
309 let values = self.values_builder.finish();
310 let keys = self.keys_builder.finish();
311
312 let data_type =
313 DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
314
315 let builder = keys
316 .into_data()
317 .into_builder()
318 .data_type(data_type)
319 .child_data(vec![values.into_data()]);
320
321 DictionaryArray::from(unsafe { builder.build_unchecked() })
322 }
323
324 pub fn finish_cloned(&self) -> DictionaryArray<K> {
326 let values = self.values_builder.finish_cloned();
327 let keys = self.keys_builder.finish_cloned();
328
329 let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE));
330
331 let builder = keys
332 .into_data()
333 .into_builder()
334 .data_type(data_type)
335 .child_data(vec![values.into_data()]);
336
337 DictionaryArray::from(unsafe { builder.build_unchecked() })
338 }
339
340 pub fn values_slice(&self) -> &[V::Native] {
342 self.values_builder.values_slice()
343 }
344
345 pub fn values_slice_mut(&mut self) -> &mut [V::Native] {
347 self.values_builder.values_slice_mut()
348 }
349
350 pub fn validity_slice(&self) -> Option<&[u8]> {
352 self.keys_builder.validity_slice()
353 }
354}
355
356impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
357 for PrimitiveDictionaryBuilder<K, P>
358{
359 #[inline]
360 fn extend<T: IntoIterator<Item = Option<P::Native>>>(&mut self, iter: T) {
361 for v in iter {
362 self.append_option(v)
363 }
364 }
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370
371 use crate::array::UInt32Array;
372 use crate::array::UInt8Array;
373 use crate::builder::Decimal128Builder;
374 use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};
375
376 #[test]
377 fn test_primitive_dictionary_builder() {
378 let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
379 builder.append(12345678).unwrap();
380 builder.append_null();
381 builder.append(22345678).unwrap();
382 let array = builder.finish();
383
384 assert_eq!(
385 array.keys(),
386 &UInt8Array::from(vec![Some(0), None, Some(1)])
387 );
388
389 let av = array.values();
391 let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
392 let avs: &[u32] = ava.values();
393
394 assert!(!array.is_null(0));
395 assert!(array.is_null(1));
396 assert!(!array.is_null(2));
397
398 assert_eq!(avs, &[12345678, 22345678]);
399 }
400
401 #[test]
402 fn test_extend() {
403 let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
404 builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
405 builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
406 let dict = builder.finish();
407 assert_eq!(
408 dict.keys().values(),
409 &[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0]
410 );
411 assert_eq!(dict.values().len(), 5);
412 }
413
414 #[test]
415 #[should_panic(expected = "DictionaryKeyOverflowError")]
416 fn test_primitive_dictionary_overflow() {
417 let mut builder =
418 PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
419 for i in 0..256 {
421 builder.append(i + 1000).unwrap();
422 }
423 builder.append(1257).unwrap();
425 }
426
427 #[test]
428 fn test_primitive_dictionary_with_builders() {
429 let keys_builder = PrimitiveBuilder::<Int32Type>::new();
430 let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
431 let mut builder =
432 PrimitiveDictionaryBuilder::<Int32Type, Decimal128Type>::new_from_empty_builders(
433 keys_builder,
434 values_builder,
435 );
436 let dict_array = builder.finish();
437 assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2));
438 assert_eq!(
439 dict_array.data_type(),
440 &DataType::Dictionary(
441 Box::new(DataType::Int32),
442 Box::new(DataType::Decimal128(1, 2)),
443 )
444 );
445 }
446}