parquet/util/
interner.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::data_type::AsBytes;
19use hashbrown::HashTable;
20
21const DEFAULT_DEDUP_CAPACITY: usize = 4096;
22
23/// Storage trait for [`Interner`]
24pub trait Storage {
25    type Key: Copy;
26
27    type Value: AsBytes + PartialEq + ?Sized;
28
29    /// Gets an element by its key
30    fn get(&self, idx: Self::Key) -> &Self::Value;
31
32    /// Adds a new element, returning the key
33    fn push(&mut self, value: &Self::Value) -> Self::Key;
34
35    /// Return an estimate of the memory used in this storage, in bytes
36    #[allow(dead_code)] // not used in parquet_derive, so is dead there
37    fn estimated_memory_size(&self) -> usize;
38}
39
40/// A generic value interner supporting various different [`Storage`]
41#[derive(Debug, Default)]
42pub struct Interner<S: Storage> {
43    state: ahash::RandomState,
44
45    /// Used to provide a lookup from value to unique value
46    dedup: HashTable<S::Key>,
47
48    storage: S,
49}
50
51impl<S: Storage> Interner<S> {
52    /// Create a new `Interner` with the provided storage
53    pub fn new(storage: S) -> Self {
54        Self {
55            state: Default::default(),
56            dedup: HashTable::with_capacity(DEFAULT_DEDUP_CAPACITY),
57            storage,
58        }
59    }
60
61    /// Intern the value, returning the interned key, and if this was a new value
62    pub fn intern(&mut self, value: &S::Value) -> S::Key {
63        let hash = self.state.hash_one(value.as_bytes());
64
65        *self
66            .dedup
67            .entry(
68                hash,
69                |index| value == self.storage.get(*index),
70                |key| self.state.hash_one(self.storage.get(*key).as_bytes()),
71            )
72            .or_insert_with(|| self.storage.push(value))
73            .get()
74    }
75
76    /// Return estimate of the memory used, in bytes
77    #[allow(dead_code)] // not used in parquet_derive, so is dead there
78    pub fn estimated_memory_size(&self) -> usize {
79        self.storage.estimated_memory_size() +
80            // estimate size of dedup hashmap as just th size of the keys
81            self.dedup.capacity() + std::mem::size_of::<S::Key>()
82    }
83
84    /// Returns the storage for this interner
85    pub fn storage(&self) -> &S {
86        &self.storage
87    }
88
89    /// Unwraps the inner storage
90    #[cfg(feature = "arrow")]
91    pub fn into_inner(self) -> S {
92        self.storage
93    }
94}