parquet/bloom_filter/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Bloom filter implementation specific to Parquet, as described
19//! in the [spec][parquet-bf-spec].
20//!
21//! # Bloom Filter Size
22//!
23//! Parquet uses the [Split Block Bloom Filter][sbbf-paper] (SBBF) as its bloom filter
24//! implementation. For each column upon which bloom filters are enabled, the offset and length of an SBBF
25//! is stored in  the metadata for each row group in the parquet file. The size of each filter is
26//! initialized using a calculation based on the desired number of distinct values (NDV) and false
27//! positive probability (FPP). The FPP for a SBBF can be approximated as<sup>[1][bf-formulae]</sup>:
28//!
29//! ```text
30//! f = (1 - e^(-k * n / m))^k
31//! ```
32//!
33//! Where, `f` is the FPP, `k` the number of hash functions, `n` the NDV, and `m` the total number
34//! of bits in the bloom filter. This can be re-arranged to determine the total number of bits
35//! required to achieve a given FPP and NDV:
36//!
37//! ```text
38//! m = -k * n / ln(1 - f^(1/k))
39//! ```
40//!
41//! SBBFs use eight hash functions to cleanly fit in SIMD lanes<sup>[2][sbbf-paper]</sup>, therefore
42//! `k` is set to 8. The SBBF will spread those `m` bits accross a set of `b` blocks that
43//! are each 256 bits, i.e., 32 bytes, in size. The number of blocks is chosen as:
44//!
45//! ```text
46//! b = NP2(m/8) / 32
47//! ```
48//!
49//! Where, `NP2` denotes *the next power of two*, and `m` is divided by 8 to be represented as bytes.
50//!
51//! Here is a table of calculated sizes for various FPP and NDV:
52//!
53//! | NDV       | FPP       | b       | Size (KB) |
54//! |-----------|-----------|---------|-----------|
55//! | 10,000    | 0.1       | 256     | 8         |
56//! | 10,000    | 0.01      | 512     | 16        |
57//! | 10,000    | 0.001     | 1,024   | 32        |
58//! | 10,000    | 0.0001    | 1,024   | 32        |
59//! | 100,000   | 0.1       | 4,096   | 128       |
60//! | 100,000   | 0.01      | 4,096   | 128       |
61//! | 100,000   | 0.001     | 8,192   | 256       |
62//! | 100,000   | 0.0001    | 16,384  | 512       |
63//! | 100,000   | 0.00001   | 16,384  | 512       |
64//! | 1,000,000 | 0.1       | 32,768  | 1,024     |
65//! | 1,000,000 | 0.01      | 65,536  | 2,048     |
66//! | 1,000,000 | 0.001     | 65,536  | 2,048     |
67//! | 1,000,000 | 0.0001    | 131,072 | 4,096     |
68//! | 1,000,000 | 0.00001   | 131,072 | 4,096     |
69//! | 1,000,000 | 0.000001  | 262,144 | 8,192     |
70//!
71//! [parquet-bf-spec]: https://github.com/apache/parquet-format/blob/master/BloomFilter.md
72//! [sbbf-paper]: https://arxiv.org/pdf/2101.01719
73//! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf
74
75use crate::basic::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash};
76use crate::data_type::AsBytes;
77use crate::errors::{ParquetError, Result};
78use crate::file::metadata::ColumnChunkMetaData;
79use crate::file::reader::ChunkReader;
80use crate::parquet_thrift::{
81    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
82    ThriftSliceInputProtocol, WriteThrift, WriteThriftField,
83};
84use crate::thrift_struct;
85use bytes::Bytes;
86use std::io::Write;
87use twox_hash::XxHash64;
88
89/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach).
90const SALT: [u32; 8] = [
91    0x47b6137b_u32,
92    0x44974d91_u32,
93    0x8824ad5b_u32,
94    0xa2b7289d_u32,
95    0x705495c7_u32,
96    0x2df1424b_u32,
97    0x9efc4947_u32,
98    0x5c6bfb31_u32,
99];
100
101thrift_struct!(
102/// Bloom filter header is stored at beginning of Bloom filter data of each column
103/// and followed by its bitset.
104///
105pub struct BloomFilterHeader {
106  /// The size of bitset in bytes
107  1: required i32 num_bytes;
108  /// The algorithm for setting bits.
109  2: required BloomFilterAlgorithm algorithm;
110  /// The hash function used for Bloom filter
111  3: required BloomFilterHash hash;
112  /// The compression used in the Bloom filter
113  4: required BloomFilterCompression compression;
114}
115);
116
117/// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits.
118/// Each word is thought of as an array of bits; each bit is either "set" or "not set".
119#[derive(Debug, Copy, Clone)]
120#[repr(transparent)]
121struct Block([u32; 8]);
122impl Block {
123    const ZERO: Block = Block([0; 8]);
124
125    /// takes as its argument a single unsigned 32-bit integer and returns a block in which each
126    /// word has exactly one bit set.
127    fn mask(x: u32) -> Self {
128        let mut result = [0_u32; 8];
129        for i in 0..8 {
130            // wrapping instead of checking for overflow
131            let y = x.wrapping_mul(SALT[i]);
132            let y = y >> 27;
133            result[i] = 1 << y;
134        }
135        Self(result)
136    }
137
138    #[inline]
139    #[cfg(not(target_endian = "little"))]
140    fn to_ne_bytes(self) -> [u8; 32] {
141        // SAFETY: [u32; 8] and [u8; 32] have the same size and neither has invalid bit patterns.
142        unsafe { std::mem::transmute(self.0) }
143    }
144
145    #[inline]
146    #[cfg(not(target_endian = "little"))]
147    fn to_le_bytes(self) -> [u8; 32] {
148        self.swap_bytes().to_ne_bytes()
149    }
150
151    #[inline]
152    #[cfg(not(target_endian = "little"))]
153    fn swap_bytes(mut self) -> Self {
154        self.0.iter_mut().for_each(|x| *x = x.swap_bytes());
155        self
156    }
157
158    /// setting every bit in the block that was also set in the result from mask
159    fn insert(&mut self, hash: u32) {
160        let mask = Self::mask(hash);
161        for i in 0..8 {
162            self[i] |= mask[i];
163        }
164    }
165
166    /// returns true when every bit that is set in the result of mask is also set in the block.
167    fn check(&self, hash: u32) -> bool {
168        let mask = Self::mask(hash);
169        for i in 0..8 {
170            if self[i] & mask[i] == 0 {
171                return false;
172            }
173        }
174        true
175    }
176}
177
178impl std::ops::Index<usize> for Block {
179    type Output = u32;
180
181    #[inline]
182    fn index(&self, index: usize) -> &Self::Output {
183        self.0.index(index)
184    }
185}
186
187impl std::ops::IndexMut<usize> for Block {
188    #[inline]
189    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
190        self.0.index_mut(index)
191    }
192}
193
194/// A split block Bloom filter.
195///
196/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
197/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
198#[derive(Debug, Clone)]
199pub struct Sbbf(Vec<Block>);
200
201pub(crate) const SBBF_HEADER_SIZE_ESTIMATE: usize = 20;
202
203/// given an initial offset, and a byte buffer, try to read out a bloom filter header and return
204/// both the header and the offset after it (for bitset).
205pub(crate) fn chunk_read_bloom_filter_header_and_offset(
206    offset: u64,
207    buffer: Bytes,
208) -> Result<(BloomFilterHeader, u64), ParquetError> {
209    let (header, length) = read_bloom_filter_header_and_length(buffer)?;
210    Ok((header, offset + length))
211}
212
213/// given a [Bytes] buffer, try to read out a bloom filter header and return both the header and
214/// length of the header.
215#[inline]
216pub(crate) fn read_bloom_filter_header_and_length(
217    buffer: Bytes,
218) -> Result<(BloomFilterHeader, u64), ParquetError> {
219    read_bloom_filter_header_and_length_from_bytes(buffer.as_ref())
220}
221
222/// Given a byte slice, try to read out a bloom filter header and return both the header and
223/// length of the header.
224#[inline]
225fn read_bloom_filter_header_and_length_from_bytes(
226    buffer: &[u8],
227) -> Result<(BloomFilterHeader, u64), ParquetError> {
228    let total_length = buffer.len();
229    let mut prot = ThriftSliceInputProtocol::new(buffer);
230    let header = BloomFilterHeader::read_thrift(&mut prot)
231        .map_err(|e| ParquetError::General(format!("Could not read bloom filter header: {e}")))?;
232    Ok((header, (total_length - prot.as_slice().len()) as u64))
233}
234
235pub(crate) const BITSET_MIN_LENGTH: usize = 32;
236pub(crate) const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024;
237
238#[inline]
239fn optimal_num_of_bytes(num_bytes: usize) -> usize {
240    let num_bytes = num_bytes.min(BITSET_MAX_LENGTH);
241    let num_bytes = num_bytes.max(BITSET_MIN_LENGTH);
242    num_bytes.next_power_of_two()
243}
244
245// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
246// given fpp = (1 - e^(-k * n / m)) ^ k
247// we have m = - k * n / ln(1 - fpp ^ (1 / k))
248// where k = number of hash functions, m = number of bits, n = number of distinct values
249#[inline]
250fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
251    let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
252    num_bits as usize
253}
254
255impl Sbbf {
256    /// Create a new [Sbbf] with given number of distinct values and false positive probability.
257    /// Will return an error if `fpp` is greater than or equal to 1.0 or less than 0.0.
258    pub(crate) fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Result<Self, ParquetError> {
259        if !(0.0..1.0).contains(&fpp) {
260            return Err(ParquetError::General(format!(
261                "False positive probability must be between 0.0 and 1.0, got {fpp}"
262            )));
263        }
264        let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp);
265        Ok(Self::new_with_num_of_bytes(num_bits / 8))
266    }
267
268    /// Create a new [Sbbf] with given number of bytes, the exact number of bytes will be adjusted
269    /// to the next power of two bounded by [BITSET_MIN_LENGTH] and [BITSET_MAX_LENGTH].
270    pub(crate) fn new_with_num_of_bytes(num_bytes: usize) -> Self {
271        let num_bytes = optimal_num_of_bytes(num_bytes);
272        assert_eq!(num_bytes % size_of::<Block>(), 0);
273        let num_blocks = num_bytes / size_of::<Block>();
274        let bitset = vec![Block::ZERO; num_blocks];
275        Self(bitset)
276    }
277
278    pub(crate) fn new(bitset: &[u8]) -> Self {
279        let data = bitset
280            .chunks_exact(4 * 8)
281            .map(|chunk| {
282                let mut block = Block::ZERO;
283                for (i, word) in chunk.chunks_exact(4).enumerate() {
284                    block[i] = u32::from_le_bytes(word.try_into().unwrap());
285                }
286                block
287            })
288            .collect::<Vec<Block>>();
289        Self(data)
290    }
291
292    /// Write the bloom filter data (header and then bitset) to the output. This doesn't
293    /// flush the writer in order to boost performance of bulk writing all blocks. Caller
294    /// must remember to flush the writer.
295    /// This method usually is used in conjunction with [`Self::from_bytes`] for serialization/deserialization.
296    pub fn write<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
297        let mut protocol = ThriftCompactOutputProtocol::new(&mut writer);
298        self.header().write_thrift(&mut protocol).map_err(|e| {
299            ParquetError::General(format!("Could not write bloom filter header: {e}"))
300        })?;
301        self.write_bitset(&mut writer)?;
302        Ok(())
303    }
304
305    /// Write the bitset in serialized form to the writer.
306    #[cfg(not(target_endian = "little"))]
307    fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
308        for block in &self.0 {
309            writer
310                .write_all(block.to_le_bytes().as_slice())
311                .map_err(|e| {
312                    ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
313                })?;
314        }
315        Ok(())
316    }
317
318    /// Write the bitset in serialized form to the writer.
319    #[cfg(target_endian = "little")]
320    fn write_bitset<W: Write>(&self, mut writer: W) -> Result<(), ParquetError> {
321        // Safety: Block is repr(transparent) and [u32; 8] can be reinterpreted as [u8; 32].
322        let slice = unsafe {
323            std::slice::from_raw_parts(
324                self.0.as_ptr() as *const u8,
325                self.0.len() * size_of::<Block>(),
326            )
327        };
328        writer.write_all(slice).map_err(|e| {
329            ParquetError::General(format!("Could not write bloom filter bit set: {e}"))
330        })?;
331        Ok(())
332    }
333
334    /// Create and populate [`BloomFilterHeader`] from this bitset for writing to serialized form
335    fn header(&self) -> BloomFilterHeader {
336        BloomFilterHeader {
337            // 8 i32 per block, 4 bytes per i32
338            num_bytes: self.0.len() as i32 * 4 * 8,
339            algorithm: BloomFilterAlgorithm::BLOCK,
340            hash: BloomFilterHash::XXHASH,
341            compression: BloomFilterCompression::UNCOMPRESSED,
342        }
343    }
344
345    /// Read a new bloom filter from the given offset in the given reader.
346    pub fn read_from_column_chunk<R: ChunkReader>(
347        column_metadata: &ColumnChunkMetaData,
348        reader: &R,
349    ) -> Result<Option<Self>, ParquetError> {
350        let offset: u64 = if let Some(offset) = column_metadata.bloom_filter_offset() {
351            offset
352                .try_into()
353                .map_err(|_| ParquetError::General("Bloom filter offset is invalid".to_string()))?
354        } else {
355            return Ok(None);
356        };
357
358        let buffer = match column_metadata.bloom_filter_length() {
359            Some(length) => reader.get_bytes(offset, length as usize),
360            None => reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE),
361        }?;
362
363        let (header, bitset_offset) =
364            chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?;
365
366        match header.algorithm {
367            BloomFilterAlgorithm::BLOCK => {
368                // this match exists to future proof the singleton algorithm enum
369            }
370        }
371        match header.compression {
372            BloomFilterCompression::UNCOMPRESSED => {
373                // this match exists to future proof the singleton compression enum
374            }
375        }
376        match header.hash {
377            BloomFilterHash::XXHASH => {
378                // this match exists to future proof the singleton hash enum
379            }
380        }
381
382        let bitset = match column_metadata.bloom_filter_length() {
383            Some(_) => buffer.slice((bitset_offset - offset) as usize..),
384            None => {
385                let bitset_length: usize = header.num_bytes.try_into().map_err(|_| {
386                    ParquetError::General("Bloom filter length is invalid".to_string())
387                })?;
388                reader.get_bytes(bitset_offset, bitset_length)?
389            }
390        };
391
392        Ok(Some(Self::new(&bitset)))
393    }
394
395    #[inline]
396    fn hash_to_block_index(&self, hash: u64) -> usize {
397        // unchecked_mul is unstable, but in reality this is safe, we'd just use saturating mul
398        // but it will not saturate
399        (((hash >> 32).saturating_mul(self.0.len() as u64)) >> 32) as usize
400    }
401
402    /// Insert an [AsBytes] value into the filter
403    pub fn insert<T: AsBytes + ?Sized>(&mut self, value: &T) {
404        self.insert_hash(hash_as_bytes(value));
405    }
406
407    /// Insert a hash into the filter
408    fn insert_hash(&mut self, hash: u64) {
409        let block_index = self.hash_to_block_index(hash);
410        self.0[block_index].insert(hash as u32)
411    }
412
413    /// Check if an [AsBytes] value is probably present or definitely absent in the filter
414    pub fn check<T: AsBytes>(&self, value: &T) -> bool {
415        self.check_hash(hash_as_bytes(value))
416    }
417
418    /// Check if a hash is in the filter. May return
419    /// true for values that was never inserted ("false positive")
420    /// but will always return false if a hash has not been inserted.
421    fn check_hash(&self, hash: u64) -> bool {
422        let block_index = self.hash_to_block_index(hash);
423        self.0[block_index].check(hash as u32)
424    }
425
426    /// Return the total in memory size of this bloom filter in bytes
427    pub(crate) fn estimated_memory_size(&self) -> usize {
428        self.0.capacity() * std::mem::size_of::<Block>()
429    }
430
431    /// Reads a Sbff from Thrift encoded bytes
432    ///
433    /// # Examples
434    ///
435    /// ```no_run
436    /// # use parquet::errors::Result;
437    /// # use parquet::bloom_filter::Sbbf;
438    /// # fn main() -> Result<()> {
439    /// // In a real application, you would read serialized bloom filter bytes from a cache.
440    /// // This example demonstrates the deserialization process.
441    /// // Assuming you have bloom filter bytes from a Parquet file:
442    /// # let serialized_bytes: Vec<u8> = vec![];
443    /// let bloom_filter = Sbbf::from_bytes(&serialized_bytes)?;
444    /// // Now you can use the bloom filter to check for values
445    /// if bloom_filter.check(&"some_value") {
446    ///     println!("Value might be present (or false positive)");
447    /// } else {
448    ///     println!("Value is definitely not present");
449    /// }
450    /// # Ok(())
451    /// # }
452    /// ```
453    pub fn from_bytes(bytes: &[u8]) -> Result<Self, ParquetError> {
454        let (header, header_len) = read_bloom_filter_header_and_length_from_bytes(bytes)?;
455
456        let bitset_length: u64 = header
457            .num_bytes
458            .try_into()
459            .map_err(|_| ParquetError::General("Bloom filter length is invalid".to_string()))?;
460
461        // Validate that bitset consumes all remaining bytes
462        if header_len + bitset_length != bytes.len() as u64 {
463            return Err(ParquetError::General(format!(
464                "Bloom filter data contains extra bytes: expected {} total bytes, got {}",
465                header_len + bitset_length,
466                bytes.len()
467            )));
468        }
469
470        let start = header_len as usize;
471        let end = (header_len + bitset_length) as usize;
472        let bitset = bytes
473            .get(start..end)
474            .ok_or_else(|| ParquetError::General("Bloom filter bitset is invalid".to_string()))?;
475
476        Ok(Self::new(bitset))
477    }
478}
479
480// per spec we use xxHash with seed=0
481const SEED: u64 = 0;
482
483#[inline]
484fn hash_as_bytes<A: AsBytes + ?Sized>(value: &A) -> u64 {
485    XxHash64::oneshot(SEED, value.as_bytes())
486}
487
488#[cfg(test)]
489mod tests {
490    use super::*;
491
492    #[test]
493    fn test_hash_bytes() {
494        assert_eq!(hash_as_bytes(""), 17241709254077376921);
495    }
496
497    #[test]
498    fn test_mask_set_quick_check() {
499        for i in 0..1_000_000 {
500            let result = Block::mask(i);
501            assert!(result.0.iter().all(|&x| x.is_power_of_two()));
502        }
503    }
504
505    #[test]
506    fn test_block_insert_and_check() {
507        for i in 0..1_000_000 {
508            let mut block = Block::ZERO;
509            block.insert(i);
510            assert!(block.check(i));
511        }
512    }
513
514    #[test]
515    fn test_sbbf_insert_and_check() {
516        let mut sbbf = Sbbf(vec![Block::ZERO; 1_000]);
517        for i in 0..1_000_000 {
518            sbbf.insert(&i);
519            assert!(sbbf.check(&i));
520        }
521    }
522
523    #[test]
524    fn test_with_fixture() {
525        // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10
526        let bitset: &[u8] = &[
527            200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5,
528            99, 65, 2, 0, 224, 44, 64, 78, 96, 4,
529        ];
530        let sbbf = Sbbf::new(bitset);
531        for a in 0..10i64 {
532            let value = format!("a{a}");
533            assert!(sbbf.check(&value.as_str()));
534        }
535    }
536
537    /// test the assumption that bloom filter header size should not exceed SBBF_HEADER_SIZE_ESTIMATE
538    /// essentially we are testing that the struct is packed with 4 i32 fields, each can be 1-5 bytes
539    /// so altogether it'll be 20 bytes at most.
540    #[test]
541    fn test_bloom_filter_header_size_assumption() {
542        let buffer: &[u8; 16] = &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99];
543        let (
544            BloomFilterHeader {
545                algorithm,
546                compression,
547                hash,
548                num_bytes,
549            },
550            read_length,
551        ) = read_bloom_filter_header_and_length(Bytes::copy_from_slice(buffer)).unwrap();
552        assert_eq!(read_length, 15);
553        assert_eq!(algorithm, BloomFilterAlgorithm::BLOCK);
554        assert_eq!(compression, BloomFilterCompression::UNCOMPRESSED);
555        assert_eq!(hash, BloomFilterHash::XXHASH);
556        assert_eq!(num_bytes, 32_i32);
557        assert_eq!(20, SBBF_HEADER_SIZE_ESTIMATE);
558    }
559
560    #[test]
561    fn test_optimal_num_of_bytes() {
562        for (input, expected) in &[
563            (0, 32),
564            (9, 32),
565            (31, 32),
566            (32, 32),
567            (33, 64),
568            (99, 128),
569            (1024, 1024),
570            (999_000_000, 128 * 1024 * 1024),
571        ] {
572            assert_eq!(*expected, optimal_num_of_bytes(*input));
573        }
574    }
575
576    #[test]
577    fn test_num_of_bits_from_ndv_fpp() {
578        for (fpp, ndv, num_bits) in &[
579            (0.1, 10, 57),
580            (0.01, 10, 96),
581            (0.001, 10, 146),
582            (0.1, 100, 577),
583            (0.01, 100, 968),
584            (0.001, 100, 1460),
585            (0.1, 1000, 5772),
586            (0.01, 1000, 9681),
587            (0.001, 1000, 14607),
588            (0.1, 10000, 57725),
589            (0.01, 10000, 96815),
590            (0.001, 10000, 146076),
591            (0.1, 100000, 577254),
592            (0.01, 100000, 968152),
593            (0.001, 100000, 1460769),
594            (0.1, 1000000, 5772541),
595            (0.01, 1000000, 9681526),
596            (0.001, 1000000, 14607697),
597            (1e-50, 1_000_000_000_000, 14226231280773240832),
598        ] {
599            assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
600        }
601    }
602
603    #[test]
604    fn test_sbbf_write_round_trip() {
605        // Create a bloom filter with a 32-byte bitset (minimum size)
606        let bitset_bytes = vec![0u8; 32];
607        let mut original = Sbbf::new(&bitset_bytes);
608
609        // Insert some test values
610        let test_values = ["hello", "world", "rust", "parquet", "bloom", "filter"];
611        for value in &test_values {
612            original.insert(value);
613        }
614
615        // Serialize to bytes
616        let mut output = Vec::new();
617        original.write(&mut output).unwrap();
618
619        // Validate header was written correctly
620        let mut protocol = ThriftSliceInputProtocol::new(&output);
621        let header = BloomFilterHeader::read_thrift(&mut protocol).unwrap();
622        assert_eq!(header.num_bytes, bitset_bytes.len() as i32);
623        assert_eq!(header.algorithm, BloomFilterAlgorithm::BLOCK);
624        assert_eq!(header.hash, BloomFilterHash::XXHASH);
625        assert_eq!(header.compression, BloomFilterCompression::UNCOMPRESSED);
626
627        // Deserialize using from_bytes
628        let reconstructed = Sbbf::from_bytes(&output).unwrap();
629
630        // Most importantly: verify the bloom filter WORKS correctly after round-trip
631        // Note: bloom filters can have false positives, but should never have false negatives
632        // So we can't assert !check(), but we should verify inserted values are found
633        for value in &test_values {
634            assert!(
635                reconstructed.check(value),
636                "Value '{}' should be present after round-trip",
637                value
638            );
639        }
640    }
641}
parquet/bloom_filter/mod.rs

parquet/bloom_filter/
mod.rs