arrow_buffer/util/
bit_iterator.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Types for iterating over packed bitmasks
19
20use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator};
21use crate::bit_util::{ceil, get_bit_raw};
22
23/// Iterator over the bits within a packed bitmask
24///
25/// To efficiently iterate over just the set bits see [`BitIndexIterator`] and [`BitSliceIterator`]
26pub struct BitIterator<'a> {
27    buffer: &'a [u8],
28    current_offset: usize,
29    end_offset: usize,
30}
31
32impl<'a> BitIterator<'a> {
33    /// Create a new [`BitIterator`] from the provided `buffer`,
34    /// and `offset` and `len` in bits
35    ///
36    /// # Panic
37    ///
38    /// Panics if `buffer` is too short for the provided offset and length
39    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
40        let end_offset = offset.checked_add(len).unwrap();
41        let required_len = ceil(end_offset, 8);
42        assert!(
43            buffer.len() >= required_len,
44            "BitIterator buffer too small, expected {required_len} got {}",
45            buffer.len()
46        );
47
48        Self {
49            buffer,
50            current_offset: offset,
51            end_offset,
52        }
53    }
54}
55
56impl Iterator for BitIterator<'_> {
57    type Item = bool;
58
59    fn next(&mut self) -> Option<Self::Item> {
60        if self.current_offset == self.end_offset {
61            return None;
62        }
63        // Safety:
64        // offsets in bounds
65        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.current_offset) };
66        self.current_offset += 1;
67        Some(v)
68    }
69
70    fn size_hint(&self) -> (usize, Option<usize>) {
71        let remaining_bits = self.end_offset - self.current_offset;
72        (remaining_bits, Some(remaining_bits))
73    }
74}
75
76impl ExactSizeIterator for BitIterator<'_> {}
77
78impl DoubleEndedIterator for BitIterator<'_> {
79    fn next_back(&mut self) -> Option<Self::Item> {
80        if self.current_offset == self.end_offset {
81            return None;
82        }
83        self.end_offset -= 1;
84        // Safety:
85        // offsets in bounds
86        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) };
87        Some(v)
88    }
89}
90
91/// Iterator of contiguous ranges of set bits within a provided packed bitmask
92///
93/// Returns `(usize, usize)` each representing an interval where the corresponding
94/// bits in the provides mask are set
95///
96#[derive(Debug)]
97pub struct BitSliceIterator<'a> {
98    iter: UnalignedBitChunkIterator<'a>,
99    len: usize,
100    current_offset: i64,
101    current_chunk: u64,
102}
103
104impl<'a> BitSliceIterator<'a> {
105    /// Create a new [`BitSliceIterator`] from the provided `buffer`,
106    /// and `offset` and `len` in bits
107    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
108        let chunk = UnalignedBitChunk::new(buffer, offset, len);
109        let mut iter = chunk.iter();
110
111        let current_offset = -(chunk.lead_padding() as i64);
112        let current_chunk = iter.next().unwrap_or(0);
113
114        Self {
115            iter,
116            len,
117            current_offset,
118            current_chunk,
119        }
120    }
121
122    /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at
123    /// least one bit set, or None if there is no such chunk.
124    ///
125    /// Where `chunk_offset` is the bit offset to the current `u64` chunk
126    /// and `bit_offset` is the offset of the first `1` bit in that chunk
127    fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> {
128        loop {
129            if self.current_chunk != 0 {
130                // Find the index of the first 1
131                let bit_pos = self.current_chunk.trailing_zeros();
132                return Some((self.current_offset, bit_pos));
133            }
134
135            self.current_chunk = self.iter.next()?;
136            self.current_offset += 64;
137        }
138    }
139}
140
141impl Iterator for BitSliceIterator<'_> {
142    type Item = (usize, usize);
143
144    fn next(&mut self) -> Option<Self::Item> {
145        // Used as termination condition
146        if self.len == 0 {
147            return None;
148        }
149
150        let (start_chunk, start_bit) = self.advance_to_set_bit()?;
151
152        // Set bits up to start
153        self.current_chunk |= (1 << start_bit) - 1;
154
155        loop {
156            if self.current_chunk != u64::MAX {
157                // Find the index of the first 0
158                let end_bit = self.current_chunk.trailing_ones();
159
160                // Zero out up to end_bit
161                self.current_chunk &= !((1 << end_bit) - 1);
162
163                return Some((
164                    (start_chunk + start_bit as i64) as usize,
165                    (self.current_offset + end_bit as i64) as usize,
166                ));
167            }
168
169            match self.iter.next() {
170                Some(next) => {
171                    self.current_chunk = next;
172                    self.current_offset += 64;
173                }
174                None => {
175                    return Some((
176                        (start_chunk + start_bit as i64) as usize,
177                        std::mem::replace(&mut self.len, 0),
178                    ));
179                }
180            }
181        }
182    }
183}
184
185/// An iterator of `usize` whose index in a provided bitmask is true
186///
187/// This provides the best performance on most masks, apart from those which contain
188/// large runs and therefore favour [`BitSliceIterator`]
189#[derive(Debug)]
190pub struct BitIndexIterator<'a> {
191    current_chunk: u64,
192    chunk_offset: i64,
193    iter: UnalignedBitChunkIterator<'a>,
194}
195
196impl<'a> BitIndexIterator<'a> {
197    /// Create a new [`BitIndexIterator`] from the provide `buffer`,
198    /// and `offset` and `len` in bits
199    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
200        let chunks = UnalignedBitChunk::new(buffer, offset, len);
201        let mut iter = chunks.iter();
202
203        let current_chunk = iter.next().unwrap_or(0);
204        let chunk_offset = -(chunks.lead_padding() as i64);
205
206        Self {
207            current_chunk,
208            chunk_offset,
209            iter,
210        }
211    }
212}
213
214impl Iterator for BitIndexIterator<'_> {
215    type Item = usize;
216
217    fn next(&mut self) -> Option<Self::Item> {
218        loop {
219            if self.current_chunk != 0 {
220                let bit_pos = self.current_chunk.trailing_zeros();
221                self.current_chunk ^= 1 << bit_pos;
222                return Some((self.chunk_offset + bit_pos as i64) as usize);
223            }
224
225            self.current_chunk = self.iter.next()?;
226            self.chunk_offset += 64;
227        }
228    }
229}
230
231/// Calls the provided closure for each index in the provided null mask that is set,
232/// using an adaptive strategy based on the null count
233///
234/// Ideally this would be encapsulated in an [`Iterator`] that would determine the optimal
235/// strategy up front, and then yield indexes based on this.
236///
237/// Unfortunately, external iteration based on the resulting [`Iterator`] would match the strategy
238/// variant on each call to [`Iterator::next`], and LLVM generally cannot eliminate this.
239///
240/// One solution to this might be internal iteration, e.g. [`Iterator::try_fold`], however,
241/// it is currently [not possible] to override this for custom iterators in stable Rust.
242///
243/// As such this is the next best option
244///
245/// [not possible]: https://github.com/rust-lang/rust/issues/69595
246#[inline]
247pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
248    len: usize,
249    offset: usize,
250    null_count: usize,
251    nulls: Option<&[u8]>,
252    f: F,
253) -> Result<(), E> {
254    let valid_count = len - null_count;
255
256    if valid_count == len {
257        (0..len).try_for_each(f)
258    } else if null_count != len {
259        BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f)
260    } else {
261        Ok(())
262    }
263}
264
265// Note: further tests located in arrow_select::filter module
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270
271    #[test]
272    fn test_bit_iterator_size_hint() {
273        let mut b = BitIterator::new(&[0b00000011], 0, 2);
274        assert_eq!(
275            b.size_hint(),
276            (2, Some(2)),
277            "Expected size_hint to be (2, Some(2))"
278        );
279
280        b.next();
281        assert_eq!(
282            b.size_hint(),
283            (1, Some(1)),
284            "Expected size_hint to be (1, Some(1)) after one bit consumed"
285        );
286
287        b.next();
288        assert_eq!(
289            b.size_hint(),
290            (0, Some(0)),
291            "Expected size_hint to be (0, Some(0)) after all bits consumed"
292        );
293    }
294
295    #[test]
296    fn test_bit_iterator() {
297        let mask = &[0b00010010, 0b00100011, 0b00000101, 0b00010001, 0b10010011];
298        let actual: Vec<_> = BitIterator::new(mask, 0, 5).collect();
299        assert_eq!(actual, &[false, true, false, false, true]);
300
301        let actual: Vec<_> = BitIterator::new(mask, 4, 5).collect();
302        assert_eq!(actual, &[true, false, false, false, true]);
303
304        let actual: Vec<_> = BitIterator::new(mask, 12, 14).collect();
305        assert_eq!(
306            actual,
307            &[
308                false, true, false, false, true, false, true, false, false, false, false, false,
309                true, false
310            ]
311        );
312
313        assert_eq!(BitIterator::new(mask, 0, 0).count(), 0);
314        assert_eq!(BitIterator::new(mask, 40, 0).count(), 0);
315    }
316
317    #[test]
318    #[should_panic(expected = "BitIterator buffer too small, expected 3 got 2")]
319    fn test_bit_iterator_bounds() {
320        let mask = &[223, 23];
321        BitIterator::new(mask, 17, 0);
322    }
323}