arrow_buffer/buffer/
null.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
19use crate::buffer::BooleanBuffer;
20use crate::{Buffer, MutableBuffer};
21
22/// A [`BooleanBuffer`] used to encode validity for arrow arrays
23///
24/// As per the [Arrow specification], array validity is encoded in a packed bitmask with a
25/// `true` value indicating the corresponding slot is not null, and `false` indicating
26/// that it is null.
27///
28/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
29#[derive(Debug, Clone, Eq, PartialEq)]
30pub struct NullBuffer {
31    buffer: BooleanBuffer,
32    null_count: usize,
33}
34
35impl NullBuffer {
36    /// Create a new [`NullBuffer`] computing the null count
37    pub fn new(buffer: BooleanBuffer) -> Self {
38        let null_count = buffer.len() - buffer.count_set_bits();
39        Self { buffer, null_count }
40    }
41
42    /// Create a new [`NullBuffer`] of length `len` where all values are null
43    pub fn new_null(len: usize) -> Self {
44        Self {
45            buffer: BooleanBuffer::new_unset(len),
46            null_count: len,
47        }
48    }
49
50    /// Create a new [`NullBuffer`] of length `len` where all values are valid
51    ///
52    /// Note: it is more efficient to not set the null buffer if it is known to be all valid
53    pub fn new_valid(len: usize) -> Self {
54        Self {
55            buffer: BooleanBuffer::new_set(len),
56            null_count: 0,
57        }
58    }
59
60    /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
61    ///
62    /// # Safety
63    ///
64    /// `buffer` must contain `null_count` `0` bits
65    pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
66        Self { buffer, null_count }
67    }
68
69    /// Computes the union of the nulls in two optional [`NullBuffer`]
70    ///
71    /// This is commonly used by binary operations where the result is NULL if either
72    /// of the input values is NULL. Handling the null mask separately in this way
73    /// can yield significant performance improvements over an iterator approach
74    pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
75        match (lhs, rhs) {
76            (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
77            (Some(n), None) | (None, Some(n)) => Some(n.clone()),
78            (None, None) => None,
79        }
80    }
81
82    /// Returns true if all nulls in `other` also exist in self
83    pub fn contains(&self, other: &NullBuffer) -> bool {
84        if other.null_count == 0 {
85            return true;
86        }
87        let lhs = self.inner().bit_chunks().iter_padded();
88        let rhs = other.inner().bit_chunks().iter_padded();
89        lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
90    }
91
92    /// Returns a new [`NullBuffer`] where each bit in the current null buffer
93    /// is repeated `count` times. This is useful for masking the nulls of
94    /// the child of a FixedSizeListArray based on its parent
95    pub fn expand(&self, count: usize) -> Self {
96        let capacity = self.buffer.len().checked_mul(count).unwrap();
97        let mut buffer = MutableBuffer::new_null(capacity);
98
99        // Expand each bit within `null_mask` into `element_len`
100        // bits, constructing the implicit mask of the child elements
101        for i in 0..self.buffer.len() {
102            if self.is_null(i) {
103                continue;
104            }
105            for j in 0..count {
106                crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
107            }
108        }
109        Self {
110            buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
111            null_count: self.null_count * count,
112        }
113    }
114
115    /// Returns the length of this [`NullBuffer`]
116    #[inline]
117    pub fn len(&self) -> usize {
118        self.buffer.len()
119    }
120
121    /// Returns the offset of this [`NullBuffer`] in bits
122    #[inline]
123    pub fn offset(&self) -> usize {
124        self.buffer.offset()
125    }
126
127    /// Returns true if this [`NullBuffer`] is empty
128    #[inline]
129    pub fn is_empty(&self) -> bool {
130        self.buffer.is_empty()
131    }
132
133    /// Returns the null count for this [`NullBuffer`]
134    #[inline]
135    pub fn null_count(&self) -> usize {
136        self.null_count
137    }
138
139    /// Returns `true` if the value at `idx` is not null
140    #[inline]
141    pub fn is_valid(&self, idx: usize) -> bool {
142        self.buffer.value(idx)
143    }
144
145    /// Returns `true` if the value at `idx` is null
146    #[inline]
147    pub fn is_null(&self, idx: usize) -> bool {
148        !self.is_valid(idx)
149    }
150
151    /// Returns the packed validity of this [`NullBuffer`] not including any offset
152    #[inline]
153    pub fn validity(&self) -> &[u8] {
154        self.buffer.values()
155    }
156
157    /// Slices this [`NullBuffer`] by the provided `offset` and `length`
158    pub fn slice(&self, offset: usize, len: usize) -> Self {
159        Self::new(self.buffer.slice(offset, len))
160    }
161
162    /// Returns an iterator over the bits in this [`NullBuffer`]
163    ///
164    /// * `true` indicates that the corresponding value is not NULL
165    /// * `false` indicates that the corresponding value is NULL
166    ///
167    /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
168    pub fn iter(&self) -> BitIterator<'_> {
169        self.buffer.iter()
170    }
171
172    /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
173    ///
174    /// Valid indices indicate the corresponding value is not NULL
175    pub fn valid_indices(&self) -> BitIndexIterator<'_> {
176        self.buffer.set_indices()
177    }
178
179    /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
180    ///
181    /// Valid indices indicate the corresponding value is not NULL
182    pub fn valid_slices(&self) -> BitSliceIterator<'_> {
183        self.buffer.set_slices()
184    }
185
186    /// Calls the provided closure for each index in this null mask that is set
187    #[inline]
188    pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
189        &self,
190        f: F,
191    ) -> Result<(), E> {
192        if self.null_count == self.len() {
193            return Ok(());
194        }
195        self.valid_indices().try_for_each(f)
196    }
197
198    /// Returns the inner [`BooleanBuffer`]
199    #[inline]
200    pub fn inner(&self) -> &BooleanBuffer {
201        &self.buffer
202    }
203
204    /// Returns the inner [`BooleanBuffer`]
205    #[inline]
206    pub fn into_inner(self) -> BooleanBuffer {
207        self.buffer
208    }
209
210    /// Returns the underlying [`Buffer`]
211    #[inline]
212    pub fn buffer(&self) -> &Buffer {
213        self.buffer.inner()
214    }
215}
216
217impl<'a> IntoIterator for &'a NullBuffer {
218    type Item = bool;
219    type IntoIter = BitIterator<'a>;
220
221    fn into_iter(self) -> Self::IntoIter {
222        self.buffer.iter()
223    }
224}
225
226impl From<BooleanBuffer> for NullBuffer {
227    fn from(value: BooleanBuffer) -> Self {
228        Self::new(value)
229    }
230}
231
232impl From<&[bool]> for NullBuffer {
233    fn from(value: &[bool]) -> Self {
234        BooleanBuffer::from(value).into()
235    }
236}
237
238impl From<Vec<bool>> for NullBuffer {
239    fn from(value: Vec<bool>) -> Self {
240        BooleanBuffer::from(value).into()
241    }
242}
243
244impl FromIterator<bool> for NullBuffer {
245    fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
246        BooleanBuffer::from_iter(iter).into()
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253    #[test]
254    fn test_size() {
255        // This tests that the niche optimisation eliminates the overhead of an option
256        assert_eq!(
257            std::mem::size_of::<NullBuffer>(),
258            std::mem::size_of::<Option<NullBuffer>>()
259        );
260    }
261}