roaring/bitmap/
serialization.rs

1use crate::bitmap::container::{Container, ARRAY_LIMIT};
2use crate::bitmap::store::{
3    ArrayStore, BitmapStore, Interval, Store, BITMAP_BYTES, BITMAP_LENGTH, RUN_ELEMENT_BYTES,
4    RUN_NUM_BYTES,
5};
6use crate::RoaringBitmap;
7use bytemuck::cast_slice_mut;
8use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use core::convert::Infallible;
10use std::error::Error;
11use std::io;
12
13use super::store::IntervalStore;
14
15pub(crate) const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
16pub(crate) const SERIAL_COOKIE: u16 = 12347;
17pub(crate) const NO_OFFSET_THRESHOLD: usize = 4;
18
19// Sizes of header structures
20pub(crate) const COOKIE_BYTES: usize = 4;
21pub(crate) const SIZE_BYTES: usize = 4;
22pub(crate) const DESCRIPTION_BYTES: usize = 4;
23pub(crate) const OFFSET_BYTES: usize = 4;
24
25impl RoaringBitmap {
26    /// Return the size in bytes of the serialized output.
27    /// This is compatible with the official C/C++, Java and Go implementations.
28    ///
29    /// # Examples
30    ///
31    /// ```rust
32    /// use roaring::RoaringBitmap;
33    ///
34    /// let rb1: RoaringBitmap = (1..4).collect();
35    /// let mut bytes = Vec::with_capacity(rb1.serialized_size());
36    /// rb1.serialize_into(&mut bytes).unwrap();
37    /// let rb2 = RoaringBitmap::deserialize_from(&bytes[..]).unwrap();
38    ///
39    /// assert_eq!(rb1, rb2);
40    /// ```
41    pub fn serialized_size(&self) -> usize {
42        let mut has_run_containers = false;
43        let size = self.containers.len();
44        let container_sizes: usize = self
45            .containers
46            .iter()
47            .map(|container| match container.store {
48                Store::Array(ref values) => values.byte_size(),
49                Store::Bitmap(..) => BITMAP_BYTES,
50                Store::Run(ref intervals) => {
51                    has_run_containers = true;
52                    intervals.byte_size()
53                }
54            })
55            .sum();
56
57        // header + container sizes
58        header_size(size, has_run_containers) + container_sizes
59    }
60
61    /// Serialize this bitmap into [the standard Roaring on-disk format][format].
62    /// This is compatible with the official C/C++, Java and Go implementations.
63    ///
64    /// [format]: https://github.com/RoaringBitmap/RoaringFormatSpec
65    ///
66    /// # Examples
67    ///
68    /// ```rust
69    /// use roaring::RoaringBitmap;
70    ///
71    /// let rb1: RoaringBitmap = (1..4).collect();
72    /// let mut bytes = vec![];
73    /// rb1.serialize_into(&mut bytes).unwrap();
74    /// let rb2 = RoaringBitmap::deserialize_from(&bytes[..]).unwrap();
75    ///
76    /// assert_eq!(rb1, rb2);
77    /// ```
78    pub fn serialize_into<W: io::Write>(&self, mut writer: W) -> io::Result<()> {
79        let has_run_containers = self.containers.iter().any(|c| matches!(c.store, Store::Run(_)));
80        let size = self.containers.len();
81
82        // Depending on if run containers are present or not write the appropriate header
83        if has_run_containers {
84            // The new format stores the container count in the most significant bits of the header
85            let cookie = SERIAL_COOKIE as u32 | ((size as u32 - 1) << 16);
86            writer.write_u32::<LittleEndian>(cookie)?;
87            // It is then followed by a bitset indicating which containers are run containers
88            let run_container_bitmap_size = size.div_ceil(8);
89            let mut run_container_bitmap = vec![0; run_container_bitmap_size];
90            for (i, container) in self.containers.iter().enumerate() {
91                if let Store::Run(_) = container.store {
92                    run_container_bitmap[i / 8] |= 1 << (i % 8);
93                }
94            }
95            writer.write_all(&run_container_bitmap)?;
96        } else {
97            // Write old format, cookie followed by container count
98            writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
99            writer.write_u32::<LittleEndian>(size as u32)?;
100        }
101
102        // Write the container descriptions
103        for container in &self.containers {
104            writer.write_u16::<LittleEndian>(container.key)?;
105            writer.write_u16::<LittleEndian>((container.len() - 1) as u16)?;
106        }
107
108        let mut offset = header_size(size, has_run_containers) as u32;
109        let has_offsets = if has_run_containers { size >= OFFSET_BYTES } else { true };
110        if has_offsets {
111            for container in &self.containers {
112                writer.write_u32::<LittleEndian>(offset)?;
113                match container.store {
114                    Store::Array(ref values) => {
115                        offset += values.len() as u32 * 2;
116                    }
117                    Store::Bitmap(..) => {
118                        offset += 8 * 1024;
119                    }
120                    Store::Run(ref intervals) => {
121                        offset += (RUN_NUM_BYTES
122                            + (intervals.run_amount() as usize * RUN_ELEMENT_BYTES))
123                            as u32;
124                    }
125                }
126            }
127        }
128
129        for container in &self.containers {
130            match container.store {
131                Store::Array(ref values) => {
132                    for &value in values.iter() {
133                        writer.write_u16::<LittleEndian>(value)?;
134                    }
135                }
136                Store::Bitmap(ref bits) => {
137                    for &value in bits.as_array() {
138                        writer.write_u64::<LittleEndian>(value)?;
139                    }
140                }
141                Store::Run(ref intervals) => {
142                    writer.write_u16::<LittleEndian>(intervals.run_amount() as u16)?;
143                    for iv in intervals.iter_intervals() {
144                        writer.write_u16::<LittleEndian>(iv.start())?;
145                        writer.write_u16::<LittleEndian>(iv.end() - iv.start())?;
146                    }
147                }
148            }
149        }
150
151        Ok(())
152    }
153
154    /// Deserialize a bitmap into memory from [the standard Roaring on-disk
155    /// format][format]. This is compatible with the official C/C++, Java and
156    /// Go implementations. This method checks that all of the internal values
157    /// are valid. If deserializing from a trusted source consider
158    /// [RoaringBitmap::deserialize_unchecked_from]
159    ///
160    /// [format]: https://github.com/RoaringBitmap/RoaringFormatSpec
161    ///
162    /// # Examples
163    ///
164    /// ```rust
165    /// use roaring::RoaringBitmap;
166    ///
167    /// let rb1: RoaringBitmap = (1..4).collect();
168    /// let mut bytes = vec![];
169    /// rb1.serialize_into(&mut bytes).unwrap();
170    /// let rb2 = RoaringBitmap::deserialize_from(&bytes[..]).unwrap();
171    ///
172    /// assert_eq!(rb1, rb2);
173    /// ```
174    pub fn deserialize_from<R: io::Read>(reader: R) -> io::Result<RoaringBitmap> {
175        RoaringBitmap::deserialize_from_impl(reader, ArrayStore::try_from, BitmapStore::try_from)
176    }
177
178    /// Deserialize a bitmap into memory from [the standard Roaring on-disk
179    /// format][format]. This is compatible with the official C/C++, Java and
180    /// Go implementations. This method is memory safe but will not check if
181    /// the data is a valid bitmap.
182    ///
183    /// [format]: https://github.com/RoaringBitmap/RoaringFormatSpec
184    ///
185    /// # Examples
186    ///
187    /// ```rust
188    /// use roaring::RoaringBitmap;
189    ///
190    /// let rb1: RoaringBitmap = (1..4).collect();
191    /// let mut bytes = vec![];
192    /// rb1.serialize_into(&mut bytes).unwrap();
193    /// let rb2 = RoaringBitmap::deserialize_unchecked_from(&bytes[..]).unwrap();
194    ///
195    /// assert_eq!(rb1, rb2);
196    /// ```
197    pub fn deserialize_unchecked_from<R: io::Read>(reader: R) -> io::Result<RoaringBitmap> {
198        RoaringBitmap::deserialize_from_impl::<R, _, Infallible, _, Infallible>(
199            reader,
200            |values| Ok(ArrayStore::from_vec_unchecked(values)),
201            |len, values| Ok(BitmapStore::from_unchecked(len, values)),
202        )
203    }
204
205    fn deserialize_from_impl<R, A, AErr, B, BErr>(
206        mut reader: R,
207        a: A,
208        b: B,
209    ) -> io::Result<RoaringBitmap>
210    where
211        R: io::Read,
212        A: Fn(Vec<u16>) -> Result<ArrayStore, AErr>,
213        AErr: Error + Send + Sync + 'static,
214        B: Fn(u64, Box<[u64; 1024]>) -> Result<BitmapStore, BErr>,
215        BErr: Error + Send + Sync + 'static,
216    {
217        // First read the cookie to determine which version of the format we are reading
218        let (size, has_offsets, has_run_containers) = {
219            let cookie = reader.read_u32::<LittleEndian>()?;
220            if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
221                (reader.read_u32::<LittleEndian>()? as usize, true, false)
222            } else if (cookie as u16) == SERIAL_COOKIE {
223                let size = ((cookie >> 16) + 1) as usize;
224                (size, size >= NO_OFFSET_THRESHOLD, true)
225            } else {
226                return Err(io::Error::other("unknown cookie value"));
227            }
228        };
229
230        // Read the run container bitmap if necessary
231        let run_container_bitmap = if has_run_containers {
232            let mut bitmap = vec![0u8; size.div_ceil(8)];
233            reader.read_exact(&mut bitmap)?;
234            Some(bitmap)
235        } else {
236            None
237        };
238
239        if size > u16::MAX as usize + 1 {
240            return Err(io::Error::other("size is greater than supported"));
241        }
242
243        // Read the container descriptions
244        let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES];
245        reader.read_exact(&mut description_bytes)?;
246        let mut description_bytes = &description_bytes[..];
247
248        if has_offsets {
249            let mut offsets = vec![0u8; size * OFFSET_BYTES];
250            reader.read_exact(&mut offsets)?;
251            drop(offsets); // Not useful when deserializing into memory
252        }
253
254        let mut containers = Vec::with_capacity(size);
255
256        let mut last_key = None::<u16>;
257        // Read each container
258        for i in 0..size {
259            let key = description_bytes.read_u16::<LittleEndian>()?;
260            if let Some(last_key) = last_key.replace(key) {
261                if key <= last_key {
262                    return Err(io::Error::new(
263                        io::ErrorKind::InvalidData,
264                        "container keys are not sorted",
265                    ));
266                }
267            }
268            let cardinality = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
269
270            // If the run container bitmap is present, check if this container is a run container
271            let is_run_container =
272                run_container_bitmap.as_ref().is_some_and(|bm| bm[i / 8] & (1 << (i % 8)) != 0);
273
274            let store = if is_run_container {
275                let runs = reader.read_u16::<LittleEndian>()?;
276                if runs == 0 {
277                    return Err(io::Error::new(
278                        io::ErrorKind::InvalidData,
279                        "run container with zero runs",
280                    ));
281                }
282                let mut intervals = vec![[0, 0]; runs as usize];
283                reader.read_exact(cast_slice_mut(&mut intervals))?;
284                intervals.iter_mut().for_each(|[s, len]| {
285                    *s = u16::from_le(*s);
286                    *len = u16::from_le(*len);
287                });
288
289                let mut last_end = None::<u16>;
290                let store = IntervalStore::from_vec_unchecked(
291                    intervals
292                        .into_iter()
293                        .map(|[s, len]| -> Result<Interval, io::ErrorKind> {
294                            let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?;
295                            if let Some(last_end) = last_end.replace(end) {
296                                if s <= last_end.saturating_add(1) {
297                                    // Range overlaps or would be contiguous with the previous range
298                                    return Err(io::ErrorKind::InvalidData);
299                                }
300                            }
301                            Ok(Interval::new_unchecked(s, end))
302                        })
303                        .collect::<Result<_, _>>()?,
304                );
305                Store::Run(store)
306            } else if cardinality <= ARRAY_LIMIT {
307                let mut values = vec![0; cardinality as usize];
308                reader.read_exact(cast_slice_mut(&mut values))?;
309                values.iter_mut().for_each(|n| *n = u16::from_le(*n));
310                let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
311                Store::Array(array)
312            } else {
313                let mut values = Box::new([0; BITMAP_LENGTH]);
314                reader.read_exact(cast_slice_mut(&mut values[..]))?;
315                values.iter_mut().for_each(|n| *n = u64::from_le(*n));
316                let bitmap = b(cardinality, values)
317                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
318                Store::Bitmap(bitmap)
319            };
320
321            containers.push(Container { key, store });
322        }
323
324        Ok(RoaringBitmap { containers })
325    }
326}
327
328fn header_size(size: usize, has_run_containers: bool) -> usize {
329    if has_run_containers {
330        // New format encodes the size (number of containers) into the 4 byte cookie
331        // Additionally a bitmap is included marking which containers are run containers
332        let run_container_bitmap_size = size.div_ceil(8);
333        // New format conditionally includes offsets if there are 4 or more containers
334        if size >= NO_OFFSET_THRESHOLD {
335            COOKIE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size) + run_container_bitmap_size
336        } else {
337            COOKIE_BYTES + (DESCRIPTION_BYTES * size) + run_container_bitmap_size
338        }
339    } else {
340        // Old format encodes cookie followed by container count
341        // It also always includes the offsets
342        COOKIE_BYTES + SIZE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size)
343    }
344}
345
346#[cfg(test)]
347mod test {
348    use crate::{bitmap::store::BITMAP_LENGTH, RoaringBitmap};
349    use proptest::prelude::*;
350
351    proptest! {
352        #[test]
353        fn test_serialization(
354            bitmap in RoaringBitmap::arbitrary(),
355        ) {
356            let mut buffer = Vec::new();
357            bitmap.serialize_into(&mut buffer).unwrap();
358            prop_assert_eq!(bitmap, RoaringBitmap::deserialize_from(buffer.as_slice()).unwrap());
359        }
360    }
361
362    #[test]
363    fn test_from_lsb0_bytes() {
364        const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32;
365        const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8;
366        let mut bytes = vec![0xff; CONTAINER_OFFSET_IN_BYTES as usize];
367        bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]);
368        bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]);
369
370        let offset = 32;
371        let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes);
372        for i in 0..offset {
373            assert!(!rb.contains(i), "{i} should not be in the bitmap");
374        }
375        for i in offset..offset + CONTAINER_OFFSET {
376            assert!(rb.contains(i), "{i} should be in the bitmap");
377        }
378        for i in offset + CONTAINER_OFFSET..offset + CONTAINER_OFFSET * 2 {
379            assert!(!rb.contains(i), "{i} should not be in the bitmap");
380        }
381        for bit in [0, 9, 16, 17, 26] {
382            let i = bit + offset + CONTAINER_OFFSET * 2;
383            assert!(rb.contains(i), "{i} should be in the bitmap");
384        }
385
386        assert_eq!(rb.len(), CONTAINER_OFFSET as u64 + 5);
387
388        // Ensure the empty container is not created
389        let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize];
390        bytes.extend([0xff]);
391        let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes);
392        assert_eq!(rb.min(), Some(CONTAINER_OFFSET));
393
394        let rb = RoaringBitmap::from_lsb0_bytes(8, &bytes);
395        assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8));
396
397        // Ensure we can set the last byte in an array container
398        let bytes = [0x80];
399        let rb = RoaringBitmap::from_lsb0_bytes(0xFFFFFFF8, &bytes);
400        assert_eq!(rb.len(), 1);
401        assert!(rb.contains(u32::MAX));
402
403        // Ensure we can set the last byte in a bitmap container
404        let bytes = vec![0xFF; 0x1_0000 / 8];
405        let rb = RoaringBitmap::from_lsb0_bytes(0xFFFF0000, &bytes);
406        assert_eq!(rb.len(), 0x1_0000);
407        assert!(rb.contains(u32::MAX));
408    }
409
410    #[test]
411    fn test_from_lsb0_bytes_not_multiple_of_8() {
412        const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32;
413        const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8;
414
415        let mut bytes = vec![0b0101_1001];
416        bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]);
417        bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]);
418
419        let mut indices = vec![0, 3, 4, 6];
420        indices.extend([0, 9, 16, 17, 26].map(|i| 8 + CONTAINER_OFFSET + i));
421
422        for offset in 0..8 {
423            let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes);
424            for i in indices.iter().map(|&i| i + offset) {
425                assert!(rb.contains(i), "{i} should be in the bitmap");
426            }
427        }
428    }
429
430    #[test]
431    #[should_panic(expected = "<= 2^32")]
432    fn test_from_lsb0_bytes_overflow() {
433        let bytes = [0x01, 0x01];
434        RoaringBitmap::from_lsb0_bytes(u32::MAX - 7, &bytes);
435    }
436
437    #[test]
438    fn test_deserialize_overflow_s_plus_len() {
439        let data = vec![59, 48, 0, 0, 255, 130, 254, 59, 48, 2, 0, 41, 255, 255, 166, 197, 4, 0, 2];
440        let res = RoaringBitmap::deserialize_from(data.as_slice());
441        assert!(res.is_err());
442    }
443}