zip/read/
magic_finder.rs

1use std::io::{Read, Seek, SeekFrom};
2
3use memchr::memmem::{Finder, FinderRev};
4
5use crate::result::ZipResult;
6
7pub trait FinderDirection<'a> {
8    fn new(needle: &'a [u8]) -> Self;
9    fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64;
10    fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize);
11
12    fn needle(&self) -> &[u8];
13    fn find(&self, haystack: &[u8]) -> Option<usize>;
14    fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option<u64>;
15    fn move_scope(&self, offset: usize) -> usize;
16}
17
18pub struct Forward<'a>(Finder<'a>);
19impl<'a> FinderDirection<'a> for Forward<'a> {
20    fn new(needle: &'a [u8]) -> Self {
21        Self(Finder::new(needle))
22    }
23
24    fn reset_cursor((start_inclusive, _): (u64, u64), _: usize) -> u64 {
25        start_inclusive
26    }
27
28    fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize) {
29        (&window[mid_window_offset..], mid_window_offset)
30    }
31
32    fn find(&self, haystack: &[u8]) -> Option<usize> {
33        self.0.find(haystack)
34    }
35
36    fn needle(&self) -> &[u8] {
37        self.0.needle()
38    }
39
40    fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option<u64> {
41        let magic_overlap = self.needle().len().saturating_sub(1) as u64;
42        let next = cursor.saturating_add(window_size as u64 - magic_overlap);
43
44        if next >= bounds.1 {
45            None
46        } else {
47            Some(next)
48        }
49    }
50
51    fn move_scope(&self, offset: usize) -> usize {
52        offset + self.needle().len()
53    }
54}
55
56pub struct Backwards<'a>(FinderRev<'a>);
57impl<'a> FinderDirection<'a> for Backwards<'a> {
58    fn new(needle: &'a [u8]) -> Self {
59        Self(FinderRev::new(needle))
60    }
61
62    fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64 {
63        bounds
64            .1
65            .saturating_sub(window_size as u64)
66            .clamp(bounds.0, bounds.1)
67    }
68
69    fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize) {
70        (&window[..mid_window_offset], 0)
71    }
72
73    fn find(&self, haystack: &[u8]) -> Option<usize> {
74        self.0.rfind(haystack)
75    }
76
77    fn needle(&self) -> &[u8] {
78        self.0.needle()
79    }
80
81    fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option<u64> {
82        let magic_overlap = self.needle().len().saturating_sub(1) as u64;
83
84        if cursor <= bounds.0 {
85            None
86        } else {
87            Some(
88                cursor
89                    .saturating_add(magic_overlap)
90                    .saturating_sub(window_size as u64)
91                    .clamp(bounds.0, bounds.1),
92            )
93        }
94    }
95
96    fn move_scope(&self, offset: usize) -> usize {
97        offset
98    }
99}
100
101/// A utility for finding magic symbols from the end of a seekable reader.
102///
103/// Can be repurposed to recycle the internal buffer.
104pub struct MagicFinder<Direction> {
105    buffer: Box<[u8]>,
106    pub(self) finder: Direction,
107    cursor: u64,
108    mid_buffer_offset: Option<usize>,
109    bounds: (u64, u64),
110}
111
112impl<'a, T: FinderDirection<'a>> MagicFinder<T> {
113    /// Create a new magic bytes finder to look within specific bounds.
114    pub fn new(magic_bytes: &'a [u8], start_inclusive: u64, end_exclusive: u64) -> Self {
115        const BUFFER_SIZE: usize = 2048;
116
117        // Smaller buffer size would be unable to locate bytes.
118        // Equal buffer size would stall (the window could not be moved).
119        debug_assert!(BUFFER_SIZE >= magic_bytes.len());
120
121        Self {
122            buffer: vec![0; BUFFER_SIZE].into_boxed_slice(),
123            finder: T::new(magic_bytes),
124            cursor: T::reset_cursor((start_inclusive, end_exclusive), BUFFER_SIZE),
125            mid_buffer_offset: None,
126            bounds: (start_inclusive, end_exclusive),
127        }
128    }
129
130    /// Repurpose the finder for different bytes or bounds.
131    pub fn repurpose(&mut self, magic_bytes: &'a [u8], bounds: (u64, u64)) -> &mut Self {
132        debug_assert!(self.buffer.len() >= magic_bytes.len());
133
134        self.finder = T::new(magic_bytes);
135        self.cursor = T::reset_cursor(bounds, self.buffer.len());
136        self.bounds = bounds;
137
138        // Reset the mid-buffer offset, to invalidate buffer content.
139        self.mid_buffer_offset = None;
140
141        self
142    }
143
144    /// Find the next magic bytes in the direction specified in the type.
145    pub fn next<R: Read + Seek>(&mut self, reader: &mut R) -> ZipResult<Option<u64>> {
146        loop {
147            if self.cursor < self.bounds.0 || self.cursor >= self.bounds.1 {
148                // The finder is consumed
149                break;
150            }
151
152            /* Position the window and ensure correct length */
153            let window_start = self.cursor;
154            let window_end = self
155                .cursor
156                .saturating_add(self.buffer.len() as u64)
157                .min(self.bounds.1);
158
159            if window_end <= window_start {
160                // Short-circuit on zero-sized windows to prevent loop
161                break;
162            }
163
164            let window = &mut self.buffer[..(window_end - window_start) as usize];
165
166            if self.mid_buffer_offset.is_none() {
167                reader.seek(SeekFrom::Start(window_start))?;
168                reader.read_exact(window)?;
169            }
170
171            let (window, window_start_offset) = match self.mid_buffer_offset {
172                Some(mid_buffer_offset) => T::scope_window(window, mid_buffer_offset),
173                None => (&*window, 0usize),
174            };
175
176            if let Some(offset) = self.finder.find(window) {
177                let magic_pos = window_start + window_start_offset as u64 + offset as u64;
178                reader.seek(SeekFrom::Start(magic_pos))?;
179
180                self.mid_buffer_offset = Some(self.finder.move_scope(window_start_offset + offset));
181
182                return Ok(Some(magic_pos));
183            }
184
185            self.mid_buffer_offset = None;
186
187            match self
188                .finder
189                .move_cursor(self.cursor, self.bounds, self.buffer.len())
190            {
191                Some(new_cursor) => {
192                    self.cursor = new_cursor;
193                }
194                None => {
195                    // Destroy the finder when we've reached the end of the bounds.
196                    self.bounds.0 = self.bounds.1;
197                    break;
198                }
199            }
200        }
201
202        Ok(None)
203    }
204}
205
206/// A magic bytes finder with an optimistic guess that is tried before
207/// the inner finder begins searching from end. This enables much faster
208/// lookup in files without appended junk, because the magic bytes will be
209/// found directly.
210///
211/// The guess can be marked as mandatory to produce an error. This is useful
212/// if the ArchiveOffset is known and auto-detection is not desired.
213pub struct OptimisticMagicFinder<Direction> {
214    inner: MagicFinder<Direction>,
215    initial_guess: Option<(u64, bool)>,
216}
217
218/// This is a temporary restriction, to avoid heap allocation in [`Self::next_back`].
219///
220/// We only use magic bytes of size 4 at the moment.
221const STACK_BUFFER_SIZE: usize = 8;
222
223impl<'a, Direction: FinderDirection<'a>> OptimisticMagicFinder<Direction> {
224    /// Create a new empty optimistic magic bytes finder.
225    pub fn new_empty() -> Self {
226        Self {
227            inner: MagicFinder::new(&[], 0, 0),
228            initial_guess: None,
229        }
230    }
231
232    /// Repurpose the finder for different bytes, bounds and initial guesses.
233    pub fn repurpose(
234        &mut self,
235        magic_bytes: &'a [u8],
236        bounds: (u64, u64),
237        initial_guess: Option<(u64, bool)>,
238    ) -> &mut Self {
239        debug_assert!(magic_bytes.len() <= STACK_BUFFER_SIZE);
240
241        self.inner.repurpose(magic_bytes, bounds);
242        self.initial_guess = initial_guess;
243
244        self
245    }
246
247    /// Equivalent to `next_back`, with an optional initial guess attempted before
248    /// proceeding with reading from the back of the reader.
249    pub fn next<R: Read + Seek>(&mut self, reader: &mut R) -> ZipResult<Option<u64>> {
250        if let Some((v, mandatory)) = self.initial_guess {
251            reader.seek(SeekFrom::Start(v))?;
252
253            let mut buffer = [0; STACK_BUFFER_SIZE];
254            let buffer = &mut buffer[..self.inner.finder.needle().len()];
255
256            // Attempt to match only if there's enough space for the needle
257            if v.saturating_add(buffer.len() as u64) <= self.inner.bounds.1 {
258                reader.read_exact(buffer)?;
259
260                // If a match is found, yield it.
261                if self.inner.finder.needle() == buffer {
262                    self.initial_guess.take();
263                    reader.seek(SeekFrom::Start(v))?;
264                    return Ok(Some(v));
265                }
266            }
267
268            // If a match is not found, but the initial guess was mandatory, return an error.
269            if mandatory {
270                return Ok(None);
271            }
272
273            // If the initial guess was not mandatory, remove it, as it was not found.
274            self.initial_guess.take();
275        }
276
277        self.inner.next(reader)
278    }
279}