Skip to main content

mz_ore/pager/
swap.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License in the LICENSE file at the
6// root of this repository, or online at
7//
8//     http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! Swap backend for the pager. See `mz_ore::pager` for the public API.
17
18use crate::pager::Handle;
19
20/// Storage for a swap-backed handle.
21#[derive(Debug)]
22pub(crate) struct SwapInner {
23    /// Logical chunks; logical layout is concatenation in this order.
24    pub(crate) chunks: Vec<Vec<u64>>,
25    /// Cumulative element counts; `prefix[i]` = sum of `chunks[..i]` lengths.
26    /// `prefix[0] == 0`, `prefix.last() == total_len`.
27    pub(crate) prefix: Vec<usize>,
28}
29
30impl SwapInner {
31    pub(crate) fn new(chunks: Vec<Vec<u64>>) -> Self {
32        let mut prefix = Vec::with_capacity(chunks.len() + 1);
33        prefix.push(0);
34        let mut sum = 0;
35        for c in &chunks {
36            sum += c.len();
37            prefix.push(sum);
38        }
39        Self { chunks, prefix }
40    }
41
42    pub(crate) fn total_len(&self) -> usize {
43        // `new` always pushes the initial 0, so `prefix` has at least one element.
44        *self
45            .prefix
46            .last()
47            .expect("SwapInner::prefix invariant: at least [0]")
48    }
49}
50
51pub(crate) fn pageout_swap(chunks: &mut [Vec<u64>]) -> Handle {
52    let mut taken: Vec<Vec<u64>> = Vec::with_capacity(chunks.len());
53    for c in chunks.iter_mut() {
54        taken.push(std::mem::take(c));
55    }
56    for c in &taken {
57        madvise_cold(c);
58    }
59    Handle::from_swap(SwapInner::new(taken))
60}
61
62/// Proactively reclaims (swaps out) the resident pages of `bytes` via
63/// `MADV_PAGEOUT`, holding RSS at the caller's budget right now rather than
64/// waiting for kernel LRU to reclaim under pressure the way `pageout_swap`'s
65/// `MADV_COLD` hint does.
66///
67/// Unlike `pageout_swap`, this takes a borrow and does **not** transfer
68/// ownership: the allocation stays addressable in the caller's address space,
69/// so a later read simply re-faults the swapped-out pages back in. That suits a
70/// buffer the caller must keep reachable — e.g. the column pager's
71/// lz4-compressed bytes kept in memory — but still wants evicted eagerly so the
72/// budget is real instead of a fiction the kernel only honors at the pressure
73/// cliff.
74///
75/// On non-Linux targets this is a no-op (matching `MADV_COLD`).
76pub fn advise_pageout(bytes: &[u8]) {
77    madvise_pageout(bytes);
78}
79
80#[cfg(target_os = "linux")]
81fn madvise_cold(chunk: &[u64]) {
82    // `Vec<u64>` cannot exceed `isize::MAX` bytes, so this multiplication
83    // cannot overflow on any supported target. Use `checked_mul` for
84    // defense-in-depth: a corrupted length should fail loudly, not wrap.
85    let Some(len_bytes) = chunk.len().checked_mul(std::mem::size_of::<u64>()) else {
86        return;
87    };
88    // SAFETY: `(ptr, len_bytes)` describes the live `&[u64]` exactly.
89    unsafe { madvise_aligned(chunk.as_ptr().cast::<u8>(), len_bytes, libc::MADV_COLD) }
90}
91
92#[cfg(target_os = "linux")]
93fn madvise_pageout(bytes: &[u8]) {
94    // SAFETY: `(ptr, len)` describes the live `&[u8]` exactly.
95    unsafe { madvise_aligned(bytes.as_ptr(), bytes.len(), libc::MADV_PAGEOUT) }
96}
97
98/// Issues `madvise(advice)` over the page-aligned interior of the byte range
99/// `[base_ptr, base_ptr + len_bytes)`. `madvise` operates at page granularity,
100/// so the start rounds up and the end rounds down to page boundaries; a range
101/// that contains no whole page is skipped so we never advise pages we only
102/// partially own.
103///
104/// # Safety
105///
106/// `base_ptr` must point to the start of a live allocation of at least
107/// `len_bytes` bytes that stays valid for the duration of the call. `advice`
108/// must be a non-mutating hint (`MADV_COLD`/`MADV_PAGEOUT`): both only change
109/// the kernel's reclaim decision and leave the bytes readable, so concurrent
110/// reads of the range remain sound.
111#[cfg(target_os = "linux")]
112unsafe fn madvise_aligned(base_ptr: *const u8, len_bytes: usize, advice: libc::c_int) {
113    if len_bytes == 0 {
114        return;
115    }
116    let page = page_size();
117    let base_addr = base_ptr.addr();
118    // Round the start up and the end down to page boundaries. Both additions
119    // use `checked_add` so that an allocation sitting near the top of the
120    // address space can never silently wrap into a tiny range.
121    let Some(start_unaligned) = base_addr.checked_add(page - 1) else {
122        return;
123    };
124    let Some(end_unaligned) = base_addr.checked_add(len_bytes) else {
125        return;
126    };
127    let aligned_start_addr = start_unaligned & !(page - 1);
128    let aligned_end_addr = end_unaligned & !(page - 1);
129    if aligned_end_addr <= aligned_start_addr {
130        return;
131    }
132    let aligned_len = aligned_end_addr - aligned_start_addr;
133    // SAFETY: `aligned_start_addr` lies in `[base_addr, base_addr + len_bytes]`
134    // by construction (rounding up the start cannot exceed `end_unaligned`,
135    // which equals `base_addr + len_bytes`; the early-return above guarantees
136    // `start ≤ end`). That interval is within the live allocation the caller
137    // promised, so `byte_add` stays in-bounds and preserves provenance.
138    let aligned_ptr = unsafe { base_ptr.byte_add(aligned_start_addr - base_addr) }
139        .cast::<libc::c_void>()
140        .cast_mut();
141    // SAFETY: pointer/length describe a fully page-aligned subrange contained
142    // within the live allocation (justified above). The caller guarantees
143    // `advice` is a non-mutating reclaim hint, so concurrent reads of the range
144    // remain sound.
145    unsafe {
146        libc::madvise(aligned_ptr, aligned_len, advice);
147    }
148}
149
150#[cfg(not(target_os = "linux"))]
151fn madvise_cold(_chunk: &[u64]) {}
152
153#[cfg(not(target_os = "linux"))]
154fn madvise_pageout(_bytes: &[u8]) {}
155
156#[cfg(target_os = "linux")]
157fn page_size() -> usize {
158    // SAFETY: `sysconf` with a valid argument is safe.
159    let raw = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
160    usize::try_from(raw).expect("page size is positive and fits usize")
161}
162
163pub(crate) fn read_at_swap(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec<u64>) {
164    let inner = handle
165        .swap_inner()
166        .expect("read_at_swap called on non-swap handle");
167    let total = inner.total_len();
168    let total_out: usize = ranges.iter().map(|(_, l)| *l).sum();
169    dst.reserve(total_out);
170    for &(off, len) in ranges {
171        let end = off.checked_add(len).expect("range offset+len overflow");
172        assert!(
173            end <= total,
174            "read range out of bounds: {off}+{len} > {total}"
175        );
176        copy_range(inner, off, len, dst);
177    }
178}
179
180fn copy_range(inner: &SwapInner, off: usize, len: usize, dst: &mut Vec<u64>) {
181    if len == 0 {
182        return;
183    }
184    let mut remaining = len;
185    let mut cur = off;
186    let mut idx = match inner.prefix.binary_search(&cur) {
187        Ok(i) => i,
188        Err(i) => i.saturating_sub(1),
189    };
190    while remaining > 0 {
191        let chunk_start = inner.prefix[idx];
192        let chunk = &inner.chunks[idx];
193        let local = cur - chunk_start;
194        let take = std::cmp::min(remaining, chunk.len() - local);
195        dst.extend_from_slice(&chunk[local..local + take]);
196        cur += take;
197        remaining -= take;
198        idx += 1;
199    }
200}
201
202pub(crate) fn take_swap(handle: Handle, dst: &mut Vec<u64>) {
203    let inner = match handle.into_swap_inner() {
204        Some(s) => s,
205        None => panic!("take_swap called on non-swap handle"),
206    };
207    dst.clear();
208    let mut chunks = inner.chunks;
209    if chunks.len() == 1 && dst.capacity() == 0 {
210        let only = chunks.pop().unwrap();
211        *dst = only;
212        return;
213    }
214    let total: usize = chunks.iter().map(|c| c.len()).sum();
215    dst.reserve(total);
216    for c in chunks {
217        dst.extend_from_slice(&c);
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224    use crate::pager::Handle;
225
226    #[mz_ore::test]
227    fn pageout_takes_chunks_and_records_lengths() {
228        let a = vec![1u64, 2, 3];
229        let b = vec![4u64, 5];
230        let mut chunks = [a, b];
231        let h: Handle = pageout_swap(&mut chunks);
232        assert_eq!(h.len(), 5);
233        assert!(chunks[0].is_empty());
234        assert!(chunks[1].is_empty());
235    }
236
237    #[mz_ore::test]
238    fn read_at_within_single_chunk() {
239        let mut chunks = [vec![10u64, 11, 12, 13, 14]];
240        let h = pageout_swap(&mut chunks);
241        let mut dst = Vec::new();
242        read_at_swap(&h, &[(1, 3)], &mut dst);
243        assert_eq!(dst, vec![11, 12, 13]);
244    }
245
246    #[mz_ore::test]
247    fn read_at_spans_chunks() {
248        let mut chunks = [vec![1u64, 2, 3], vec![4, 5, 6]];
249        let h = pageout_swap(&mut chunks);
250        let mut dst = Vec::new();
251        read_at_swap(&h, &[(2, 3)], &mut dst);
252        assert_eq!(dst, vec![3, 4, 5]);
253    }
254
255    #[mz_ore::test]
256    fn read_at_many_concats() {
257        let mut chunks = [vec![1u64, 2, 3, 4, 5]];
258        let h = pageout_swap(&mut chunks);
259        let mut dst = Vec::new();
260        read_at_swap(&h, &[(0, 2), (3, 2)], &mut dst);
261        assert_eq!(dst, vec![1, 2, 4, 5]);
262    }
263
264    #[mz_ore::test]
265    #[should_panic(expected = "out of bounds")]
266    fn read_at_panics_on_oob() {
267        let mut chunks = [vec![1u64, 2]];
268        let h = pageout_swap(&mut chunks);
269        let mut dst = Vec::new();
270        read_at_swap(&h, &[(1, 5)], &mut dst);
271    }
272
273    #[mz_ore::test]
274    #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function `madvise` on OS `linux`
275    fn take_single_chunk_zero_copy() {
276        let v = vec![100u64; 1024];
277        let ptr_before = v.as_ptr();
278        let mut chunks = [v];
279        let h = pageout_swap(&mut chunks);
280        let mut dst = Vec::new();
281        take_swap(h, &mut dst);
282        assert_eq!(dst.len(), 1024);
283        assert_eq!(
284            dst.as_ptr(),
285            ptr_before,
286            "single-chunk take should be zero-copy"
287        );
288    }
289
290    #[mz_ore::test]
291    fn take_multi_chunk_concats() {
292        let mut chunks = [vec![1u64, 2], vec![3, 4, 5]];
293        let h = pageout_swap(&mut chunks);
294        let mut dst = Vec::new();
295        take_swap(h, &mut dst);
296        assert_eq!(dst, vec![1, 2, 3, 4, 5]);
297    }
298
299    #[mz_ore::test]
300    #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function `madvise` on OS `linux`
301    fn advise_pageout_leaves_bytes_readable() {
302        // `MADV_PAGEOUT` is a reclaim hint: the bytes must remain addressable
303        // and unchanged afterwards (a read re-faults the pages back in). Use a
304        // multi-page buffer so the page-aligned interior is non-empty.
305        let pattern = |i: usize| u8::try_from(i % 251).expect("0..251 fits in u8");
306        let bytes: Vec<u8> = (0..64 * 1024).map(pattern).collect();
307        advise_pageout(&bytes);
308        // Re-read after the advice; contents are preserved.
309        assert!(bytes.iter().enumerate().all(|(i, &b)| b == pattern(i)));
310    }
311
312    #[mz_ore::test]
313    fn advise_pageout_empty_and_subpage_are_noops() {
314        // Neither an empty slice nor a sub-page slice contains a whole page, so
315        // both skip the syscall entirely; they must not panic.
316        advise_pageout(&[]);
317        advise_pageout(&[1u8, 2, 3, 4]);
318    }
319}