1use crate::pager::Handle;
19
20#[derive(Debug)]
22pub(crate) struct SwapInner {
23 pub(crate) chunks: Vec<Vec<u64>>,
25 pub(crate) prefix: Vec<usize>,
28}
29
30impl SwapInner {
31 pub(crate) fn new(chunks: Vec<Vec<u64>>) -> Self {
32 let mut prefix = Vec::with_capacity(chunks.len() + 1);
33 prefix.push(0);
34 let mut sum = 0;
35 for c in &chunks {
36 sum += c.len();
37 prefix.push(sum);
38 }
39 Self { chunks, prefix }
40 }
41
42 pub(crate) fn total_len(&self) -> usize {
43 *self
45 .prefix
46 .last()
47 .expect("SwapInner::prefix invariant: at least [0]")
48 }
49}
50
51pub(crate) fn pageout_swap(chunks: &mut [Vec<u64>]) -> Handle {
52 let mut taken: Vec<Vec<u64>> = Vec::with_capacity(chunks.len());
53 for c in chunks.iter_mut() {
54 taken.push(std::mem::take(c));
55 }
56 for c in &taken {
57 madvise_cold(c);
58 }
59 Handle::from_swap(SwapInner::new(taken))
60}
61
62pub fn advise_pageout(bytes: &[u8]) {
77 madvise_pageout(bytes);
78}
79
80#[cfg(target_os = "linux")]
81fn madvise_cold(chunk: &[u64]) {
82 let Some(len_bytes) = chunk.len().checked_mul(std::mem::size_of::<u64>()) else {
86 return;
87 };
88 unsafe { madvise_aligned(chunk.as_ptr().cast::<u8>(), len_bytes, libc::MADV_COLD) }
90}
91
92#[cfg(target_os = "linux")]
93fn madvise_pageout(bytes: &[u8]) {
94 unsafe { madvise_aligned(bytes.as_ptr(), bytes.len(), libc::MADV_PAGEOUT) }
96}
97
98#[cfg(target_os = "linux")]
112unsafe fn madvise_aligned(base_ptr: *const u8, len_bytes: usize, advice: libc::c_int) {
113 if len_bytes == 0 {
114 return;
115 }
116 let page = page_size();
117 let base_addr = base_ptr.addr();
118 let Some(start_unaligned) = base_addr.checked_add(page - 1) else {
122 return;
123 };
124 let Some(end_unaligned) = base_addr.checked_add(len_bytes) else {
125 return;
126 };
127 let aligned_start_addr = start_unaligned & !(page - 1);
128 let aligned_end_addr = end_unaligned & !(page - 1);
129 if aligned_end_addr <= aligned_start_addr {
130 return;
131 }
132 let aligned_len = aligned_end_addr - aligned_start_addr;
133 let aligned_ptr = unsafe { base_ptr.byte_add(aligned_start_addr - base_addr) }
139 .cast::<libc::c_void>()
140 .cast_mut();
141 unsafe {
146 libc::madvise(aligned_ptr, aligned_len, advice);
147 }
148}
149
150#[cfg(not(target_os = "linux"))]
151fn madvise_cold(_chunk: &[u64]) {}
152
153#[cfg(not(target_os = "linux"))]
154fn madvise_pageout(_bytes: &[u8]) {}
155
156#[cfg(target_os = "linux")]
157fn page_size() -> usize {
158 let raw = unsafe { libc::sysconf(libc::_SC_PAGESIZE) };
160 usize::try_from(raw).expect("page size is positive and fits usize")
161}
162
163pub(crate) fn read_at_swap(handle: &Handle, ranges: &[(usize, usize)], dst: &mut Vec<u64>) {
164 let inner = handle
165 .swap_inner()
166 .expect("read_at_swap called on non-swap handle");
167 let total = inner.total_len();
168 let total_out: usize = ranges.iter().map(|(_, l)| *l).sum();
169 dst.reserve(total_out);
170 for &(off, len) in ranges {
171 let end = off.checked_add(len).expect("range offset+len overflow");
172 assert!(
173 end <= total,
174 "read range out of bounds: {off}+{len} > {total}"
175 );
176 copy_range(inner, off, len, dst);
177 }
178}
179
180fn copy_range(inner: &SwapInner, off: usize, len: usize, dst: &mut Vec<u64>) {
181 if len == 0 {
182 return;
183 }
184 let mut remaining = len;
185 let mut cur = off;
186 let mut idx = match inner.prefix.binary_search(&cur) {
187 Ok(i) => i,
188 Err(i) => i.saturating_sub(1),
189 };
190 while remaining > 0 {
191 let chunk_start = inner.prefix[idx];
192 let chunk = &inner.chunks[idx];
193 let local = cur - chunk_start;
194 let take = std::cmp::min(remaining, chunk.len() - local);
195 dst.extend_from_slice(&chunk[local..local + take]);
196 cur += take;
197 remaining -= take;
198 idx += 1;
199 }
200}
201
202pub(crate) fn take_swap(handle: Handle, dst: &mut Vec<u64>) {
203 let inner = match handle.into_swap_inner() {
204 Some(s) => s,
205 None => panic!("take_swap called on non-swap handle"),
206 };
207 dst.clear();
208 let mut chunks = inner.chunks;
209 if chunks.len() == 1 && dst.capacity() == 0 {
210 let only = chunks.pop().unwrap();
211 *dst = only;
212 return;
213 }
214 let total: usize = chunks.iter().map(|c| c.len()).sum();
215 dst.reserve(total);
216 for c in chunks {
217 dst.extend_from_slice(&c);
218 }
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224 use crate::pager::Handle;
225
226 #[mz_ore::test]
227 fn pageout_takes_chunks_and_records_lengths() {
228 let a = vec![1u64, 2, 3];
229 let b = vec![4u64, 5];
230 let mut chunks = [a, b];
231 let h: Handle = pageout_swap(&mut chunks);
232 assert_eq!(h.len(), 5);
233 assert!(chunks[0].is_empty());
234 assert!(chunks[1].is_empty());
235 }
236
237 #[mz_ore::test]
238 fn read_at_within_single_chunk() {
239 let mut chunks = [vec![10u64, 11, 12, 13, 14]];
240 let h = pageout_swap(&mut chunks);
241 let mut dst = Vec::new();
242 read_at_swap(&h, &[(1, 3)], &mut dst);
243 assert_eq!(dst, vec![11, 12, 13]);
244 }
245
246 #[mz_ore::test]
247 fn read_at_spans_chunks() {
248 let mut chunks = [vec![1u64, 2, 3], vec![4, 5, 6]];
249 let h = pageout_swap(&mut chunks);
250 let mut dst = Vec::new();
251 read_at_swap(&h, &[(2, 3)], &mut dst);
252 assert_eq!(dst, vec![3, 4, 5]);
253 }
254
255 #[mz_ore::test]
256 fn read_at_many_concats() {
257 let mut chunks = [vec![1u64, 2, 3, 4, 5]];
258 let h = pageout_swap(&mut chunks);
259 let mut dst = Vec::new();
260 read_at_swap(&h, &[(0, 2), (3, 2)], &mut dst);
261 assert_eq!(dst, vec![1, 2, 4, 5]);
262 }
263
264 #[mz_ore::test]
265 #[should_panic(expected = "out of bounds")]
266 fn read_at_panics_on_oob() {
267 let mut chunks = [vec![1u64, 2]];
268 let h = pageout_swap(&mut chunks);
269 let mut dst = Vec::new();
270 read_at_swap(&h, &[(1, 5)], &mut dst);
271 }
272
273 #[mz_ore::test]
274 #[cfg_attr(miri, ignore)] fn take_single_chunk_zero_copy() {
276 let v = vec![100u64; 1024];
277 let ptr_before = v.as_ptr();
278 let mut chunks = [v];
279 let h = pageout_swap(&mut chunks);
280 let mut dst = Vec::new();
281 take_swap(h, &mut dst);
282 assert_eq!(dst.len(), 1024);
283 assert_eq!(
284 dst.as_ptr(),
285 ptr_before,
286 "single-chunk take should be zero-copy"
287 );
288 }
289
290 #[mz_ore::test]
291 fn take_multi_chunk_concats() {
292 let mut chunks = [vec![1u64, 2], vec![3, 4, 5]];
293 let h = pageout_swap(&mut chunks);
294 let mut dst = Vec::new();
295 take_swap(h, &mut dst);
296 assert_eq!(dst, vec![1, 2, 3, 4, 5]);
297 }
298
299 #[mz_ore::test]
300 #[cfg_attr(miri, ignore)] fn advise_pageout_leaves_bytes_readable() {
302 let pattern = |i: usize| u8::try_from(i % 251).expect("0..251 fits in u8");
306 let bytes: Vec<u8> = (0..64 * 1024).map(pattern).collect();
307 advise_pageout(&bytes);
308 assert!(bytes.iter().enumerate().all(|(i, &b)| b == pattern(i)));
310 }
311
312 #[mz_ore::test]
313 fn advise_pageout_empty_and_subpage_are_noops() {
314 advise_pageout(&[]);
317 advise_pageout(&[1u8, 2, 3, 4]);
318 }
319}