moka/common/frequency_sketch.rs
1// License and Copyright Notice:
2//
3// Some of the code and doc comments in this module were ported or copied from
4// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
5// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
6//
7// The original code/comments from Caffeine are licensed under the Apache License,
8// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE>
9//
10// Copyrights of the original code/comments are retained by their contributors.
11// For full authorship information, see the version control history of
12// https://github.com/ben-manes/caffeine/
13
14/// A probabilistic multi-set for estimating the popularity of an element within
15/// a time window. The maximum frequency of an element is limited to 15 (4-bits)
16/// and an aging process periodically halves the popularity of all elements.
17#[derive(Default)]
18pub(crate) struct FrequencySketch {
19 sample_size: u32,
20 table_mask: u64,
21 table: Box<[u64]>,
22 size: u32,
23}
24
25// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine)
26static SEED: [u64; 4] = [
27 0xc3a5_c85c_97cb_3127,
28 0xb492_b66f_be98_f273,
29 0x9ae1_6a3b_2f90_404f,
30 0xcbf2_9ce4_8422_2325,
31];
32
33static RESET_MASK: u64 = 0x7777_7777_7777_7777;
34
35static ONE_MASK: u64 = 0x1111_1111_1111_1111;
36
37// -------------------------------------------------------------------------------
38// Some of the code and doc comments in this module were ported or copied from
39// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
40// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
41// -------------------------------------------------------------------------------
42//
43// FrequencySketch maintains a 4-bit CountMinSketch [1] with periodic aging to
44// provide the popularity history for the TinyLfu admission policy [2].
45// The time and space efficiency of the sketch allows it to cheaply estimate the
46// frequency of an entry in a stream of cache access events.
47//
48// The counter matrix is represented as a single dimensional array holding 16
49// counters per slot. A fixed depth of four balances the accuracy and cost,
50// resulting in a width of four times the length of the array. To retain an
51// accurate estimation the array's length equals the maximum number of entries
52// in the cache, increased to the closest power-of-two to exploit more efficient
53// bit masking. This configuration results in a confidence of 93.75% and error
54// bound of e / width.
55//
56// The frequency of all entries is aged periodically using a sampling window
57// based on the maximum number of entries in the cache. This is referred to as
58// the reset operation by TinyLfu and keeps the sketch fresh by dividing all
59// counters by two and subtracting based on the number of odd counters
60// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching,
61// and uses inexpensive bit manipulations per array location.
62//
63// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications
64// http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
65// [2] TinyLFU: A Highly Efficient Cache Admission Policy
66// https://dl.acm.org/citation.cfm?id=3149371
67//
68// -------------------------------------------------------------------------------
69
70impl FrequencySketch {
71 /// Initializes and increases the capacity of this `FrequencySketch` instance,
72 /// if necessary, to ensure that it can accurately estimate the popularity of
73 /// elements given the maximum size of the cache. This operation forgets all
74 /// previous counts when resizing.
75 pub(crate) fn ensure_capacity(&mut self, cap: u32) {
76 // The max byte size of the table, Box<[u64; table_size]>
77 //
78 // | Pointer width | Max size |
79 // |:-----------------|---------:|
80 // | 16 bit | 8 KiB |
81 // | 32 bit | 128 MiB |
82 // | 64 bit or bigger | 8 GiB |
83
84 let maximum = if cfg!(target_pointer_width = "16") {
85 cap.min(1024)
86 } else if cfg!(target_pointer_width = "32") {
87 cap.min(2u32.pow(24)) // about 16 millions
88 } else {
89 // Same to Caffeine's limit:
90 // `Integer.MAX_VALUE >>> 1` with `ceilingPowerOfTwo()` applied.
91 cap.min(2u32.pow(30)) // about 1 billion
92 };
93 let table_size = if maximum == 0 {
94 1
95 } else {
96 maximum.next_power_of_two()
97 };
98
99 if self.table.len() as u32 >= table_size {
100 return;
101 }
102
103 self.table = vec![0; table_size as usize].into_boxed_slice();
104 self.table_mask = table_size.saturating_sub(1) as u64;
105 self.sample_size = if cap == 0 {
106 10
107 } else {
108 maximum.saturating_mul(10).min(i32::MAX as u32)
109 };
110 }
111
112 /// Takes the hash value of an element, and returns the estimated number of
113 /// occurrences of the element, up to the maximum (15).
114 pub(crate) fn frequency(&self, hash: u64) -> u8 {
115 if self.table.is_empty() {
116 return 0;
117 }
118
119 let start = ((hash & 3) << 2) as u8;
120 let mut frequency = u8::MAX;
121 for i in 0..4 {
122 let index = self.index_of(hash, i);
123 let count = (self.table[index] >> ((start + i) << 2) & 0xF) as u8;
124 frequency = frequency.min(count);
125 }
126 frequency
127 }
128
129 /// Take a hash value of an element and increments the popularity of the
130 /// element if it does not exceed the maximum (15). The popularity of all
131 /// elements will be periodically down sampled when the observed events
132 /// exceeds a threshold. This process provides a frequency aging to allow
133 /// expired long term entries to fade away.
134 pub(crate) fn increment(&mut self, hash: u64) {
135 if self.table.is_empty() {
136 return;
137 }
138
139 let start = ((hash & 3) << 2) as u8;
140 let mut added = false;
141 for i in 0..4 {
142 let index = self.index_of(hash, i);
143 added |= self.increment_at(index, start + i);
144 }
145
146 if added {
147 self.size += 1;
148 if self.size >= self.sample_size {
149 self.reset();
150 }
151 }
152 }
153
154 /// Takes a table index (each entry has 16 counters) and counter index, and
155 /// increments the counter by 1 if it is not already at the maximum value
156 /// (15). Returns `true` if incremented.
157 fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool {
158 let offset = (counter_index as usize) << 2;
159 let mask = 0xF_u64 << offset;
160 if self.table[table_index] & mask != mask {
161 self.table[table_index] += 1u64 << offset;
162 true
163 } else {
164 false
165 }
166 }
167
168 /// Reduces every counter by half of its original value.
169 fn reset(&mut self) {
170 let mut count = 0u32;
171 for entry in self.table.iter_mut() {
172 // Count number of odd numbers.
173 count += (*entry & ONE_MASK).count_ones();
174 *entry = (*entry >> 1) & RESET_MASK;
175 }
176 self.size = (self.size >> 1) - (count >> 2);
177 }
178
179 /// Returns the table index for the counter at the specified depth.
180 fn index_of(&self, hash: u64, depth: u8) -> usize {
181 let i = depth as usize;
182 let mut hash = hash.wrapping_add(SEED[i]).wrapping_mul(SEED[i]);
183 hash = hash.wrapping_add(hash >> 32);
184 (hash & self.table_mask) as usize
185 }
186
187 #[cfg(feature = "unstable-debug-counters")]
188 pub(crate) fn table_size(&self) -> u64 {
189 (self.table.len() * std::mem::size_of::<u64>()) as u64
190 }
191}
192
193// Methods only available for testing.
194#[cfg(test)]
195impl FrequencySketch {
196 pub(crate) fn table_len(&self) -> usize {
197 self.table.len()
198 }
199}
200
201// Some test cases were ported from Caffeine at:
202// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java
203//
204// To see the debug prints, run test as `cargo test -- --nocapture`
205#[cfg(test)]
206mod tests {
207 use super::FrequencySketch;
208 use once_cell::sync::Lazy;
209 use std::hash::{BuildHasher, Hash, Hasher};
210
211 static ITEM: Lazy<u32> = Lazy::new(|| {
212 let mut buf = [0; 4];
213 getrandom::getrandom(&mut buf).unwrap();
214 unsafe { std::mem::transmute::<[u8; 4], u32>(buf) }
215 });
216
217 // This test was ported from Caffeine.
218 #[test]
219 fn increment_once() {
220 let mut sketch = FrequencySketch::default();
221 sketch.ensure_capacity(512);
222 let hasher = hasher();
223 let item_hash = hasher(*ITEM);
224 sketch.increment(item_hash);
225 assert_eq!(sketch.frequency(item_hash), 1);
226 }
227
228 // This test was ported from Caffeine.
229 #[test]
230 fn increment_max() {
231 let mut sketch = FrequencySketch::default();
232 sketch.ensure_capacity(512);
233 let hasher = hasher();
234 let item_hash = hasher(*ITEM);
235 for _ in 0..20 {
236 sketch.increment(item_hash);
237 }
238 assert_eq!(sketch.frequency(item_hash), 15);
239 }
240
241 // This test was ported from Caffeine.
242 #[test]
243 fn increment_distinct() {
244 let mut sketch = FrequencySketch::default();
245 sketch.ensure_capacity(512);
246 let hasher = hasher();
247 sketch.increment(hasher(*ITEM));
248 sketch.increment(hasher(ITEM.wrapping_add(1)));
249 assert_eq!(sketch.frequency(hasher(*ITEM)), 1);
250 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1);
251 assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0);
252 }
253
254 // This test was ported from Caffeine.
255 #[test]
256 fn index_of_around_zero() {
257 let mut sketch = FrequencySketch::default();
258 sketch.ensure_capacity(512);
259 let mut indexes = std::collections::HashSet::new();
260 let hashes = [u64::MAX, 0, 1];
261 for hash in hashes.iter() {
262 for depth in 0..4 {
263 indexes.insert(sketch.index_of(*hash, depth));
264 }
265 }
266 assert_eq!(indexes.len(), 4 * hashes.len())
267 }
268
269 // This test was ported from Caffeine.
270 #[test]
271 fn reset() {
272 let mut reset = false;
273 let mut sketch = FrequencySketch::default();
274 sketch.ensure_capacity(64);
275 let hasher = hasher();
276
277 for i in 1..(20 * sketch.table.len() as u32) {
278 sketch.increment(hasher(i));
279 if sketch.size != i {
280 reset = true;
281 break;
282 }
283 }
284
285 assert!(reset);
286 assert!(sketch.size <= sketch.sample_size / 2);
287 }
288
289 // This test was ported from Caffeine.
290 #[test]
291 fn heavy_hitters() {
292 let mut sketch = FrequencySketch::default();
293 sketch.ensure_capacity(65_536);
294 let hasher = hasher();
295
296 for i in 100..100_000 {
297 sketch.increment(hasher(i));
298 }
299
300 for i in (0..10).step_by(2) {
301 for _ in 0..i {
302 sketch.increment(hasher(i));
303 }
304 }
305
306 // A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0]
307 let popularity = (0..10)
308 .map(|i| sketch.frequency(hasher(i)))
309 .collect::<Vec<_>>();
310
311 for (i, freq) in popularity.iter().enumerate() {
312 match i {
313 2 => assert!(freq <= &popularity[4]),
314 4 => assert!(freq <= &popularity[6]),
315 6 => assert!(freq <= &popularity[8]),
316 8 => (),
317 _ => assert!(freq <= &popularity[2]),
318 }
319 }
320 }
321
322 fn hasher<K: Hash>() -> impl Fn(K) -> u64 {
323 let build_hasher = std::collections::hash_map::RandomState::default();
324 move |key| {
325 let mut hasher = build_hasher.build_hasher();
326 key.hash(&mut hasher);
327 hasher.finish()
328 }
329 }
330}
331
332// Verify that some properties hold such as no panic occurs on any possible inputs.
333#[cfg(kani)]
334mod kani {
335 use super::FrequencySketch;
336
337 const CAPACITIES: &[u32] = &[
338 0,
339 1,
340 1024,
341 1025,
342 2u32.pow(24),
343 2u32.pow(24) + 1,
344 2u32.pow(30),
345 2u32.pow(30) + 1,
346 u32::MAX,
347 ];
348
349 #[kani::proof]
350 fn verify_ensure_capacity() {
351 // Check for arbitrary capacities.
352 let capacity = kani::any();
353 let mut sketch = FrequencySketch::default();
354 sketch.ensure_capacity(capacity);
355 }
356
357 #[kani::proof]
358 fn verify_frequency() {
359 // Check for some selected capacities.
360 for capacity in CAPACITIES {
361 let mut sketch = FrequencySketch::default();
362 sketch.ensure_capacity(*capacity);
363
364 // Check for arbitrary hashes.
365 let hash = kani::any();
366 let frequency = sketch.frequency(hash);
367 assert!(frequency <= 15);
368 }
369 }
370
371 #[kani::proof]
372 fn verify_increment() {
373 // Only check for small capacities. Because Kani Rust Verifier is a model
374 // checking tool, it will take much longer time (exponential) to check larger
375 // capacities here.
376 for capacity in &[0, 1, 128] {
377 let mut sketch = FrequencySketch::default();
378 sketch.ensure_capacity(*capacity);
379
380 // Check for arbitrary hashes.
381 let hash = kani::any();
382 sketch.increment(hash);
383 }
384 }
385
386 #[kani::proof]
387 fn verify_index_of() {
388 // Check for arbitrary capacities.
389 let capacity = kani::any();
390 let mut sketch = FrequencySketch::default();
391 sketch.ensure_capacity(capacity);
392
393 // Check for arbitrary hashes.
394 let hash = kani::any();
395 for i in 0..4 {
396 let index = sketch.index_of(hash, i);
397 assert!(index < sketch.table.len());
398 }
399 }
400}