moka/common/
frequency_sketch.rs

1// License and Copyright Notice:
2//
3// Some of the code and doc comments in this module were ported or copied from
4// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
5// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
6//
7// The original code/comments from Caffeine are licensed under the Apache License,
8// Version 2.0 <https://github.com/ben-manes/caffeine/blob/master/LICENSE>
9//
10// Copyrights of the original code/comments are retained by their contributors.
11// For full authorship information, see the version control history of
12// https://github.com/ben-manes/caffeine/
13
14/// A probabilistic multi-set for estimating the popularity of an element within
15/// a time window. The maximum frequency of an element is limited to 15 (4-bits)
16/// and an aging process periodically halves the popularity of all elements.
17#[derive(Default)]
18pub(crate) struct FrequencySketch {
19    sample_size: u32,
20    table_mask: u64,
21    table: Box<[u64]>,
22    size: u32,
23}
24
25// A mixture of seeds from FNV-1a, CityHash, and Murmur3. (Taken from Caffeine)
26static SEED: [u64; 4] = [
27    0xc3a5_c85c_97cb_3127,
28    0xb492_b66f_be98_f273,
29    0x9ae1_6a3b_2f90_404f,
30    0xcbf2_9ce4_8422_2325,
31];
32
33static RESET_MASK: u64 = 0x7777_7777_7777_7777;
34
35static ONE_MASK: u64 = 0x1111_1111_1111_1111;
36
37// -------------------------------------------------------------------------------
38// Some of the code and doc comments in this module were ported or copied from
39// a Java class `com.github.benmanes.caffeine.cache.FrequencySketch` of Caffeine.
40// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/main/java/com/github/benmanes/caffeine/cache/FrequencySketch.java
41// -------------------------------------------------------------------------------
42//
43// FrequencySketch maintains a 4-bit CountMinSketch [1] with periodic aging to
44// provide the popularity history for the TinyLfu admission policy [2].
45// The time and space efficiency of the sketch allows it to cheaply estimate the
46// frequency of an entry in a stream of cache access events.
47//
48// The counter matrix is represented as a single dimensional array holding 16
49// counters per slot. A fixed depth of four balances the accuracy and cost,
50// resulting in a width of four times the length of the array. To retain an
51// accurate estimation the array's length equals the maximum number of entries
52// in the cache, increased to the closest power-of-two to exploit more efficient
53// bit masking. This configuration results in a confidence of 93.75% and error
54// bound of e / width.
55//
56// The frequency of all entries is aged periodically using a sampling window
57// based on the maximum number of entries in the cache. This is referred to as
58// the reset operation by TinyLfu and keeps the sketch fresh by dividing all
59// counters by two and subtracting based on the number of odd counters
60// found. The O(n) cost of aging is amortized, ideal for hardware pre-fetching,
61// and uses inexpensive bit manipulations per array location.
62//
63// [1] An Improved Data Stream Summary: The Count-Min Sketch and its Applications
64//     http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
65// [2] TinyLFU: A Highly Efficient Cache Admission Policy
66//     https://dl.acm.org/citation.cfm?id=3149371
67//
68// -------------------------------------------------------------------------------
69
70impl FrequencySketch {
71    /// Initializes and increases the capacity of this `FrequencySketch` instance,
72    /// if necessary, to ensure that it can accurately estimate the popularity of
73    /// elements given the maximum size of the cache. This operation forgets all
74    /// previous counts when resizing.
75    pub(crate) fn ensure_capacity(&mut self, cap: u32) {
76        // The max byte size of the table, Box<[u64; table_size]>
77        //
78        // | Pointer width    | Max size |
79        // |:-----------------|---------:|
80        // | 16 bit           |    8 KiB |
81        // | 32 bit           |  128 MiB |
82        // | 64 bit or bigger |    8 GiB |
83
84        let maximum = if cfg!(target_pointer_width = "16") {
85            cap.min(1024)
86        } else if cfg!(target_pointer_width = "32") {
87            cap.min(2u32.pow(24)) // about 16 millions
88        } else {
89            // Same to Caffeine's limit:
90            //   `Integer.MAX_VALUE >>> 1` with `ceilingPowerOfTwo()` applied.
91            cap.min(2u32.pow(30)) // about 1 billion
92        };
93        let table_size = if maximum == 0 {
94            1
95        } else {
96            maximum.next_power_of_two()
97        };
98
99        if self.table.len() as u32 >= table_size {
100            return;
101        }
102
103        self.table = vec![0; table_size as usize].into_boxed_slice();
104        self.table_mask = table_size.saturating_sub(1) as u64;
105        self.sample_size = if cap == 0 {
106            10
107        } else {
108            maximum.saturating_mul(10).min(i32::MAX as u32)
109        };
110    }
111
112    /// Takes the hash value of an element, and returns the estimated number of
113    /// occurrences of the element, up to the maximum (15).
114    pub(crate) fn frequency(&self, hash: u64) -> u8 {
115        if self.table.is_empty() {
116            return 0;
117        }
118
119        let start = ((hash & 3) << 2) as u8;
120        let mut frequency = u8::MAX;
121        for i in 0..4 {
122            let index = self.index_of(hash, i);
123            let count = (self.table[index] >> ((start + i) << 2) & 0xF) as u8;
124            frequency = frequency.min(count);
125        }
126        frequency
127    }
128
129    /// Take a hash value of an element and increments the popularity of the
130    /// element if it does not exceed the maximum (15). The popularity of all
131    /// elements will be periodically down sampled when the observed events
132    /// exceeds a threshold. This process provides a frequency aging to allow
133    /// expired long term entries to fade away.
134    pub(crate) fn increment(&mut self, hash: u64) {
135        if self.table.is_empty() {
136            return;
137        }
138
139        let start = ((hash & 3) << 2) as u8;
140        let mut added = false;
141        for i in 0..4 {
142            let index = self.index_of(hash, i);
143            added |= self.increment_at(index, start + i);
144        }
145
146        if added {
147            self.size += 1;
148            if self.size >= self.sample_size {
149                self.reset();
150            }
151        }
152    }
153
154    /// Takes a table index (each entry has 16 counters) and counter index, and
155    /// increments the counter by 1 if it is not already at the maximum value
156    /// (15). Returns `true` if incremented.
157    fn increment_at(&mut self, table_index: usize, counter_index: u8) -> bool {
158        let offset = (counter_index as usize) << 2;
159        let mask = 0xF_u64 << offset;
160        if self.table[table_index] & mask != mask {
161            self.table[table_index] += 1u64 << offset;
162            true
163        } else {
164            false
165        }
166    }
167
168    /// Reduces every counter by half of its original value.
169    fn reset(&mut self) {
170        let mut count = 0u32;
171        for entry in self.table.iter_mut() {
172            // Count number of odd numbers.
173            count += (*entry & ONE_MASK).count_ones();
174            *entry = (*entry >> 1) & RESET_MASK;
175        }
176        self.size = (self.size >> 1) - (count >> 2);
177    }
178
179    /// Returns the table index for the counter at the specified depth.
180    fn index_of(&self, hash: u64, depth: u8) -> usize {
181        let i = depth as usize;
182        let mut hash = hash.wrapping_add(SEED[i]).wrapping_mul(SEED[i]);
183        hash = hash.wrapping_add(hash >> 32);
184        (hash & self.table_mask) as usize
185    }
186
187    #[cfg(feature = "unstable-debug-counters")]
188    pub(crate) fn table_size(&self) -> u64 {
189        (self.table.len() * std::mem::size_of::<u64>()) as u64
190    }
191}
192
193// Methods only available for testing.
194#[cfg(test)]
195impl FrequencySketch {
196    pub(crate) fn table_len(&self) -> usize {
197        self.table.len()
198    }
199}
200
201// Some test cases were ported from Caffeine at:
202// https://github.com/ben-manes/caffeine/blob/master/caffeine/src/test/java/com/github/benmanes/caffeine/cache/FrequencySketchTest.java
203//
204// To see the debug prints, run test as `cargo test -- --nocapture`
205#[cfg(test)]
206mod tests {
207    use super::FrequencySketch;
208    use once_cell::sync::Lazy;
209    use std::hash::{BuildHasher, Hash, Hasher};
210
211    static ITEM: Lazy<u32> = Lazy::new(|| {
212        let mut buf = [0; 4];
213        getrandom::getrandom(&mut buf).unwrap();
214        unsafe { std::mem::transmute::<[u8; 4], u32>(buf) }
215    });
216
217    // This test was ported from Caffeine.
218    #[test]
219    fn increment_once() {
220        let mut sketch = FrequencySketch::default();
221        sketch.ensure_capacity(512);
222        let hasher = hasher();
223        let item_hash = hasher(*ITEM);
224        sketch.increment(item_hash);
225        assert_eq!(sketch.frequency(item_hash), 1);
226    }
227
228    // This test was ported from Caffeine.
229    #[test]
230    fn increment_max() {
231        let mut sketch = FrequencySketch::default();
232        sketch.ensure_capacity(512);
233        let hasher = hasher();
234        let item_hash = hasher(*ITEM);
235        for _ in 0..20 {
236            sketch.increment(item_hash);
237        }
238        assert_eq!(sketch.frequency(item_hash), 15);
239    }
240
241    // This test was ported from Caffeine.
242    #[test]
243    fn increment_distinct() {
244        let mut sketch = FrequencySketch::default();
245        sketch.ensure_capacity(512);
246        let hasher = hasher();
247        sketch.increment(hasher(*ITEM));
248        sketch.increment(hasher(ITEM.wrapping_add(1)));
249        assert_eq!(sketch.frequency(hasher(*ITEM)), 1);
250        assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(1))), 1);
251        assert_eq!(sketch.frequency(hasher(ITEM.wrapping_add(2))), 0);
252    }
253
254    // This test was ported from Caffeine.
255    #[test]
256    fn index_of_around_zero() {
257        let mut sketch = FrequencySketch::default();
258        sketch.ensure_capacity(512);
259        let mut indexes = std::collections::HashSet::new();
260        let hashes = [u64::MAX, 0, 1];
261        for hash in hashes.iter() {
262            for depth in 0..4 {
263                indexes.insert(sketch.index_of(*hash, depth));
264            }
265        }
266        assert_eq!(indexes.len(), 4 * hashes.len())
267    }
268
269    // This test was ported from Caffeine.
270    #[test]
271    fn reset() {
272        let mut reset = false;
273        let mut sketch = FrequencySketch::default();
274        sketch.ensure_capacity(64);
275        let hasher = hasher();
276
277        for i in 1..(20 * sketch.table.len() as u32) {
278            sketch.increment(hasher(i));
279            if sketch.size != i {
280                reset = true;
281                break;
282            }
283        }
284
285        assert!(reset);
286        assert!(sketch.size <= sketch.sample_size / 2);
287    }
288
289    // This test was ported from Caffeine.
290    #[test]
291    fn heavy_hitters() {
292        let mut sketch = FrequencySketch::default();
293        sketch.ensure_capacity(65_536);
294        let hasher = hasher();
295
296        for i in 100..100_000 {
297            sketch.increment(hasher(i));
298        }
299
300        for i in (0..10).step_by(2) {
301            for _ in 0..i {
302                sketch.increment(hasher(i));
303            }
304        }
305
306        // A perfect popularity count yields an array [0, 0, 2, 0, 4, 0, 6, 0, 8, 0]
307        let popularity = (0..10)
308            .map(|i| sketch.frequency(hasher(i)))
309            .collect::<Vec<_>>();
310
311        for (i, freq) in popularity.iter().enumerate() {
312            match i {
313                2 => assert!(freq <= &popularity[4]),
314                4 => assert!(freq <= &popularity[6]),
315                6 => assert!(freq <= &popularity[8]),
316                8 => (),
317                _ => assert!(freq <= &popularity[2]),
318            }
319        }
320    }
321
322    fn hasher<K: Hash>() -> impl Fn(K) -> u64 {
323        let build_hasher = std::collections::hash_map::RandomState::default();
324        move |key| {
325            let mut hasher = build_hasher.build_hasher();
326            key.hash(&mut hasher);
327            hasher.finish()
328        }
329    }
330}
331
332// Verify that some properties hold such as no panic occurs on any possible inputs.
333#[cfg(kani)]
334mod kani {
335    use super::FrequencySketch;
336
337    const CAPACITIES: &[u32] = &[
338        0,
339        1,
340        1024,
341        1025,
342        2u32.pow(24),
343        2u32.pow(24) + 1,
344        2u32.pow(30),
345        2u32.pow(30) + 1,
346        u32::MAX,
347    ];
348
349    #[kani::proof]
350    fn verify_ensure_capacity() {
351        // Check for arbitrary capacities.
352        let capacity = kani::any();
353        let mut sketch = FrequencySketch::default();
354        sketch.ensure_capacity(capacity);
355    }
356
357    #[kani::proof]
358    fn verify_frequency() {
359        // Check for some selected capacities.
360        for capacity in CAPACITIES {
361            let mut sketch = FrequencySketch::default();
362            sketch.ensure_capacity(*capacity);
363
364            // Check for arbitrary hashes.
365            let hash = kani::any();
366            let frequency = sketch.frequency(hash);
367            assert!(frequency <= 15);
368        }
369    }
370
371    #[kani::proof]
372    fn verify_increment() {
373        // Only check for small capacities. Because Kani Rust Verifier is a model
374        // checking tool, it will take much longer time (exponential) to check larger
375        // capacities here.
376        for capacity in &[0, 1, 128] {
377            let mut sketch = FrequencySketch::default();
378            sketch.ensure_capacity(*capacity);
379
380            // Check for arbitrary hashes.
381            let hash = kani::any();
382            sketch.increment(hash);
383        }
384    }
385
386    #[kani::proof]
387    fn verify_index_of() {
388        // Check for arbitrary capacities.
389        let capacity = kani::any();
390        let mut sketch = FrequencySketch::default();
391        sketch.ensure_capacity(capacity);
392
393        // Check for arbitrary hashes.
394        let hash = kani::any();
395        for i in 0..4 {
396            let index = sketch.index_of(hash, i);
397            assert!(index < sketch.table.len());
398        }
399    }
400}