crc32c/
hw_x86_64.rs

1//! Implements crc32c with SSE 4.2 support.
2
3use crate::hw_tables;
4use crate::util::{self, U64Le};
5use std::arch::x86_64 as simd;
6
7/// Computes CRC-32C using the SSE 4.2 hardware instruction.
8pub unsafe fn crc32c(crci: u32, buffer: &[u8]) -> u32 {
9    let mut crc0 = u64::from(!crci);
10
11    let (begin, middle, end) = util::split(buffer);
12
13    // Leading bytes, up to the first one aligned on 8 bytes.
14    crc0 = crc_u8(crc0, begin);
15
16    // Most CPUs have a latency of 3 on these instructions,
17    // meaning we must use 3 of them at a time, to leverage
18    // hardware parallelism.
19
20    // First do chunks of size LONG * 3.
21    let chunk_size = (hw_tables::LONG * 3) / 8;
22    let last_chunk = middle.len() / chunk_size * chunk_size;
23
24    let (middle_first, middle_last) = middle.split_at(last_chunk);
25
26    crc0 = crc_u64_parallel3(crc0, chunk_size, &hw_tables::LONG_TABLE, middle_first);
27
28    // Now do chunks of size SHORT * 3.
29    let chunk_size = (hw_tables::SHORT * 3) / 8;
30    let last_chunk = middle_last.len() / chunk_size * chunk_size;
31
32    let (middle_last_first, middle_last_last) = middle_last.split_at(last_chunk);
33
34    crc0 = crc_u64_parallel3(crc0, chunk_size, &hw_tables::SHORT_TABLE, middle_last_first);
35
36    // Now the last part, less than SHORT * 3 but still a multiple of 8-bytes.
37    crc0 = crc_u64(crc0, middle_last_last);
38
39    // Final unaligned remainder.
40    crc0 = crc_u8(crc0, end);
41
42    !(crc0 as u32)
43}
44
45#[inline]
46#[target_feature(enable = "sse4.2")]
47unsafe fn crc_u8_append(crc: u64, next: u8) -> u64 {
48    u64::from(self::simd::_mm_crc32_u8(crc as u32, next))
49}
50
51#[inline]
52#[target_feature(enable = "sse4.2")]
53unsafe fn crc_u64_append(crc: u64, next: u64) -> u64 {
54    self::simd::_mm_crc32_u64(crc, next)
55}
56
57#[inline]
58unsafe fn crc_u8(crc: u64, buffer: &[u8]) -> u64 {
59    buffer
60        .iter()
61        .fold(crc, |crc, &next| crc_u8_append(crc, next))
62}
63
64#[inline]
65unsafe fn crc_u64(crc: u64, buffer: &[U64Le]) -> u64 {
66    buffer
67        .iter()
68        .fold(crc, |crc, &next| crc_u64_append(crc, next.get()))
69}
70
71/// Hardware-parallel version of the algorithm.
72///
73/// Calculates the CRC for a chunk of `chunk_size`,
74/// by dividing it in 3 separate blocks.
75///
76/// Uses a pre-made CRC table designed for the given chunk size.
77#[inline]
78unsafe fn crc_u64_parallel3(
79    crc: u64,
80    chunk_size: usize,
81    table: &hw_tables::CrcTable,
82    buffer: &[U64Le],
83) -> u64 {
84    buffer.chunks(chunk_size).fold(crc, |mut crc0, chunk| {
85        let mut crc1 = 0;
86        let mut crc2 = 0;
87
88        // Divide it in three.
89        let block_size = chunk_size / 3;
90
91        let mut blocks = chunk.chunks(block_size);
92        let a = blocks.next().unwrap();
93        let b = blocks.next().unwrap();
94        let c = blocks.next().unwrap();
95
96        for i in 0..block_size {
97            crc0 = crc_u64_append(crc0, a[i].get());
98            crc1 = crc_u64_append(crc1, b[i].get());
99            crc2 = crc_u64_append(crc2, c[i].get());
100        }
101
102        crc0 = table.shift_u64(crc0) ^ crc1;
103        crc0 = table.shift_u64(crc0) ^ crc2;
104
105        crc0
106    })
107}