1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// Copyright Materialize, Inc. and contributors. All rights reserved.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

use std::collections::BTreeSet;

use mz_repr::{ColumnName, ColumnType, Datum, RelationDesc, Row};

pub trait Encode {
    fn encode_unchecked(&self, row: Row) -> Vec<u8>;

    /// Given the output of a call to [`Encode::encode_unchecked`], returns
    /// a hash that is suitable for stable partitioning.
    fn hash(&self, buf: &[u8]) -> u64 {
        // We use seahash as it outperforms pretty much all other options, and
        // has great mathematically proven statistical properties. It truly is a
        // remarkable non-cryptographic hash. More details can be found here:
        // https://docs.rs/seahash/latest/seahash/
        seahash::hash(buf)
    }
}

/// Bundled information sufficient to encode Datums.
#[derive(Debug)]
pub struct TypedDatum<'a> {
    pub datum: Datum<'a>,
    pub typ: &'a ColumnType,
}

impl<'a> TypedDatum<'a> {
    /// Pairs a datum and its type, for encoding.
    pub fn new(datum: Datum<'a>, typ: &'a ColumnType) -> Self {
        Self { datum, typ }
    }
}

/// Extracts deduplicated column names and types from a relation description.
pub fn column_names_and_types(desc: RelationDesc) -> Vec<(ColumnName, ColumnType)> {
    // Invent names for columns that don't have a name.
    let mut columns: Vec<_> = desc.into_iter().collect();

    // Deduplicate names.
    let mut seen = BTreeSet::new();
    for (name, _ty) in &mut columns {
        let stem_len = name.as_str().len();
        let mut i = 1;
        while seen.contains(name) {
            name.as_mut_str().truncate(stem_len);
            if name.as_str().ends_with(|c: char| c.is_ascii_digit()) {
                name.as_mut_str().push('_');
            }
            name.as_mut_str().push_str(&i.to_string());
            i += 1;
        }
        seen.insert(name);
    }
    columns
}