mz_interchange/
encode.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeSet;
11
12use mz_repr::{ColumnName, ColumnType, Datum, RelationDesc, Row};
13
14pub trait Encode {
15    fn encode_unchecked(&self, row: Row) -> Vec<u8>;
16
17    /// Given the output of a call to [`Encode::encode_unchecked`], returns
18    /// a hash that is suitable for stable partitioning.
19    fn hash(&self, buf: &[u8]) -> u64 {
20        // We use seahash as it outperforms pretty much all other options, and
21        // has great mathematically proven statistical properties. It truly is a
22        // remarkable non-cryptographic hash. More details can be found here:
23        // https://docs.rs/seahash/latest/seahash/
24        seahash::hash(buf)
25    }
26}
27
28/// Bundled information sufficient to encode Datums.
29#[derive(Debug)]
30pub struct TypedDatum<'a> {
31    pub datum: Datum<'a>,
32    pub typ: &'a ColumnType,
33}
34
35impl<'a> TypedDatum<'a> {
36    /// Pairs a datum and its type, for encoding.
37    pub fn new(datum: Datum<'a>, typ: &'a ColumnType) -> Self {
38        Self { datum, typ }
39    }
40}
41
42/// Extracts deduplicated column names and types from a relation description.
43pub fn column_names_and_types(desc: RelationDesc) -> Vec<(ColumnName, ColumnType)> {
44    // Invent names for columns that don't have a name.
45    let mut columns: Vec<_> = desc.into_iter().collect();
46
47    // Deduplicate names.
48    let mut seen = BTreeSet::new();
49    for (name, _ty) in &mut columns {
50        let stem_len = name.as_str().len();
51        let mut i = 1;
52        while seen.contains(name) {
53            name.as_mut_str().truncate(stem_len);
54            if name.as_str().ends_with(|c: char| c.is_ascii_digit()) {
55                name.as_mut_str().push('_');
56            }
57            name.as_mut_str().push_str(&i.to_string());
58            i += 1;
59        }
60        seen.insert(name);
61    }
62    columns
63}