mz_interchange/
encode.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeSet;
11
12use mz_repr::{ColumnName, Datum, RelationDesc, Row, SqlColumnType};
13
14pub trait Encode {
15    fn encode_unchecked(&self, row: Row) -> Vec<u8>;
16
17    /// Given the output of a call to [`Encode::encode_unchecked`], returns
18    /// a hash that is suitable for stable partitioning.
19    fn hash(&self, buf: &[u8]) -> u64 {
20        // We use seahash as it outperforms pretty much all other options, and
21        // has great mathematically proven statistical properties. It truly is a
22        // remarkable non-cryptographic hash. More details can be found here:
23        // https://docs.rs/seahash/latest/seahash/
24        seahash::hash(buf)
25    }
26}
27
28/// Bundled information sufficient to encode Datums.
29#[derive(Debug)]
30pub struct TypedDatum<'a> {
31    pub datum: Datum<'a>,
32    pub typ: &'a SqlColumnType,
33}
34
35impl<'a> TypedDatum<'a> {
36    /// Pairs a datum and its type, for encoding.
37    pub fn new(datum: Datum<'a>, typ: &'a SqlColumnType) -> Self {
38        Self { datum, typ }
39    }
40}
41
42/// Extracts deduplicated column names and types from a relation description.
43pub fn column_names_and_types(desc: RelationDesc) -> Vec<(ColumnName, SqlColumnType)> {
44    // Invent names for columns that don't have a name.
45    let mut columns: Vec<_> = desc.into_iter().collect();
46
47    let mut name = String::new();
48    // Deduplicate names.
49    let mut seen = BTreeSet::new();
50    for (column_name, _ty) in &mut columns {
51        name.clear();
52        name.push_str(column_name.as_str());
53        let stem_len = name.len();
54        let mut i = 1;
55        while seen.contains(&name) {
56            name.truncate(stem_len);
57            if name.ends_with(|c: char| c.is_ascii_digit()) {
58                name.push('_');
59            }
60            name.push_str(&i.to_string());
61            i += 1;
62        }
63        seen.insert(name.clone());
64        if column_name.as_str() != name {
65            *column_name.as_mut_boxed_str() = name.clone().into();
66        }
67    }
68    columns
69}