mz_interchange/encode.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeSet;
11
12use mz_repr::{ColumnName, Datum, RelationDesc, Row, SqlColumnType};
13
14pub trait Encode {
15 fn encode_unchecked(&self, row: Row) -> Vec<u8>;
16
17 /// Given the output of a call to [`Encode::encode_unchecked`], returns
18 /// a hash that is suitable for stable partitioning.
19 fn hash(&self, buf: &[u8]) -> u64 {
20 // We use seahash as it outperforms pretty much all other options, and
21 // has great mathematically proven statistical properties. It truly is a
22 // remarkable non-cryptographic hash. More details can be found here:
23 // https://docs.rs/seahash/latest/seahash/
24 seahash::hash(buf)
25 }
26}
27
28/// Bundled information sufficient to encode Datums.
29#[derive(Debug)]
30pub struct TypedDatum<'a> {
31 pub datum: Datum<'a>,
32 pub typ: &'a SqlColumnType,
33}
34
35impl<'a> TypedDatum<'a> {
36 /// Pairs a datum and its type, for encoding.
37 pub fn new(datum: Datum<'a>, typ: &'a SqlColumnType) -> Self {
38 Self { datum, typ }
39 }
40}
41
42/// Extracts deduplicated column names and types from a relation description.
43pub fn column_names_and_types(desc: RelationDesc) -> Vec<(ColumnName, SqlColumnType)> {
44 // Invent names for columns that don't have a name.
45 let mut columns: Vec<_> = desc.into_iter().collect();
46
47 let mut name = String::new();
48 // Deduplicate names.
49 let mut seen = BTreeSet::new();
50 for (column_name, _ty) in &mut columns {
51 name.clear();
52 name.push_str(column_name.as_str());
53 let stem_len = name.len();
54 let mut i = 1;
55 while seen.contains(&name) {
56 name.truncate(stem_len);
57 if name.ends_with(|c: char| c.is_ascii_digit()) {
58 name.push('_');
59 }
60 name.push_str(&i.to_string());
61 i += 1;
62 }
63 seen.insert(name.clone());
64 if column_name.as_str() != name {
65 *column_name.as_mut_boxed_str() = name.clone().into();
66 }
67 }
68 columns
69}