mz_interchange/encode.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeSet;
11
12use mz_repr::{ColumnName, ColumnType, Datum, RelationDesc, Row};
13
14pub trait Encode {
15 fn encode_unchecked(&self, row: Row) -> Vec<u8>;
16
17 /// Given the output of a call to [`Encode::encode_unchecked`], returns
18 /// a hash that is suitable for stable partitioning.
19 fn hash(&self, buf: &[u8]) -> u64 {
20 // We use seahash as it outperforms pretty much all other options, and
21 // has great mathematically proven statistical properties. It truly is a
22 // remarkable non-cryptographic hash. More details can be found here:
23 // https://docs.rs/seahash/latest/seahash/
24 seahash::hash(buf)
25 }
26}
27
28/// Bundled information sufficient to encode Datums.
29#[derive(Debug)]
30pub struct TypedDatum<'a> {
31 pub datum: Datum<'a>,
32 pub typ: &'a ColumnType,
33}
34
35impl<'a> TypedDatum<'a> {
36 /// Pairs a datum and its type, for encoding.
37 pub fn new(datum: Datum<'a>, typ: &'a ColumnType) -> Self {
38 Self { datum, typ }
39 }
40}
41
42/// Extracts deduplicated column names and types from a relation description.
43pub fn column_names_and_types(desc: RelationDesc) -> Vec<(ColumnName, ColumnType)> {
44 // Invent names for columns that don't have a name.
45 let mut columns: Vec<_> = desc.into_iter().collect();
46
47 // Deduplicate names.
48 let mut seen = BTreeSet::new();
49 for (name, _ty) in &mut columns {
50 let stem_len = name.as_str().len();
51 let mut i = 1;
52 while seen.contains(name) {
53 name.as_mut_str().truncate(stem_len);
54 if name.as_str().ends_with(|c: char| c.is_ascii_digit()) {
55 name.as_mut_str().push('_');
56 }
57 name.as_mut_str().push_str(&i.to_string());
58 i += 1;
59 }
60 seen.insert(name);
61 }
62 columns
63}