mz_repr/adt/char.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::error::Error;
11use std::fmt;
12
13use anyhow::bail;
14use mz_lowertest::MzReflect;
15use mz_ore::cast::CastFrom;
16use mz_proto::{RustType, TryFromProtoError};
17use proptest::arbitrary::Arbitrary;
18use proptest::strategy::{BoxedStrategy, Strategy};
19use serde::{Deserialize, Serialize};
20
21include!(concat!(env!("OUT_DIR"), "/mz_repr.adt.char.rs"));
22
23// https://github.com/postgres/postgres/blob/REL_14_0/src/include/access/htup_details.h#L577-L584
24const MAX_LENGTH: u32 = 10_485_760;
25
26/// A marker type indicating that a Rust string should be interpreted as a
27/// [`ScalarType::Char`].
28///
29/// [`ScalarType::Char`]: crate::ScalarType::Char
30#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)]
31pub struct Char<S: AsRef<str>>(pub S);
32
33/// The `length` of a [`ScalarType::Char`].
34///
35/// This newtype wrapper ensures that the length is within the valid range.
36///
37/// [`ScalarType::Char`]: crate::ScalarType::Char
38#[derive(
39 Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, MzReflect,
40)]
41pub struct CharLength(pub(crate) u32);
42
43impl CharLength {
44 /// A length of one.
45 pub const ONE: CharLength = CharLength(1);
46
47 /// Consumes the newtype wrapper, returning the inner `u32`.
48 pub fn into_u32(self) -> u32 {
49 self.0
50 }
51}
52
53impl TryFrom<i64> for CharLength {
54 type Error = InvalidCharLengthError;
55
56 fn try_from(length: i64) -> Result<Self, Self::Error> {
57 match u32::try_from(length) {
58 Ok(length) if length > 0 && length < MAX_LENGTH => Ok(CharLength(length)),
59 _ => Err(InvalidCharLengthError),
60 }
61 }
62}
63
64impl Arbitrary for CharLength {
65 type Parameters = ();
66 type Strategy = BoxedStrategy<CharLength>;
67
68 fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy {
69 proptest::arbitrary::any::<u32>()
70 // We cap the maximum CharLength to prevent generating massive
71 // strings which can greatly slow down tests and are relatively
72 // uninteresting.
73 .prop_map(|len| CharLength(len % 300))
74 .boxed()
75 }
76}
77
78/// The error returned when constructing a [`CharLength`] from an invalid value.
79#[derive(Debug, Clone)]
80pub struct InvalidCharLengthError;
81
82impl fmt::Display for InvalidCharLengthError {
83 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
84 write!(
85 f,
86 "length for type character must be between 1 and {}",
87 MAX_LENGTH
88 )
89 }
90}
91
92impl Error for InvalidCharLengthError {}
93
94/// Controls how to handle trailing whitespace at the end of bpchar data.
95#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
96enum CharWhiteSpace {
97 /// Trim all whitespace from strings, which is appropriate for storing
98 /// bpchar data in Materialize. bpchar data is stored in datums with its
99 /// trailing whitespace trimmed to enforce the same equality semantics as
100 /// PG, while also allowing us to bit-wise equality on rows.
101 Trim,
102 /// Blank pad strings, which is appropriate for returning bpchar data out of Materialize.
103 Pad,
104}
105
106impl CharWhiteSpace {
107 fn process_str(&self, s: &str, length: Option<usize>) -> String {
108 use CharWhiteSpace::*;
109 match self {
110 Trim => s.trim_end().to_string(),
111 Pad => match length {
112 Some(length) => format!("{:width$}", s, width = length),
113 // This occurs when e.g. printing lists
114 None => s.to_string(),
115 },
116 }
117 }
118}
119
120/// Returns `s` as a `String` with options to enforce char and varchar
121/// semantics.
122///
123/// # Arguments
124/// * `s` - The `str` to format
125/// * `length` - An optional maximum length for the string
126/// * `fail_on_len` - Return an error if `s`'s character count exceeds the
127/// specified maximum length.
128/// * `white_space` - Express how to handle trailing whitespace on `s`
129///
130/// This function should only fail when `fail_on_len` is `true` _and_ `length`
131/// is present and exceeded.
132fn format_char_str(
133 s: &str,
134 length: Option<CharLength>,
135 fail_on_len: bool,
136 white_space: CharWhiteSpace,
137) -> Result<String, anyhow::Error> {
138 Ok(match length {
139 // Note that length is 1-indexed, so finding `None` means the string's
140 // characters don't exceed the length, while finding `Some` means it
141 // does.
142 Some(l) => {
143 let l = usize::cast_from(l.into_u32());
144 // The number of chars in a string is always less or equal to the length of the string.
145 // Hence, if the string is shorter than the length, we do not have to check for
146 // the maximum length.
147 if s.len() < l {
148 return Ok(white_space.process_str(s, Some(l)));
149 }
150 match s.char_indices().nth(l) {
151 None => white_space.process_str(s, Some(l)),
152 Some((idx, _)) => {
153 if !fail_on_len || s[idx..].chars().all(|c| c.is_ascii_whitespace()) {
154 white_space.process_str(&s[..idx], Some(l))
155 } else {
156 bail!("{} exceeds maximum length of {}", s, l)
157 }
158 }
159 }
160 }
161 None => white_space.process_str(s, None),
162 })
163}
164
165/// Ensures that `s` has fewer than `length` characters, and returns a `String`
166/// version of it with all whitespace trimmed from the end.
167///
168/// The value returned is appropriate to store in `Datum::String`, but _is not_
169/// appropriate to return to clients.
170///
171/// This function should only fail when `fail_on_len` is `true` _and_ `length`
172/// is present and exceeded.
173pub fn format_str_trim(
174 s: &str,
175 length: Option<CharLength>,
176 fail_on_len: bool,
177) -> Result<String, anyhow::Error> {
178 format_char_str(s, length, fail_on_len, CharWhiteSpace::Trim)
179}
180
181/// Ensures that `s` has fewer than `length` characters, and returns a `String`
182/// version of it with blank padding so that its width is `length` characters.
183///
184/// The value returned is appropriate to return to clients, but _is not_
185/// appropriate to store in `Datum::String`.
186pub fn format_str_pad(s: &str, length: Option<CharLength>) -> String {
187 format_char_str(s, length, false, CharWhiteSpace::Pad).unwrap()
188}
189
190impl RustType<ProtoCharLength> for CharLength {
191 fn into_proto(&self) -> ProtoCharLength {
192 ProtoCharLength { value: self.0 }
193 }
194
195 fn from_proto(proto: ProtoCharLength) -> Result<Self, TryFromProtoError> {
196 Ok(CharLength(proto.value))
197 }
198}
199
200#[cfg(test)]
201mod tests {
202 use mz_ore::assert_ok;
203 use mz_proto::protobuf_roundtrip;
204 use proptest::prelude::*;
205
206 use super::*;
207
208 proptest! {
209 #[mz_ore::test]
210 fn char_length_protobuf_roundtrip(expect in any::<CharLength>()) {
211 let actual = protobuf_roundtrip::<_, ProtoCharLength>(&expect);
212 assert_ok!(actual);
213 assert_eq!(actual.unwrap(), expect);
214 }
215 }
216}