mz_mysql_util/
decoding.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::str::FromStr;
11
12use itertools::{EitherOrBoth, Itertools};
13use mysql_common::value::convert::from_value_opt;
14use mysql_common::{Row as MySqlRow, Value};
15
16use mz_ore::cast::CastFrom;
17use mz_ore::error::ErrorExt;
18use mz_repr::adt::date::Date;
19use mz_repr::adt::jsonb::JsonbPacker;
20use mz_repr::adt::numeric::{NUMERIC_DATUM_MAX_PRECISION, Numeric, get_precision, get_scale};
21use mz_repr::adt::timestamp::CheckedTimestamp;
22use mz_repr::{Datum, Row, RowPacker, ScalarType};
23
24use crate::desc::MySqlColumnMeta;
25use crate::{MySqlColumnDesc, MySqlError, MySqlTableDesc};
26
27pub fn pack_mysql_row(
28    row_container: &mut Row,
29    row: MySqlRow,
30    table_desc: &MySqlTableDesc,
31) -> Result<Row, MySqlError> {
32    let mut packer = row_container.packer();
33    let row_values = row.unwrap();
34
35    for values in table_desc.columns.iter().zip_longest(row_values) {
36        let (col_desc, value) = match values {
37            EitherOrBoth::Both(col_desc, value) => (col_desc, value),
38            EitherOrBoth::Left(col_desc) => {
39                tracing::error!(
40                    "mysql: extra column description {col_desc:?} for table {}",
41                    table_desc.name
42                );
43                Err(MySqlError::ValueDecodeError {
44                    column_name: col_desc.name.clone(),
45                    qualified_table_name: format!("{}.{}", table_desc.schema_name, table_desc.name),
46                    error: "extra column description".to_string(),
47                })?
48            }
49            EitherOrBoth::Right(_) => {
50                // If there are extra columns on the upstream table we can safely ignore them
51                break;
52            }
53        };
54        if col_desc.column_type.is_none() {
55            // This column is ignored, so don't decode it.
56            continue;
57        }
58        match pack_val_as_datum(value, col_desc, &mut packer) {
59            Err(err) => Err(MySqlError::ValueDecodeError {
60                column_name: col_desc.name.clone(),
61                qualified_table_name: format!("{}.{}", table_desc.schema_name, table_desc.name),
62                error: err.to_string(),
63            })?,
64            Ok(()) => (),
65        };
66    }
67
68    Ok(row_container.clone())
69}
70
71// TODO(guswynn|roshan): This function has various `.to_string()` and `format!` calls that should
72// use a shared allocation if possible.
73fn pack_val_as_datum(
74    value: Value,
75    col_desc: &MySqlColumnDesc,
76    packer: &mut RowPacker,
77) -> Result<(), anyhow::Error> {
78    let column_type = match col_desc.column_type {
79        Some(ref column_type) => column_type,
80        None => anyhow::bail!("column type is not set for column: {}", col_desc.name),
81    };
82    match value {
83        Value::NULL => {
84            if column_type.nullable {
85                packer.push(Datum::Null);
86            } else {
87                Err(anyhow::anyhow!(
88                    "received a null value in a non-null column".to_string(),
89                ))?
90            }
91        }
92        value => match &column_type.scalar_type {
93            ScalarType::Bool => packer.push(Datum::from(from_value_opt::<bool>(value)?)),
94            ScalarType::UInt16 => packer.push(Datum::from(from_value_opt::<u16>(value)?)),
95            ScalarType::Int16 => packer.push(Datum::from(from_value_opt::<i16>(value)?)),
96            ScalarType::UInt32 => packer.push(Datum::from(from_value_opt::<u32>(value)?)),
97            ScalarType::Int32 => packer.push(Datum::from(from_value_opt::<i32>(value)?)),
98            ScalarType::UInt64 => {
99                if let Some(MySqlColumnMeta::Bit(precision)) = &col_desc.meta {
100                    let mut value = from_value_opt::<Vec<u8>>(value)?;
101
102                    // Ensure we have the correct number of bytes.
103                    let precision_bytes = (precision + 7) / 8;
104                    if value.len() != usize::cast_from(precision_bytes) {
105                        return Err(anyhow::anyhow!("'bit' column out of range!"));
106                    }
107                    // Be defensive and prune any bits that come over the wire and are
108                    // greater than our precision.
109                    let bit_index = precision % 8;
110                    if bit_index != 0 {
111                        let mask = !(u8::MAX << bit_index);
112                        if value.len() > 0 {
113                            value[0] &= mask;
114                        }
115                    }
116
117                    // Based on experimentation the value coming across the wire is
118                    // encoded in big-endian.
119                    let mut buf = [0u8; 8];
120                    buf[(8 - value.len())..].copy_from_slice(value.as_slice());
121                    let value = u64::from_be_bytes(buf);
122                    packer.push(Datum::from(value))
123                } else {
124                    packer.push(Datum::from(from_value_opt::<u64>(value)?))
125                }
126            }
127            ScalarType::Int64 => packer.push(Datum::from(from_value_opt::<i64>(value)?)),
128            ScalarType::Float32 => packer.push(Datum::from(from_value_opt::<f32>(value)?)),
129            ScalarType::Float64 => packer.push(Datum::from(from_value_opt::<f64>(value)?)),
130            ScalarType::Char { length } => {
131                let val = from_value_opt::<String>(value)?;
132                check_char_length(length.map(|l| l.into_u32()), &val, col_desc)?;
133                packer.push(Datum::String(&val));
134            }
135            ScalarType::VarChar { max_length } => {
136                let val = from_value_opt::<String>(value)?;
137                check_char_length(max_length.map(|l| l.into_u32()), &val, col_desc)?;
138                packer.push(Datum::String(&val));
139            }
140            ScalarType::String => {
141                // Special case for string types, since this is the scalar type used for a column
142                // specified as a 'TEXT COLUMN'. In some cases we need to check the column
143                // metadata to know if the upstream value needs special handling
144                match &col_desc.meta {
145                    Some(MySqlColumnMeta::Enum(e)) => {
146                        match value {
147                            Value::Bytes(data) => {
148                                let data = std::str::from_utf8(&data)?;
149                                packer.push(Datum::String(data));
150                            }
151                            Value::Int(val) => {
152                                // Enum types are provided as 1-indexed integers in the replication
153                                // stream, so we need to find the string value from the enum meta
154                                let enum_val =
155                                    e.values.get(usize::try_from(val)? - 1).ok_or_else(|| {
156                                        anyhow::anyhow!(
157                                            "received invalid enum value: {} for column {}",
158                                            val,
159                                            col_desc.name
160                                        )
161                                    })?;
162                                packer.push(Datum::String(enum_val));
163                            }
164                            _ => Err(anyhow::anyhow!(
165                                "received unexpected value for enum type: {:?}",
166                                value
167                            ))?,
168                        }
169                    }
170                    Some(MySqlColumnMeta::Json) => {
171                        // JSON types in a query response are encoded as a string with whitespace,
172                        // but when parsed from the binlog event by mysql-common they are provided
173                        // as an encoded string sans-whitespace.
174                        if let Value::Bytes(data) = value {
175                            let json = serde_json::from_slice::<serde_json::Value>(&data)?;
176                            packer.push(Datum::String(&json.to_string()));
177                        } else {
178                            Err(anyhow::anyhow!(
179                                "received unexpected value for json type: {:?}",
180                                value
181                            ))?;
182                        }
183                    }
184                    Some(MySqlColumnMeta::Year) => {
185                        let val = from_value_opt::<u16>(value)?;
186                        packer.push(Datum::String(&val.to_string()));
187                    }
188                    Some(MySqlColumnMeta::Date) => {
189                        // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so
190                        // we need to handle them directly as strings
191                        if let Value::Date(y, m, d, 0, 0, 0, 0) = value {
192                            packer.push(Datum::String(&format!("{:04}-{:02}-{:02}", y, m, d)));
193                        } else {
194                            Err(anyhow::anyhow!(
195                                "received unexpected value for date type: {:?}",
196                                value
197                            ))?;
198                        }
199                    }
200                    Some(MySqlColumnMeta::Timestamp(precision)) => {
201                        // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so
202                        // we need to handle them directly as strings
203                        if let Value::Date(y, m, d, h, mm, s, ms) = value {
204                            if *precision > 0 {
205                                let precision: usize = (*precision).try_into()?;
206                                packer.push(Datum::String(&format!(
207                                    "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.{:0precision$}",
208                                    y,
209                                    m,
210                                    d,
211                                    h,
212                                    mm,
213                                    s,
214                                    ms,
215                                    precision = precision
216                                )));
217                            } else {
218                                packer.push(Datum::String(&format!(
219                                    "{:04}-{:02}-{:02} {:02}:{:02}:{:02}",
220                                    y, m, d, h, mm, s
221                                )));
222                            }
223                        } else {
224                            Err(anyhow::anyhow!(
225                                "received unexpected value for timestamp type: {:?}",
226                                value
227                            ))?;
228                        }
229                    }
230                    Some(MySqlColumnMeta::Bit(_)) => unreachable!("parsed as a u64"),
231                    None => {
232                        packer.push(Datum::String(&from_value_opt::<String>(value)?));
233                    }
234                }
235            }
236            ScalarType::Jsonb => {
237                if let Value::Bytes(data) = value {
238                    let packer = JsonbPacker::new(packer);
239                    // TODO(guswynn): This still produces and extract allocation (in the
240                    // `DeserializeSeed` impl used internally), which should be improved,
241                    // for all users of the APIs in that module.
242                    packer.pack_slice(&data).map_err(|e| {
243                        anyhow::anyhow!(
244                            "Failed to decode JSON: {}",
245                            // See if we can output the string that failed to be converted to JSON.
246                            match std::str::from_utf8(&data) {
247                                Ok(str) => str.to_string(),
248                                // Otherwise produce the nominally helpful error.
249                                Err(_) => e.display_with_causes().to_string(),
250                            }
251                        )
252                    })?;
253                } else {
254                    Err(anyhow::anyhow!(
255                        "received unexpected value for json type: {:?}",
256                        value
257                    ))?
258                }
259            }
260            ScalarType::Bytes => {
261                let data = from_value_opt::<Vec<u8>>(value)?;
262                packer.push(Datum::Bytes(&data));
263            }
264            ScalarType::Date => {
265                let date = Date::try_from(from_value_opt::<chrono::NaiveDate>(value)?)?;
266                packer.push(Datum::from(date));
267            }
268            ScalarType::Timestamp { precision: _ } => {
269                // Timestamps are encoded as different mysql_common::Value types depending on
270                // whether they are from a binlog event or a query, and depending on which
271                // mysql timestamp version is used. We handle those cases here
272                // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/binlog/value.rs#L87-L155
273                // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/value/mod.rs#L332
274                let chrono_timestamp = match value {
275                    Value::Date(..) => from_value_opt::<chrono::NaiveDateTime>(value)?,
276                    // old temporal format from before MySQL 5.6; didn't support fractional seconds
277                    Value::Int(val) => chrono::DateTime::from_timestamp(val, 0)
278                        .ok_or_else(|| {
279                            anyhow::anyhow!("received invalid timestamp value: {}", val)
280                        })?
281                        .naive_utc(),
282                    Value::Bytes(data) => {
283                        let data = std::str::from_utf8(&data)?;
284                        if data.contains('.') {
285                            chrono::NaiveDateTime::parse_from_str(data, "%s%.6f")?
286                        } else {
287                            chrono::NaiveDateTime::parse_from_str(data, "%s")?
288                        }
289                    }
290                    _ => Err(anyhow::anyhow!(
291                        "received unexpected value for timestamp type: {:?}",
292                        value
293                    ))?,
294                };
295                packer.push(Datum::try_from(CheckedTimestamp::try_from(
296                    chrono_timestamp,
297                )?)?);
298            }
299            ScalarType::Time => {
300                packer.push(Datum::from(from_value_opt::<chrono::NaiveTime>(value)?));
301            }
302            ScalarType::Numeric { max_scale } => {
303                // The wire-format of numeric types is a string when sent in a binary query
304                // response but is represented in a decimal binary format when sent in a binlog
305                // event. However the mysql-common crate abstracts this away and always returns
306                // a string. We parse the string into a numeric type here.
307                let val = from_value_opt::<String>(value)?;
308                let val = Numeric::from_str(&val)?;
309                if get_precision(&val) > NUMERIC_DATUM_MAX_PRECISION.into() {
310                    Err(anyhow::anyhow!(
311                        "received numeric value with precision {} for column {} which has a max precision of {}",
312                        get_precision(&val),
313                        col_desc.name,
314                        NUMERIC_DATUM_MAX_PRECISION
315                    ))?
316                }
317                if let Some(max_scale) = max_scale {
318                    if get_scale(&val) > max_scale.into_u8().into() {
319                        Err(anyhow::anyhow!(
320                            "received numeric value with scale {} for column {} which has a max scale of {}",
321                            get_scale(&val),
322                            col_desc.name,
323                            max_scale.into_u8()
324                        ))?
325                    }
326                }
327                packer.push(Datum::from(val));
328            }
329            // TODO(roshan): IMPLEMENT OTHER TYPES
330            data_type => Err(anyhow::anyhow!(
331                "received unexpected value for type: {:?}: {:?}",
332                data_type,
333                value
334            ))?,
335        },
336    }
337    Ok(())
338}
339
340fn check_char_length(
341    length: Option<u32>,
342    val: &str,
343    col_desc: &MySqlColumnDesc,
344) -> Result<(), anyhow::Error> {
345    if let Some(length) = length {
346        if let Some(_) = val.char_indices().nth(usize::cast_from(length)) {
347            Err(anyhow::anyhow!(
348                "received string value of length {} for column {} which has a max length of {}",
349                val.len(),
350                col_desc.name,
351                length
352            ))?
353        }
354    }
355    Ok(())
356}