mz_mysql_util/
decoding.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::str::FromStr;
11
12use itertools::{EitherOrBoth, Itertools};
13use mysql_common::value::convert::from_value_opt;
14use mysql_common::{Row as MySqlRow, Value};
15
16use mz_ore::cast::CastFrom;
17use mz_ore::error::ErrorExt;
18use mz_repr::adt::date::Date;
19use mz_repr::adt::jsonb::JsonbPacker;
20use mz_repr::adt::numeric::{NUMERIC_DATUM_MAX_PRECISION, Numeric, get_precision, get_scale};
21use mz_repr::adt::timestamp::CheckedTimestamp;
22use mz_repr::{Datum, Row, RowPacker, ScalarType};
23
24use crate::desc::MySqlColumnMeta;
25use crate::{MySqlColumnDesc, MySqlError, MySqlTableDesc};
26
27pub fn pack_mysql_row(
28    row_container: &mut Row,
29    row: MySqlRow,
30    table_desc: &MySqlTableDesc,
31) -> Result<Row, MySqlError> {
32    let mut packer = row_container.packer();
33    let row_values = row.unwrap();
34
35    for values in table_desc.columns.iter().zip_longest(row_values) {
36        let (col_desc, value) = match values {
37            EitherOrBoth::Both(col_desc, value) => (col_desc, value),
38            EitherOrBoth::Left(col_desc) => {
39                tracing::error!(
40                    "mysql: extra column description {col_desc:?} for table {}",
41                    table_desc.name
42                );
43                Err(MySqlError::ValueDecodeError {
44                    column_name: col_desc.name.clone(),
45                    qualified_table_name: format!("{}.{}", table_desc.schema_name, table_desc.name),
46                    error: "extra column description".to_string(),
47                })?
48            }
49            EitherOrBoth::Right(_) => {
50                // If there are extra columns on the upstream table we can safely ignore them
51                break;
52            }
53        };
54        if col_desc.column_type.is_none() {
55            // This column is ignored, so don't decode it.
56            continue;
57        }
58        match pack_val_as_datum(value, col_desc, &mut packer) {
59            Err(err) => Err(MySqlError::ValueDecodeError {
60                column_name: col_desc.name.clone(),
61                qualified_table_name: format!("{}.{}", table_desc.schema_name, table_desc.name),
62                error: err.to_string(),
63            })?,
64            Ok(()) => (),
65        };
66    }
67
68    Ok(row_container.clone())
69}
70
71// TODO(guswynn|roshan): This function has various `.to_string()` and `format!` calls that should
72// use a shared allocation if possible.
73fn pack_val_as_datum(
74    value: Value,
75    col_desc: &MySqlColumnDesc,
76    packer: &mut RowPacker,
77) -> Result<(), anyhow::Error> {
78    let column_type = match col_desc.column_type {
79        Some(ref column_type) => column_type,
80        None => anyhow::bail!("column type is not set for column: {}", col_desc.name),
81    };
82    match value {
83        Value::NULL => {
84            if column_type.nullable {
85                packer.push(Datum::Null);
86            } else {
87                Err(anyhow::anyhow!(
88                    "received a null value in a non-null column".to_string(),
89                ))?
90            }
91        }
92        value => match &column_type.scalar_type {
93            ScalarType::Bool => packer.push(Datum::from(from_value_opt::<bool>(value)?)),
94            ScalarType::UInt16 => packer.push(Datum::from(from_value_opt::<u16>(value)?)),
95            ScalarType::Int16 => packer.push(Datum::from(from_value_opt::<i16>(value)?)),
96            ScalarType::UInt32 => packer.push(Datum::from(from_value_opt::<u32>(value)?)),
97            ScalarType::Int32 => packer.push(Datum::from(from_value_opt::<i32>(value)?)),
98            ScalarType::UInt64 => {
99                if let Some(MySqlColumnMeta::Bit(precision)) = &col_desc.meta {
100                    let mut value = from_value_opt::<Vec<u8>>(value)?;
101
102                    // Ensure we have the correct number of bytes.
103                    let precision_bytes = (precision + 7) / 8;
104                    if value.len() != usize::cast_from(precision_bytes) {
105                        return Err(anyhow::anyhow!("'bit' column out of range!"));
106                    }
107                    // Be defensive and prune any bits that come over the wire and are
108                    // greater than our precision.
109                    let bit_index = precision % 8;
110                    if bit_index != 0 {
111                        let mask = !(u8::MAX << bit_index);
112                        if value.len() > 0 {
113                            value[0] &= mask;
114                        }
115                    }
116
117                    // Based on experimentation the value coming across the wire is
118                    // encoded in big-endian.
119                    let mut buf = [0u8; 8];
120                    buf[(8 - value.len())..].copy_from_slice(value.as_slice());
121                    let value = u64::from_be_bytes(buf);
122                    packer.push(Datum::from(value))
123                } else {
124                    packer.push(Datum::from(from_value_opt::<u64>(value)?))
125                }
126            }
127            ScalarType::Int64 => packer.push(Datum::from(from_value_opt::<i64>(value)?)),
128            ScalarType::Float32 => packer.push(Datum::from(from_value_opt::<f32>(value)?)),
129            ScalarType::Float64 => packer.push(Datum::from(from_value_opt::<f64>(value)?)),
130            ScalarType::Char { length } => {
131                let val = from_value_opt::<String>(value)?;
132                check_char_length(length.map(|l| l.into_u32()), &val, col_desc)?;
133                packer.push(Datum::String(&val));
134            }
135            ScalarType::VarChar { max_length } => {
136                let val = from_value_opt::<String>(value)?;
137                check_char_length(max_length.map(|l| l.into_u32()), &val, col_desc)?;
138                packer.push(Datum::String(&val));
139            }
140            ScalarType::String => {
141                // Special case for string types, since this is the scalar type used for a column
142                // specified as a 'TEXT COLUMN'. In some cases we need to check the column
143                // metadata to know if the upstream value needs special handling
144                match &col_desc.meta {
145                    Some(MySqlColumnMeta::Enum(e)) => {
146                        match value {
147                            Value::Bytes(data) => {
148                                let data = std::str::from_utf8(&data)?;
149                                packer.push(Datum::String(data));
150                            }
151                            Value::Int(val) => {
152                                // Enum types are provided as 1-indexed integers in the replication
153                                // stream, so we need to find the string value from the enum meta
154                                let enum_val = e.values.get(usize::try_from(val)? - 1).ok_or(
155                                    anyhow::anyhow!(
156                                        "received invalid enum value: {} for column {}",
157                                        val,
158                                        col_desc.name
159                                    ),
160                                )?;
161                                packer.push(Datum::String(enum_val));
162                            }
163                            _ => Err(anyhow::anyhow!(
164                                "received unexpected value for enum type: {:?}",
165                                value
166                            ))?,
167                        }
168                    }
169                    Some(MySqlColumnMeta::Json) => {
170                        // JSON types in a query response are encoded as a string with whitespace,
171                        // but when parsed from the binlog event by mysql-common they are provided
172                        // as an encoded string sans-whitespace.
173                        if let Value::Bytes(data) = value {
174                            let json = serde_json::from_slice::<serde_json::Value>(&data)?;
175                            packer.push(Datum::String(&json.to_string()));
176                        } else {
177                            Err(anyhow::anyhow!(
178                                "received unexpected value for json type: {:?}",
179                                value
180                            ))?;
181                        }
182                    }
183                    Some(MySqlColumnMeta::Year) => {
184                        let val = from_value_opt::<u16>(value)?;
185                        packer.push(Datum::String(&val.to_string()));
186                    }
187                    Some(MySqlColumnMeta::Date) => {
188                        // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so
189                        // we need to handle them directly as strings
190                        if let Value::Date(y, m, d, 0, 0, 0, 0) = value {
191                            packer.push(Datum::String(&format!("{:04}-{:02}-{:02}", y, m, d)));
192                        } else {
193                            Err(anyhow::anyhow!(
194                                "received unexpected value for date type: {:?}",
195                                value
196                            ))?;
197                        }
198                    }
199                    Some(MySqlColumnMeta::Timestamp(precision)) => {
200                        // Some MySQL dates are invalid in chrono/NaiveDate (e.g. 0000-00-00), so
201                        // we need to handle them directly as strings
202                        if let Value::Date(y, m, d, h, mm, s, ms) = value {
203                            if *precision > 0 {
204                                let precision: usize = (*precision).try_into()?;
205                                packer.push(Datum::String(&format!(
206                                    "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.{:0precision$}",
207                                    y,
208                                    m,
209                                    d,
210                                    h,
211                                    mm,
212                                    s,
213                                    ms,
214                                    precision = precision
215                                )));
216                            } else {
217                                packer.push(Datum::String(&format!(
218                                    "{:04}-{:02}-{:02} {:02}:{:02}:{:02}",
219                                    y, m, d, h, mm, s
220                                )));
221                            }
222                        } else {
223                            Err(anyhow::anyhow!(
224                                "received unexpected value for timestamp type: {:?}",
225                                value
226                            ))?;
227                        }
228                    }
229                    Some(MySqlColumnMeta::Bit(_)) => unreachable!("parsed as a u64"),
230                    None => {
231                        packer.push(Datum::String(&from_value_opt::<String>(value)?));
232                    }
233                }
234            }
235            ScalarType::Jsonb => {
236                if let Value::Bytes(data) = value {
237                    let packer = JsonbPacker::new(packer);
238                    // TODO(guswynn): This still produces and extract allocation (in the
239                    // `DeserializeSeed` impl used internally), which should be improved,
240                    // for all users of the APIs in that module.
241                    packer.pack_slice(&data).map_err(|e| {
242                        anyhow::anyhow!(
243                            "Failed to decode JSON: {}",
244                            // See if we can output the string that failed to be converted to JSON.
245                            match std::str::from_utf8(&data) {
246                                Ok(str) => str.to_string(),
247                                // Otherwise produce the nominally helpful error.
248                                Err(_) => e.display_with_causes().to_string(),
249                            }
250                        )
251                    })?;
252                } else {
253                    Err(anyhow::anyhow!(
254                        "received unexpected value for json type: {:?}",
255                        value
256                    ))?
257                }
258            }
259            ScalarType::Bytes => {
260                let data = from_value_opt::<Vec<u8>>(value)?;
261                packer.push(Datum::Bytes(&data));
262            }
263            ScalarType::Date => {
264                let date = Date::try_from(from_value_opt::<chrono::NaiveDate>(value)?)?;
265                packer.push(Datum::from(date));
266            }
267            ScalarType::Timestamp { precision: _ } => {
268                // Timestamps are encoded as different mysql_common::Value types depending on
269                // whether they are from a binlog event or a query, and depending on which
270                // mysql timestamp version is used. We handle those cases here
271                // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/binlog/value.rs#L87-L155
272                // https://github.com/blackbeam/rust_mysql_common/blob/v0.31.0/src/value/mod.rs#L332
273                let chrono_timestamp = match value {
274                    Value::Date(..) => from_value_opt::<chrono::NaiveDateTime>(value)?,
275                    // old temporal format from before MySQL 5.6; didn't support fractional seconds
276                    Value::Int(val) => chrono::DateTime::from_timestamp(val, 0)
277                        .ok_or(anyhow::anyhow!("received invalid timestamp value: {}", val))?
278                        .naive_utc(),
279                    Value::Bytes(data) => {
280                        let data = std::str::from_utf8(&data)?;
281                        if data.contains('.') {
282                            chrono::NaiveDateTime::parse_from_str(data, "%s%.6f")?
283                        } else {
284                            chrono::NaiveDateTime::parse_from_str(data, "%s")?
285                        }
286                    }
287                    _ => Err(anyhow::anyhow!(
288                        "received unexpected value for timestamp type: {:?}",
289                        value
290                    ))?,
291                };
292                packer.push(Datum::try_from(CheckedTimestamp::try_from(
293                    chrono_timestamp,
294                )?)?);
295            }
296            ScalarType::Time => {
297                packer.push(Datum::from(from_value_opt::<chrono::NaiveTime>(value)?));
298            }
299            ScalarType::Numeric { max_scale } => {
300                // The wire-format of numeric types is a string when sent in a binary query
301                // response but is represented in a decimal binary format when sent in a binlog
302                // event. However the mysql-common crate abstracts this away and always returns
303                // a string. We parse the string into a numeric type here.
304                let val = from_value_opt::<String>(value)?;
305                let val = Numeric::from_str(&val)?;
306                if get_precision(&val) > NUMERIC_DATUM_MAX_PRECISION.into() {
307                    Err(anyhow::anyhow!(
308                        "received numeric value with precision {} for column {} which has a max precision of {}",
309                        get_precision(&val),
310                        col_desc.name,
311                        NUMERIC_DATUM_MAX_PRECISION
312                    ))?
313                }
314                if let Some(max_scale) = max_scale {
315                    if get_scale(&val) > max_scale.into_u8().into() {
316                        Err(anyhow::anyhow!(
317                            "received numeric value with scale {} for column {} which has a max scale of {}",
318                            get_scale(&val),
319                            col_desc.name,
320                            max_scale.into_u8()
321                        ))?
322                    }
323                }
324                packer.push(Datum::from(val));
325            }
326            // TODO(roshan): IMPLEMENT OTHER TYPES
327            data_type => Err(anyhow::anyhow!(
328                "received unexpected value for type: {:?}: {:?}",
329                data_type,
330                value
331            ))?,
332        },
333    }
334    Ok(())
335}
336
337fn check_char_length(
338    length: Option<u32>,
339    val: &str,
340    col_desc: &MySqlColumnDesc,
341) -> Result<(), anyhow::Error> {
342    if let Some(length) = length {
343        if let Some(_) = val.char_indices().nth(usize::cast_from(length)) {
344            Err(anyhow::anyhow!(
345                "received string value of length {} for column {} which has a max length of {}",
346                val.len(),
347                col_desc.name,
348                length
349            ))?
350        }
351    }
352    Ok(())
353}