mz_transform/
column_knowledge.rs

Help
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Transformations based on pulling information about individual columns from sources.
11
12use std::collections::BTreeMap;
13
14use itertools::{Itertools, zip_eq};
15use mz_expr::JoinImplementation::IndexedFilter;
16use mz_expr::visit::Visit;
17use mz_expr::{
18    EvalError, LetRecLimit, MirRelationExpr, MirScalarExpr, RECURSION_LIMIT, UnaryFunc, func,
19};
20use mz_ore::cast::CastFrom;
21use mz_ore::stack::{CheckedRecursion, RecursionGuard};
22use mz_ore::{assert_none, soft_panic_or_log};
23use mz_repr::{ColumnType, Datum, RelationType, Row, ScalarType};
24
25use crate::{TransformCtx, TransformError};
26
27/// Harvest and act upon per-column information.
28#[derive(Debug)]
29pub struct ColumnKnowledge {
30    recursion_guard: RecursionGuard,
31}
32
33impl Default for ColumnKnowledge {
34    fn default() -> ColumnKnowledge {
35        ColumnKnowledge {
36            recursion_guard: RecursionGuard::with_limit(RECURSION_LIMIT),
37        }
38    }
39}
40
41impl CheckedRecursion for ColumnKnowledge {
42    fn recursion_guard(&self) -> &RecursionGuard {
43        &self.recursion_guard
44    }
45}
46
47impl crate::Transform for ColumnKnowledge {
48    fn name(&self) -> &'static str {
49        "ColumnKnowledge"
50    }
51
52    /// Transforms an expression through accumulated knowledge.
53    #[mz_ore::instrument(
54        target = "optimizer",
55        level = "debug",
56        fields(path.segment = "column_knowledge")
57    )]
58    fn actually_perform_transform(
59        &self,
60        expr: &mut MirRelationExpr,
61        _: &mut TransformCtx,
62    ) -> Result<(), TransformError> {
63        let mut knowledge_stack = Vec::<DatumKnowledge>::new();
64        let result = self
65            .harvest(expr, &mut BTreeMap::new(), &mut knowledge_stack)
66            .map(|_| ());
67        mz_repr::explain::trace_plan(&*expr);
68        result
69    }
70}
71
72impl ColumnKnowledge {
73    /// Harvest per-column knowledge.
74    ///
75    /// `knowledge_stack` is a pre-allocated vector but is expected not to contain any elements.
76    fn harvest(
77        &self,
78        expr: &mut MirRelationExpr,
79        knowledge: &mut BTreeMap<mz_expr::Id, Vec<DatumKnowledge>>,
80        knowledge_stack: &mut Vec<DatumKnowledge>,
81    ) -> Result<Vec<DatumKnowledge>, TransformError> {
82        self.checked_recur(|_| {
83            let result = match expr {
84                MirRelationExpr::ArrangeBy { input, .. } => {
85                    self.harvest(input, knowledge, knowledge_stack)
86                }
87                MirRelationExpr::Get { id, typ, .. } => {
88                    Ok(knowledge.get(id).cloned().unwrap_or_else(|| {
89                        typ.column_types.iter().map(DatumKnowledge::from).collect()
90                    }))
91                }
92                MirRelationExpr::Constant { rows, typ } => {
93                    // TODO: handle multi-row cases with some constant columns.
94                    if let Ok([(row, _diff)]) = rows.as_deref() {
95                        let knowledge = std::iter::zip(row.iter(), typ.column_types.iter())
96                            .map(DatumKnowledge::from)
97                            .collect();
98                        Ok(knowledge)
99                    } else {
100                        Ok(typ.column_types.iter().map(DatumKnowledge::from).collect())
101                    }
102                }
103                MirRelationExpr::Let { id, value, body } => {
104                    let value_knowledge = self.harvest(value, knowledge, knowledge_stack)?;
105                    let prior_knowledge =
106                        knowledge.insert(mz_expr::Id::Local(id.clone()), value_knowledge);
107                    let body_knowledge = self.harvest(body, knowledge, knowledge_stack)?;
108                    knowledge.remove(&mz_expr::Id::Local(id.clone()));
109                    if let Some(prior_knowledge) = prior_knowledge {
110                        knowledge.insert(mz_expr::Id::Local(id.clone()), prior_knowledge);
111                    }
112                    Ok(body_knowledge)
113                }
114                MirRelationExpr::LetRec {
115                    ids,
116                    values,
117                    limits,
118                    body,
119                } => {
120                    // Set knowledge[i][j] = DatumKnowledge::bottom() for each
121                    // column j and CTE i. This corresponds to the normal
122                    // evaluation semantics where each recursive CTE is
123                    // initialized to the empty collection.
124                    for (id, value) in zip_eq(ids.iter(), values.iter()) {
125                        let id = mz_expr::Id::Local(id.clone());
126                        let knowledge_new = vec![DatumKnowledge::bottom(); value.arity()];
127                        let knowledge_old = knowledge.insert(id, knowledge_new);
128                        assert_none!(knowledge_old);
129                    }
130
131                    // Sum up the arity of all ids in the enclosing LetRec node.
132                    let let_rec_arity = ids.iter().fold(0, |acc, id| {
133                        let id = mz_expr::Id::Local(id.clone());
134                        acc + u64::cast_from(knowledge[&id].len())
135                    });
136
137                    // Sequentially union knowledge[i][j] with the result of
138                    // descending into a clone of values[i]. Repeat until one of
139                    // the following conditions is met:
140                    //
141                    // 1. The knowledge bindings have stabilized at a fixpoint.
142                    // 2. No fixpoint was found after `max_iterations`. If this
143                    //    is the case reset the knowledge vectors for all
144                    //    recursive CTEs to DatumKnowledge::top().
145                    // 3. We reach the user-specified recursion limit of any of the bindings.
146                    //    In this case, we also give up similarly to 2., because we don't want to
147                    //    complicate things with handling different limits per binding.
148                    let min_max_iter = LetRecLimit::min_max_iter(limits);
149                    let max_iterations = 100;
150                    let mut curr_iteration = 0;
151                    loop {
152                        // Check for conditions (2) and (3).
153                        if curr_iteration >= max_iterations
154                            || min_max_iter
155                                .map(|min_max_iter| curr_iteration >= min_max_iter)
156                                .unwrap_or(false)
157                        {
158                            if curr_iteration > 3 * let_rec_arity {
159                                soft_panic_or_log!(
160                                    "LetRec loop in ColumnKnowledge has not converged in 3 * |{}|",
161                                    let_rec_arity
162                                );
163                            }
164
165                            for (id, value) in zip_eq(ids.iter(), values.iter()) {
166                                let id = mz_expr::Id::Local(id.clone());
167                                let knowledge_new = vec![DatumKnowledge::top(); value.arity()];
168                                knowledge.insert(id, knowledge_new);
169                            }
170                            break;
171                        }
172
173                        // Check for condition (1).
174                        let mut change = false;
175                        for (id, mut value) in zip_eq(ids.iter(), values.iter().cloned()) {
176                            let id = mz_expr::Id::Local(id.clone());
177                            let value = &mut value;
178                            let next_knowledge = self.harvest(value, knowledge, knowledge_stack)?;
179                            let curr_knowledge = knowledge.get_mut(&id).unwrap();
180                            for (curr, next) in zip_eq(curr_knowledge.iter_mut(), next_knowledge) {
181                                let prev = curr.clone();
182                                curr.join_assign(&next);
183                                change |= prev != *curr;
184                            }
185                        }
186                        if !change {
187                            break;
188                        }
189
190                        curr_iteration += 1;
191                    }
192
193                    // Descend into the values with the inferred knowledge.
194                    for value in values.iter_mut() {
195                        self.harvest(value, knowledge, knowledge_stack)?;
196                    }
197
198                    // Descend and return the knowledge from the body.
199                    let body_knowledge = self.harvest(body, knowledge, knowledge_stack)?;
200
201                    // Remove shadowed bindings. This is good hygiene, as
202                    // otherwise with nested LetRec blocks the `loop { ... }`
203                    // above will carry inner LetRec IDs across outer LetRec
204                    // iterations. As a consequence, the "no shadowing"
205                    // assertion at the beginning of this block will fail at the
206                    // inner LetRec for the second outer LetRec iteration.
207                    for id in ids.iter() {
208                        let id = mz_expr::Id::Local(id.clone());
209                        knowledge.remove(&id);
210                    }
211
212                    Ok(body_knowledge)
213                }
214                MirRelationExpr::Project { input, outputs } => {
215                    let input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
216                    Ok(outputs
217                        .iter()
218                        .map(|i| input_knowledge[*i].clone())
219                        .collect())
220                }
221                MirRelationExpr::Map { input, scalars } => {
222                    let mut input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
223                    let mut column_types = input.typ().column_types;
224                    for scalar in scalars.iter_mut() {
225                        input_knowledge.push(optimize(
226                            scalar,
227                            &column_types,
228                            &input_knowledge[..],
229                            knowledge_stack,
230                        )?);
231                        column_types.push(scalar.typ(&column_types));
232                    }
233                    Ok(input_knowledge)
234                }
235                MirRelationExpr::FlatMap { input, func, exprs } => {
236                    let mut input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
237                    let input_typ = input.typ();
238                    for expr in exprs {
239                        optimize(
240                            expr,
241                            &input_typ.column_types,
242                            &input_knowledge[..],
243                            knowledge_stack,
244                        )?;
245                    }
246                    let func_typ = func.output_type();
247                    input_knowledge.extend(func_typ.column_types.iter().map(DatumKnowledge::from));
248                    Ok(input_knowledge)
249                }
250                MirRelationExpr::Filter { input, predicates } => {
251                    let mut input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
252                    let input_typ = input.typ();
253                    for predicate in predicates.iter_mut() {
254                        optimize(
255                            predicate,
256                            &input_typ.column_types,
257                            &input_knowledge[..],
258                            knowledge_stack,
259                        )?;
260                    }
261                    // If any predicate tests a column for equality, truth, or is_null, we learn stuff.
262                    for predicate in predicates.iter() {
263                        // Equality tests allow us to unify the column knowledge of each input.
264                        if let MirScalarExpr::CallBinary {
265                            func: mz_expr::BinaryFunc::Eq,
266                            expr1,
267                            expr2,
268                        } = predicate
269                        {
270                            // Collect knowledge about the inputs (for columns and literals).
271                            let mut knowledge = DatumKnowledge::top();
272                            if let MirScalarExpr::Column(c) = &**expr1 {
273                                knowledge.meet_assign(&input_knowledge[*c]);
274                            }
275                            if let MirScalarExpr::Column(c) = &**expr2 {
276                                knowledge.meet_assign(&input_knowledge[*c]);
277                            }
278
279                            // Absorb literal knowledge about columns.
280                            if let MirScalarExpr::Literal(..) = &**expr1 {
281                                knowledge.meet_assign(&DatumKnowledge::from(&**expr1));
282                            }
283                            if let MirScalarExpr::Literal(..) = &**expr2 {
284                                knowledge.meet_assign(&DatumKnowledge::from(&**expr2));
285                            }
286
287                            // Write back unified knowledge to each column.
288                            if let MirScalarExpr::Column(c) = &**expr1 {
289                                input_knowledge[*c].meet_assign(&knowledge);
290                            }
291                            if let MirScalarExpr::Column(c) = &**expr2 {
292                                input_knowledge[*c].meet_assign(&knowledge);
293                            }
294                        }
295                        if let MirScalarExpr::CallUnary {
296                            func: UnaryFunc::Not(func::Not),
297                            expr,
298                        } = predicate
299                        {
300                            if let MirScalarExpr::CallUnary {
301                                func: UnaryFunc::IsNull(func::IsNull),
302                                expr,
303                            } = &**expr
304                            {
305                                if let MirScalarExpr::Column(c) = &**expr {
306                                    input_knowledge[*c].meet_assign(&DatumKnowledge::any(false));
307                                }
308                            }
309                        }
310                    }
311
312                    Ok(input_knowledge)
313                }
314                MirRelationExpr::Join {
315                    inputs,
316                    equivalences,
317                    implementation,
318                    ..
319                } => {
320                    // Aggregate column knowledge from each input into one `Vec`.
321                    let mut knowledges = Vec::new();
322                    for input in inputs.iter_mut() {
323                        for mut knowledge in self.harvest(input, knowledge, knowledge_stack)? {
324                            // Do not propagate error literals beyond join inputs, since that may result
325                            // in them being propagated to other inputs of the join and evaluated when
326                            // they should not.
327                            if let DatumKnowledge::Lit { value: Err(_), .. } = knowledge {
328                                knowledge.join_assign(&DatumKnowledge::any(false));
329                            }
330                            knowledges.push(knowledge);
331                        }
332                    }
333
334                    // This only aggregates the column types of each input, not the
335                    // keys of the inputs. It is unnecessary to aggregate the keys
336                    // of the inputs since input keys are unnecessary for reducing
337                    // `MirScalarExpr`s.
338                    let folded_inputs_typ =
339                        inputs.iter().fold(RelationType::empty(), |mut typ, input| {
340                            typ.column_types.append(&mut input.typ().column_types);
341                            typ
342                        });
343
344                    for equivalence in equivalences.iter_mut() {
345                        let mut knowledge = DatumKnowledge::top();
346
347                        // We can produce composite knowledge for everything in the equivalence class.
348                        for expr in equivalence.iter_mut() {
349                            if !matches!(implementation, IndexedFilter(..)) {
350                                optimize(
351                                    expr,
352                                    &folded_inputs_typ.column_types,
353                                    &knowledges,
354                                    knowledge_stack,
355                                )?;
356                            }
357                            if let MirScalarExpr::Column(c) = expr {
358                                knowledge.meet_assign(&knowledges[*c]);
359                            }
360                            if let MirScalarExpr::Literal(..) = expr {
361                                knowledge.meet_assign(&DatumKnowledge::from(&*expr));
362                            }
363                        }
364                        for expr in equivalence.iter_mut() {
365                            if let MirScalarExpr::Column(c) = expr {
366                                knowledges[*c] = knowledge.clone();
367                            }
368                        }
369                    }
370
371                    Ok(knowledges)
372                }
373                MirRelationExpr::Reduce {
374                    input,
375                    group_key,
376                    aggregates,
377                    monotonic: _,
378                    expected_group_size: _,
379                } => {
380                    let input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
381                    let input_typ = input.typ();
382                    let mut output = group_key
383                        .iter_mut()
384                        .map(|k| {
385                            optimize(
386                                k,
387                                &input_typ.column_types,
388                                &input_knowledge[..],
389                                knowledge_stack,
390                            )
391                        })
392                        .collect::<Result<Vec<_>, _>>()?;
393                    for aggregate in aggregates.iter_mut() {
394                        use mz_expr::AggregateFunc;
395                        let knowledge = optimize(
396                            &mut aggregate.expr,
397                            &input_typ.column_types,
398                            &input_knowledge[..],
399                            knowledge_stack,
400                        )?;
401                        // This could be improved.
402                        let knowledge = match aggregate.func {
403                            AggregateFunc::MaxInt16
404                            | AggregateFunc::MaxInt32
405                            | AggregateFunc::MaxInt64
406                            | AggregateFunc::MaxUInt16
407                            | AggregateFunc::MaxUInt32
408                            | AggregateFunc::MaxUInt64
409                            | AggregateFunc::MaxMzTimestamp
410                            | AggregateFunc::MaxFloat32
411                            | AggregateFunc::MaxFloat64
412                            | AggregateFunc::MaxBool
413                            | AggregateFunc::MaxString
414                            | AggregateFunc::MaxDate
415                            | AggregateFunc::MaxTimestamp
416                            | AggregateFunc::MaxTimestampTz
417                            | AggregateFunc::MinInt16
418                            | AggregateFunc::MinInt32
419                            | AggregateFunc::MinInt64
420                            | AggregateFunc::MinUInt16
421                            | AggregateFunc::MinUInt32
422                            | AggregateFunc::MinUInt64
423                            | AggregateFunc::MinMzTimestamp
424                            | AggregateFunc::MinFloat32
425                            | AggregateFunc::MinFloat64
426                            | AggregateFunc::MinBool
427                            | AggregateFunc::MinString
428                            | AggregateFunc::MinDate
429                            | AggregateFunc::MinTimestamp
430                            | AggregateFunc::MinTimestampTz
431                            | AggregateFunc::Any
432                            | AggregateFunc::All => {
433                                // These methods propagate constant values exactly.
434                                knowledge
435                            }
436                            AggregateFunc::Count => DatumKnowledge::any(false),
437                            _ => {
438                                // The remaining aggregates are non-null if
439                                // their inputs are non-null. This is correct
440                                // because in Mir~ we reduce an empty collection
441                                // to an empty collection (in Hir~ the result
442                                // often is singleton null collection).
443                                DatumKnowledge::any(knowledge.nullable())
444                            }
445                        };
446                        output.push(knowledge);
447                    }
448                    Ok(output)
449                }
450                MirRelationExpr::TopK { input, limit, .. } => {
451                    let input_knowledge = self.harvest(input, knowledge, knowledge_stack)?;
452                    if let Some(limit) = limit.as_mut() {
453                        optimize(
454                            limit,
455                            &input.typ().column_types,
456                            &input_knowledge[..],
457                            knowledge_stack,
458                        )?;
459                    }
460                    Ok(input_knowledge)
461                }
462                MirRelationExpr::Negate { input } => {
463                    self.harvest(input, knowledge, knowledge_stack)
464                }
465                MirRelationExpr::Threshold { input } => {
466                    self.harvest(input, knowledge, knowledge_stack)
467                }
468                MirRelationExpr::Union { base, inputs } => {
469                    let mut know = self.harvest(base, knowledge, knowledge_stack)?;
470                    for input in inputs {
471                        know = know
472                            .into_iter()
473                            .zip_eq(self.harvest(input, knowledge, knowledge_stack)?)
474                            .map(|(mut k1, k2)| {
475                                k1.join_assign(&k2);
476                                k1
477                            })
478                            .collect();
479                    }
480                    Ok(know)
481                }
482            }?;
483
484            // println!("# Plan");
485            // println!("{}", expr.pretty());
486            // println!("# Knowledge");
487            // print_knowledge_vec(&result);
488            // println!("---");
489
490            Ok(result)
491        })
492    }
493}
494
495/// Information about a specific column.
496///
497/// The values should form a [complete lattice].
498///
499/// [complete lattice]: https://en.wikipedia.org/wiki/Complete_lattice
500#[derive(Clone, Debug, PartialEq, Eq)]
501enum DatumKnowledge {
502    // Any possible value, optionally known to be NOT NULL.
503    Any {
504        nullable: bool,
505    },
506    // A known literal value of a specific type.
507    Lit {
508        value: Result<mz_repr::Row, EvalError>,
509        typ: ScalarType,
510    },
511    // A value that cannot exist.
512    Nothing,
513}
514
515impl From<&MirScalarExpr> for DatumKnowledge {
516    fn from(expr: &MirScalarExpr) -> Self {
517        if let MirScalarExpr::Literal(l, t) = expr {
518            let value = l.clone();
519            let typ = t.scalar_type.clone();
520            Self::Lit { value, typ }
521        } else {
522            Self::top()
523        }
524    }
525}
526
527impl From<(Datum<'_>, &ColumnType)> for DatumKnowledge {
528    fn from((d, t): (Datum<'_>, &ColumnType)) -> Self {
529        let value = Ok(Row::pack_slice(&[d.clone()]));
530        let typ = t.scalar_type.clone();
531        Self::Lit { value, typ }
532    }
533}
534
535impl From<&ColumnType> for DatumKnowledge {
536    fn from(typ: &ColumnType) -> Self {
537        let nullable = typ.nullable;
538        Self::Any { nullable }
539    }
540}
541
542impl DatumKnowledge {
543    /// The most general possible knowledge (the top of the complete lattice).
544    fn top() -> Self {
545        Self::Any { nullable: true }
546    }
547
548    /// The strictest possible knowledge (the bottom of the complete lattice).
549    #[allow(dead_code)]
550    fn bottom() -> Self {
551        Self::Nothing
552    }
553
554    /// Create a [`DatumKnowledge::Any`] instance with the given nullable flag.
555    fn any(nullable: bool) -> Self {
556        DatumKnowledge::Any { nullable }
557    }
558
559    /// Unions (weakens) the possible states of a column.
560    fn join_assign(&mut self, other: &Self) {
561        use DatumKnowledge::*;
562
563        // Each of the following `if` statements handles the cases marked with
564        // `x` of the (self x other) cross product (depicted as a rows x cols
565        // table where Self::bottom() is the UL and Self::top() the LR corner).
566        // Cases that are already handled are marked with `+` and cases that are
567        // yet to be handled marked with `-`.
568        //
569        // The order of handling (crossing out) of cases ensures that
570        // `*.clone()` is not called unless necessary.
571        //
572        // The final case after the ifs can assert that both sides are Lit
573        // variants.
574
575        // x - - - : Nothing
576        // x - - - : Lit { _, _ }
577        // x - - - : Any { false }
578        // x x x x : Any { true }
579        if crate::any![
580            matches!(self, Any { nullable: true }),
581            matches!(other, Nothing),
582        ] {
583            // Nothing to do.
584        }
585        // + x x x : Nothing
586        // + - - x : Lit { _, _ }
587        // + - - x : Any { false }
588        // + + + + : Any { true }
589        else if crate::any![
590            matches!(self, Nothing),
591            matches!(other, Any { nullable: true }),
592        ] {
593            *self = other.clone();
594        }
595        // + + + + : Nothing
596        // + - - + : Lit { _, _ }
597        // + x x + : Any { false }
598        // + + + + : Any { true }
599        else if matches!(self, Any { nullable: false }) {
600            if !other.nullable() {
601                // Nothing to do.
602            } else {
603                *self = Self::top() // other: Lit { null, _ }
604            }
605            // Nothing to do.
606        }
607        // + + + + : Nothing
608        // + - x + : Lit { _, _ }
609        // + + + + : Any { false }
610        // + + + + : Any { true }
611        else if matches!(other, Any { nullable: false }) {
612            if !self.nullable() {
613                *self = other.clone();
614            } else {
615                *self = Self::top() // other: Lit { null, _ }
616            }
617        }
618        // + + + + : Nothing
619        // + x + + : Lit { _, _ }
620        // + + + + : Any { false }
621        // + + + + : Any { true }
622        else {
623            let Lit {
624                value: s_val,
625                typ: s_typ,
626            } = self
627            else {
628                unreachable!();
629            };
630            let Lit {
631                value: o_val,
632                typ: o_typ,
633            } = other
634            else {
635                unreachable!();
636            };
637
638            if !s_typ.base_eq(o_typ) {
639                ::tracing::error!("Undefined join of non-equal base types {s_typ:?} != {o_typ:?}");
640                *self = Self::top();
641            } else if s_val != o_val {
642                let nullable = self.nullable() || other.nullable();
643                *self = Any { nullable }
644            } else if s_typ != o_typ {
645                // Same value but different concrete types - strip all modifiers!
646                // This is identical to what ColumnType::union is doing.
647                *s_typ = s_typ.without_modifiers();
648            } else {
649                // Value and type coincide - do nothing!
650            }
651        }
652    }
653
654    /// Intersects (strengthens) the possible states of a column.
655    fn meet_assign(&mut self, other: &Self) {
656        use DatumKnowledge::*;
657
658        // Each of the following `if` statements handles the cases marked with
659        // `x` of the (self x other) cross product (depicted as a rows x cols
660        // table where Self::bottom() is the UL and Self::top() the LR corner).
661        // Cases that are already handled are marked with `+` and cases that are
662        // yet to be handled marked with `-`.
663        //
664        // The order of handling (crossing out) of cases ensures that
665        // `*.clone()` is not called unless necessary.
666        //
667        // The final case after the ifs can assert that both sides are Lit
668        // variants.
669
670        // x x x x : Nothing
671        // - - - x : Lit { _, _ }
672        // - - - x : Any { false }
673        // - - - x : Any { true }
674        if crate::any![
675            matches!(self, Nothing),
676            matches!(other, Any { nullable: true }),
677        ] {
678            // Nothing to do.
679        }
680        // + + + + : Nothing
681        // x - - + : Lit { _, _ }
682        // x - - + : Any { false }
683        // x x x + : Any { true }
684        else if crate::any![
685            matches!(self, Any { nullable: true }),
686            matches!(other, Nothing),
687        ] {
688            *self = other.clone();
689        }
690        // + + + + : Nothing
691        // + - - + : Lit { _, _ }
692        // + x x + : Any { false }
693        // + + + + : Any { true }
694        else if matches!(self, Any { nullable: false }) {
695            match other {
696                Any { .. } => {
697                    // Nothing to do.
698                }
699                Lit { .. } => {
700                    if other.nullable() {
701                        *self = Nothing; // other: Lit { null, _ }
702                    } else {
703                        *self = other.clone();
704                    }
705                }
706                Nothing => unreachable!(),
707            }
708        }
709        // + + + + : Nothing
710        // + - x + : Lit { _, _ }
711        // + + + + : Any { false }
712        // + + + + : Any { true }
713        else if matches!(other, Any { nullable: false }) {
714            if self.nullable() {
715                *self = Nothing // self: Lit { null, _ }
716            }
717        }
718        // + + + + : Nothing
719        // + x + + : Lit { _, _ }
720        // + + + + : Any { false }
721        // + + + + : Any { true }
722        else {
723            let Lit {
724                value: s_val,
725                typ: s_typ,
726            } = self
727            else {
728                unreachable!();
729            };
730            let Lit {
731                value: o_val,
732                typ: o_typ,
733            } = other
734            else {
735                unreachable!();
736            };
737
738            if !s_typ.base_eq(o_typ) {
739                soft_panic_or_log!("Undefined meet of non-equal base types {s_typ:?} != {o_typ:?}");
740                *self = Self::top(); // this really should be Nothing
741            } else if s_val != o_val {
742                *self = Nothing;
743            } else if s_typ != o_typ {
744                // Same value but different concrete types - strip all
745                // modifiers! We should probably pick the more specific of the
746                // two types if they are ordered or return Nothing otherwise.
747                *s_typ = s_typ.without_modifiers();
748            } else {
749                // Value and type coincide - do nothing!
750            }
751        }
752    }
753
754    fn nullable(&self) -> bool {
755        match self {
756            DatumKnowledge::Any { nullable } => *nullable,
757            DatumKnowledge::Lit { value, .. } => match value {
758                Ok(value) => value.iter().next().unwrap().is_null(),
759                Err(_) => false,
760            },
761            DatumKnowledge::Nothing => false,
762        }
763    }
764}
765
766/// Attempts to optimize
767///
768/// `knowledge_stack` is a pre-allocated vector but is expected not to contain any elements.
769fn optimize(
770    expr: &mut MirScalarExpr,
771    column_types: &[ColumnType],
772    column_knowledge: &[DatumKnowledge],
773    knowledge_stack: &mut Vec<DatumKnowledge>,
774) -> Result<DatumKnowledge, TransformError> {
775    // Storage for `DatumKnowledge` being propagated up through the
776    // `MirScalarExpr`. When a node is visited, pop off as many `DatumKnowledge`
777    // as the number of children the node has, and then push the
778    // `DatumKnowledge` corresponding to the node back onto the stack.
779    // Post-order traversal means that if a node has `n` children, the top `n`
780    // `DatumKnowledge` in the stack are the `DatumKnowledge` corresponding to
781    // the children.
782    assert!(knowledge_stack.is_empty());
783    #[allow(deprecated)]
784    expr.visit_mut_pre_post(
785        &mut |e| {
786            // We should not eagerly memoize `if` branches that might not be taken.
787            // TODO: Memoize expressions in the intersection of `then` and `els`.
788            if let MirScalarExpr::If { then, els, .. } = e {
789                Some(vec![then, els])
790            } else {
791                None
792            }
793        },
794        &mut |e| {
795            let result = match e {
796                MirScalarExpr::Column(index) => {
797                    let index = *index;
798                    if let DatumKnowledge::Lit { value, typ } = &column_knowledge[index] {
799                        let nullable = column_knowledge[index].nullable();
800                        *e = MirScalarExpr::Literal(value.clone(), typ.clone().nullable(nullable));
801                    }
802                    column_knowledge[index].clone()
803                }
804                MirScalarExpr::Literal(_, _) | MirScalarExpr::CallUnmaterializable(_) => {
805                    DatumKnowledge::from(&*e)
806                }
807                MirScalarExpr::CallUnary { func, expr: _ } => {
808                    let knowledge = knowledge_stack.pop().unwrap();
809                    if matches!(&knowledge, DatumKnowledge::Lit { .. }) {
810                        e.reduce(column_types);
811                    } else if func == &UnaryFunc::IsNull(func::IsNull) && !knowledge.nullable() {
812                        *e = MirScalarExpr::literal_false();
813                    };
814                    DatumKnowledge::from(&*e)
815                }
816                MirScalarExpr::CallBinary {
817                    func: _,
818                    expr1: _,
819                    expr2: _,
820                } => {
821                    let knowledge2 = knowledge_stack.pop().unwrap();
822                    let knowledge1 = knowledge_stack.pop().unwrap();
823                    if crate::any![
824                        matches!(knowledge1, DatumKnowledge::Lit { .. }),
825                        matches!(knowledge2, DatumKnowledge::Lit { .. }),
826                    ] {
827                        e.reduce(column_types);
828                    }
829                    DatumKnowledge::from(&*e)
830                }
831                MirScalarExpr::CallVariadic { func: _, exprs } => {
832                    // Drain the last `exprs.len()` knowledge, and reduce if any is `Lit`.
833                    assert!(knowledge_stack.len() >= exprs.len());
834                    if knowledge_stack
835                        .drain(knowledge_stack.len() - exprs.len()..)
836                        .any(|k| matches!(k, DatumKnowledge::Lit { .. }))
837                    {
838                        e.reduce(column_types);
839                    }
840                    DatumKnowledge::from(&*e)
841                }
842                MirScalarExpr::If {
843                    cond: _,
844                    then: _,
845                    els: _,
846                } => {
847                    // `cond` has been left un-optimized, as we should not remove the conditional
848                    // nature of the evaluation based on column knowledge: the resulting
849                    // expression could then move down past a filter or join that provided
850                    // the guarantees, and would become wrong.
851                    //
852                    // Instead, each of the branches have been optimized, and we
853                    // can union the states of their columns.
854                    let know2 = knowledge_stack.pop().unwrap();
855                    let mut know1 = knowledge_stack.pop().unwrap();
856                    know1.join_assign(&know2);
857                    know1
858                }
859            };
860            knowledge_stack.push(result);
861        },
862    )?;
863    let knowledge_datum = knowledge_stack.pop();
864    assert!(knowledge_stack.is_empty());
865    knowledge_datum.ok_or_else(|| {
866        TransformError::Internal(String::from("unexpectedly empty stack in optimize"))
867    })
868}
869
870#[allow(dead_code)] // keep debugging method around
871fn print_knowledge_map<'a>(
872    knowledge: &BTreeMap<mz_expr::Id, Vec<DatumKnowledge>>,
873    ids: impl Iterator<Item = &'a mz_expr::LocalId>,
874) {
875    for id in ids {
876        let id = mz_expr::Id::Local(id.clone());
877        for (i, k) in knowledge.get(&id).unwrap().iter().enumerate() {
878            println!("{id}.#{i}: {k:?}");
879        }
880    }
881    println!("");
882}
883
884#[allow(dead_code)] // keep debugging method around
885fn print_knowledge_vec(knowledge: &Vec<DatumKnowledge>) {
886    for (i, k) in knowledge.iter().enumerate() {
887        println!("#{i}: {k:?}");
888    }
889    println!("");
890}
mz_transform/column_knowledge.rs

mz_transform/
column_knowledge.rs