mz_transform/cse/
anf.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Identifies common relation subexpressions and places them behind `Let`
11//! bindings.
12//!
13//! All structurally equivalent expressions, defined recursively as having
14//! structurally equivalent inputs, and identical parameters, will be placed
15//! behind `Let` bindings. The resulting expressions likely have an excess of
16//! `Let` expressions, and therefore this transform is usually followed by a
17//! `NormalizeLets` application.
18
19use std::collections::BTreeMap;
20
21use mz_expr::visit::VisitChildren;
22use mz_expr::{AccessStrategy, Id, LocalId, MirRelationExpr, RECURSION_LIMIT};
23use mz_ore::id_gen::IdGen;
24use mz_ore::stack::{CheckedRecursion, RecursionGuard};
25
26/// Transform an MirRelationExpr into an administrative normal form (ANF).
27#[derive(Default, Debug)]
28pub struct ANF;
29
30use crate::TransformCtx;
31
32impl crate::Transform for ANF {
33    fn name(&self) -> &'static str {
34        "ANF"
35    }
36
37    #[mz_ore::instrument(
38        target = "optimizer",
39        level = "debug",
40        fields(path.segment = "anf")
41    )]
42    fn actually_perform_transform(
43        &self,
44        relation: &mut MirRelationExpr,
45        _ctx: &mut TransformCtx,
46    ) -> Result<(), crate::TransformError> {
47        let result = self.transform_without_trace(relation);
48        mz_repr::explain::trace_plan(&*relation);
49        result
50    }
51}
52
53impl ANF {
54    /// Performs the `NormalizeLets` transformation without tracing the result.
55    pub fn transform_without_trace(
56        &self,
57        relation: &mut MirRelationExpr,
58    ) -> Result<(), crate::TransformError> {
59        let mut bindings = Bindings::default();
60        bindings.insert_expression(&mut IdGen::default(), relation)?;
61        bindings.populate_expression(relation);
62        Ok(())
63    }
64}
65
66/// Maintains `Let` bindings in a compact, explicit representation.
67///
68/// The `bindings` map contains neither `Let` bindings nor two structurally
69/// equivalent expressions.
70///
71/// The bindings can be interpreted as an ordered sequence of let bindings,
72/// ordered by their identifier, that should be applied in order before the
73/// use of the expression from which they have been extracted.
74#[derive(Clone, Debug)]
75struct Bindings {
76    /// A list of let-bound expressions and their order / identifier.
77    bindings: BTreeMap<MirRelationExpr, u64>,
78    /// Mapping from conventional local `Get` identifiers to new ones.
79    rebindings: BTreeMap<LocalId, LocalId>,
80    // A guard for tracking the maximum depth of recursive tree traversal.
81    recursion_guard: RecursionGuard,
82}
83
84impl CheckedRecursion for Bindings {
85    fn recursion_guard(&self) -> &RecursionGuard {
86        &self.recursion_guard
87    }
88}
89
90impl Default for Bindings {
91    fn default() -> Bindings {
92        Bindings {
93            bindings: BTreeMap::new(),
94            rebindings: BTreeMap::new(),
95            recursion_guard: RecursionGuard::with_limit(RECURSION_LIMIT),
96        }
97    }
98}
99
100impl Bindings {
101    fn new(rebindings: BTreeMap<LocalId, LocalId>) -> Bindings {
102        Bindings {
103            rebindings,
104            ..Bindings::default()
105        }
106    }
107}
108
109impl Bindings {
110    /// Ensures `self` contains bindings for all of `relation`'s subexpressions, including itself,
111    /// and replaces `relation` with a reference to its corresponding identifier.
112    ///
113    /// The algorithm performs a post-order traversal of the expression tree, binding each distinct
114    /// expression to a new local identifier and replacing each expression with a reference to the
115    /// identifier for the expression. It maintains the invariant that `bindings` contains no `Let`
116    /// expressions, nor any two structurally identical expressions.
117    ///
118    /// `LetRec` expressions are treated differently, as their expressions cannot simply be bound to
119    /// `Let` expressions. Each `LetRec` expression clones the current bindings `self`, and then goes
120    /// through its bindings in order, extending `self` with terms discovered in order. Importantly,
121    /// when a bound term is first visited we re-introduce its identifier with a fresh identifier,
122    /// to ensure that preceding references to the term are not equated with subsequent references.
123    /// The `LetRec::body` is treated differently, as it is "outside" the scope, and should not rely
124    /// on expressions from within the scope, other than those that it has coming in to the analysis.
125    /// This is a limitation of our optimization pipeline at the moment, that it breaks if `body`
126    /// gains new references to terms within the `LetRec` bindings (for example: `JoinImplementation`
127    /// can break if `body` acquires a reference to an arranged term, as that arrangement is not
128    /// available outside the loop).
129    fn insert_expression(
130        &mut self,
131        id_gen: &mut IdGen,
132        relation: &mut MirRelationExpr,
133    ) -> Result<(), crate::TransformError> {
134        self.checked_recur_mut(|this| {
135            match relation {
136                MirRelationExpr::LetRec {
137                    ids,
138                    values,
139                    body,
140                    limits,
141                } => {
142                    // Used for `zip_eq`.
143                    use itertools::Itertools;
144
145                    // Introduce a new copy of `self`, which will be specific to this scope.
146                    // This makes expressions used in the outer scope available for re-use.
147                    // We will discard `scoped_anf` once we have processed the `LetRec`.
148                    let mut scoped_anf = this.clone();
149
150                    // Used to distinguish new bindings from old bindings.
151                    // This is needed to extract from `scoped_anf` only the bindings added
152                    // in this block, and not those inherited from `self`.
153                    let id_boundary = id_gen.allocate_id();
154
155                    // Each identifier in `ids` will be given *two* new identifiers,
156                    // initially one "before" and then once bound another one "after".
157                    // The two identifiers are important to distinguish references to the
158                    // binding "before" it is refreshed, and "after" it is refreshed.
159                    // We can equate two "before" references and two "after" references,
160                    // but we must not equate a "before" and an "after" reference.
161
162                    // For each bound identifier from `ids`, a temporary identifier for the "before" version.
163                    let before_ids = ids
164                        .iter()
165                        .map(|_id| LocalId::new(id_gen.allocate_id()))
166                        .collect::<Vec<_>>();
167                    let mut after_ids = Vec::new();
168
169                    // Install the "before" rebindings to start.
170                    // These rebindings will be used for each binding until we process the binding.
171                    scoped_anf
172                        .rebindings
173                        .extend(ids.iter().zip(before_ids.iter()).map(|(x, y)| (*x, *y)));
174
175                    // Convert each bound expression into a sequence of let bindings, which are appended
176                    // to the sequence of let bindings from prior bound expressions.
177                    // After visiting the expression, we'll update the binding for the `id` to its "after"
178                    // identifier.
179                    for (index, value) in values.iter_mut().enumerate() {
180                        scoped_anf.insert_expression(id_gen, value)?;
181                        // Update the binding for `ids[index]` from its "before" id to a new "after" id.
182                        let new_id = id_gen.allocate_id();
183                        after_ids.push(new_id);
184                        scoped_anf
185                            .rebindings
186                            .insert(ids[index].clone(), LocalId::new(new_id));
187                    }
188
189                    // We handle `body` separately, as it is an error to rely on arrangements from within the `LetRec`.
190                    // Ideally we wouldn't need that complexity here, but this is called on arrangement-laden expressions
191                    // after join planning where we need to have locked in arrangements. Revisit if we correct that.
192                    // TODO: this logic does not find expressions shared between `body` and `values` that could be hoisted
193                    // out of the `LetRec`; for example terms that depend only on bindings from outside the `LetRec`.
194                    let mut body_anf = Bindings::new(this.rebindings.clone());
195                    for id in ids.iter() {
196                        body_anf
197                            .rebindings
198                            .insert(*id, scoped_anf.rebindings[id].clone());
199                    }
200                    body_anf.insert_expression(id_gen, body)?;
201                    body_anf.populate_expression(body);
202
203                    // Collect the bindings that are new to this `LetRec` scope (delineated by `id_boundary`).
204                    let mut bindings = scoped_anf
205                        .bindings
206                        .into_iter()
207                        .filter(|(_e, i)| i > &id_boundary)
208                        .map(|(e, i)| (i, e))
209                        .collect::<Vec<_>>();
210                    // Add bindings corresponding to `(ids, values)` using after identifiers.
211                    bindings.extend(after_ids.iter().cloned().zip_eq(values.drain(..)));
212                    bindings.sort();
213
214                    // Before continuing, we should rewrite each "before" id to its corresponding "after" id.
215                    let before_to_after: BTreeMap<_, _> = before_ids
216                        .into_iter()
217                        .zip_eq(after_ids)
218                        .map(|(b, a)| (b, LocalId::new(a)))
219                        .collect();
220                    // Perform the rewrite of  "before" ids into "after" ids.
221                    for (_id, expr) in bindings.iter_mut() {
222                        let mut todo = vec![&mut *expr];
223                        while let Some(e) = todo.pop() {
224                            if let MirRelationExpr::Get {
225                                id: Id::Local(i), ..
226                            } = e
227                            {
228                                if let Some(after) = before_to_after.get(i) {
229                                    i.clone_from(after);
230                                }
231                            }
232                            todo.extend(e.children_mut());
233                        }
234                    }
235
236                    // New ids and new values can be extracted from the bindings.
237                    let (new_ids, new_values): (Vec<_>, Vec<_>) = bindings
238                        .into_iter()
239                        .map(|(id_int, value)| (LocalId::new(id_int), value))
240                        .unzip();
241                    // New limits will all be `None`, except for any pre-existing limits.
242                    let mut new_limits: BTreeMap<LocalId, _> = BTreeMap::default();
243                    for (id, limit) in ids.iter().zip_eq(limits.iter()) {
244                        new_limits.insert(scoped_anf.rebindings[id], limit.clone());
245                    }
246                    for id in new_ids.iter() {
247                        if !new_limits.contains_key(id) {
248                            new_limits.insert(id.clone(), None);
249                        }
250                    }
251
252                    *ids = new_ids;
253                    *values = new_values;
254                    *limits = new_limits.into_values().collect();
255                }
256                MirRelationExpr::Let { id, value, body } => {
257                    this.insert_expression(id_gen, value)?;
258                    let new_id = if let MirRelationExpr::Get {
259                        id: Id::Local(x), ..
260                    } = **value
261                    {
262                        x
263                    } else {
264                        panic!("Invariant violated")
265                    };
266                    this.rebindings.insert(*id, new_id);
267                    this.insert_expression(id_gen, body)?;
268                    let body = body.take_dangerous();
269                    this.rebindings.remove(id);
270                    *relation = body;
271                }
272                MirRelationExpr::Get { id, .. } => {
273                    if let Id::Local(id) = id {
274                        if let Some(rebound) = this.rebindings.get(id) {
275                            *id = *rebound;
276                        } else {
277                            Err(crate::TransformError::Internal(format!(
278                                "Identifier missing: {:?}",
279                                id
280                            )))?;
281                        }
282                    }
283                }
284                _ => {
285                    // All other expressions just need to apply the logic recursively.
286                    relation.try_visit_mut_children(|expr| this.insert_expression(id_gen, expr))?;
287                }
288            };
289
290            // This should be fast, as it depends directly on only `Get` expressions.
291            let typ = relation.typ();
292            // We want to maintain the invariant that `relation` ends up as a local `Get`.
293            if let MirRelationExpr::Get {
294                id: Id::Local(_), ..
295            } = relation
296            {
297                // Do nothing, as the expression is already a local `Get` expression.
298            } else {
299                // Either find an instance of `relation` or insert this one.
300                let id = this
301                    .bindings
302                    .entry(relation.take_dangerous())
303                    .or_insert_with(|| id_gen.allocate_id());
304                *relation = MirRelationExpr::Get {
305                    id: Id::Local(LocalId::new(*id)),
306                    typ,
307                    access_strategy: AccessStrategy::UnknownOrLocal,
308                }
309            }
310
311            Ok(())
312        })
313    }
314
315    /// Populates `expression` with necessary `Let` bindings.
316    ///
317    /// This population may result in substantially more `Let` bindings that one
318    /// might expect. It is very appropriate to run the `NormalizeLets` transformation
319    /// afterwards to remove `Let` bindings that it deems unhelpful.
320    fn populate_expression(self, expression: &mut MirRelationExpr) {
321        // Convert the bindings in to a sequence, by the local identifier.
322        let mut bindings = self.bindings.into_iter().collect::<Vec<_>>();
323        bindings.sort_by_key(|(_, i)| *i);
324
325        for (value, index) in bindings.into_iter().rev() {
326            let new_expression = MirRelationExpr::Let {
327                id: LocalId::new(index),
328                value: Box::new(value),
329                body: Box::new(expression.take_dangerous()),
330            };
331            *expression = new_expression;
332        }
333    }
334}