mz_transform/cse/anf.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Identifies common relation subexpressions and places them behind `Let`
11//! bindings.
12//!
13//! All structurally equivalent expressions, defined recursively as having
14//! structurally equivalent inputs, and identical parameters, will be placed
15//! behind `Let` bindings. The resulting expressions likely have an excess of
16//! `Let` expressions, and therefore this transform is usually followed by a
17//! `NormalizeLets` application.
18
19use std::collections::BTreeMap;
20
21use mz_expr::visit::VisitChildren;
22use mz_expr::{AccessStrategy, Id, LocalId, MirRelationExpr, RECURSION_LIMIT};
23use mz_ore::id_gen::IdGen;
24use mz_ore::stack::{CheckedRecursion, RecursionGuard};
25
26/// Transform an MirRelationExpr into an administrative normal form (ANF).
27#[derive(Default, Debug)]
28pub struct ANF;
29
30use crate::TransformCtx;
31
32impl crate::Transform for ANF {
33 fn name(&self) -> &'static str {
34 "ANF"
35 }
36
37 #[mz_ore::instrument(
38 target = "optimizer",
39 level = "debug",
40 fields(path.segment = "anf")
41 )]
42 fn actually_perform_transform(
43 &self,
44 relation: &mut MirRelationExpr,
45 _ctx: &mut TransformCtx,
46 ) -> Result<(), crate::TransformError> {
47 let result = self.transform_without_trace(relation);
48 mz_repr::explain::trace_plan(&*relation);
49 result
50 }
51}
52
53impl ANF {
54 /// Performs the `NormalizeLets` transformation without tracing the result.
55 pub fn transform_without_trace(
56 &self,
57 relation: &mut MirRelationExpr,
58 ) -> Result<(), crate::TransformError> {
59 let mut bindings = Bindings::default();
60 bindings.insert_expression(&mut IdGen::default(), relation)?;
61 bindings.populate_expression(relation);
62 Ok(())
63 }
64}
65
66/// Maintains `Let` bindings in a compact, explicit representation.
67///
68/// The `bindings` map contains neither `Let` bindings nor two structurally
69/// equivalent expressions.
70///
71/// The bindings can be interpreted as an ordered sequence of let bindings,
72/// ordered by their identifier, that should be applied in order before the
73/// use of the expression from which they have been extracted.
74#[derive(Clone, Debug)]
75struct Bindings {
76 /// A list of let-bound expressions and their order / identifier.
77 bindings: BTreeMap<MirRelationExpr, u64>,
78 /// Mapping from conventional local `Get` identifiers to new ones.
79 rebindings: BTreeMap<LocalId, LocalId>,
80 // A guard for tracking the maximum depth of recursive tree traversal.
81 recursion_guard: RecursionGuard,
82}
83
84impl CheckedRecursion for Bindings {
85 fn recursion_guard(&self) -> &RecursionGuard {
86 &self.recursion_guard
87 }
88}
89
90impl Default for Bindings {
91 fn default() -> Bindings {
92 Bindings {
93 bindings: BTreeMap::new(),
94 rebindings: BTreeMap::new(),
95 recursion_guard: RecursionGuard::with_limit(RECURSION_LIMIT),
96 }
97 }
98}
99
100impl Bindings {
101 fn new(rebindings: BTreeMap<LocalId, LocalId>) -> Bindings {
102 Bindings {
103 rebindings,
104 ..Bindings::default()
105 }
106 }
107}
108
109impl Bindings {
110 /// Ensures `self` contains bindings for all of `relation`'s subexpressions, including itself,
111 /// and replaces `relation` with a reference to its corresponding identifier.
112 ///
113 /// The algorithm performs a post-order traversal of the expression tree, binding each distinct
114 /// expression to a new local identifier and replacing each expression with a reference to the
115 /// identifier for the expression. It maintains the invariant that `bindings` contains no `Let`
116 /// expressions, nor any two structurally identical expressions.
117 ///
118 /// `LetRec` expressions are treated differently, as their expressions cannot simply be bound to
119 /// `Let` expressions. Each `LetRec` expression clones the current bindings `self`, and then goes
120 /// through its bindings in order, extending `self` with terms discovered in order. Importantly,
121 /// when a bound term is first visited we re-introduce its identifier with a fresh identifier,
122 /// to ensure that preceding references to the term are not equated with subsequent references.
123 /// The `LetRec::body` is treated differently, as it is "outside" the scope, and should not rely
124 /// on expressions from within the scope, other than those that it has coming in to the analysis.
125 /// This is a limitation of our optimization pipeline at the moment, that it breaks if `body`
126 /// gains new references to terms within the `LetRec` bindings (for example: `JoinImplementation`
127 /// can break if `body` acquires a reference to an arranged term, as that arrangement is not
128 /// available outside the loop).
129 fn insert_expression(
130 &mut self,
131 id_gen: &mut IdGen,
132 relation: &mut MirRelationExpr,
133 ) -> Result<(), crate::TransformError> {
134 self.checked_recur_mut(|this| {
135 match relation {
136 MirRelationExpr::LetRec {
137 ids,
138 values,
139 body,
140 limits,
141 } => {
142 // Used for `zip_eq`.
143 use itertools::Itertools;
144
145 // Introduce a new copy of `self`, which will be specific to this scope.
146 // This makes expressions used in the outer scope available for re-use.
147 // We will discard `scoped_anf` once we have processed the `LetRec`.
148 let mut scoped_anf = this.clone();
149
150 // Used to distinguish new bindings from old bindings.
151 // This is needed to extract from `scoped_anf` only the bindings added
152 // in this block, and not those inherited from `self`.
153 let id_boundary = id_gen.allocate_id();
154
155 // Each identifier in `ids` will be given *two* new identifiers,
156 // initially one "before" and then once bound another one "after".
157 // The two identifiers are important to distinguish references to the
158 // binding "before" it is refreshed, and "after" it is refreshed.
159 // We can equate two "before" references and two "after" references,
160 // but we must not equate a "before" and an "after" reference.
161
162 // For each bound identifier from `ids`, a temporary identifier for the "before" version.
163 let before_ids = ids
164 .iter()
165 .map(|_id| LocalId::new(id_gen.allocate_id()))
166 .collect::<Vec<_>>();
167 let mut after_ids = Vec::new();
168
169 // Install the "before" rebindings to start.
170 // These rebindings will be used for each binding until we process the binding.
171 scoped_anf
172 .rebindings
173 .extend(ids.iter().zip(before_ids.iter()).map(|(x, y)| (*x, *y)));
174
175 // Convert each bound expression into a sequence of let bindings, which are appended
176 // to the sequence of let bindings from prior bound expressions.
177 // After visiting the expression, we'll update the binding for the `id` to its "after"
178 // identifier.
179 for (index, value) in values.iter_mut().enumerate() {
180 scoped_anf.insert_expression(id_gen, value)?;
181 // Update the binding for `ids[index]` from its "before" id to a new "after" id.
182 let new_id = id_gen.allocate_id();
183 after_ids.push(new_id);
184 scoped_anf
185 .rebindings
186 .insert(ids[index].clone(), LocalId::new(new_id));
187 }
188
189 // We handle `body` separately, as it is an error to rely on arrangements from within the `LetRec`.
190 // Ideally we wouldn't need that complexity here, but this is called on arrangement-laden expressions
191 // after join planning where we need to have locked in arrangements. Revisit if we correct that.
192 // TODO: this logic does not find expressions shared between `body` and `values` that could be hoisted
193 // out of the `LetRec`; for example terms that depend only on bindings from outside the `LetRec`.
194 let mut body_anf = Bindings::new(this.rebindings.clone());
195 for id in ids.iter() {
196 body_anf
197 .rebindings
198 .insert(*id, scoped_anf.rebindings[id].clone());
199 }
200 body_anf.insert_expression(id_gen, body)?;
201 body_anf.populate_expression(body);
202
203 // Collect the bindings that are new to this `LetRec` scope (delineated by `id_boundary`).
204 let mut bindings = scoped_anf
205 .bindings
206 .into_iter()
207 .filter(|(_e, i)| i > &id_boundary)
208 .map(|(e, i)| (i, e))
209 .collect::<Vec<_>>();
210 // Add bindings corresponding to `(ids, values)` using after identifiers.
211 bindings.extend(after_ids.iter().cloned().zip_eq(values.drain(..)));
212 bindings.sort();
213
214 // Before continuing, we should rewrite each "before" id to its corresponding "after" id.
215 let before_to_after: BTreeMap<_, _> = before_ids
216 .into_iter()
217 .zip_eq(after_ids)
218 .map(|(b, a)| (b, LocalId::new(a)))
219 .collect();
220 // Perform the rewrite of "before" ids into "after" ids.
221 for (_id, expr) in bindings.iter_mut() {
222 let mut todo = vec![&mut *expr];
223 while let Some(e) = todo.pop() {
224 if let MirRelationExpr::Get {
225 id: Id::Local(i), ..
226 } = e
227 {
228 if let Some(after) = before_to_after.get(i) {
229 i.clone_from(after);
230 }
231 }
232 todo.extend(e.children_mut());
233 }
234 }
235
236 // New ids and new values can be extracted from the bindings.
237 let (new_ids, new_values): (Vec<_>, Vec<_>) = bindings
238 .into_iter()
239 .map(|(id_int, value)| (LocalId::new(id_int), value))
240 .unzip();
241 // New limits will all be `None`, except for any pre-existing limits.
242 let mut new_limits: BTreeMap<LocalId, _> = BTreeMap::default();
243 for (id, limit) in ids.iter().zip_eq(limits.iter()) {
244 new_limits.insert(scoped_anf.rebindings[id], limit.clone());
245 }
246 for id in new_ids.iter() {
247 if !new_limits.contains_key(id) {
248 new_limits.insert(id.clone(), None);
249 }
250 }
251
252 *ids = new_ids;
253 *values = new_values;
254 *limits = new_limits.into_values().collect();
255 }
256 MirRelationExpr::Let { id, value, body } => {
257 this.insert_expression(id_gen, value)?;
258 let new_id = if let MirRelationExpr::Get {
259 id: Id::Local(x), ..
260 } = **value
261 {
262 x
263 } else {
264 panic!("Invariant violated")
265 };
266 this.rebindings.insert(*id, new_id);
267 this.insert_expression(id_gen, body)?;
268 let body = body.take_dangerous();
269 this.rebindings.remove(id);
270 *relation = body;
271 }
272 MirRelationExpr::Get { id, .. } => {
273 if let Id::Local(id) = id {
274 if let Some(rebound) = this.rebindings.get(id) {
275 *id = *rebound;
276 } else {
277 Err(crate::TransformError::Internal(format!(
278 "Identifier missing: {:?}",
279 id
280 )))?;
281 }
282 }
283 }
284 _ => {
285 // All other expressions just need to apply the logic recursively.
286 relation.try_visit_mut_children(|expr| this.insert_expression(id_gen, expr))?;
287 }
288 };
289
290 // This should be fast, as it depends directly on only `Get` expressions.
291 let typ = relation.typ();
292 // We want to maintain the invariant that `relation` ends up as a local `Get`.
293 if let MirRelationExpr::Get {
294 id: Id::Local(_), ..
295 } = relation
296 {
297 // Do nothing, as the expression is already a local `Get` expression.
298 } else {
299 // Either find an instance of `relation` or insert this one.
300 let id = this
301 .bindings
302 .entry(relation.take_dangerous())
303 .or_insert_with(|| id_gen.allocate_id());
304 *relation = MirRelationExpr::Get {
305 id: Id::Local(LocalId::new(*id)),
306 typ,
307 access_strategy: AccessStrategy::UnknownOrLocal,
308 }
309 }
310
311 Ok(())
312 })
313 }
314
315 /// Populates `expression` with necessary `Let` bindings.
316 ///
317 /// This population may result in substantially more `Let` bindings that one
318 /// might expect. It is very appropriate to run the `NormalizeLets` transformation
319 /// afterwards to remove `Let` bindings that it deems unhelpful.
320 fn populate_expression(self, expression: &mut MirRelationExpr) {
321 // Convert the bindings in to a sequence, by the local identifier.
322 let mut bindings = self.bindings.into_iter().collect::<Vec<_>>();
323 bindings.sort_by_key(|(_, i)| *i);
324
325 for (value, index) in bindings.into_iter().rev() {
326 let new_expression = MirRelationExpr::Let {
327 id: LocalId::new(index),
328 value: Box::new(value),
329 body: Box::new(expression.take_dangerous()),
330 };
331 *expression = new_expression;
332 }
333 }
334}