mz_sql/plan/scope.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Handles SQL's scoping rules.
11//!
12//! A scope spans a single SQL `Query`. Nested subqueries create new scopes.
13//! Names are resolved against the innermost scope first.
14//! * If a match is found, it is returned.
15//! * If no matches are found, the name is resolved against the parent scope.
16//! * If multiple matches are found, the name is ambiguous and we return an
17//!   error to the user.
18//!
19//! Matching rules:
20//! * `bar` will match any column in the scope named `bar`
21//! * `foo.bar` will match any column in the scope named `bar` that originated
22//!    from a table named `foo`.
23//! * Table aliases such as `foo AS quux` replace the old table name.
24//! * Functions create unnamed columns, which can be named with columns aliases
25//!   `(bar + 1) as more_bar`.
26//!
27//! Additionally, most databases fold some form of CSE into name resolution so
28//! that eg `SELECT sum(x) FROM foo GROUP BY sum(x)` would be treated something
29//! like `SELECT "sum(x)" FROM foo GROUP BY sum(x) AS "sum(x)"` rather than
30//! failing to resolve `x`. We handle this by including the underlying
31//! `sql_parser::ast::Expr` in cases where this is possible.
32//!
33//! Many SQL expressions do strange and arbitrary things to scopes. Rather than
34//! try to capture them all here, we just expose the internals of `Scope` and
35//! handle it in the appropriate place in `super::query`.
36//!
37//! NOTE(benesch): The above approach of exposing scope's internals to the
38//! entire planner has not aged well. SQL scopes are now full of undocumented
39//! assumptions and requirements, since various subcomponents of the planner
40//! shove data into scope items to communicate with subcomponents a mile away.
41//! I've tried to refactor this code several times to no avail. It works better
42//! than you might expect. But you have been warned. Tread carefully!
43
44use std::collections::BTreeSet;
45use std::iter;
46use std::sync::Arc;
47
48use mz_ore::iter::IteratorExt;
49use mz_repr::{ColumnName, UNKNOWN_COLUMN_NAME};
50
51use crate::ast::Expr;
52use crate::names::{Aug, PartialItemName};
53use crate::plan::error::PlanError;
54use crate::plan::hir::ColumnRef;
55use crate::plan::plan_utils::JoinSide;
56use crate::plan::query::NameManager;
57
58#[derive(Debug, Clone)]
59pub struct ScopeItem {
60    /// The name of the table that produced this scope item, if any.
61    pub table_name: Option<PartialItemName>,
62    /// The name of the column.
63    pub column_name: ColumnName,
64    /// The expressions from which this scope item is derived. Used by `GROUP
65    /// BY` and window functions.
66    pub exprs: BTreeSet<Expr<Aug>>,
67    /// Whether the column is the return value of a function that produces only
68    /// a single column. This accounts for a strange PostgreSQL special case
69    /// around whole-row expansion.
70    pub from_single_column_function: bool,
71    /// Controls whether the column is only accessible via a table-qualified
72    /// reference. When false, the scope item is also excluded from `SELECT *`.
73    ///
74    /// This should be true for almost all scope items. It is set to false for
75    /// join columns in USING constraints. For example, in `t1 FULL JOIN t2
76    /// USING a`, `t1.a` and `t2.a` are still available by fully-qualified
77    /// reference, but a bare `a` refers to a new column whose value is
78    /// `coalesce(t1.a, t2.a)`. This is a big special case because normally
79    /// having three columns in scope named `a` would result in "ambiguous
80    /// column reference" errors.
81    pub allow_unqualified_references: bool,
82    /// If set, any attempt to reference this item will return the error
83    /// produced by this function.
84    ///
85    /// The function is provided with the table and column name in the
86    /// reference. It should return a `PlanError` describing why the reference
87    /// is invalid.
88    ///
89    /// This is useful for preventing access to certain columns in specific
90    /// contexts, like columns that are on the wrong side of a `LATERAL` join.
91    pub error_if_referenced: Option<fn(Option<&PartialItemName>, &ColumnName) -> PlanError>,
92    /// For table functions in scalar positions, this flag is true for the
93    /// ordinality column. If true, then this column represents an "exists" flag
94    /// for the entire row of the table function. In that case, this column must
95    /// be excluded from `*` expansion. If the corresponding datum is `NULL`, then
96    /// `*` expansion should yield a single `NULL` instead of a record with various
97    /// datums.
98    pub is_exists_column_for_a_table_function_that_was_in_the_target_list: bool,
99    // Force use of the constructor methods.
100    _private: (),
101}
102
103/// An ungrouped column in a scope.
104///
105/// We can't simply drop these items from scope. These items need to *exist*
106/// because they might shadow variables in outer scopes that would otherwise be
107/// valid to reference, but accessing them needs to produce an error.
108#[derive(Debug, Clone)]
109pub struct ScopeUngroupedColumn {
110    /// The name of the table that produced this ungrouped column, if any.
111    pub table_name: Option<PartialItemName>,
112    /// The name of the ungrouped column.
113    pub column_name: ColumnName,
114    /// Whether the original scope item allowed unqualified references.
115    pub allow_unqualified_references: bool,
116}
117
118#[derive(Debug, Clone)]
119pub struct Scope {
120    /// The items in this scope.
121    pub items: Vec<ScopeItem>,
122    /// The ungrouped columns in the scope.
123    pub ungrouped_columns: Vec<ScopeUngroupedColumn>,
124    // Whether this scope starts a new chain of lateral outer scopes.
125    //
126    // It's easiest to understand with an example. Consider this query:
127    //
128    //     SELECT (SELECT * FROM tab1, tab2, (SELECT tab1.a, tab3.a)) FROM tab3, tab4
129    //     Scope 1:                          ------------------------
130    //     Scope 2:                    ----
131    //     Scope 3:              ----
132    //     Scope 4:                                                              ----
133    //     Scope 5:                                                        ----
134    //
135    // Note that the because the derived table is not marked `LATERAL`, its
136    // reference to `tab3.a` is valid but its reference to `tab1.a` is not.
137    //
138    // Scope 5 is the parent of scope 4, scope 4 is the parent of scope 3, and
139    // so on. The non-lateral derived table is not allowed to access scopes 2
140    // and 3, because they are part of the same lateral chain, but it *is*
141    // allowed to access scope 4 and 5. So, to capture this information, we set
142    // `lateral_barrier: true` for scope 4.
143    pub lateral_barrier: bool,
144}
145
146impl ScopeItem {
147    pub fn empty() -> ScopeItem {
148        ScopeItem {
149            table_name: None,
150            column_name: UNKNOWN_COLUMN_NAME.into(),
151            exprs: BTreeSet::new(),
152            from_single_column_function: false,
153            allow_unqualified_references: true,
154            error_if_referenced: None,
155            is_exists_column_for_a_table_function_that_was_in_the_target_list: false,
156            _private: (),
157        }
158    }
159
160    /// Constructs a new scope item from an unqualified column name.
161    pub fn from_column_name<N>(column_name: N) -> ScopeItem
162    where
163        N: Into<ColumnName>,
164    {
165        ScopeItem::from_name(None, column_name.into())
166    }
167
168    /// Constructs a new scope item from a name.
169    pub fn from_name<N>(table_name: Option<PartialItemName>, column_name: N) -> ScopeItem
170    where
171        N: Into<ColumnName>,
172    {
173        let mut item = ScopeItem::empty();
174        item.table_name = table_name;
175        item.column_name = column_name.into();
176        item
177    }
178
179    /// Constructs a new scope item with no name from an expression.
180    pub fn from_expr(expr: impl Into<Option<Expr<Aug>>>) -> ScopeItem {
181        let mut item = ScopeItem::empty();
182        if let Some(expr) = expr.into() {
183            item.exprs.insert(expr);
184        }
185        item
186    }
187
188    pub fn is_from_table(&self, table_name: &PartialItemName) -> bool {
189        match &self.table_name {
190            None => false,
191            Some(n) => n.matches(table_name),
192        }
193    }
194}
195
196impl Scope {
197    pub fn empty() -> Self {
198        Scope {
199            items: vec![],
200            ungrouped_columns: vec![],
201            lateral_barrier: false,
202        }
203    }
204
205    pub fn from_source<I, N>(table_name: Option<PartialItemName>, column_names: I) -> Self
206    where
207        I: IntoIterator<Item = N>,
208        N: Into<ColumnName>,
209    {
210        let mut scope = Scope::empty();
211        scope.items = column_names
212            .into_iter()
213            .map(|column_name| ScopeItem::from_name(table_name.clone(), column_name.into()))
214            .collect();
215        scope
216    }
217
218    /// Constructs an iterator over the canonical name for each column.
219    pub fn column_names(&self) -> impl Iterator<Item = &ColumnName> {
220        self.items.iter().map(|item| &item.column_name)
221    }
222
223    pub fn len(&self) -> usize {
224        self.items.len()
225    }
226
227    /// Iterates over all items in the scope.
228    ///
229    /// Items are returned in order of preference, where the innermost scope has
230    /// the highest preference. For example, given scopes `A(B(C))`, items are
231    /// presented in the order `C`, `B`, `A`.
232    ///
233    /// Items are returned alongside the column reference that targets that item
234    /// and the item's "lateral level". The latter bears explaining. The lateral
235    /// level is the number of lateral barriers between this scope and the item.
236    /// See `Scope::lateral_barrier` for a diagram. Roughly speaking, items from
237    /// different levels but the same lateral level are items from different
238    /// joins in the same subquery, while items in different lateral levels are
239    /// items from different queries entirely. Rules about ambiguity apply
240    /// within an entire lateral level, not just within a single scope level.
241    ///
242    /// NOTE(benesch): Scope` really shows its weaknesses here. Ideally we'd
243    /// have separate types like `QueryScope` and `JoinScope` that more
244    /// naturally encode the concept of a "lateral level", or at least something
245    /// along those lines.
246    fn all_items<'a>(
247        &'a self,
248        outer_scopes: &'a [Scope],
249    ) -> impl Iterator<Item = ScopeCursor<'a>> + 'a {
250        let mut lat_level = 0;
251        iter::once(self)
252            .chain(outer_scopes)
253            .enumerate()
254            .flat_map(move |(level, scope)| {
255                if scope.lateral_barrier {
256                    lat_level += 1;
257                }
258                let items = scope
259                    .items
260                    .iter()
261                    .enumerate()
262                    .map(move |(column, item)| ScopeCursor {
263                        lat_level,
264                        inner: ScopeCursorInner::Item {
265                            column: ColumnRef { level, column },
266                            item,
267                        },
268                    });
269                let ungrouped_columns = scope.ungrouped_columns.iter().map(move |uc| ScopeCursor {
270                    lat_level,
271                    inner: ScopeCursorInner::UngroupedColumn(uc),
272                });
273                items.chain(ungrouped_columns)
274            })
275    }
276
277    /// Returns all items from the given table name in the closest scope.
278    ///
279    /// If no tables with the given name are in scope, returns an empty
280    /// iterator.
281    ///
282    /// NOTE(benesch): This is wrong for zero-arity relations, because we can't
283    /// distinguish between "no such table" and a table that exists but has no
284    /// columns. The current design of scope makes this difficult to fix,
285    /// unfortunately.
286    pub fn items_from_table<'a>(
287        &'a self,
288        outer_scopes: &'a [Scope],
289        table: &PartialItemName,
290    ) -> Result<Vec<(ColumnRef, &'a ScopeItem)>, PlanError> {
291        let mut seen_level = None;
292        let items: Vec<_> = self
293            .all_items(outer_scopes)
294            .filter_map(move |c| match c.inner {
295                ScopeCursorInner::Item { column, item }
296                    if item.is_from_table(table)
297                        && *seen_level.get_or_insert(c.lat_level) == c.lat_level =>
298                {
299                    Some((column, item))
300                }
301                _ => None,
302            })
303            .collect();
304        if !items.iter().map(|(column, _)| column.level).all_equal() {
305            return Err(PlanError::AmbiguousTable(table.clone()));
306        }
307        Ok(items)
308    }
309
310    /// Returns a matching [`ColumnRef`] and interned name, if one exists.
311    ///
312    /// Filters all visible items against the provided `matches` closure, and then matches this
313    /// filtered set against the provided `column_name`.
314    fn resolve_internal<'a, M>(
315        &'a self,
316        outer_scopes: &[Scope],
317        mut matches: M,
318        table_name: Option<&PartialItemName>,
319        column_name: &ColumnName,
320        name_manager: &mut NameManager,
321    ) -> Result<(ColumnRef, Arc<str>), PlanError>
322    where
323        M: FnMut(&ScopeCursor) -> bool,
324    {
325        let mut results = self
326            .all_items(outer_scopes)
327            .filter(|c| (matches)(c) && c.column_name() == column_name);
328        match results.next() {
329            None => {
330                let similar = self
331                    .all_items(outer_scopes)
332                    .filter(|c| (matches)(c))
333                    .filter_map(|c| {
334                        c.column_name()
335                            .is_similar(column_name)
336                            .then(|| c.column_name().clone())
337                    })
338                    .collect();
339                Err(PlanError::UnknownColumn {
340                    table: table_name.cloned(),
341                    column: column_name.clone(),
342                    similar,
343                })
344            }
345            Some(c) => {
346                if let Some(ambiguous) = results.find(|c2| c.lat_level == c2.lat_level) {
347                    if let Some(table_name) = table_name {
348                        if let (
349                            ScopeCursorInner::Item {
350                                column: ColumnRef { level: c_level, .. },
351                                ..
352                            },
353                            ScopeCursorInner::Item {
354                                column:
355                                    ColumnRef {
356                                        level: ambiguous_level,
357                                        ..
358                                    },
359                                ..
360                            },
361                        ) = (c.inner, ambiguous.inner)
362                        {
363                            // ColumnRefs with identical levels indicate multiple columns of the
364                            // same name in relation. If the levels differ then it is instead two
365                            // tables with the same name, both having a column with this name.
366                            if c_level == ambiguous_level {
367                                return Err(PlanError::AmbiguousColumn(column_name.clone()));
368                            }
369                        }
370                        return Err(PlanError::AmbiguousTable(table_name.clone()));
371                    } else {
372                        return Err(PlanError::AmbiguousColumn(column_name.clone()));
373                    }
374                }
375
376                match c.inner {
377                    ScopeCursorInner::UngroupedColumn(uc) => Err(PlanError::UngroupedColumn {
378                        table: uc.table_name.clone(),
379                        column: uc.column_name.clone(),
380                    }),
381                    ScopeCursorInner::Item { column, item } => {
382                        if let Some(error_if_referenced) = item.error_if_referenced {
383                            return Err(error_if_referenced(table_name, column_name));
384                        }
385                        Ok((column, name_manager.intern_scope_item(item)))
386                    }
387                }
388            }
389        }
390    }
391
392    /// Resolves references to a column name to a single column, or errors if
393    /// multiple columns are equally valid references.
394    pub fn resolve_column<'a>(
395        &'a self,
396        outer_scopes: &[Scope],
397        column_name: &ColumnName,
398        name_manager: &mut NameManager,
399    ) -> Result<(ColumnRef, Arc<str>), PlanError> {
400        let table_name = None;
401        self.resolve_internal(
402            outer_scopes,
403            |c| c.allow_unqualified_references(),
404            table_name,
405            column_name,
406            name_manager,
407        )
408    }
409
410    /// Resolves a column name in a `USING` clause.
411    pub fn resolve_using_column(
412        &self,
413        column_name: &ColumnName,
414        join_side: JoinSide,
415        name_manager: &mut NameManager,
416    ) -> Result<(ColumnRef, Arc<str>), PlanError> {
417        self.resolve_column(&[], column_name, name_manager)
418            .map_err(|e| match e {
419                // Attach a bit more context to unknown and ambiguous column
420                // errors to match PostgreSQL.
421                PlanError::AmbiguousColumn(column) => {
422                    PlanError::AmbiguousColumnInUsingClause { column, join_side }
423                }
424                PlanError::UnknownColumn { column, .. } => {
425                    PlanError::UnknownColumnInUsingClause { column, join_side }
426                }
427                _ => e,
428            })
429    }
430
431    /// Resolves a named column reference in a given scope.
432    /// Returns the interned name for tracking purposes.
433    pub fn resolve_table_column<'a>(
434        &'a self,
435        outer_scopes: &[Scope],
436        table_name: &PartialItemName,
437        column_name: &ColumnName,
438        name_manager: &mut NameManager,
439    ) -> Result<(ColumnRef, Arc<str>), PlanError> {
440        let mut seen_at_level = None;
441        self.resolve_internal(
442            outer_scopes,
443            |c| {
444                // Once we've matched a table name at a lateral level, even if
445                // the column name did not match, we can never match an item
446                // from another lateral level.
447                if let Some(seen_at_level) = seen_at_level {
448                    if seen_at_level != c.lat_level {
449                        return false;
450                    }
451                }
452                if c.table_name().as_ref().map(|n| n.matches(table_name)) == Some(true) {
453                    seen_at_level = Some(c.lat_level);
454                    true
455                } else {
456                    false
457                }
458            },
459            Some(table_name),
460            column_name,
461            name_manager,
462        )
463    }
464
465    /// Look to see if there is an already-calculated instance of this expr.
466    /// Failing to find one is not an error, so this just returns an Option.
467    ///
468    /// We do, however, return the `ScopeItem`, so that we can resolve and
469    /// intern the column name.
470    pub fn resolve_expr<'a>(&'a self, expr: &Expr<Aug>) -> Option<(ColumnRef, &'a ScopeItem)> {
471        // Literal values should not be treated as "cached" because their types
472        // in scope will have already been determined, but the type of the
473        // reoccurence of the expr might want to have a different type.
474        //
475        // This is most evident in the case of literal `NULL` values. The first
476        // occurrence is likely to be cast as `SqlScalarType::String`, but
477        // subsequent `NULL` values should be untyped.
478        if matches!(expr, Expr::Value(_)) {
479            return None;
480        }
481
482        self.items
483            .iter()
484            .enumerate()
485            .find(|(_, item)| item.exprs.contains(expr))
486            .map(|(i, item)| {
487                (
488                    ColumnRef {
489                        level: 0,
490                        column: i,
491                    },
492                    item,
493                )
494            })
495    }
496
497    pub fn product(self, right: Self) -> Result<Self, PlanError> {
498        let mut l_tables = self.table_names().into_iter().collect::<Vec<_>>();
499        // Make ordering deterministic for testing
500        l_tables.sort_by(|l, r| l.item.cmp(&r.item));
501        let r_tables = right.table_names();
502        for l in l_tables {
503            for r in &r_tables {
504                if l.matches(r) {
505                    sql_bail!("table name \"{}\" specified more than once", l.item)
506                }
507            }
508        }
509        Ok(Scope {
510            items: self.items.into_iter().chain(right.items).collect(),
511            ungrouped_columns: vec![],
512            lateral_barrier: false,
513        })
514    }
515
516    pub fn project(&self, columns: &[usize]) -> Self {
517        Scope {
518            items: columns.iter().map(|&i| self.items[i].clone()).collect(),
519            ungrouped_columns: vec![],
520            lateral_barrier: false,
521        }
522    }
523
524    pub fn table_names(&self) -> BTreeSet<&PartialItemName> {
525        self.items
526            .iter()
527            .filter_map(|name| name.table_name.as_ref())
528            .collect::<BTreeSet<&PartialItemName>>()
529    }
530}
531
532/// A pointer to a scope item or an ungrouped column along side its lateral
533/// level. Used internally while iterating.
534#[derive(Debug, Clone)]
535struct ScopeCursor<'a> {
536    lat_level: usize,
537    inner: ScopeCursorInner<'a>,
538}
539
540#[derive(Debug, Clone)]
541enum ScopeCursorInner<'a> {
542    Item {
543        column: ColumnRef,
544        item: &'a ScopeItem,
545    },
546    UngroupedColumn(&'a ScopeUngroupedColumn),
547}
548
549impl ScopeCursor<'_> {
550    fn table_name(&self) -> Option<&PartialItemName> {
551        match &self.inner {
552            ScopeCursorInner::Item { item, .. } => item.table_name.as_ref(),
553            ScopeCursorInner::UngroupedColumn(uc) => uc.table_name.as_ref(),
554        }
555    }
556
557    fn column_name(&self) -> &ColumnName {
558        match &self.inner {
559            ScopeCursorInner::Item { item, .. } => &item.column_name,
560            ScopeCursorInner::UngroupedColumn(uc) => &uc.column_name,
561        }
562    }
563
564    fn allow_unqualified_references(&self) -> bool {
565        match &self.inner {
566            ScopeCursorInner::Item { item, .. } => item.allow_unqualified_references,
567            ScopeCursorInner::UngroupedColumn(uc) => uc.allow_unqualified_references,
568        }
569    }
570}