mz_sql/plan/scope.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Handles SQL's scoping rules.
11//!
12//! A scope spans a single SQL `Query`. Nested subqueries create new scopes.
13//! Names are resolved against the innermost scope first.
14//! * If a match is found, it is returned.
15//! * If no matches are found, the name is resolved against the parent scope.
16//! * If multiple matches are found, the name is ambiguous and we return an
17//! error to the user.
18//!
19//! Matching rules:
20//! * `bar` will match any column in the scope named `bar`
21//! * `foo.bar` will match any column in the scope named `bar` that originated
22//! from a table named `foo`.
23//! * Table aliases such as `foo AS quux` replace the old table name.
24//! * Functions create unnamed columns, which can be named with columns aliases
25//! `(bar + 1) as more_bar`.
26//!
27//! Additionally, most databases fold some form of CSE into name resolution so
28//! that eg `SELECT sum(x) FROM foo GROUP BY sum(x)` would be treated something
29//! like `SELECT "sum(x)" FROM foo GROUP BY sum(x) AS "sum(x)"` rather than
30//! failing to resolve `x`. We handle this by including the underlying
31//! `sql_parser::ast::Expr` in cases where this is possible.
32//!
33//! Many SQL expressions do strange and arbitrary things to scopes. Rather than
34//! try to capture them all here, we just expose the internals of `Scope` and
35//! handle it in the appropriate place in `super::query`.
36//!
37//! NOTE(benesch): The above approach of exposing scope's internals to the
38//! entire planner has not aged well. SQL scopes are now full of undocumented
39//! assumptions and requirements, since various subcomponents of the planner
40//! shove data into scope items to communicate with subcomponents a mile away.
41//! I've tried to refactor this code several times to no avail. It works better
42//! than you might expect. But you have been warned. Tread carefully!
43
44use std::collections::BTreeSet;
45use std::iter;
46use std::sync::Arc;
47
48use mz_ore::iter::IteratorExt;
49use mz_repr::ColumnName;
50
51use crate::ast::Expr;
52use crate::names::{Aug, PartialItemName};
53use crate::plan::error::PlanError;
54use crate::plan::hir::ColumnRef;
55use crate::plan::plan_utils::JoinSide;
56use crate::plan::query::NameManager;
57
58#[derive(Debug, Clone)]
59pub struct ScopeItem {
60 /// The name of the table that produced this scope item, if any.
61 pub table_name: Option<PartialItemName>,
62 /// The name of the column.
63 pub column_name: ColumnName,
64 /// The expressions from which this scope item is derived. Used by `GROUP
65 /// BY` and window functions.
66 pub exprs: BTreeSet<Expr<Aug>>,
67 /// Whether the column is the return value of a function that produces only
68 /// a single column. This accounts for a strange PostgreSQL special case
69 /// around whole-row expansion.
70 pub from_single_column_function: bool,
71 /// Controls whether the column is only accessible via a table-qualified
72 /// reference. When false, the scope item is also excluded from `SELECT *`.
73 ///
74 /// This should be true for almost all scope items. It is set to false for
75 /// join columns in USING constraints. For example, in `t1 FULL JOIN t2
76 /// USING a`, `t1.a` and `t2.a` are still available by fully-qualified
77 /// reference, but a bare `a` refers to a new column whose value is
78 /// `coalesce(t1.a, t2.a)`. This is a big special case because normally
79 /// having three columns in scope named `a` would result in "ambiguous
80 /// column reference" errors.
81 pub allow_unqualified_references: bool,
82 /// If set, any attempt to reference this item will return the error
83 /// produced by this function.
84 ///
85 /// The function is provided with the table and column name in the
86 /// reference. It should return a `PlanError` describing why the reference
87 /// is invalid.
88 ///
89 /// This is useful for preventing access to certain columns in specific
90 /// contexts, like columns that are on the wrong side of a `LATERAL` join.
91 pub error_if_referenced: Option<fn(Option<&PartialItemName>, &ColumnName) -> PlanError>,
92 /// For table functions in scalar positions, this flag is true for the
93 /// ordinality column. If true, then this column represents an "exists" flag
94 /// for the entire row of the table function. In that case, this column must
95 /// be excluded from `*` expansion. If the corresponding datum is `NULL`, then
96 /// `*` expansion should yield a single `NULL` instead of a record with various
97 /// datums.
98 pub is_exists_column_for_a_table_function_that_was_in_the_target_list: bool,
99 // Force use of the constructor methods.
100 _private: (),
101}
102
103/// An ungrouped column in a scope.
104///
105/// We can't simply drop these items from scope. These items need to *exist*
106/// because they might shadow variables in outer scopes that would otherwise be
107/// valid to reference, but accessing them needs to produce an error.
108#[derive(Debug, Clone)]
109pub struct ScopeUngroupedColumn {
110 /// The name of the table that produced this ungrouped column, if any.
111 pub table_name: Option<PartialItemName>,
112 /// The name of the ungrouped column.
113 pub column_name: ColumnName,
114 /// Whether the original scope item allowed unqualified references.
115 pub allow_unqualified_references: bool,
116}
117
118#[derive(Debug, Clone)]
119pub struct Scope {
120 /// The items in this scope.
121 pub items: Vec<ScopeItem>,
122 /// The ungrouped columns in the scope.
123 pub ungrouped_columns: Vec<ScopeUngroupedColumn>,
124 // Whether this scope starts a new chain of lateral outer scopes.
125 //
126 // It's easiest to understand with an example. Consider this query:
127 //
128 // SELECT (SELECT * FROM tab1, tab2, (SELECT tab1.a, tab3.a)) FROM tab3, tab4
129 // Scope 1: ------------------------
130 // Scope 2: ----
131 // Scope 3: ----
132 // Scope 4: ----
133 // Scope 5: ----
134 //
135 // Note that the because the derived table is not marked `LATERAL`, its
136 // reference to `tab3.a` is valid but its reference to `tab1.a` is not.
137 //
138 // Scope 5 is the parent of scope 4, scope 4 is the parent of scope 3, and
139 // so on. The non-lateral derived table is not allowed to access scopes 2
140 // and 3, because they are part of the same lateral chain, but it *is*
141 // allowed to access scope 4 and 5. So, to capture this information, we set
142 // `lateral_barrier: true` for scope 4.
143 pub lateral_barrier: bool,
144}
145
146impl ScopeItem {
147 pub fn empty() -> ScopeItem {
148 ScopeItem {
149 table_name: None,
150 column_name: "?column?".into(),
151 exprs: BTreeSet::new(),
152 from_single_column_function: false,
153 allow_unqualified_references: true,
154 error_if_referenced: None,
155 is_exists_column_for_a_table_function_that_was_in_the_target_list: false,
156 _private: (),
157 }
158 }
159
160 /// Constructs a new scope item from an unqualified column name.
161 pub fn from_column_name<N>(column_name: N) -> ScopeItem
162 where
163 N: Into<ColumnName>,
164 {
165 ScopeItem::from_name(None, column_name.into())
166 }
167
168 /// Constructs a new scope item from a name.
169 pub fn from_name<N>(table_name: Option<PartialItemName>, column_name: N) -> ScopeItem
170 where
171 N: Into<ColumnName>,
172 {
173 let mut item = ScopeItem::empty();
174 item.table_name = table_name;
175 item.column_name = column_name.into();
176 item
177 }
178
179 /// Constructs a new scope item with no name from an expression.
180 pub fn from_expr(expr: impl Into<Option<Expr<Aug>>>) -> ScopeItem {
181 let mut item = ScopeItem::empty();
182 if let Some(expr) = expr.into() {
183 item.exprs.insert(expr);
184 }
185 item
186 }
187
188 pub fn is_from_table(&self, table_name: &PartialItemName) -> bool {
189 match &self.table_name {
190 None => false,
191 Some(n) => n.matches(table_name),
192 }
193 }
194}
195
196impl Scope {
197 pub fn empty() -> Self {
198 Scope {
199 items: vec![],
200 ungrouped_columns: vec![],
201 lateral_barrier: false,
202 }
203 }
204
205 pub fn from_source<I, N>(table_name: Option<PartialItemName>, column_names: I) -> Self
206 where
207 I: IntoIterator<Item = N>,
208 N: Into<ColumnName>,
209 {
210 let mut scope = Scope::empty();
211 scope.items = column_names
212 .into_iter()
213 .map(|column_name| ScopeItem::from_name(table_name.clone(), column_name.into()))
214 .collect();
215 scope
216 }
217
218 /// Constructs an iterator over the canonical name for each column.
219 pub fn column_names(&self) -> impl Iterator<Item = &ColumnName> {
220 self.items.iter().map(|item| &item.column_name)
221 }
222
223 pub fn len(&self) -> usize {
224 self.items.len()
225 }
226
227 /// Iterates over all items in the scope.
228 ///
229 /// Items are returned in order of preference, where the innermost scope has
230 /// the highest preference. For example, given scopes `A(B(C))`, items are
231 /// presented in the order `C`, `B`, `A`.
232 ///
233 /// Items are returned alongside the column reference that targets that item
234 /// and the item's "lateral level". The latter bears explaining. The lateral
235 /// level is the number of lateral barriers between this scope and the item.
236 /// See `Scope::lateral_barrier` for a diagram. Roughly speaking, items from
237 /// different levels but the same lateral level are items from different
238 /// joins in the same subquery, while items in different lateral levels are
239 /// items from different queries entirely. Rules about ambiguity apply
240 /// within an entire lateral level, not just within a single scope level.
241 ///
242 /// NOTE(benesch): Scope` really shows its weaknesses here. Ideally we'd
243 /// have separate types like `QueryScope` and `JoinScope` that more
244 /// naturally encode the concept of a "lateral level", or at least something
245 /// along those lines.
246 fn all_items<'a>(
247 &'a self,
248 outer_scopes: &'a [Scope],
249 ) -> impl Iterator<Item = ScopeCursor<'a>> + 'a {
250 let mut lat_level = 0;
251 iter::once(self)
252 .chain(outer_scopes)
253 .enumerate()
254 .flat_map(move |(level, scope)| {
255 if scope.lateral_barrier {
256 lat_level += 1;
257 }
258 let items = scope
259 .items
260 .iter()
261 .enumerate()
262 .map(move |(column, item)| ScopeCursor {
263 lat_level,
264 inner: ScopeCursorInner::Item {
265 column: ColumnRef { level, column },
266 item,
267 },
268 });
269 let ungrouped_columns = scope.ungrouped_columns.iter().map(move |uc| ScopeCursor {
270 lat_level,
271 inner: ScopeCursorInner::UngroupedColumn(uc),
272 });
273 items.chain(ungrouped_columns)
274 })
275 }
276
277 /// Returns all items from the given table name in the closest scope.
278 ///
279 /// If no tables with the given name are in scope, returns an empty
280 /// iterator.
281 ///
282 /// NOTE(benesch): This is wrong for zero-arity relations, because we can't
283 /// distinguish between "no such table" and a table that exists but has no
284 /// columns. The current design of scope makes this difficult to fix,
285 /// unfortunately.
286 pub fn items_from_table<'a>(
287 &'a self,
288 outer_scopes: &'a [Scope],
289 table: &PartialItemName,
290 ) -> Result<Vec<(ColumnRef, &'a ScopeItem)>, PlanError> {
291 let mut seen_level = None;
292 let items: Vec<_> = self
293 .all_items(outer_scopes)
294 .filter_map(move |c| match c.inner {
295 ScopeCursorInner::Item { column, item }
296 if item.is_from_table(table)
297 && *seen_level.get_or_insert(c.lat_level) == c.lat_level =>
298 {
299 Some((column, item))
300 }
301 _ => None,
302 })
303 .collect();
304 if !items.iter().map(|(column, _)| column.level).all_equal() {
305 return Err(PlanError::AmbiguousTable(table.clone()));
306 }
307 Ok(items)
308 }
309
310 /// Returns a matching [`ColumnRef`] and interned name, if one exists.
311 ///
312 /// Filters all visible items against the provided `matches` closure, and then matches this
313 /// filtered set against the provided `column_name`.
314 fn resolve_internal<'a, M>(
315 &'a self,
316 outer_scopes: &[Scope],
317 mut matches: M,
318 table_name: Option<&PartialItemName>,
319 column_name: &ColumnName,
320 name_manager: &mut NameManager,
321 ) -> Result<(ColumnRef, Arc<str>), PlanError>
322 where
323 M: FnMut(&ScopeCursor) -> bool,
324 {
325 let mut results = self
326 .all_items(outer_scopes)
327 .filter(|c| (matches)(c) && c.column_name() == column_name);
328 match results.next() {
329 None => {
330 let similar = self
331 .all_items(outer_scopes)
332 .filter(|c| (matches)(c))
333 .filter_map(|c| {
334 c.column_name()
335 .is_similar(column_name)
336 .then(|| c.column_name().clone())
337 })
338 .collect();
339 Err(PlanError::UnknownColumn {
340 table: table_name.cloned(),
341 column: column_name.clone(),
342 similar,
343 })
344 }
345 Some(c) => {
346 if let Some(ambiguous) = results.find(|c2| c.lat_level == c2.lat_level) {
347 if let Some(table_name) = table_name {
348 if let (
349 ScopeCursorInner::Item {
350 column: ColumnRef { level: c_level, .. },
351 ..
352 },
353 ScopeCursorInner::Item {
354 column:
355 ColumnRef {
356 level: ambiguous_level,
357 ..
358 },
359 ..
360 },
361 ) = (c.inner, ambiguous.inner)
362 {
363 // ColumnRefs with identical levels indicate multiple columns of the
364 // same name in relation. If the levels differ then it is instead two
365 // tables with the same name, both having a column with this name.
366 if c_level == ambiguous_level {
367 return Err(PlanError::AmbiguousColumn(column_name.clone()));
368 }
369 }
370 return Err(PlanError::AmbiguousTable(table_name.clone()));
371 } else {
372 return Err(PlanError::AmbiguousColumn(column_name.clone()));
373 }
374 }
375
376 match c.inner {
377 ScopeCursorInner::UngroupedColumn(uc) => Err(PlanError::UngroupedColumn {
378 table: uc.table_name.clone(),
379 column: uc.column_name.clone(),
380 }),
381 ScopeCursorInner::Item { column, item } => {
382 if let Some(error_if_referenced) = item.error_if_referenced {
383 return Err(error_if_referenced(table_name, column_name));
384 }
385 Ok((column, name_manager.intern_scope_item(item)))
386 }
387 }
388 }
389 }
390 }
391
392 /// Resolves references to a column name to a single column, or errors if
393 /// multiple columns are equally valid references.
394 pub fn resolve_column<'a>(
395 &'a self,
396 outer_scopes: &[Scope],
397 column_name: &ColumnName,
398 name_manager: &mut NameManager,
399 ) -> Result<(ColumnRef, Arc<str>), PlanError> {
400 let table_name = None;
401 self.resolve_internal(
402 outer_scopes,
403 |c| c.allow_unqualified_references(),
404 table_name,
405 column_name,
406 name_manager,
407 )
408 }
409
410 /// Resolves a column name in a `USING` clause.
411 pub fn resolve_using_column(
412 &self,
413 column_name: &ColumnName,
414 join_side: JoinSide,
415 name_manager: &mut NameManager,
416 ) -> Result<(ColumnRef, Arc<str>), PlanError> {
417 self.resolve_column(&[], column_name, name_manager)
418 .map_err(|e| match e {
419 // Attach a bit more context to unknown and ambiguous column
420 // errors to match PostgreSQL.
421 PlanError::AmbiguousColumn(column) => {
422 PlanError::AmbiguousColumnInUsingClause { column, join_side }
423 }
424 PlanError::UnknownColumn { column, .. } => {
425 PlanError::UnknownColumnInUsingClause { column, join_side }
426 }
427 _ => e,
428 })
429 }
430
431 /// Resolves a named column reference in a given scope.
432 /// Returns the interned name for tracking purposes.
433 pub fn resolve_table_column<'a>(
434 &'a self,
435 outer_scopes: &[Scope],
436 table_name: &PartialItemName,
437 column_name: &ColumnName,
438 name_manager: &mut NameManager,
439 ) -> Result<(ColumnRef, Arc<str>), PlanError> {
440 let mut seen_at_level = None;
441 self.resolve_internal(
442 outer_scopes,
443 |c| {
444 // Once we've matched a table name at a lateral level, even if
445 // the column name did not match, we can never match an item
446 // from another lateral level.
447 if let Some(seen_at_level) = seen_at_level {
448 if seen_at_level != c.lat_level {
449 return false;
450 }
451 }
452 if c.table_name().as_ref().map(|n| n.matches(table_name)) == Some(true) {
453 seen_at_level = Some(c.lat_level);
454 true
455 } else {
456 false
457 }
458 },
459 Some(table_name),
460 column_name,
461 name_manager,
462 )
463 }
464
465 /// Look to see if there is an already-calculated instance of this expr.
466 /// Failing to find one is not an error, so this just returns an Option.
467 ///
468 /// We do, however, return the `ScopeItem`, so that we can resolve and
469 /// intern the column name.
470 pub fn resolve_expr<'a>(&'a self, expr: &Expr<Aug>) -> Option<(ColumnRef, &'a ScopeItem)> {
471 // Literal values should not be treated as "cached" because their types
472 // in scope will have already been determined, but the type of the
473 // reoccurence of the expr might want to have a different type.
474 //
475 // This is most evident in the case of literal `NULL` values. The first
476 // occurrence is likely to be cast as `ScalarType::String`, but
477 // subsequent `NULL` values should be untyped.
478 if matches!(expr, Expr::Value(_)) {
479 return None;
480 }
481
482 self.items
483 .iter()
484 .enumerate()
485 .find(|(_, item)| item.exprs.contains(expr))
486 .map(|(i, item)| {
487 (
488 ColumnRef {
489 level: 0,
490 column: i,
491 },
492 item,
493 )
494 })
495 }
496
497 pub fn product(self, right: Self) -> Result<Self, PlanError> {
498 let mut l_tables = self.table_names().into_iter().collect::<Vec<_>>();
499 // Make ordering deterministic for testing
500 l_tables.sort_by(|l, r| l.item.cmp(&r.item));
501 let r_tables = right.table_names();
502 for l in l_tables {
503 for r in &r_tables {
504 if l.matches(r) {
505 sql_bail!("table name \"{}\" specified more than once", l.item)
506 }
507 }
508 }
509 Ok(Scope {
510 items: self.items.into_iter().chain(right.items).collect(),
511 ungrouped_columns: vec![],
512 lateral_barrier: false,
513 })
514 }
515
516 pub fn project(&self, columns: &[usize]) -> Self {
517 Scope {
518 items: columns.iter().map(|&i| self.items[i].clone()).collect(),
519 ungrouped_columns: vec![],
520 lateral_barrier: false,
521 }
522 }
523
524 pub fn table_names(&self) -> BTreeSet<&PartialItemName> {
525 self.items
526 .iter()
527 .filter_map(|name| name.table_name.as_ref())
528 .collect::<BTreeSet<&PartialItemName>>()
529 }
530}
531
532/// A pointer to a scope item or an ungrouped column along side its lateral
533/// level. Used internally while iterating.
534#[derive(Debug, Clone)]
535struct ScopeCursor<'a> {
536 lat_level: usize,
537 inner: ScopeCursorInner<'a>,
538}
539
540#[derive(Debug, Clone)]
541enum ScopeCursorInner<'a> {
542 Item {
543 column: ColumnRef,
544 item: &'a ScopeItem,
545 },
546 UngroupedColumn(&'a ScopeUngroupedColumn),
547}
548
549impl ScopeCursor<'_> {
550 fn table_name(&self) -> Option<&PartialItemName> {
551 match &self.inner {
552 ScopeCursorInner::Item { item, .. } => item.table_name.as_ref(),
553 ScopeCursorInner::UngroupedColumn(uc) => uc.table_name.as_ref(),
554 }
555 }
556
557 fn column_name(&self) -> &ColumnName {
558 match &self.inner {
559 ScopeCursorInner::Item { item, .. } => &item.column_name,
560 ScopeCursorInner::UngroupedColumn(uc) => &uc.column_name,
561 }
562 }
563
564 fn allow_unqualified_references(&self) -> bool {
565 match &self.inner {
566 ScopeCursorInner::Item { item, .. } => item.allow_unqualified_references,
567 ScopeCursorInner::UngroupedColumn(uc) => uc.allow_unqualified_references,
568 }
569 }
570}