Skip to main content

mz_adapter/
statement_logging.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::collections::BTreeSet;
11use std::sync::atomic::Ordering;
12use std::sync::{Arc, Mutex};
13
14use bytes::BytesMut;
15use mz_catalog::memory::objects::CatalogItem;
16use mz_controller_types::ClusterId;
17use mz_ore::cast::{CastFrom, CastInto};
18use mz_ore::now::{EpochMillis, NowFn, epoch_to_uuid_v7, to_datetime};
19use mz_ore::soft_panic_or_log;
20use mz_repr::adt::array::ArrayDimension;
21use mz_repr::adt::timestamp::TimestampLike;
22use mz_repr::{Datum, GlobalId, Row, RowIterator, RowPacker, Timestamp};
23use mz_sql::ast::display::AstDisplay;
24use mz_sql::ast::{AstInfo, Statement};
25use mz_sql::plan::Params;
26use mz_sql::session::metadata::SessionMetadata;
27use mz_sql::session::vars::SystemVars;
28use mz_sql_parser::ast::{StatementKind, statement_kind_label_value};
29use qcell::QCell;
30use rand::distr::{Bernoulli, Distribution};
31use sha2::{Digest, Sha256};
32use uuid::Uuid;
33
34use crate::catalog::CatalogState;
35use crate::session::{LifecycleTimestamps, Session, TransactionId};
36use crate::{AdapterError, CollectionIdBundle, ExecuteResponse};
37
38#[derive(Clone, Debug)]
39pub enum StatementLifecycleEvent {
40    ExecutionBegan,
41    OptimizationFinished,
42    StorageDependenciesFinished,
43    ComputeDependenciesFinished,
44    ExecutionFinished,
45}
46
47impl StatementLifecycleEvent {
48    pub fn as_str(&self) -> &str {
49        match self {
50            Self::ExecutionBegan => "execution-began",
51            Self::OptimizationFinished => "optimization-finished",
52            Self::StorageDependenciesFinished => "storage-dependencies-finished",
53            Self::ComputeDependenciesFinished => "compute-dependencies-finished",
54            Self::ExecutionFinished => "execution-finished",
55        }
56    }
57}
58
59/// Contains all the information necessary to generate the initial
60/// entry in `mz_statement_execution_history`. We need to keep this
61/// around in order to modify the entry later once the statement finishes executing.
62#[derive(Clone, Debug)]
63pub struct StatementBeganExecutionRecord {
64    pub id: Uuid,
65    pub prepared_statement_id: Uuid,
66    pub sample_rate: f64,
67    pub params: Vec<Option<String>>,
68    pub began_at: EpochMillis,
69    pub cluster_id: Option<ClusterId>,
70    pub cluster_name: Option<String>,
71    pub database_name: String,
72    pub search_path: Vec<String>,
73    pub application_name: String,
74    pub transaction_isolation: String,
75    pub execution_timestamp: Option<EpochMillis>,
76    pub transaction_id: TransactionId,
77    pub transient_index_id: Option<GlobalId>,
78    pub mz_version: String,
79}
80
81#[derive(Clone, Copy, Debug)]
82pub enum StatementExecutionStrategy {
83    /// The statement was executed by spinning up a dataflow.
84    Standard,
85    /// The statement was executed by reading from an existing
86    /// arrangement.
87    FastPath,
88    /// Experimental: The statement was executed by reading from an existing
89    /// persist collection.
90    PersistFastPath,
91    /// The statement was determined to be constant by
92    /// environmentd, and not sent to a cluster.
93    Constant,
94}
95
96impl StatementExecutionStrategy {
97    pub fn name(&self) -> &'static str {
98        match self {
99            Self::Standard => "standard",
100            Self::FastPath => "fast-path",
101            Self::PersistFastPath => "persist-fast-path",
102            Self::Constant => "constant",
103        }
104    }
105}
106
107#[derive(Clone, Debug)]
108pub enum StatementEndedExecutionReason {
109    Success {
110        result_size: Option<u64>,
111        rows_returned: Option<u64>,
112        execution_strategy: Option<StatementExecutionStrategy>,
113    },
114    Canceled,
115    Errored {
116        error: String,
117    },
118    /// Should only be emitted by `impl Drop for ExecuteContextGuard`.
119    /// Code paths that explicitly complete execution should use
120    /// `retire_execute` with `Success`, `Canceled`, or `Errored`.
121    Aborted,
122}
123
124#[derive(Clone, Debug)]
125pub struct StatementEndedExecutionRecord {
126    pub id: Uuid,
127    pub reason: StatementEndedExecutionReason,
128    pub ended_at: EpochMillis,
129}
130
131/// Contains all the information necessary to generate an entry in
132/// `mz_prepared_statement_history`
133#[derive(Clone, Debug)]
134pub(crate) struct StatementPreparedRecord {
135    pub id: Uuid,
136    pub sql_hash: [u8; 32],
137    pub name: String,
138    pub session_id: Uuid,
139    pub prepared_at: EpochMillis,
140    pub kind: Option<StatementKind>,
141}
142
143#[derive(Clone, Debug)]
144pub(crate) struct SessionHistoryEvent {
145    pub id: Uuid,
146    pub connected_at: EpochMillis,
147    pub application_name: String,
148    pub authenticated_user: String,
149}
150
151impl From<&Result<ExecuteResponse, AdapterError>> for StatementEndedExecutionReason {
152    fn from(value: &Result<ExecuteResponse, AdapterError>) -> StatementEndedExecutionReason {
153        match value {
154            Ok(resp) => resp.into(),
155            Err(e) => StatementEndedExecutionReason::Errored {
156                error: e.to_string(),
157            },
158        }
159    }
160}
161
162impl From<&ExecuteResponse> for StatementEndedExecutionReason {
163    fn from(value: &ExecuteResponse) -> StatementEndedExecutionReason {
164        match value {
165            ExecuteResponse::CopyTo { resp, .. } => match resp.as_ref() {
166                // NB [btv]: It's not clear that this combination
167                // can ever actually happen.
168                ExecuteResponse::SendingRowsImmediate { rows, .. } => {
169                    // Note(parkmycar): It potentially feels bad here to iterate over the entire
170                    // iterator _just_ to get the encoded result size. As noted above, it's not
171                    // entirely clear this case ever happens, so the simplicity is worth it.
172                    let result_size: usize = rows.box_clone().map(|row| row.byte_len()).sum();
173                    StatementEndedExecutionReason::Success {
174                        result_size: Some(u64::cast_from(result_size)),
175                        rows_returned: Some(u64::cast_from(rows.count())),
176                        execution_strategy: Some(StatementExecutionStrategy::Constant),
177                    }
178                }
179                ExecuteResponse::SendingRowsStreaming { .. } => {
180                    panic!("SELECTs terminate on peek finalization, not here.")
181                }
182                ExecuteResponse::Subscribing { .. } => {
183                    panic!("SUBSCRIBEs terminate in the protocol layer, not here.")
184                }
185                _ => panic!("Invalid COPY response type"),
186            },
187            ExecuteResponse::CopyFrom { .. } => {
188                panic!("COPY FROMs terminate in the protocol layer, not here.")
189            }
190            ExecuteResponse::Fetch { .. } => {
191                panic!("FETCHes terminate after a follow-up message is sent.")
192            }
193            ExecuteResponse::SendingRowsStreaming { .. } => {
194                panic!("SELECTs terminate on peek finalization, not here.")
195            }
196            ExecuteResponse::Subscribing { .. } => {
197                panic!("SUBSCRIBEs terminate in the protocol layer, not here.")
198            }
199
200            ExecuteResponse::SendingRowsImmediate { rows, .. } => {
201                // Note(parkmycar): It potentially feels bad here to iterate over the entire
202                // iterator _just_ to get the encoded result size, the number of Rows returned here
203                // shouldn't be too large though. An alternative is to pre-compute some of the
204                // result size, but that would require always decoding Rows to handle projecting
205                // away columns, which has a negative impact for much larger response sizes.
206                let result_size: usize = rows.box_clone().map(|row| row.byte_len()).sum();
207                StatementEndedExecutionReason::Success {
208                    result_size: Some(u64::cast_from(result_size)),
209                    rows_returned: Some(u64::cast_from(rows.count())),
210                    execution_strategy: Some(StatementExecutionStrategy::Constant),
211                }
212            }
213
214            ExecuteResponse::AlteredDefaultPrivileges
215            | ExecuteResponse::AlteredObject(_)
216            | ExecuteResponse::AlteredRole
217            | ExecuteResponse::AlteredSystemConfiguration
218            | ExecuteResponse::ClosedCursor
219            | ExecuteResponse::Comment
220            | ExecuteResponse::Copied(_)
221            | ExecuteResponse::CreatedConnection
222            | ExecuteResponse::CreatedDatabase
223            | ExecuteResponse::CreatedSchema
224            | ExecuteResponse::CreatedRole
225            | ExecuteResponse::CreatedCluster
226            | ExecuteResponse::CreatedClusterReplica
227            | ExecuteResponse::CreatedIndex
228            | ExecuteResponse::CreatedIntrospectionSubscribe
229            | ExecuteResponse::CreatedSecret
230            | ExecuteResponse::CreatedSink
231            | ExecuteResponse::CreatedSource
232            | ExecuteResponse::CreatedTable
233            | ExecuteResponse::CreatedView
234            | ExecuteResponse::CreatedViews
235            | ExecuteResponse::CreatedMaterializedView
236            | ExecuteResponse::CreatedType
237            | ExecuteResponse::CreatedNetworkPolicy
238            | ExecuteResponse::Deallocate { .. }
239            | ExecuteResponse::DeclaredCursor
240            | ExecuteResponse::Deleted(_)
241            | ExecuteResponse::DiscardedTemp
242            | ExecuteResponse::DiscardedAll
243            | ExecuteResponse::DroppedObject(_)
244            | ExecuteResponse::DroppedOwned
245            | ExecuteResponse::EmptyQuery
246            | ExecuteResponse::GrantedPrivilege
247            | ExecuteResponse::GrantedRole
248            | ExecuteResponse::Inserted(_)
249            | ExecuteResponse::Prepare
250            | ExecuteResponse::Raised
251            | ExecuteResponse::ReassignOwned
252            | ExecuteResponse::RevokedPrivilege
253            | ExecuteResponse::RevokedRole
254            | ExecuteResponse::SetVariable { .. }
255            | ExecuteResponse::StartedTransaction
256            | ExecuteResponse::TransactionCommitted { .. }
257            | ExecuteResponse::TransactionRolledBack { .. }
258            | ExecuteResponse::Updated(_)
259            | ExecuteResponse::ValidatedConnection { .. } => {
260                StatementEndedExecutionReason::Success {
261                    result_size: None,
262                    rows_returned: None,
263                    execution_strategy: None,
264                }
265            }
266        }
267    }
268}
269
270mod sealed {
271    /// A struct that is purposefully private so folks are forced to use the constructor of an
272    /// enum.
273    #[derive(Debug, Copy, Clone)]
274    pub struct Private;
275}
276
277/// Metadata required for logging a prepared statement.
278#[derive(Debug)]
279pub enum PreparedStatementLoggingInfo {
280    /// The statement has already been logged; we don't need to log it
281    /// again if a future execution hits the sampling rate; we merely
282    /// need to reference the corresponding UUID.
283    AlreadyLogged { uuid: Uuid },
284    /// The statement has not yet been logged; if a future execution
285    /// hits the sampling rate, we need to log it at that point.
286    StillToLog {
287        /// The SQL text of the statement.
288        sql: String,
289        /// The SQL text of the statement, redacted to follow our data management
290        /// policy
291        redacted_sql: String,
292        /// When the statement was prepared
293        prepared_at: EpochMillis,
294        /// The name with which the statement was prepared
295        name: String,
296        /// The ID of the session that prepared the statement
297        session_id: Uuid,
298        /// Whether we have already recorded this in the "would have logged" metric
299        accounted: bool,
300        /// The top-level kind of the statement (e.g., `Select`), or `None` for an empty statement
301        kind: Option<StatementKind>,
302
303        /// Private type that forces use of the [`PreparedStatementLoggingInfo::still_to_log`]
304        /// constructor.
305        _sealed: sealed::Private,
306    },
307}
308
309impl PreparedStatementLoggingInfo {
310    /// Constructor for the [`PreparedStatementLoggingInfo::StillToLog`] variant that ensures SQL
311    /// statements are properly redacted.
312    pub fn still_to_log<A: AstInfo>(
313        raw_sql: String,
314        stmt: Option<&Statement<A>>,
315        prepared_at: EpochMillis,
316        name: String,
317        session_id: Uuid,
318        accounted: bool,
319    ) -> Self {
320        let kind = stmt.map(StatementKind::from);
321        let sql = match kind {
322            // Always redact SQL statements that may contain sensitive information.
323            // CREATE SECRET and ALTER SECRET statements can contain secret values, so we redact them.
324            // INSERT, UPDATE, and EXECUTE statements can include large amounts of user data, so we redact them for both
325            // data privacy and to avoid logging excessive data.
326            Some(
327                StatementKind::CreateSecret
328                | StatementKind::AlterSecret
329                | StatementKind::Insert
330                | StatementKind::Update
331                | StatementKind::Execute,
332            ) => stmt.map(|s| s.to_ast_string_redacted()).unwrap_or_default(),
333            _ => raw_sql,
334        };
335
336        PreparedStatementLoggingInfo::StillToLog {
337            sql,
338            redacted_sql: stmt.map(|s| s.to_ast_string_redacted()).unwrap_or_default(),
339            prepared_at,
340            name,
341            session_id,
342            accounted,
343            kind,
344            _sealed: sealed::Private,
345        }
346    }
347}
348
349#[derive(Copy, Clone, Debug, Ord, Eq, PartialOrd, PartialEq)]
350pub struct StatementLoggingId(pub Uuid);
351
352/// Rows to be written to `mz_prepared_statement_history` and `mz_sql_text`, with the session id.
353#[derive(Debug, Clone)]
354pub struct PreparedStatementEvent {
355    pub prepared_statement: Row,
356    pub sql_text: Row,
357    pub session_id: Uuid,
358}
359
360/// Throttling state for statement logging, shared across multiple frontend tasks (and currently
361/// also shared with the old peek sequencing).
362#[derive(Debug)]
363pub struct ThrottlingState {
364    /// Inner state protected by a mutex for rate-limiting, because the two inner fields have to be
365    /// manipulated together atomically.
366    /// This mutex is locked once per unsampled query. (There is both sampling and throttling.
367    /// Sampling happens before throttling.) This should be ok for now: Our QPS will not be more
368    /// than 10000s for now, and a mutex should be able to do 100000s of lockings per second, even
369    /// with some contention. If this ever becomes an issue, then we could redesign throttling to be
370    /// per-session/per-tokio-worker-thread.
371    inner: Mutex<ThrottlingStateInner>,
372    /// The number of statements that have been throttled since the last successfully logged
373    /// statement. This is not needed for the throttling decision itself, so it can be a separate
374    /// atomic to allow reading/writing without acquiring the inner mutex.
375    throttled_count: std::sync::atomic::AtomicUsize,
376}
377
378#[derive(Debug)]
379struct ThrottlingStateInner {
380    /// The number of bytes that we are allowed to emit for statement logging without being throttled.
381    /// Increases at a rate of [`mz_sql::session::vars::STATEMENT_LOGGING_TARGET_DATA_RATE`] per second,
382    /// up to a max value of [`mz_sql::session::vars::STATEMENT_LOGGING_MAX_DATA_CREDIT`].
383    tokens: u64,
384    /// The last time at which a statement was logged.
385    last_logged_ts_seconds: u64,
386}
387
388impl ThrottlingState {
389    /// Create a new throttling state.
390    pub fn new(now: &NowFn) -> Self {
391        Self {
392            inner: Mutex::new(ThrottlingStateInner {
393                tokens: 0,
394                last_logged_ts_seconds: now() / 1000,
395            }),
396            throttled_count: std::sync::atomic::AtomicUsize::new(0),
397        }
398    }
399
400    /// Check if we need to drop a statement due to throttling, and update the number of available
401    /// tokens appropriately.
402    ///
403    /// Returns `false` if we must throttle this statement, and `true` otherwise.
404    /// Note: `throttled_count` is NOT modified by this method - callers are responsible
405    /// for incrementing it on throttle failure and resetting it when appropriate.
406    pub fn throttling_check(
407        &self,
408        cost: u64,
409        target_data_rate: u64,
410        max_data_credit: Option<u64>,
411        now: &NowFn,
412    ) -> bool {
413        let ts = now() / 1000;
414        let mut inner = self.inner.lock().expect("throttling state lock poisoned");
415        // We use saturating_sub here because system time isn't monotonic, causing cases
416        // when last_logged_ts_seconds is greater than ts.
417        let elapsed = ts.saturating_sub(inner.last_logged_ts_seconds);
418        inner.last_logged_ts_seconds = ts;
419        inner.tokens = inner
420            .tokens
421            .saturating_add(target_data_rate.saturating_mul(elapsed));
422        if let Some(max_data_credit) = max_data_credit {
423            inner.tokens = inner.tokens.min(max_data_credit);
424        }
425        if let Some(remaining) = inner.tokens.checked_sub(cost) {
426            tracing::debug!("throttling check passed. tokens remaining: {remaining}; cost: {cost}");
427            inner.tokens = remaining;
428            true
429        } else {
430            tracing::debug!(
431                "throttling check failed. tokens available: {}; cost: {cost}",
432                inner.tokens
433            );
434            false
435        }
436    }
437
438    pub fn get_throttled_count(&self) -> usize {
439        self.throttled_count.load(Ordering::Relaxed)
440    }
441
442    pub fn increment_throttled_count(&self) {
443        self.throttled_count.fetch_add(1, Ordering::Relaxed);
444    }
445
446    pub fn reset_throttled_count(&self) {
447        self.throttled_count.store(0, Ordering::Relaxed);
448    }
449}
450
451/// Encapsulates statement logging state needed by the frontend peek sequencing.
452///
453/// This struct bundles together all the statement logging-related state that
454/// the frontend peek sequencing needs to perform statement logging independently
455/// of the Coordinator's main task.
456#[derive(Debug, Clone)]
457pub struct StatementLoggingFrontend {
458    /// Shared throttling state for rate-limiting statement logging.
459    pub throttling_state: Arc<ThrottlingState>,
460    /// Reproducible RNG for statement sampling (only used in tests).
461    pub reproducible_rng: Arc<Mutex<rand_chacha::ChaCha8Rng>>,
462    /// Cached human version string from build info.
463    pub build_info_human_version: String,
464    /// Function to get current time for statement logging.
465    pub now: NowFn,
466}
467
468impl StatementLoggingFrontend {
469    /// Get prepared statement info for frontend peek sequencing.
470    ///
471    /// This function processes prepared statement logging info and builds the event rows.
472    /// It does NOT do throttling - that is handled externally by the caller in `begin_statement_execution`.
473    /// It DOES mutate the logging info to mark the statement as already logged.
474    ///
475    /// # Arguments
476    /// * `session` - The session executing the statement
477    /// * `logging` - Prepared statement logging info
478    ///
479    /// # Returns
480    /// A tuple containing:
481    /// - `Option<PreparedStatementEvent>`: If the prepared statement has not yet been logged,
482    ///   returns the packed rows for the prepared statement.
483    /// - `Uuid`: The UUID of the prepared statement.
484    fn get_prepared_statement_info(
485        &self,
486        session: &mut Session,
487        logging: &Arc<QCell<PreparedStatementLoggingInfo>>,
488    ) -> (Option<PreparedStatementEvent>, Uuid) {
489        let logging_ref = session.qcell_rw(&*logging);
490        let mut prepared_statement_event = None;
491
492        let ps_uuid = match logging_ref {
493            PreparedStatementLoggingInfo::AlreadyLogged { uuid } => *uuid,
494            PreparedStatementLoggingInfo::StillToLog {
495                sql,
496                redacted_sql,
497                prepared_at,
498                name,
499                session_id,
500                accounted,
501                kind,
502                _sealed: _,
503            } => {
504                assert!(
505                    *accounted,
506                    "accounting for logging should be done in `begin_statement_execution`"
507                );
508                let uuid = epoch_to_uuid_v7(prepared_at);
509                let sql = std::mem::take(sql);
510                let redacted_sql = std::mem::take(redacted_sql);
511                let sql_hash: [u8; 32] = Sha256::digest(sql.as_bytes()).into();
512
513                // Copy session_id before mutating logging_ref
514                let sid = *session_id;
515
516                let record = StatementPreparedRecord {
517                    id: uuid,
518                    sql_hash,
519                    name: std::mem::take(name),
520                    session_id: sid,
521                    prepared_at: *prepared_at,
522                    kind: *kind,
523                };
524
525                // `mz_prepared_statement_history`
526                let mut mpsh_row = Row::default();
527                let mut mpsh_packer = mpsh_row.packer();
528                pack_statement_prepared_update(&record, &mut mpsh_packer);
529
530                let sql_row = Row::pack([
531                    Datum::TimestampTz(
532                        to_datetime(*prepared_at)
533                            .truncate_day()
534                            .try_into()
535                            .expect("must fit"),
536                    ),
537                    Datum::Bytes(sql_hash.as_slice()),
538                    Datum::String(sql.as_str()),
539                    Datum::String(redacted_sql.as_str()),
540                ]);
541
542                // Read throttled_count from shared state
543                let throttled_count = self.throttling_state.get_throttled_count();
544
545                mpsh_packer.push(Datum::UInt64(CastFrom::cast_from(throttled_count)));
546
547                prepared_statement_event = Some(PreparedStatementEvent {
548                    prepared_statement: mpsh_row,
549                    sql_text: sql_row,
550                    session_id: sid,
551                });
552
553                *logging_ref = PreparedStatementLoggingInfo::AlreadyLogged { uuid };
554                uuid
555            }
556        };
557
558        (prepared_statement_event, ps_uuid)
559    }
560
561    /// Begin statement execution logging from the frontend. (Corresponds to
562    /// `Coordinator::begin_statement_execution`, which is used by the old peek sequencing.)
563    ///
564    /// This encapsulates all the statement logging setup:
565    /// - Retrieves system config values
566    /// - Performs sampling and throttling checks
567    /// - Creates statement logging records
568    /// - Attends to metrics.
569    ///
570    /// Returns None if the statement should not be logged (due to sampling or throttling), or the
571    /// info required to proceed with statement logging.
572    /// The `Row` is the pre-packed row for `mz_statement_execution_history`.
573    /// The `Option<PreparedStatementEvent>` is None when we have already logged the prepared
574    /// statement before, and this is just a subsequent execution.
575    pub fn begin_statement_execution(
576        &self,
577        session: &mut Session,
578        params: &Params,
579        logging: &Arc<QCell<PreparedStatementLoggingInfo>>,
580        system_config: &SystemVars,
581        lifecycle_timestamps: Option<LifecycleTimestamps>,
582    ) -> Option<(
583        StatementLoggingId,
584        StatementBeganExecutionRecord,
585        Row,
586        Option<PreparedStatementEvent>,
587    )> {
588        // Skip logging for internal users unless explicitly enabled
589        let enable_internal_statement_logging = system_config.enable_internal_statement_logging();
590        if session.user().is_internal() && !enable_internal_statement_logging {
591            return None;
592        }
593
594        let sample_rate = effective_sample_rate(session, system_config);
595
596        let use_reproducible_rng = system_config.statement_logging_use_reproducible_rng();
597        let target_data_rate: Option<u64> = system_config
598            .statement_logging_target_data_rate()
599            .map(|rate| rate.cast_into());
600        let max_data_credit: Option<u64> = system_config
601            .statement_logging_max_data_credit()
602            .map(|credit| credit.cast_into());
603
604        // Only lock the RNG when we actually need reproducible sampling (tests only)
605        let sample = if use_reproducible_rng {
606            let mut rng = self.reproducible_rng.lock().expect("rng lock poisoned");
607            should_sample_statement(sample_rate, Some(&mut *rng))
608        } else {
609            should_sample_statement(sample_rate, None)
610        };
611
612        let sampled_label = sample.then_some("true").unwrap_or("false");
613        session
614            .metrics()
615            .statement_logging_records(&[sampled_label])
616            .inc_by(1);
617
618        // Clone only the metrics needed below, before the mutable borrow of session.
619        let unsampled_bytes_metric = session
620            .metrics()
621            .statement_logging_unsampled_bytes()
622            .clone();
623        let actual_bytes_metric = session.metrics().statement_logging_actual_bytes().clone();
624
625        // Handle the accounted flag and record byte metrics
626        let is_new_prepared_statement = if let Some((sql, accounted)) =
627            match session.qcell_rw(logging) {
628                PreparedStatementLoggingInfo::AlreadyLogged { .. } => None,
629                PreparedStatementLoggingInfo::StillToLog { sql, accounted, .. } => {
630                    Some((sql, accounted))
631                }
632            } {
633            if !*accounted {
634                unsampled_bytes_metric.inc_by(u64::cast_from(sql.len()));
635                if sample {
636                    actual_bytes_metric.inc_by(u64::cast_from(sql.len()));
637                }
638                *accounted = true;
639            }
640            true
641        } else {
642            false
643        };
644
645        if !sample {
646            return None;
647        }
648
649        // Get prepared statement info (this also marks it as logged)
650        let (prepared_statement_event, ps_uuid) =
651            self.get_prepared_statement_info(session, logging);
652
653        let began_at = if let Some(lifecycle_timestamps) = lifecycle_timestamps {
654            lifecycle_timestamps.received
655        } else {
656            (self.now)()
657        };
658
659        let current_time = (self.now)();
660        let execution_uuid = epoch_to_uuid_v7(&current_time);
661
662        // Create the execution record
663        let began_execution = create_began_execution_record(
664            execution_uuid,
665            ps_uuid,
666            sample_rate,
667            params,
668            session,
669            began_at,
670            self.build_info_human_version.clone(),
671        );
672
673        // Build rows to calculate cost for throttling
674        let mseh_update = pack_statement_began_execution_update(&began_execution);
675        let maybe_ps_prepared_statement = prepared_statement_event
676            .as_ref()
677            .map(|e| &e.prepared_statement);
678        let maybe_ps_sql_text = prepared_statement_event.as_ref().map(|e| &e.sql_text);
679
680        // Calculate cost of all rows we intend to log
681        let cost: usize = [
682            Some(&mseh_update),
683            maybe_ps_prepared_statement,
684            maybe_ps_sql_text,
685        ]
686        .into_iter()
687        .filter_map(|row_opt| row_opt.map(|row| row.byte_len()))
688        .fold(0_usize, |acc, x| acc.saturating_add(x));
689
690        // Do throttling check
691        let passed = if let Some(target_data_rate) = target_data_rate {
692            self.throttling_state.throttling_check(
693                cost.cast_into(),
694                target_data_rate,
695                max_data_credit,
696                &self.now,
697            )
698        } else {
699            true // No throttling configured
700        };
701
702        if !passed {
703            // Increment throttled_count in shared state
704            self.throttling_state.increment_throttled_count();
705            return None;
706        }
707
708        // When we successfully log the first instance of a prepared statement
709        // (i.e., it is not throttled), reset the throttled count for future tracking.
710        if is_new_prepared_statement {
711            self.throttling_state.reset_throttled_count();
712        }
713
714        Some((
715            StatementLoggingId(execution_uuid),
716            began_execution,
717            mseh_update,
718            prepared_statement_event,
719        ))
720    }
721}
722
723/// The effective rate at which statement execution should be sampled.
724/// This is the value of the session var `statement_logging_sample_rate`,
725/// constrained by the system var `statement_logging_max_sample_rate`.
726pub(crate) fn effective_sample_rate(session: &Session, system_vars: &SystemVars) -> f64 {
727    let system_max: f64 = system_vars
728        .statement_logging_max_sample_rate()
729        .try_into()
730        .expect("value constrained to be convertible to f64");
731    let user_rate: f64 = session
732        .vars()
733        .get_statement_logging_sample_rate()
734        .try_into()
735        .expect("value constrained to be convertible to f64");
736    f64::min(system_max, user_rate)
737}
738
739/// Helper function to decide whether to sample a statement execution.
740/// Returns `true` if the statement should be sampled based on the sample rate.
741///
742/// If `reproducible_rng` is `Some`, uses the provided RNG for reproducible sampling (used in tests).
743/// If `reproducible_rng` is `None`, uses the thread-local RNG.
744pub(crate) fn should_sample_statement(
745    sample_rate: f64,
746    reproducible_rng: Option<&mut rand_chacha::ChaCha8Rng>,
747) -> bool {
748    let distribution = Bernoulli::new(sample_rate).unwrap_or_else(|_| {
749        soft_panic_or_log!("statement_logging_sample_rate is out of range [0, 1]");
750        Bernoulli::new(0.0).expect("0.0 is valid for Bernoulli")
751    });
752    if let Some(rng) = reproducible_rng {
753        distribution.sample(rng)
754    } else {
755        distribution.sample(&mut rand::rng())
756    }
757}
758
759/// Helper function to serialize statement parameters for logging.
760fn serialize_params(params: &Params) -> Vec<Option<String>> {
761    std::iter::zip(params.execute_types.iter(), params.datums.iter())
762        .map(|(r#type, datum)| {
763            mz_pgrepr::Value::from_datum(datum, r#type).map(|val| {
764                let mut buf = BytesMut::new();
765                val.encode_text(&mut buf);
766                String::from_utf8(Into::<Vec<u8>>::into(buf))
767                    .expect("Serialization shouldn't produce non-UTF-8 strings.")
768            })
769        })
770        .collect()
771}
772
773/// Helper function to create a `StatementBeganExecutionRecord`.
774pub(crate) fn create_began_execution_record(
775    execution_uuid: Uuid,
776    prepared_statement_uuid: Uuid,
777    sample_rate: f64,
778    params: &Params,
779    session: &Session,
780    began_at: EpochMillis,
781    build_info_version: String,
782) -> StatementBeganExecutionRecord {
783    let params = serialize_params(params);
784    StatementBeganExecutionRecord {
785        id: execution_uuid,
786        prepared_statement_id: prepared_statement_uuid,
787        sample_rate,
788        params,
789        began_at,
790        application_name: session.application_name().to_string(),
791        transaction_isolation: session.vars().transaction_isolation().to_string(),
792        transaction_id: session
793            .transaction()
794            .inner()
795            .map(|t| t.id)
796            .unwrap_or_else(|| {
797                // This should never happen because every statement runs in an explicit or implicit
798                // transaction.
799                soft_panic_or_log!(
800                    "Statement logging got a statement with no associated transaction"
801                );
802                9999999
803            }),
804        mz_version: build_info_version,
805        // These are not known yet; we'll fill them in later.
806        cluster_id: None,
807        cluster_name: None,
808        execution_timestamp: None,
809        transient_index_id: None,
810        database_name: session.vars().database().into(),
811        search_path: session
812            .vars()
813            .search_path()
814            .iter()
815            .map(|s| s.as_str().to_string())
816            .collect(),
817    }
818}
819
820/// Represents a single statement logging event that can be sent from the frontend
821/// peek sequencing to the Coordinator via an mpsc channel.
822#[derive(Debug, Clone)]
823pub enum FrontendStatementLoggingEvent {
824    /// Statement execution began, possibly with an associated prepared statement
825    /// if this is the first time the prepared statement is being logged
826    BeganExecution {
827        record: StatementBeganExecutionRecord,
828        /// `mz_statement_execution_history`
829        mseh_update: Row,
830        prepared_statement: Option<PreparedStatementEvent>,
831    },
832    /// Statement execution ended
833    EndedExecution(StatementEndedExecutionRecord),
834    /// Set the cluster for a statement execution
835    SetCluster {
836        id: StatementLoggingId,
837        cluster_id: ClusterId,
838    },
839    /// Set the execution timestamp for a statement
840    SetTimestamp {
841        id: StatementLoggingId,
842        timestamp: Timestamp,
843    },
844    /// Set the transient index ID for a statement
845    SetTransientIndex {
846        id: StatementLoggingId,
847        transient_index_id: GlobalId,
848    },
849    /// Record a statement lifecycle event
850    Lifecycle {
851        id: StatementLoggingId,
852        event: StatementLifecycleEvent,
853        when: EpochMillis,
854    },
855}
856
857pub(crate) fn pack_statement_execution_inner(
858    record: &StatementBeganExecutionRecord,
859    packer: &mut RowPacker,
860) {
861    let StatementBeganExecutionRecord {
862        id,
863        prepared_statement_id,
864        sample_rate,
865        params,
866        began_at,
867        cluster_id,
868        cluster_name,
869        database_name,
870        search_path,
871        application_name,
872        transaction_isolation,
873        execution_timestamp,
874        transaction_id,
875        transient_index_id,
876        mz_version,
877    } = record;
878
879    let cluster = cluster_id.map(|id| id.to_string());
880    let transient_index_id = transient_index_id.map(|id| id.to_string());
881    packer.extend([
882        Datum::Uuid(*id),
883        Datum::Uuid(*prepared_statement_id),
884        Datum::Float64((*sample_rate).into()),
885        match &cluster {
886            None => Datum::Null,
887            Some(cluster_id) => Datum::String(cluster_id),
888        },
889        Datum::String(&*application_name),
890        cluster_name.as_ref().map(String::as_str).into(),
891        Datum::String(database_name),
892    ]);
893    packer.push_list(search_path.iter().map(|s| Datum::String(s)));
894    packer.extend([
895        Datum::String(&*transaction_isolation),
896        (*execution_timestamp).into(),
897        Datum::UInt64(*transaction_id),
898        match &transient_index_id {
899            None => Datum::Null,
900            Some(transient_index_id) => Datum::String(transient_index_id),
901        },
902    ]);
903    packer
904        .try_push_array(
905            &[ArrayDimension {
906                lower_bound: 1,
907                length: params.len(),
908            }],
909            params
910                .iter()
911                .map(|p| Datum::from(p.as_ref().map(String::as_str))),
912        )
913        .expect("correct array dimensions");
914    packer.push(Datum::from(mz_version.as_str()));
915    packer.push(Datum::TimestampTz(
916        to_datetime(*began_at).try_into().expect("Sane system time"),
917    ));
918}
919
920pub(crate) fn pack_statement_began_execution_update(record: &StatementBeganExecutionRecord) -> Row {
921    let mut row = Row::default();
922    let mut packer = row.packer();
923    pack_statement_execution_inner(record, &mut packer);
924    packer.extend([
925        // finished_at
926        Datum::Null,
927        // finished_status
928        Datum::Null,
929        // error_message
930        Datum::Null,
931        // result_size
932        Datum::Null,
933        // rows_returned
934        Datum::Null,
935        // execution_status
936        Datum::Null,
937    ]);
938    row
939}
940
941pub(crate) fn pack_statement_prepared_update(
942    record: &StatementPreparedRecord,
943    packer: &mut RowPacker,
944) {
945    let StatementPreparedRecord {
946        id,
947        session_id,
948        name,
949        sql_hash,
950        prepared_at,
951        kind,
952    } = record;
953    packer.extend([
954        Datum::Uuid(*id),
955        Datum::Uuid(*session_id),
956        Datum::String(name.as_str()),
957        Datum::Bytes(sql_hash.as_slice()),
958        Datum::TimestampTz(to_datetime(*prepared_at).try_into().expect("must fit")),
959        kind.map(statement_kind_label_value).into(),
960    ]);
961}
962
963/// Bundles all information needed to install watch sets for statement lifecycle logging.
964/// This includes the statement logging ID and the transitive dependencies to watch.
965#[derive(Debug)]
966pub struct WatchSetCreation {
967    /// The statement logging ID for this execution.
968    pub logging_id: StatementLoggingId,
969    /// The timestamp at which to watch for dependencies becoming ready.
970    pub timestamp: Timestamp,
971    /// Transitive storage dependencies (tables, sources) to watch.
972    pub storage_ids: BTreeSet<GlobalId>,
973    /// Transitive compute dependencies (materialized views, indexes) to watch.
974    pub compute_ids: BTreeSet<GlobalId>,
975}
976
977impl WatchSetCreation {
978    /// Compute transitive dependencies for watch sets from an input ID bundle, categorized into
979    /// storage and compute IDs.
980    pub fn new(
981        logging_id: StatementLoggingId,
982        catalog_state: &CatalogState,
983        input_id_bundle: &CollectionIdBundle,
984        timestamp: Timestamp,
985    ) -> Self {
986        let mut storage_ids = BTreeSet::new();
987        let mut compute_ids = BTreeSet::new();
988
989        for item_id in input_id_bundle
990            .iter()
991            .map(|gid| catalog_state.get_entry_by_global_id(&gid).id())
992            .flat_map(|id| catalog_state.transitive_uses(id))
993        {
994            let entry = catalog_state.get_entry(&item_id);
995            match entry.item() {
996                // TODO(alter_table): Adding all of the GlobalIds for an object is incorrect.
997                // For example, this peek may depend on just a single version of a table, but
998                // we would add dependencies on all versions of said table. Doing this is okay
999                // for now since we can't yet version tables, but should get fixed.
1000                CatalogItem::Table(_) | CatalogItem::Source(_) => {
1001                    storage_ids.extend(entry.global_ids());
1002                }
1003                // Each catalog item is computed by at most one compute collection at a time,
1004                // which is also the most recent one.
1005                CatalogItem::MaterializedView(_) | CatalogItem::Index(_) => {
1006                    compute_ids.insert(entry.latest_global_id());
1007                }
1008                _ => {}
1009            }
1010        }
1011
1012        Self {
1013            logging_id,
1014            timestamp,
1015            storage_ids,
1016            compute_ids,
1017        }
1018    }
1019}