Struct Coordinator

Help

pub struct Coordinator {Show 44 fields
    controller: Controller,
    catalog: Arc<Catalog>,
    internal_cmd_tx: UnboundedSender<Message>,
    group_commit_tx: GroupCommitNotifier,
    strict_serializable_reads_tx: UnboundedSender<(ConnectionId, PendingReadTxn)>,
    global_timelines: BTreeMap<Timeline, TimelineState<Timestamp>>,
    transient_id_gen: Arc<TransientIdGen>,
    active_conns: BTreeMap<ConnectionId, ConnMeta>,
    txn_read_holds: BTreeMap<ConnectionId, ReadHolds<Timestamp>>,
    pending_peeks: BTreeMap<Uuid, PendingPeek>,
    client_pending_peeks: BTreeMap<ConnectionId, BTreeMap<Uuid, ClusterId>>,
    pending_linearize_read_txns: BTreeMap<ConnectionId, PendingReadTxn>,
    active_compute_sinks: BTreeMap<GlobalId, ActiveComputeSink>,
    active_webhooks: BTreeMap<CatalogItemId, WebhookAppenderInvalidator>,
    active_copies: BTreeMap<ConnectionId, ActiveCopyFrom>,
    staged_cancellation: BTreeMap<ConnectionId, (Sender<bool>, Receiver<bool>)>,
    introspection_subscribes: BTreeMap<GlobalId, IntrospectionSubscribe>,
    write_locks: BTreeMap<CatalogItemId, Arc<Mutex<()>>>,
    deferred_write_ops: BTreeMap<ConnectionId, DeferredOp>,
    pending_writes: Vec<PendingWriteTxn>,
    advance_timelines_interval: Interval,
    serialized_ddl: LockedVecDeque<DeferredPlanStatement>,
    secrets_controller: Arc<dyn SecretsController>,
    caching_secrets_reader: CachingSecretsReader,
    cloud_resource_controller: Option<Arc<dyn CloudResourceController>>,
    transient_replica_metadata: BTreeMap<ReplicaId, Option<ReplicaMetadata>>,
    storage_usage_client: StorageUsageClient,
    storage_usage_collection_interval: Duration,
    segment_client: Option<Client>,
    metrics: Metrics,
    optimizer_metrics: OptimizerMetrics,
    tracing_handle: TracingHandle,
    statement_logging: StatementLogging,
    webhook_concurrency_limit: WebhookConcurrencyLimiter,
    pg_timestamp_oracle_config: Option<PostgresTimestampOracleConfig>,
    check_cluster_scheduling_policies_interval: Interval,
    cluster_scheduling_decisions: BTreeMap<ClusterId, BTreeMap<&'static str, SchedulingDecision>>,
    caught_up_check_interval: Interval,
    caught_up_check: Option<CaughtUpCheckContext>,
    installed_watch_sets: BTreeMap<WatchSetId, (ConnectionId, WatchSetResponse)>,
    connection_watch_sets: BTreeMap<ConnectionId, BTreeSet<WatchSetId>>,
    cluster_replica_statuses: ClusterReplicaStatuses,
    read_only_controllers: bool,
    buffered_builtin_table_updates: Option<Vec<BuiltinTableUpdate>>,
}

Expand description

Glues the external world to the Timely workers.

Fields§

§controller: Controller

The controller for the storage and compute layers.

§catalog: Arc<Catalog>

The catalog in an Arc suitable for readonly references. The Arc allows us to hand out cheap copies of the catalog to functions that can use it off of the main coordinator thread. If the coordinator needs to mutate the catalog, call Self::catalog_mut, which will clone this struct member, allowing it to be mutated here while the other off-thread references can read their catalog as long as needed. In the future we would like this to be a pTVC, but for now this is sufficient.

§internal_cmd_tx: UnboundedSender<Message>

Channel to manage internal commands from the coordinator to itself.

§group_commit_tx: GroupCommitNotifier

Notification that triggers a group commit.

§strict_serializable_reads_tx: UnboundedSender<(ConnectionId, PendingReadTxn)>

Channel for strict serializable reads ready to commit.

§global_timelines: BTreeMap<Timeline, TimelineState<Timestamp>>

Mechanism for totally ordering write and read timestamps, so that all reads reflect exactly the set of writes that precede them, and no writes that follow.

§transient_id_gen: Arc<TransientIdGen>

A generator for transient GlobalIds, shareable with other threads.

§active_conns: BTreeMap<ConnectionId, ConnMeta>

A map from connection ID to metadata about that connection for all active connections.

§txn_read_holds: BTreeMap<ConnectionId, ReadHolds<Timestamp>>

For each transaction, the read holds taken to support any performed reads.

Upon completing a transaction, these read holds should be dropped.

§pending_peeks: BTreeMap<Uuid, PendingPeek>

Access to the peek fields should be restricted to methods in the peek API. A map from pending peek ids to the queue into which responses are sent, and the connection id of the client that initiated the peek.

§client_pending_peeks: BTreeMap<ConnectionId, BTreeMap<Uuid, ClusterId>>

A map from client connection ids to a set of all pending peeks for that client.

§pending_linearize_read_txns: BTreeMap<ConnectionId, PendingReadTxn>

A map from client connection ids to pending linearize read transaction.

§active_compute_sinks: BTreeMap<GlobalId, ActiveComputeSink>

A map from the compute sink ID to it’s state description.

§active_webhooks: BTreeMap<CatalogItemId, WebhookAppenderInvalidator>

A map from active webhooks to their invalidation handle.

§active_copies: BTreeMap<ConnectionId, ActiveCopyFrom>

A map of active COPY FROM statements. The Coordinator waits for clusterd to stage Batches in Persist that we will then link into the shard.

§staged_cancellation: BTreeMap<ConnectionId, (Sender<bool>, Receiver<bool>)>

A map from connection ids to a watch channel that is set to true if the connection received a cancel request.

§introspection_subscribes: BTreeMap<GlobalId, IntrospectionSubscribe>

Active introspection subscribes.

§write_locks: BTreeMap<CatalogItemId, Arc<Mutex<()>>>

Locks that grant access to a specific object, populated lazily as objects are written to.

§deferred_write_ops: BTreeMap<ConnectionId, DeferredOp>

Plans that are currently deferred and waiting on a write lock.

§pending_writes: Vec<PendingWriteTxn>

Pending writes waiting for a group commit.

§advance_timelines_interval: Interval

For the realtime timeline, an explicit SELECT or INSERT on a table will bump the table’s timestamps, but there are cases where timestamps are not bumped but we expect the closed timestamps to advance (AS OF X, SUBSCRIBing views over RT sources and tables). To address these, spawn a task that forces table timestamps to close on a regular interval. This roughly tracks the behavior of realtime sources that close off timestamps on an interval.

For non-realtime timelines, nothing pushes the timestamps forward, so we must do it manually.

§serialized_ddl: LockedVecDeque<DeferredPlanStatement>

Serialized DDL. DDL must be serialized because:

Many of them do off-thread work and need to verify the catalog is in a valid state, but PlanValidity does not currently support tracking all changes. Doing that correctly seems to be more difficult than it’s worth, so we would instead re-plan and re-sequence the statements.
Re-planning a statement is hard because Coordinator and Session state is mutated at various points, and we would need to correctly reset those changes before re-planning and re-sequencing.

§secrets_controller: Arc<dyn SecretsController>

Handle to secret manager that can create and delete secrets from an arbitrary secret storage engine.

§caching_secrets_reader: CachingSecretsReader

A secrets reader than maintains an in-memory cache, where values have a set TTL.

§cloud_resource_controller: Option<Arc<dyn CloudResourceController>>

Handle to a manager that can create and delete kubernetes resources (ie: VpcEndpoint objects)

§transient_replica_metadata: BTreeMap<ReplicaId, Option<ReplicaMetadata>>

Metadata about replicas that doesn’t need to be persisted. Intended for inclusion in system tables.

None is used as a tombstone value for replicas that have been dropped and for which no further updates should be recorded.

§storage_usage_client: StorageUsageClient

Persist client for fetching storage metadata such as size metrics.

§storage_usage_collection_interval: Duration

The interval at which to collect storage usage information.

§segment_client: Option<Client>

Segment analytics client.

§metrics: Metrics

Coordinator metrics.

§optimizer_metrics: OptimizerMetrics

Optimizer metrics.

§tracing_handle: TracingHandle

Tracing handle.

§statement_logging: StatementLogging

Data used by the statement logging feature.

§webhook_concurrency_limit: WebhookConcurrencyLimiter

Limit for how many concurrent webhook requests we allow.

§pg_timestamp_oracle_config: Option<PostgresTimestampOracleConfig>

Optional config for the Postgres-backed timestamp oracle. This is required when postgres is configured using the timestamp_oracle system variable.

§check_cluster_scheduling_policies_interval: Interval

Periodically asks cluster scheduling policies to make their decisions.

§cluster_scheduling_decisions: BTreeMap<ClusterId, BTreeMap<&'static str, SchedulingDecision>>

This keeps the last On/Off decision for each cluster and each scheduling policy. (Clusters that have been dropped or are otherwise out of scope for automatic scheduling are periodically cleaned up from this Map.)

§caught_up_check_interval: Interval

When doing 0dt upgrades/in read-only mode, periodically ask all known clusters/collections whether they are caught up.

§caught_up_check: Option<CaughtUpCheckContext>

Context needed to check whether all clusters/collections have caught up. Only used during 0dt deployment, while in read-only mode.

§installed_watch_sets: BTreeMap<WatchSetId, (ConnectionId, WatchSetResponse)>

Tracks the state associated with the currently installed watchsets.

§connection_watch_sets: BTreeMap<ConnectionId, BTreeSet<WatchSetId>>

Tracks the currently installed watchsets for each connection.

§cluster_replica_statuses: ClusterReplicaStatuses

Tracks the statuses of all cluster replicas.

§read_only_controllers: bool

Whether or not to start controllers in read-only mode. This is only meant for use during development of read-only clusters and 0dt upgrades and should go away once we have proper orchestration during upgrades.

§buffered_builtin_table_updates: Option<Vec<BuiltinTableUpdate>>

Updates to builtin tables that are being buffered while we are in read-only mode. We apply these all at once when coming out of read-only mode.

This is a Some while in read-only mode and will be replaced by a None when we transition out of read-only mode and write out any buffered updates.

Struct CoordinatorCopy item path

Fields§

Implementations§

impl Coordinator

pub fn resolve_collection_id_bundle_names( &self, session: &Session, id_bundle: &CollectionIdBundle, ) -> Vec<String>

impl Coordinator

pub(crate) fn cancel_pending_peeks(&mut self, conn_id: &ConnectionId)

pub(crate) fn handle_peek_notification( &mut self, uuid: Uuid, notification: PeekNotification, otel_ctx: OpenTelemetryContext, )

pub(crate) fn remove_pending_peek(&mut self, uuid: &Uuid) -> Option<PendingPeek>

pub(crate) fn send_immediate_rows<I>(rows: I) -> ExecuteResponsewhere I: IntoRowIterator, I::Iter: Send + Sync + 'static,

impl Coordinator

pub(crate) fn spawn_statement_logging_task(&self)

pub(crate) fn drain_statement_log(&mut self)

fn statement_logging_throttling_check(&mut self, cost: usize) -> Option<usize>

pub(crate) fn log_prepared_statement( &mut self, session: &mut Session, logging: &Arc<QCell<PreparedStatementLoggingInfo>>, ) -> Option<(Option<(StatementPreparedRecord, PreparedStatementEvent)>, Uuid)>

pub fn statement_execution_sample_rate(&self, session: &Session) -> f64

pub fn end_statement_execution( &mut self, id: StatementLoggingId, reason: StatementEndedExecutionReason, )

fn pack_statement_execution_inner( record: &StatementBeganExecutionRecord, packer: &mut RowPacker<'_>, )

fn pack_statement_began_execution_update( record: &StatementBeganExecutionRecord, ) -> Row

fn pack_statement_prepared_update( record: &StatementPreparedRecord, packer: &mut RowPacker<'_>, )

fn pack_session_history_update(event: &SessionHistoryEvent) -> Row

fn pack_statement_lifecycle_event( StatementLoggingId: &StatementLoggingId, event: &StatementLifecycleEvent, when: EpochMillis, ) -> Row

pub fn pack_full_statement_execution_update( began_record: &StatementBeganExecutionRecord, ended_record: &StatementEndedExecutionRecord, ) -> Row

pub fn pack_statement_ended_execution_updates( began_record: &StatementBeganExecutionRecord, ended_record: &StatementEndedExecutionRecord, ) -> [(Row, Diff); 2]

fn mutate_record<F: FnOnce(&mut StatementBeganExecutionRecord)>( &mut self, StatementLoggingId: StatementLoggingId, f: F, )

pub fn set_statement_execution_cluster( &mut self, id: StatementLoggingId, cluster_id: ClusterId, )

pub fn set_statement_execution_timestamp( &mut self, id: StatementLoggingId, timestamp: Timestamp, )

pub fn set_transient_index_id( &mut self, id: StatementLoggingId, transient_index_id: GlobalId, )

pub fn begin_statement_execution( &mut self, session: &mut Session, params: &Params, logging: &Arc<QCell<PreparedStatementLoggingInfo>>, ) -> Option<StatementLoggingId>

pub fn begin_session_for_statement_logging(&mut self, session: &ConnMeta)

pub fn end_session_for_statement_logging(&mut self, uuid: Uuid)

pub fn record_statement_lifecycle_event( &mut self, id: &StatementLoggingId, event: &StatementLifecycleEvent, when: EpochMillis, )

impl Coordinator

pub(crate) fn now(&self) -> EpochMillis

pub(crate) fn now_datetime(&self) -> DateTime<Utc>

pub(crate) fn get_timestamp_oracle( &self, timeline: &Timeline, ) -> Arc<dyn TimestampOracle<Timestamp> + Send + Sync>

pub(crate) fn get_local_timestamp_oracle( &self, ) -> Arc<dyn TimestampOracle<Timestamp> + Send + Sync>

pub(crate) async fn get_local_read_ts(&self) -> Timestamp

pub(crate) async fn get_local_write_ts(&mut self) -> WriteTimestamp

pub(crate) async fn peek_local_write_ts(&self) -> Timestamp

pub(crate) fn apply_local_write( &self, timestamp: Timestamp, ) -> impl Future<Output = ()> + Send + 'static

pub(crate) async fn get_catalog_write_ts(&mut self) -> Timestamp

pub(crate) async fn ensure_timeline_state<'a>( &'a mut self, timeline: &'a Timeline, ) -> &'a mut TimelineState<Timestamp>

pub(crate) fn build_collection_id_bundle( &self, storage_ids: impl IntoIterator<Item = GlobalId>, compute_ids: impl IntoIterator<Item = (ComputeInstanceId, GlobalId)>, clusters: impl IntoIterator<Item = ComputeInstanceId>, ) -> CollectionIdBundle

pub(crate) fn remove_resources_associated_with_timeline( &mut self, timeline: Timeline, ids: CollectionIdBundle, ) -> bool

pub(crate) fn remove_compute_ids_from_timeline<I>( &mut self, ids: I, ) -> Vec<Timeline>where I: IntoIterator<Item = (ComputeInstanceId, GlobalId)>,

pub(crate) fn ids_in_timeline(&self, timeline: &Timeline) -> CollectionIdBundle

pub(crate) fn validate_timeline_context<I>( &self, ids: I, ) -> Result<TimelineContext, AdapterError>where I: IntoIterator<Item = GlobalId>,

pub(crate) fn get_timeline_context(&self, id: CatalogItemId) -> TimelineContext

pub(crate) fn get_timeline_context_for_global_id( &self, id: GlobalId, ) -> TimelineContext

fn get_timeline_contexts<I>(&self, ids: I) -> BTreeSet<TimelineContext>where I: IntoIterator<Item = CatalogItemId>,

pub fn partition_ids_by_timeline_context( &self, id_bundle: &CollectionIdBundle, ) -> impl Iterator<Item = (TimelineContext, CollectionIdBundle)>

pub(crate) fn timedomain_for<'a, I>( &self, uses_ids: I, timeline_context: &TimelineContext, conn_id: &ConnectionId, compute_instance: ComputeInstanceId, ) -> Result<CollectionIdBundle, AdapterError>where I: IntoIterator<Item = &'a GlobalId>,

pub(crate) async fn advance_timelines(&mut self)

impl Coordinator

pub(crate) async fn oracle_read_ts( &self, session: &Session, timeline_ctx: &TimelineContext, when: &QueryWhen, ) -> Option<Timestamp>

pub(crate) fn largest_not_in_advance_of_upper( upper: &Antichain<Timestamp>, ) -> Timestamp

pub(crate) fn evaluate_when( catalog: &CatalogState, timestamp: MirScalarExpr, session: &Session, ) -> Result<Timestamp, AdapterError>

impl Coordinator

pub(crate) fn trigger_group_commit(&mut self)

pub(crate) async fn try_deferred( &mut self, conn_id: ConnectionId, acquired_lock: Option<(CatalogItemId, OwnedMutexGuard<()>)>, )

pub(crate) async fn try_group_commit( &mut self, permit: Option<GroupCommitPermit>, )

pub(crate) async fn group_commit( &mut self, permit: Option<GroupCommitPermit>, ) -> Timestamp

pub(crate) fn submit_write(&mut self, pending_write_txn: PendingWriteTxn)

pub(crate) fn builtin_table_update<'a>(&'a mut self) -> BuiltinTableAppend<'a>

pub(crate) fn defer_op<F>(&mut self, acquire_future: F, op: DeferredOp)where F: Future<Output = Option<(CatalogItemId, OwnedMutexGuard<()>)>> + Send + 'static,

pub(crate) fn grant_object_write_lock( &mut self, object_id: CatalogItemId, ) -> impl Future<Output = (CatalogItemId, OwnedMutexGuard<()>)> + 'static

pub(crate) fn try_grant_object_write_lock( &mut self, object_id: CatalogItemId, ) -> Option<OwnedMutexGuard<()>>

impl Coordinator

pub async fn maybe_check_caught_up(&mut self)

async fn maybe_check_caught_up_new(&mut self)

async fn clusters_caught_up( &self, allowed_lag: Timestamp, cutoff: Timestamp, now: Timestamp, live_frontiers: &BTreeMap<GlobalId, Antichain<Timestamp>>, exclude_collections: &BTreeSet<GlobalId>, ) -> bool

async fn collections_caught_up( &self, cluster: &Cluster, allowed_lag: Timestamp, cutoff: Timestamp, now: Timestamp, live_frontiers: &BTreeMap<GlobalId, Antichain<Timestamp>>, exclude_collections: &BTreeSet<GlobalId>, ) -> Result<bool, Error>

async fn maybe_check_caught_up_legacy(&mut self)

impl Coordinator

pub(crate) async fn check_scheduling_policies(&mut self)

fn check_refresh_policy(&self)

pub(crate) async fn handle_scheduling_decisions( &mut self, decisions: Vec<(&'static str, Vec<(ClusterId, SchedulingDecision)>)>, )

fn get_managed_cluster_config( &self, cluster_id: ClusterId, ) -> Option<ClusterVariantManaged>

impl Coordinator

Struct Coordinator

pub(crate) fn send_immediate_rows<I>(rows: I) -> ExecuteResponse
where I: IntoRowIterator, I::Iter: Send + Sync + 'static,

pub(crate) fn remove_compute_ids_from_timeline<I>( &mut self, ids: I, ) -> Vec<Timeline>
where I: IntoIterator<Item = (ComputeInstanceId, GlobalId)>,

pub(crate) fn validate_timeline_context<I>( &self, ids: I, ) -> Result<TimelineContext, AdapterError>
where I: IntoIterator<Item = GlobalId>,

fn get_timeline_contexts<I>(&self, ids: I) -> BTreeSet<TimelineContext>
where I: IntoIterator<Item = CatalogItemId>,

pub(crate) fn timedomain_for<'a, I>( &self, uses_ids: I, timeline_context: &TimelineContext, conn_id: &ConnectionId, compute_instance: ComputeInstanceId, ) -> Result<CollectionIdBundle, AdapterError>
where I: IntoIterator<Item = &'a GlobalId>,

pub(crate) fn defer_op<F>(&mut self, acquire_future: F, op: DeferredOp)
where F: Future<Output = Option<(CatalogItemId, OwnedMutexGuard<()>)>> + Send + 'static,

pub(crate) async fn catalog_transact_with_side_effects<'c, F, Fut>( &'c mut self, session: Option<&Session>, ops: Vec<Op>, side_effect: F, ) -> Result<(), AdapterError>
where F: FnOnce(&'c mut Coordinator) -> Fut, Fut: Future<Output = ()>,