1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
// Copyright Materialize, Inc. and contributors. All rights reserved.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

//! Logging dataflows for events generated by differential dataflow.

use std::cell::RefCell;
use std::collections::BTreeMap;
use std::rc::Rc;
use std::time::Duration;

use differential_dataflow::consolidation::ConsolidatingContainerBuilder;
use differential_dataflow::logging::{
    BatchEvent, BatcherEvent, DifferentialEvent, DropEvent, MergeEvent, TraceShare,
};
use mz_ore::cast::CastFrom;
use mz_repr::{Datum, Diff, Timestamp};
use mz_timely_util::replay::MzReplay;
use timely::communication::Allocate;
use timely::container::CapacityContainerBuilder;
use timely::dataflow::channels::pact::Pipeline;
use timely::dataflow::channels::pushers::buffer::Session;
use timely::dataflow::channels::pushers::{Counter, Tee};
use timely::dataflow::operators::generic::builder_rc::OperatorBuilder;
use timely::dataflow::operators::Filter;
use timely::dataflow::Stream;
use timely::logging::WorkerIdentifier;

use crate::extensions::arrange::{MzArrange, MzArrangeCore};
use crate::logging::compute::ComputeEvent;
use crate::logging::{
    DifferentialLog, EventQueue, LogCollection, LogVariant, PermutedRowPacker, SharedLoggingState,
};
use crate::typedefs::{KeyValSpine, RowRowSpine};

/// Constructs the logging dataflow for differential logs.
///
/// Params
/// * `worker`: The Timely worker hosting the log analysis dataflow.
/// * `config`: Logging configuration
/// * `event_queue`: The source to read log events from.
pub(super) fn construct<A: Allocate>(
    worker: &mut timely::worker::Worker<A>,
    config: &mz_compute_client::logging::LoggingConfig,
    event_queue: EventQueue<Vec<(Duration, WorkerIdentifier, DifferentialEvent)>>,
    shared_state: Rc<RefCell<SharedLoggingState>>,
) -> BTreeMap<LogVariant, LogCollection> {
    let logging_interval_ms = std::cmp::max(1, config.interval.as_millis());
    let worker_id = worker.index();
    let dataflow_index = worker.next_dataflow_index();

    worker.dataflow_named("Dataflow: differential logging", move |scope| {
        let (mut logs, token) = Some(event_queue.link)
            .mz_replay::<_, CapacityContainerBuilder<_>, _>(
                scope,
                "differential logs",
                config.interval,
                event_queue.activator,
                |mut session, data| session.give_iterator(data.iter()),
            );

        // If logging is disabled, we still need to install the indexes, but we can leave them
        // empty. We do so by immediately filtering all logs events.
        if !config.enable_logging {
            logs = logs.filter(|_| false);
        }

        // Build a demux operator that splits the replayed event stream up into the separate
        // logging streams.
        let mut demux =
            OperatorBuilder::new("Differential Logging Demux".to_string(), scope.clone());
        let mut input = demux.new_input(&logs, Pipeline);
        let (mut batches_out, batches) = demux.new_output();
        let (mut records_out, records) = demux.new_output();
        let (mut sharing_out, sharing) = demux.new_output();
        let (mut batcher_records_out, batcher_records) = demux.new_output();
        let (mut batcher_size_out, batcher_size) = demux.new_output();
        let (mut batcher_capacity_out, batcher_capacity) = demux.new_output();
        let (mut batcher_allocations_out, batcher_allocations) = demux.new_output();

        let mut demux_state = Default::default();
        demux.build(move |_capability| {
            move |_frontiers| {
                let mut batches = batches_out.activate();
                let mut records = records_out.activate();
                let mut sharing = sharing_out.activate();
                let mut batcher_records = batcher_records_out.activate();
                let mut batcher_size = batcher_size_out.activate();
                let mut batcher_capacity = batcher_capacity_out.activate();
                let mut batcher_allocations = batcher_allocations_out.activate();

                input.for_each(|cap, data| {
                    let mut output_buffers = DemuxOutput {
                        batches: batches.session_with_builder(&cap),
                        records: records.session_with_builder(&cap),
                        sharing: sharing.session_with_builder(&cap),
                        batcher_records: batcher_records.session_with_builder(&cap),
                        batcher_size: batcher_size.session_with_builder(&cap),
                        batcher_capacity: batcher_capacity.session_with_builder(&cap),
                        batcher_allocations: batcher_allocations.session_with_builder(&cap),
                    };

                    for (time, logger_id, event) in data.drain(..) {
                        // We expect the logging infrastructure to not shuffle events between
                        // workers and this code relies on the assumption that each worker handles
                        // its own events.
                        assert_eq!(logger_id, worker_id);

                        DemuxHandler {
                            state: &mut demux_state,
                            output: &mut output_buffers,
                            logging_interval_ms,
                            time,
                            shared_state: &mut shared_state.borrow_mut(),
                        }
                        .handle(event);
                    }
                });
            }
        });

        let stream_to_collection = |input: Stream<_, ((usize, ()), Timestamp, Diff)>, log, name| {
            let packer = PermutedRowPacker::new(log);
            input
                .mz_arrange_core::<_, KeyValSpine<_, _, _, _>>(
                    Pipeline,
                    &format!("PreArrange Differential {name}"),
                )
                .as_collection(move |op, ()| {
                    packer.pack_slice(&[
                        Datum::UInt64(u64::cast_from(*op)),
                        Datum::UInt64(u64::cast_from(worker_id)),
                    ])
                })
        };

        // Encode the contents of each logging stream into its expected `Row` format.
        let arrangement_batches = stream_to_collection(batches, ArrangementBatches, "batches");
        let arrangement_records = stream_to_collection(records, ArrangementRecords, "records");
        let sharing = stream_to_collection(sharing, Sharing, "sharing");
        let batcher_records =
            stream_to_collection(batcher_records, BatcherRecords, "batcher records");
        let batcher_size = stream_to_collection(batcher_size, BatcherSize, "batcher size");
        let batcher_capacity =
            stream_to_collection(batcher_capacity, BatcherCapacity, "batcher capacity");
        let batcher_allocations = stream_to_collection(
            batcher_allocations,
            BatcherAllocations,
            "batcher allocations",
        );

        use DifferentialLog::*;
        let logs = [
            (ArrangementBatches, arrangement_batches),
            (ArrangementRecords, arrangement_records),
            (Sharing, sharing),
            (BatcherRecords, batcher_records),
            (BatcherSize, batcher_size),
            (BatcherCapacity, batcher_capacity),
            (BatcherAllocations, batcher_allocations),
        ];

        // Build the output arrangements.
        let mut result = BTreeMap::new();
        for (variant, collection) in logs {
            let variant = LogVariant::Differential(variant);
            if config.index_logs.contains_key(&variant) {
                let trace = collection
                    .mz_arrange::<RowRowSpine<_, _>>(&format!("Arrange {variant:?}"))
                    .trace;
                let collection = LogCollection {
                    trace,
                    token: Rc::clone(&token),
                    dataflow_index,
                };
                result.insert(variant, collection);
            }
        }

        result
    })
}

type Pusher<D> =
    Counter<Timestamp, Vec<(D, Timestamp, Diff)>, Tee<Timestamp, Vec<(D, Timestamp, Diff)>>>;
type OutputSession<'a, D> =
    Session<'a, Timestamp, ConsolidatingContainerBuilder<Vec<(D, Timestamp, Diff)>>, Pusher<D>>;

/// Bundled output buffers used by the demux operator.
struct DemuxOutput<'a> {
    batches: OutputSession<'a, (usize, ())>,
    records: OutputSession<'a, (usize, ())>,
    sharing: OutputSession<'a, (usize, ())>,
    batcher_records: OutputSession<'a, (usize, ())>,
    batcher_size: OutputSession<'a, (usize, ())>,
    batcher_capacity: OutputSession<'a, (usize, ())>,
    batcher_allocations: OutputSession<'a, (usize, ())>,
}

/// State maintained by the demux operator.
#[derive(Default)]
struct DemuxState {
    /// Arrangement trace sharing
    sharing: BTreeMap<usize, usize>,
}

/// Event handler of the demux operator.
struct DemuxHandler<'a, 'b> {
    /// State kept by the demux operator
    state: &'a mut DemuxState,
    /// Demux output buffers.
    output: &'a mut DemuxOutput<'b>,
    /// The logging interval specifying the time granularity for the updates.
    logging_interval_ms: u128,
    /// The current event time.
    time: Duration,
    /// State shared across log receivers.
    shared_state: &'a mut SharedLoggingState,
}

impl DemuxHandler<'_, '_> {
    /// Return the timestamp associated with the current event, based on the event time and the
    /// logging interval.
    fn ts(&self) -> Timestamp {
        let time_ms = self.time.as_millis();
        let interval = self.logging_interval_ms;
        let rounded = (time_ms / interval + 1) * interval;
        rounded.try_into().expect("must fit")
    }

    /// Handle the given differential event.
    fn handle(&mut self, event: DifferentialEvent) {
        use DifferentialEvent::*;

        match event {
            Batch(e) => self.handle_batch(e),
            Merge(e) => self.handle_merge(e),
            Drop(e) => self.handle_drop(e),
            TraceShare(e) => self.handle_trace_share(e),
            Batcher(e) => self.handle_batcher_event(e),
            _ => (),
        }
    }

    fn handle_batch(&mut self, event: BatchEvent) {
        let ts = self.ts();
        let op = event.operator;
        self.output.batches.give(((op, ()), ts, 1));

        let diff = Diff::try_from(event.length).expect("must fit");
        self.output.records.give(((op, ()), ts, diff));
        self.notify_arrangement_size(op);
    }

    fn handle_merge(&mut self, event: MergeEvent) {
        let Some(done) = event.complete else { return };

        let ts = self.ts();
        let op = event.operator;
        self.output.batches.give(((op, ()), ts, -1));

        let diff = Diff::try_from(done).expect("must fit")
            - Diff::try_from(event.length1 + event.length2).expect("must fit");
        if diff != 0 {
            self.output.records.give(((op, ()), ts, diff));
        }
        self.notify_arrangement_size(op);
    }

    fn handle_drop(&mut self, event: DropEvent) {
        let ts = self.ts();
        let op = event.operator;
        self.output.batches.give(((op, ()), ts, -1));

        let diff = -Diff::try_from(event.length).expect("must fit");
        if diff != 0 {
            self.output.records.give(((op, ()), ts, diff));
        }
        self.notify_arrangement_size(op);
    }

    fn handle_trace_share(&mut self, event: TraceShare) {
        let ts = self.ts();
        let op = event.operator;
        let diff = Diff::cast_from(event.diff);
        debug_assert_ne!(diff, 0);
        self.output.sharing.give(((op, ()), ts, diff));

        if let Some(logger) = &mut self.shared_state.compute_logger {
            let sharing = self.state.sharing.entry(op).or_default();
            *sharing = (i64::try_from(*sharing).expect("must fit") + diff)
                .try_into()
                .expect("under/overflow");
            if *sharing == 0 {
                self.state.sharing.remove(&op);
                logger.log(ComputeEvent::ArrangementHeapSizeOperatorDrop { operator: op });
            }
        }
    }

    fn handle_batcher_event(&mut self, event: BatcherEvent) {
        let ts = self.ts();
        let op = event.operator;
        let records_diff = Diff::cast_from(event.records_diff);
        let size_diff = Diff::cast_from(event.size_diff);
        let capacity_diff = Diff::cast_from(event.capacity_diff);
        let allocations_diff = Diff::cast_from(event.allocations_diff);
        self.output
            .batcher_records
            .give(((op, ()), ts, records_diff));
        self.output.batcher_size.give(((op, ()), ts, size_diff));
        self.output
            .batcher_capacity
            .give(((op, ()), ts, capacity_diff));
        self.output
            .batcher_allocations
            .give(((op, ()), ts, allocations_diff));
    }

    fn notify_arrangement_size(&self, operator: usize) {
        // While every arrangement should have a corresponding arrangement size operator,
        // we have no guarantee that it already/still exists. Otherwise we could print a warning
        // here, but it's difficult to implement without maintaining state for a longer period than
        // while the arrangement actually exists.
        if let Some(activator) = self.shared_state.arrangement_size_activators.get(&operator) {
            activator.activate();
        }
    }
}