Skip to main content

mz_environmentd/http/
mcp_metrics.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Prometheus metrics for the MCP HTTP endpoints.
11//!
12//! Tracks request counts, tool call counts, and tool call durations,
13//! labeled by endpoint type (`agent` / `developer`) and either the
14//! JSON-RPC method name or the MCP tool name. The status label is `ok`
15//! for successful calls and the `McpRequestError` error type
16//! (e.g. `ToolNotFound`, `DataProductNotFound`) for failures.
17
18use mz_ore::metric;
19use mz_ore::metrics::MetricsRegistry;
20use mz_ore::stats::histogram_seconds_buckets;
21use prometheus::{HistogramTimer, HistogramVec, IntCounterVec};
22
23/// Metrics emitted by the MCP HTTP handlers.
24///
25/// Cheaply `Clone`: Prometheus collector handles are `Arc`-shared internally,
26/// so the struct can be cloned freely and stored as an axum `Extension`.
27#[derive(Debug, Clone)]
28pub struct McpMetrics {
29    /// Total MCP requests by endpoint type, JSON-RPC method, and status.
30    pub requests: IntCounterVec,
31    /// Total MCP `tools/call` invocations by endpoint type, tool name, and status.
32    pub tool_calls: IntCounterVec,
33    /// Duration of MCP `tools/call` invocations by endpoint type and tool name.
34    pub tool_call_duration: HistogramVec,
35}
36
37/// RAII guard for a single `tools/call` invocation. On drop, increments
38/// `tool_calls_total` with the current status and observes
39/// `tool_call_duration_seconds` via the embedded [`HistogramTimer`]'s own
40/// drop. Designed so that if the surrounding future is dropped before
41/// completion (e.g. by `tokio::time::timeout`), the metric still records
42/// with the default `"cancelled"` status instead of being silently lost.
43pub struct ToolCallGuard<'a> {
44    metrics: &'a McpMetrics,
45    endpoint_label: &'static str,
46    tool_label: String,
47    status: &'static str,
48    /// `HistogramTimer::drop` observes the duration into the histogram, so
49    /// holding the timer here means we get the duration recorded for both
50    /// normal completion and early drop.
51    _timer: HistogramTimer,
52}
53
54impl<'a> ToolCallGuard<'a> {
55    /// Starts a new tool call: begins the duration timer and reserves the
56    /// counter increment that will happen on drop.
57    pub fn new(metrics: &'a McpMetrics, endpoint_label: &'static str, tool_label: String) -> Self {
58        let timer = metrics
59            .tool_call_duration
60            .with_label_values(&[endpoint_label, &tool_label])
61            .start_timer();
62        Self {
63            metrics,
64            endpoint_label,
65            tool_label,
66            status: "cancelled",
67            _timer: timer,
68        }
69    }
70
71    /// Records the outcome of the call. Callers should set this on the
72    /// normal completion path right before the guard is dropped.
73    pub fn set_status(&mut self, status: &'static str) {
74        self.status = status;
75    }
76}
77
78impl Drop for ToolCallGuard<'_> {
79    fn drop(&mut self) {
80        self.metrics
81            .tool_calls
82            .with_label_values(&[self.endpoint_label, &self.tool_label, self.status])
83            .inc();
84    }
85}
86
87impl McpMetrics {
88    pub fn register_into(registry: &MetricsRegistry) -> Self {
89        Self {
90            requests: registry.register(metric!(
91                name: "mz_mcp_requests_total",
92                help: "Total number of MCP requests received.",
93                var_labels: ["endpoint_type", "method", "status"],
94            )),
95            tool_calls: registry.register(metric!(
96                name: "mz_mcp_tool_calls_total",
97                help: "Total number of MCP tools/call invocations.",
98                var_labels: ["endpoint_type", "tool_name", "status"],
99            )),
100            tool_call_duration: registry.register(metric!(
101                name: "mz_mcp_tool_call_duration_seconds",
102                help: "Duration of MCP tools/call invocations in seconds.",
103                var_labels: ["endpoint_type", "tool_name"],
104                buckets: histogram_seconds_buckets(0.000_128, 8.0),
105            )),
106        }
107    }
108}
109
110#[cfg(test)]
111mod tests {
112    use super::McpMetrics;
113    use mz_ore::metrics::MetricsRegistry;
114
115    /// All three metrics register cleanly and show up in the gathered output
116    /// with the expected names. `IntCounterVec` / `HistogramVec` families
117    /// only appear in `gather()` after at least one label combination has
118    /// been observed, so each metric is touched once before gathering.
119    #[mz_ore::test]
120    fn test_register_into() {
121        let registry = MetricsRegistry::new();
122        let metrics = McpMetrics::register_into(&registry);
123
124        metrics
125            .requests
126            .with_label_values(&["agent", "initialize", "ok"])
127            .inc_by(0);
128        metrics
129            .tool_calls
130            .with_label_values(&["agent", "read_data_product", "ok"])
131            .inc_by(0);
132        metrics
133            .tool_call_duration
134            .with_label_values(&["agent", "read_data_product"])
135            .observe(0.0);
136
137        let names: Vec<String> = registry
138            .gather()
139            .iter()
140            .map(|m| m.name().to_string())
141            .collect();
142
143        assert!(
144            names.iter().any(|n| n == "mz_mcp_requests_total"),
145            "mz_mcp_requests_total should be registered, got: {names:?}",
146        );
147        assert!(
148            names.iter().any(|n| n == "mz_mcp_tool_calls_total"),
149            "mz_mcp_tool_calls_total should be registered, got: {names:?}",
150        );
151        assert!(
152            names
153                .iter()
154                .any(|n| n == "mz_mcp_tool_call_duration_seconds"),
155            "mz_mcp_tool_call_duration_seconds should be registered, got: {names:?}",
156        );
157    }
158
159    /// Incrementing each counter with realistic label values produces the
160    /// expected counts in the gathered output.
161    #[mz_ore::test]
162    fn test_record_metrics() {
163        let registry = MetricsRegistry::new();
164        let metrics = McpMetrics::register_into(&registry);
165
166        metrics
167            .requests
168            .with_label_values(&["agent", "tools/call", "ok"])
169            .inc();
170        metrics
171            .requests
172            .with_label_values(&["agent", "tools/call", "ok"])
173            .inc();
174        metrics
175            .requests
176            .with_label_values(&["developer", "initialize", "ok"])
177            .inc();
178
179        metrics
180            .tool_calls
181            .with_label_values(&["agent", "read_data_product", "ok"])
182            .inc();
183        metrics
184            .tool_calls
185            .with_label_values(&["agent", "read_data_product", "DataProductNotFound"])
186            .inc();
187
188        metrics
189            .tool_call_duration
190            .with_label_values(&["agent", "read_data_product"])
191            .observe(0.123);
192
193        let gathered = registry.gather();
194
195        // requests_total: 3 increments produce 2 distinct label sets (the
196        // first two share labels and so collapse into the same series).
197        let requests = gathered
198            .iter()
199            .find(|m| m.name() == "mz_mcp_requests_total")
200            .expect("requests metric present");
201        assert_eq!(requests.get_metric().len(), 2);
202
203        // tool_calls_total: 2 distinct label sets (one for each status).
204        let tool_calls = gathered
205            .iter()
206            .find(|m| m.name() == "mz_mcp_tool_calls_total")
207            .expect("tool_calls metric present");
208        assert_eq!(tool_calls.get_metric().len(), 2);
209
210        // tool_call_duration_seconds: one observation in one bucket set.
211        let duration = gathered
212            .iter()
213            .find(|m| m.name() == "mz_mcp_tool_call_duration_seconds")
214            .expect("tool_call_duration metric present");
215        assert_eq!(
216            duration.get_metric()[0].get_histogram().get_sample_count(),
217            1
218        );
219    }
220}