mz_ore/panic.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License in the LICENSE file at the
6// root of this repository, or online at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! Panic utilities.
17
18use std::any::Any;
19use std::backtrace::Backtrace;
20use std::borrow::Cow;
21use std::cell::RefCell;
22use std::fmt;
23use std::fs::File;
24use std::io::{self, Write as _};
25use std::os::fd::FromRawFd;
26use std::panic::{self, UnwindSafe};
27use std::process;
28use std::sync::{Arc, Mutex};
29use std::time::Duration;
30use std::{env, thread};
31
32#[cfg(feature = "chrono")]
33use chrono::Utc;
34use itertools::Itertools;
35#[cfg(feature = "async")]
36use tokio::task_local;
37
38use crate::iter::IteratorExt;
39
40thread_local! {
41 /// Keeps track of how many `catch_unwind` calls we are inside.
42 static CATCHING_UNWIND: RefCell<usize> = const { RefCell::new(0) };
43
44 /// Keeps track of how many [`catch_unwind_with_details`] calls we are inside.
45 ///
46 /// When non-zero, the enhanced panic handler records details (the panic
47 /// location and a backtrace) about caught panics into [`CAUGHT_PANIC_DETAILS`]
48 /// before letting the unwind proceed.
49 static CAPTURE_PANIC_DETAILS: RefCell<usize> = const { RefCell::new(0) };
50
51 /// Details about the most recently caught panic, recorded by the enhanced
52 /// panic handler when [`CAPTURE_PANIC_DETAILS`] is non-zero. Consumed by
53 /// [`catch_unwind_with_details`].
54 static CAUGHT_PANIC_DETAILS: RefCell<Option<PanicDetails>> = const { RefCell::new(None) };
55}
56
57/// Details about a caught panic, recorded at the panic site by the enhanced
58/// panic handler and consumed by [`catch_unwind_with_details`]. Internal: the
59/// public-facing type is [`CaughtPanic`].
60#[derive(Clone, Debug)]
61struct PanicDetails {
62 /// The source code location at which the panic occurred, if known.
63 location: Option<String>,
64 /// A backtrace captured at the panic site. Always populated by the handler.
65 backtrace: String,
66}
67
68/// A panic recovered by [`catch_unwind_with_details`], bundling the panic
69/// message with the location and backtrace captured at the panic site.
70#[derive(Clone, Debug)]
71pub struct CaughtPanic {
72 /// The panic message.
73 pub message: Cow<'static, str>,
74 /// The source code location at which the panic occurred, if known.
75 pub location: Option<String>,
76 /// A backtrace captured at the panic site, if one was captured. Absent if
77 /// the enhanced panic handler (see [`install_enhanced_handler`]) was not
78 /// installed.
79 pub backtrace: Option<String>,
80}
81
82impl fmt::Display for CaughtPanic {
83 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
84 write!(f, "{}", self.message)?;
85 if let Some(location) = &self.location {
86 write!(f, " (at {location})")?;
87 }
88 Ok(())
89 }
90}
91
92#[cfg(feature = "async")]
93task_local! {
94 pub(crate) static CATCHING_UNWIND_ASYNC: bool;
95}
96
97/// Overwrites the default panic handler with an enhanced panic handler.
98///
99/// The enhanced panic handler:
100///
101/// * Always emits a backtrace, regardless of how `RUST_BACKTRACE` was
102/// configured.
103///
104/// * Writes to stderr as atomically as possible, to minimize interleaving
105/// with concurrent log messages.
106///
107/// * Reports panics to Sentry.
108///
109/// Sentry installs its own panic hook by default that reports the panic and
110/// then forwards it to the previous panic hook. We can't use that hook
111/// because it would also report panics that we catch-unwind afterwards.
112/// Instead we are invoking the Sentry integration manually here, after the
113/// catch-unwind check.
114///
115/// * Instructs the entire process to abort if any thread panics.
116///
117/// By default, when a thread panics in Rust, only that thread is affected,
118/// and other threads continue running unaffected. This is a bad default. In
119/// almost all programs, thread panics are unexpected, unrecoverable, and
120/// leave the overall program in an invalid state. It is therefore typically
121/// less confusing to abort the entire program.
122///
123/// For example, consider a simple program with two threads communicating
124/// through a channel, where the first thread is waiting for the second
125/// thread to send a value over the channel. If the second thread panics,
126/// the first thread will block forever for a value that will never be
127/// produced. Blocking forever will be more confusing to the end user than
128/// aborting the program entirely.
129///
130/// Note that after calling this function, computations in which a panic is
131/// expected must use the special [`catch_unwind`] function in this module to
132/// recover. Note that the `catch_unwind` function in the standard library is
133/// **not** compatible with this improved panic handler.
134pub fn install_enhanced_handler() {
135 panic::set_hook(Box::new(move |panic_info| {
136 // If we're catching an unwind, do nothing to let the unwind handler
137 // run.
138 let catching_unwind = CATCHING_UNWIND.with(|v| *v.borrow());
139 #[cfg(feature = "async")]
140 let catching_unwind_async = CATCHING_UNWIND_ASYNC.try_with(|v| *v).unwrap_or(false);
141 #[cfg(not(feature = "async"))]
142 let catching_unwind_async = false;
143 if catching_unwind != 0 || catching_unwind_async {
144 // We're letting the unwind proceed without printing or reporting the
145 // panic. The location and backtrace would normally be lost here,
146 // because `catch_unwind` only recovers the panic payload (the
147 // message). If a caller has opted in via `catch_unwind_with_details`,
148 // stash those details now so they can be attached to the resulting
149 // error. We only pay the cost of capturing a backtrace when a panic
150 // actually occurs, which is the exceptional case.
151 let capture_details = CAPTURE_PANIC_DETAILS.with(|v| *v.borrow()) != 0;
152 if capture_details {
153 let location = panic_info.location().map(|loc| loc.to_string());
154 let backtrace = Backtrace::force_capture().to_string();
155 CAUGHT_PANIC_DETAILS.with(|details| {
156 *details.borrow_mut() = Some(PanicDetails {
157 location,
158 backtrace,
159 });
160 });
161 }
162 return;
163 }
164
165 // Report the panic to Sentry.
166 // Note that we can't use `sentry_panic::panic_handler` because that requires the panic
167 // integration to be enabled.
168 sentry::Hub::with_active(|hub| {
169 let event = sentry_panic::PanicIntegration::new().event_from_panic_info(panic_info);
170 hub.capture_event(event);
171 if let Some(client) = hub.client() {
172 client.flush(None);
173 }
174 });
175
176 // can't use if cfg!() here because that will require chrono::Utc import
177 #[cfg(feature = "chrono")]
178 let timestamp = Utc::now().format("%Y-%m-%dT%H:%M:%S%.6fZ ").to_string();
179 #[cfg(not(feature = "chrono"))]
180 let timestamp = String::new();
181
182 let thread = thread::current();
183 let thread_name = thread.name().unwrap_or("<unnamed>");
184
185 let msg = match panic_info.payload().downcast_ref::<&'static str>() {
186 Some(s) => *s,
187 None => match panic_info.payload().downcast_ref::<String>() {
188 Some(s) => &s[..],
189 None => "Box<Any>",
190 },
191 };
192
193 let location = if let Some(loc) = panic_info.location() {
194 loc.to_string()
195 } else {
196 "<unknown>".to_string()
197 };
198
199 // We unconditionally collect and display a short backtrace, as there's
200 // no practical situation where producing a backtrace in a panic message
201 // is undesirable. Panics are always unexpected, and we don't want to
202 // miss our chance to give ourselves as much context as possible.
203 //
204 // We do support `RUST_BACKTRACE=full` to display a full backtrace
205 // rather than a short backtrace that omits the frames from the runtime
206 // and the panic handler itslef.
207 let mut backtrace = Backtrace::force_capture().to_string();
208 if env::var("RUST_BACKTRACE").as_deref() != Ok("full") {
209 // Rust doesn't provide an API for generating a short backtrace, so
210 // we have to string munge it ourselves. The relevant frames are
211 // between the call to `backtrace::__rust_begin_short_backtrace` and
212 // `backtrace::__rust_end_short_backtrace`, which are easy to sniff
213 // out. To make this string munging as robust as possible, if we
214 // don't find the first marker frame, we leave the full backtrace
215 // in place.
216 let mut lines = backtrace.lines();
217 if lines
218 .find(|l| l.contains("backtrace::__rust_end_short_backtrace"))
219 .is_some()
220 {
221 lines.next();
222 backtrace = lines
223 .take_while(|l| !l.contains("backtrace::__rust_begin_short_backtrace"))
224 .chain_one("note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.\n")
225 .join("\n");
226 }
227 };
228
229 // Rust uses an unbuffered stderr stream. Build the message in a buffer
230 // first so that we minimize the number of calls to the underlying
231 // `write(2)` system call. This minimizes the chance of (but does not
232 // outright prevent) interleaving output with C or C++ libraries that
233 // may be writing to the stderr stream outside of the Rust runtime.
234 //
235 // See https://github.com/rust-lang/rust/issues/64413 for details.
236 let buf = format!(
237 "{timestamp}thread '{thread_name}' panicked at {location}:\n{msg}\n{backtrace}"
238 );
239
240 // Ideal path: spawn a thread that attempts to lock the Rust-managed
241 // stderr stream and write the panic message there. Acquiring the stderr
242 // lock prevents interleaving with concurrent output from the `tracing`
243 // crate, because `tracing` also acquires the lock before printing
244 // output.
245 //
246 // We put the ideal path in a thread because there is no guarantee that
247 // we'll be able to acquire the lock in a timely fashion, and there is
248 // no API to specify a time out for the lock call.
249 let buf = Arc::new(Mutex::new(Some(buf)));
250 thread::spawn({
251 let buf = Arc::clone(&buf);
252 move || {
253 let mut stderr = io::stderr().lock();
254 let mut buf = buf.lock().unwrap();
255 if let Some(buf) = buf.take() {
256 let _ = stderr.write_all(buf.as_bytes());
257 }
258
259 // Abort while still holding the stderr lock to ensure the panic
260 // is the last output printed to stderr.
261 process::abort();
262 }
263 });
264
265 // Backup path: wait one second for the ideal path to succeed, then
266 // write the panic message directly to the underlying stderr stream
267 // (file descriptor 2) if it wasn't already written by the ideal path.
268 // This ensures we eventually eke out a panic message, possibly
269 // interleaved with other output, even if another thread is wedged while
270 // holding the stderr lock.
271 thread::sleep(Duration::from_secs(1));
272 let mut buf = buf.lock().unwrap();
273 if let Some(buf) = buf.take() {
274 let mut stderr = unsafe { File::from_raw_fd(2) };
275 let _ = stderr.write_all(buf.as_bytes());
276 }
277
278 process::abort();
279 }))
280}
281
282/// Like [`std::panic::catch_unwind`], but can unwind panics even if
283/// [`install_enhanced_handler`] has been called.
284pub fn catch_unwind<F, R>(f: F) -> Result<R, Box<dyn Any + Send + 'static>>
285where
286 F: FnOnce() -> R + UnwindSafe,
287{
288 CATCHING_UNWIND.with(|catching_unwind| {
289 *catching_unwind.borrow_mut() += 1;
290 #[allow(clippy::disallowed_methods)]
291 let res = panic::catch_unwind(f);
292 *catching_unwind.borrow_mut() -= 1;
293 res
294 })
295}
296
297/// Downcasts an opaque panic payload (as returned by [`catch_unwind`]) to its
298/// message string, which it almost always is.
299///
300/// See: <https://doc.rust-lang.org/stable/std/panic/struct.PanicHookInfo.html#method.payload>
301fn downcast_panic_message(payload: &(dyn Any + Send)) -> Cow<'static, str> {
302 match payload.downcast_ref::<&'static str>() {
303 Some(s) => Cow::Borrowed(*s),
304 None => match payload.downcast_ref::<String>() {
305 Some(s) => Cow::Owned(s.to_owned()),
306 None => Cow::Borrowed("Box<Any>"),
307 },
308 }
309}
310
311/// Like [`crate::panic::catch_unwind`], but downcasts the returned `Box<dyn Any>` error to a
312/// string which is almost always is.
313pub fn catch_unwind_str<F, R>(f: F) -> Result<R, Cow<'static, str>>
314where
315 F: FnOnce() -> R + UnwindSafe,
316{
317 match crate::panic::catch_unwind(f) {
318 Ok(res) => Ok(res),
319 Err(opaque) => Err(downcast_panic_message(&*opaque)),
320 }
321}
322
323/// Like [`catch_unwind_str`], but on panic also recovers the panic's source
324/// location and a backtrace captured at the panic site, bundled together in a
325/// [`CaughtPanic`].
326///
327/// The standard catch-unwind machinery only recovers the panic payload (the
328/// message); the location and backtrace are otherwise only available inside the
329/// panic handler, which runs before the stack is unwound. This function opts in
330/// to having the enhanced panic handler (see [`install_enhanced_handler`]) stash
331/// those details so they can be attached to the returned error.
332///
333/// Capturing a backtrace is relatively expensive, but the cost is only paid when
334/// a panic actually occurs, so this is suitable for enriching internal errors
335/// with extra context. If [`install_enhanced_handler`] has not been installed,
336/// the `location` and `backtrace` fields will be absent, but the `message` is
337/// still recovered.
338///
339/// Note that the captured backtrace is always a full backtrace
340/// ([`Backtrace::force_capture`]); unlike the aborting path in
341/// [`install_enhanced_handler`], it does not honor `RUST_BACKTRACE`, since these
342/// caught panics are rare and the extra context is worth the verbosity.
343pub fn catch_unwind_with_details<F, R>(f: F) -> Result<R, CaughtPanic>
344where
345 F: FnOnce() -> R + UnwindSafe,
346{
347 CAPTURE_PANIC_DETAILS.with(|v| *v.borrow_mut() += 1);
348 let res = catch_unwind(f);
349 CAPTURE_PANIC_DETAILS.with(|v| *v.borrow_mut() -= 1);
350
351 match res {
352 Ok(res) => Ok(res),
353 Err(opaque) => {
354 let message = downcast_panic_message(&*opaque);
355 let details = CAUGHT_PANIC_DETAILS.with(|details| details.borrow_mut().take());
356 let (location, backtrace) = match details {
357 Some(details) => (details.location, Some(details.backtrace)),
358 None => (None, None),
359 };
360 Err(CaughtPanic {
361 message,
362 location,
363 backtrace,
364 })
365 }
366 }
367}