mz_ore/
panic.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License in the LICENSE file at the
6// root of this repository, or online at
7//
8//     http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! Panic utilities.
17
18use std::any::Any;
19use std::backtrace::Backtrace;
20use std::borrow::Cow;
21use std::cell::RefCell;
22use std::fs::File;
23use std::io::{self, Write as _};
24use std::os::fd::FromRawFd;
25use std::panic::{self, UnwindSafe};
26use std::process;
27use std::sync::{Arc, Mutex};
28use std::time::Duration;
29use std::{env, thread};
30
31#[cfg(feature = "chrono")]
32use chrono::Utc;
33use itertools::Itertools;
34#[cfg(feature = "async")]
35use tokio::task_local;
36
37use crate::iter::IteratorExt;
38
39thread_local! {
40    /// Keeps track of how many `catch_unwind` calls we are inside.
41    static CATCHING_UNWIND: RefCell<usize> = const { RefCell::new(0) };
42}
43
44#[cfg(feature = "async")]
45task_local! {
46    pub(crate) static CATCHING_UNWIND_ASYNC: bool;
47}
48
49/// Overwrites the default panic handler with an enhanced panic handler.
50///
51/// The enhanced panic handler:
52///
53///   * Always emits a backtrace, regardless of how `RUST_BACKTRACE` was
54///     configured.
55///
56///   * Writes to stderr as atomically as possible, to minimize interleaving
57///     with concurrent log messages.
58///
59///   * Reports panics to Sentry.
60///
61///     Sentry installs its own panic hook by default that reports the panic and
62///     then forwards it to the previous panic hook. We can't use that hook
63///     because it would also report panics that we catch-unwind afterwards.
64///     Instead we are invoking the Sentry integration manually here, after the
65///     catch-unwind check.
66///
67///   * Instructs the entire process to abort if any thread panics.
68///
69///     By default, when a thread panics in Rust, only that thread is affected,
70///     and other threads continue running unaffected. This is a bad default. In
71///     almost all programs, thread panics are unexpected, unrecoverable, and
72///     leave the overall program in an invalid state. It is therefore typically
73///     less confusing to abort the entire program.
74///
75///     For example, consider a simple program with two threads communicating
76///     through a channel, where the first thread is waiting for the second
77///     thread to send a value over the channel. If the second thread panics,
78///     the first thread will block forever for a value that will never be
79///     produced. Blocking forever will be more confusing to the end user than
80///     aborting the program entirely.
81///
82/// Note that after calling this function, computations in which a panic is
83/// expected must use the special [`catch_unwind`] function in this module to
84/// recover. Note that the `catch_unwind` function in the standard library is
85/// **not** compatible with this improved panic handler.
86pub fn install_enhanced_handler() {
87    panic::set_hook(Box::new(move |panic_info| {
88        // If we're catching an unwind, do nothing to let the unwind handler
89        // run.
90        let catching_unwind = CATCHING_UNWIND.with(|v| *v.borrow());
91        #[cfg(feature = "async")]
92        let catching_unwind_async = CATCHING_UNWIND_ASYNC.try_with(|v| *v).unwrap_or(false);
93        #[cfg(not(feature = "async"))]
94        let catching_unwind_async = false;
95        if catching_unwind != 0 || catching_unwind_async {
96            return;
97        }
98
99        // Report the panic to Sentry.
100        // Note that we can't use `sentry_panic::panic_handler` because that requires the panic
101        // integration to be enabled.
102        sentry::Hub::with_active(|hub| {
103            let event = sentry_panic::PanicIntegration::new().event_from_panic_info(panic_info);
104            hub.capture_event(event);
105            if let Some(client) = hub.client() {
106                client.flush(None);
107            }
108        });
109
110        // can't use if cfg!() here because that will require chrono::Utc import
111        #[cfg(feature = "chrono")]
112        let timestamp = Utc::now().format("%Y-%m-%dT%H:%M:%S%.6fZ  ").to_string();
113        #[cfg(not(feature = "chrono"))]
114        let timestamp = String::new();
115
116        let thread = thread::current();
117        let thread_name = thread.name().unwrap_or("<unnamed>");
118
119        let msg = match panic_info.payload().downcast_ref::<&'static str>() {
120            Some(s) => *s,
121            None => match panic_info.payload().downcast_ref::<String>() {
122                Some(s) => &s[..],
123                None => "Box<Any>",
124            },
125        };
126
127        let location = if let Some(loc) = panic_info.location() {
128            loc.to_string()
129        } else {
130            "<unknown>".to_string()
131        };
132
133        // We unconditionally collect and display a short backtrace, as there's
134        // no practical situation where producing a backtrace in a panic message
135        // is undesirable. Panics are always unexpected, and we don't want to
136        // miss our chance to give ourselves as much context as possible.
137        //
138        // We do support `RUST_BACKTRACE=full` to display a full backtrace
139        // rather than a short backtrace that omits the frames from the runtime
140        // and the panic handler itslef.
141        let mut backtrace = Backtrace::force_capture().to_string();
142        if env::var("RUST_BACKTRACE").as_deref() != Ok("full") {
143            // Rust doesn't provide an API for generating a short backtrace, so
144            // we have to string munge it ourselves. The relevant frames are
145            // between the call to `backtrace::__rust_begin_short_backtrace` and
146            // `backtrace::__rust_end_short_backtrace`, which are easy to sniff
147            // out. To make this string munging as robust as possible, if we
148            // don't find the first marker frame, we  leave the full backtrace
149            // in place.
150            let mut lines = backtrace.lines();
151            if lines
152                .find(|l| l.contains("backtrace::__rust_end_short_backtrace"))
153                .is_some()
154            {
155                lines.next();
156                backtrace = lines
157                    .take_while(|l| !l.contains("backtrace::__rust_begin_short_backtrace"))
158                    .chain_one("note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.\n")
159                    .join("\n");
160            }
161        };
162
163        // Rust uses an unbuffered stderr stream. Build the message in a buffer
164        // first so that we minimize the number of calls to the underlying
165        // `write(2)` system call. This minimizes the chance of (but does not
166        // outright prevent) interleaving output with C or C++ libraries that
167        // may be writing to the stderr stream outside of the Rust runtime.
168        //
169        // See https://github.com/rust-lang/rust/issues/64413 for details.
170        let buf = format!(
171            "{timestamp}thread '{thread_name}' panicked at {location}:\n{msg}\n{backtrace}"
172        );
173
174        // Ideal path: spawn a thread that attempts to lock the Rust-managed
175        // stderr stream and write the panic message there. Acquiring the stderr
176        // lock prevents interleaving with concurrent output from the `tracing`
177        // crate, because `tracing` also acquires the lock before printing
178        // output.
179        //
180        // We put the ideal path in a thread because there is no guarantee that
181        // we'll be able to acquire the lock in a timely fashion, and there is
182        // no API to specify a time out for the lock call.
183        let buf = Arc::new(Mutex::new(Some(buf)));
184        thread::spawn({
185            let buf = Arc::clone(&buf);
186            move || {
187                let mut stderr = io::stderr().lock();
188                let mut buf = buf.lock().unwrap();
189                if let Some(buf) = buf.take() {
190                    let _ = stderr.write_all(buf.as_bytes());
191                }
192
193                // Abort while still holding the stderr lock to ensure the panic
194                // is the last output printed to stderr.
195                process::abort();
196            }
197        });
198
199        // Backup path: wait one second for the ideal path to succeed, then
200        // write the panic message directly to the underlying stderr stream
201        // (file descriptor 2) if it wasn't already written by the ideal path.
202        // This ensures we eventually eke out a panic message, possibly
203        // interleaved with other output, even if another thread is wedged while
204        // holding the stderr lock.
205        thread::sleep(Duration::from_secs(1));
206        let mut buf = buf.lock().unwrap();
207        if let Some(buf) = buf.take() {
208            let mut stderr = unsafe { File::from_raw_fd(2) };
209            let _ = stderr.write_all(buf.as_bytes());
210        }
211
212        process::abort();
213    }))
214}
215
216/// Like [`std::panic::catch_unwind`], but can unwind panics even if
217/// [`install_enhanced_handler`] has been called.
218pub fn catch_unwind<F, R>(f: F) -> Result<R, Box<dyn Any + Send + 'static>>
219where
220    F: FnOnce() -> R + UnwindSafe,
221{
222    CATCHING_UNWIND.with(|catching_unwind| {
223        *catching_unwind.borrow_mut() += 1;
224        #[allow(clippy::disallowed_methods)]
225        let res = panic::catch_unwind(f);
226        *catching_unwind.borrow_mut() -= 1;
227        res
228    })
229}
230
231/// Like [`crate::panic::catch_unwind`], but downcasts the returned `Box<dyn Any>` error to a
232/// string which is almost always is.
233///
234/// See: <https://doc.rust-lang.org/stable/std/panic/struct.PanicHookInfo.html#method.payload>
235pub fn catch_unwind_str<F, R>(f: F) -> Result<R, Cow<'static, str>>
236where
237    F: FnOnce() -> R + UnwindSafe,
238{
239    match crate::panic::catch_unwind(f) {
240        Ok(res) => Ok(res),
241        Err(opaque) => match opaque.downcast_ref::<&'static str>() {
242            Some(s) => Err(Cow::Borrowed(*s)),
243            None => match opaque.downcast_ref::<String>() {
244                Some(s) => Err(Cow::Owned(s.to_owned())),
245                None => Err(Cow::Borrowed("Box<Any>")),
246            },
247        },
248    }
249}