mz_service/
boot.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! Service boot helpers.
11
12/// Emits a tracing event with system diagnostic information, for use during
13/// service boot.
14// NOTE(benesch): this is a macro because the tracing crate does not support
15// dynamic targets, and we want the target of this event to be the name of
16// the root module (e.g., "clusterd" or "environmentd"), not "mz_service".
17#[macro_export]
18macro_rules! emit_boot_diagnostics {
19    ($build_info:expr $(,)?) => {{
20        use $crate::boot::r#private::sysinfo::{System, SystemExt, CpuExt, CpuRefreshKind};
21        use $crate::boot::r#private::tracing::info;
22        use $crate::boot::r#private::tracing::level_filters::LevelFilter;
23        use $crate::boot::r#private::cgroup;
24        use $crate::boot::r#private::os_info;
25
26        use $crate::boot::r#private::mz_build_info::BuildInfo;
27        use $crate::boot::r#private::mz_ore::option::OptionExt;
28
29        let build_info = $build_info;
30        let os = os_info::get();
31        let mut system = System::new();
32        system.refresh_memory();
33        system.refresh_cpu_specifics(CpuRefreshKind::new().with_frequency());
34        let cpus = system.cpus();
35        let limits = cgroup::detect_limits();
36        info!(
37            os.os_type = %os.os_type(),
38            os.version = %os.version(),
39            os.bitness = %os.bitness(),
40            build.version = build_info.version,
41            build.sha = build_info.sha,
42            cpus.logical = cpus.len(),
43            cpus.physical = %system.physical_core_count().display_or("<unknown>"),
44            cpu0.brand = cpus[0].brand(),
45            cpu0.frequency = cpus[0].frequency(),
46            memory.total = system.total_memory(),
47            memory.used = system.used_memory(),
48            memory.limit = %limits.as_ref().and_then(|l| l.memory_max).display_or("<unknown>"),
49            swap.total = system.total_swap(),
50            swap.used = system.used_swap(),
51            swap.limit = %limits.as_ref().and_then(|l| l.swap_max).display_or("<unknown>"),
52            tracing.max_level = %LevelFilter::current(),
53            "booting",
54        );
55    }};
56}
57
58// Implementation for the `emit_boot_event` macro.
59#[doc(hidden)]
60pub mod r#private {
61    pub use {mz_build_info, mz_ore, os_info, sysinfo, tracing};
62
63    // NOTE(benesch): this module contains a lot of complexity just to detect
64    // the cgroup memory limit. Is it worth now that we're cloud native? It was
65    // more obviously worthwhile in the days of the binary, when we needed to
66    // put as much information about the user's system as possible into the log
67    // files, to increase our ability to debug issues. Nowadays we can just look
68    // in Kubernetes for the limit.
69    pub mod cgroup {
70        use std::fs::{self, File};
71        use std::io::{BufRead, BufReader};
72        use std::path::{Path, PathBuf};
73
74        /// An entry in /proc/self/cgroup.
75        #[derive(Debug, PartialEq)]
76        struct CgroupEntry {
77            subsystems: Vec<String>,
78            root: PathBuf,
79        }
80
81        impl CgroupEntry {
82            fn from_line(line: String) -> Option<CgroupEntry> {
83                let mut fields = line.split(':');
84                let subsystems = fields
85                    .nth(1)?
86                    .split(',')
87                    .filter(|s| !s.is_empty())
88                    .map(|s| s.to_owned())
89                    .collect();
90                let root = PathBuf::from(fields.next()?);
91                Some(CgroupEntry { subsystems, root })
92            }
93        }
94
95        /// Parses /proc/self/cgroup into a `Vec<CgroupEntry>`, if the file exists.
96        fn parse_proc_self_cgroup() -> Option<Vec<CgroupEntry>> {
97            let file = File::open("/proc/self/cgroup").ok()?;
98            let file = BufReader::new(file);
99            Some(
100                file.lines()
101                    .map_while(Result::ok)
102                    .filter_map(CgroupEntry::from_line)
103                    .collect(),
104            )
105        }
106
107        /// An entry in /proc/self/mountinfo.
108        #[derive(Debug, PartialEq)]
109        struct MountInfo {
110            root: PathBuf,
111            mount_point: PathBuf,
112            fs_type: String,
113            super_opts: Vec<String>,
114        }
115
116        impl MountInfo {
117            fn from_line(line: String) -> Option<MountInfo> {
118                // https://www.kernel.org/doc/Documentation/filesystems/proc.txt
119                let mut split = line.split(" - ");
120
121                let mut mount_fields = split.next()?.split(' ');
122                let root = PathBuf::from(mount_fields.nth(3)?);
123                let mount_point = PathBuf::from(mount_fields.next()?);
124
125                let mut fs_fields = split.next()?.split(' ');
126
127                let fs_type = fs_fields.next()?.split('.').next()?.to_owned();
128                let super_opts: Vec<String> = fs_fields
129                    .nth(1)?
130                    .split(',')
131                    .filter(|s| !s.is_empty())
132                    .map(|s| s.to_owned())
133                    .collect();
134
135                Some(MountInfo {
136                    root,
137                    mount_point,
138                    fs_type,
139                    super_opts,
140                })
141            }
142        }
143
144        /// Parses /proc/self/mountinfo into vectors of Mountinfo objects,
145        /// returning (v2_mounts, v1_mounts).
146        fn parse_proc_self_mountinfo() -> Option<(Vec<MountInfo>, Vec<MountInfo>)> {
147            let file = File::open("/proc/self/mountinfo").ok()?;
148            let file = BufReader::new(file);
149            Some(
150                file.lines()
151                    .map_while(Result::ok)
152                    .filter_map(MountInfo::from_line)
153                    .filter(|mi| mi.fs_type == "cgroup" || mi.fs_type == "cgroup2")
154                    .partition(|mi| mi.fs_type == "cgroup2"),
155            )
156        }
157
158        /// Represents a cgroup limits, with both memory and swap maximums if they
159        /// exist.
160        ///
161        /// Fields will be `None` if a limit does not exist or when running on a
162        /// platform without cgroup support (i.e., non-Linux platforms).
163        #[derive(Debug)]
164        pub struct Limits {
165            /// Maximum memory limit, in bytes, if a limit exists.
166            pub memory_max: Option<usize>,
167            /// Maximum swap limit, in bytes, if a limit exists.
168            pub swap_max: Option<usize>,
169        }
170
171        fn parse_file<P>(path: P) -> Option<usize>
172        where
173            P: AsRef<Path>,
174        {
175            let s = fs::read_to_string(&path).ok()?;
176            s.trim().parse().ok()
177        }
178
179        /// Finds the mountpoint corresponding to the provided cgroup v2, and reads
180        /// the memory limits within.
181        fn read_v2_memory_limit(cgroups: &[CgroupEntry], mounts: &[MountInfo]) -> Option<Limits> {
182            // cgroups v2 only supports a single cgroup per process
183            let mount = mounts.first()?;
184            if mount.root != cgroups.first()?.root {
185                // We don't support mixed v2/v1.
186                return None;
187            }
188            let mount_point = &mount.mount_point;
189            let controllers = fs::read_to_string(mount_point.join("cgroup.controllers")).ok()?;
190            let mut controllers = controllers.trim().split(' ');
191            if controllers.any(|c| c == "memory") {
192                let memory_max = parse_file(mount_point.join("memory.max"));
193                // Unlike v1, this is only the swap, not swap + memory.
194                let swap_max = parse_file(mount_point.join("memory.swap.max"));
195                return Some(Limits {
196                    memory_max,
197                    swap_max,
198                });
199            }
200            None
201        }
202
203        /// Finds the cgroup v1 and mountpoint combination containing the memory
204        /// controller, and reads the memory limits within.
205        fn read_v1_memory_limit(cgroups: &[CgroupEntry], mounts: &[MountInfo]) -> Option<Limits> {
206            // https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
207            let memory_cgroup = cgroups
208                .into_iter()
209                .find(|cgroup| cgroup.subsystems.iter().any(|s| s == "memory"))?;
210            let memory_mount = mounts.iter().find(|mi| {
211                mi.root == memory_cgroup.root && mi.super_opts.iter().any(|o| o == "memory")
212            })?;
213            let mount_point = &memory_mount.mount_point;
214            let memory_max = parse_file(mount_point.join("memory.limit_in_bytes"));
215            // This is memory + swap, not just swap.
216            let memsw_max = parse_file(mount_point.join("memory.memsw.limit_in_bytes"));
217            let swap_max = match (memory_max, memsw_max) {
218                (Some(max), Some(memsw_max)) => Some(memsw_max - max),
219                _ => None,
220            };
221            Some(Limits {
222                memory_max,
223                swap_max,
224            })
225        }
226
227        /// Returns the cgroup (v1 or v2) limits, if tjey exists.
228        pub fn detect_limits() -> Option<Limits> {
229            let (v2_mounts, v1_mounts) = parse_proc_self_mountinfo()?;
230            let cgroups = parse_proc_self_cgroup()?;
231            if !v2_mounts.is_empty() {
232                return read_v2_memory_limit(&cgroups, &v2_mounts);
233            }
234            read_v1_memory_limit(&cgroups, &v1_mounts)
235        }
236
237        #[cfg(test)]
238        mod tests {
239            use std::path::PathBuf;
240
241            use super::{CgroupEntry, MountInfo};
242
243            #[mz_ore::test]
244            fn test_cgroup_from_line() {
245                // cgroups v2
246                assert_eq!(
247                    CgroupEntry::from_line("0::/".to_owned()),
248                    Some(CgroupEntry {
249                        subsystems: vec![],
250                        root: PathBuf::from("/"),
251                    })
252                );
253
254                // cgroups v1
255                assert_eq!(
256                    CgroupEntry::from_line("6:cpu,cpuacct:/kubepods/pod5b977639-f878-469b-94ee-47a4aa7e597a/dd55abbabd99bcb4d2ce17ffa77d6f811c90e09202f537c273962a8259cac8a0".to_owned()),
257                    Some(CgroupEntry {
258                        subsystems: vec!["cpu".to_owned(), "cpuacct".to_owned()],
259                        root: PathBuf::from("/kubepods/pod5b977639-f878-469b-94ee-47a4aa7e597a/dd55abbabd99bcb4d2ce17ffa77d6f811c90e09202f537c273962a8259cac8a0"),
260                    })
261                );
262                assert_eq!(
263                    CgroupEntry::from_line("5:memory:/kubepods/pod5b977639-f878-469b-94ee-47a4aa7e597a/dd55abbabd99bcb4d2ce17ffa77d6f811c90e09202f537c273962a8259cac8a0".to_owned()),
264                    Some(CgroupEntry {
265                        subsystems: vec!["memory".to_owned()],
266                        root: PathBuf::from("/kubepods/pod5b977639-f878-469b-94ee-47a4aa7e597a/dd55abbabd99bcb4d2ce17ffa77d6f811c90e09202f537c273962a8259cac8a0"),
267                    })
268                );
269            }
270
271            #[mz_ore::test]
272            fn test_mountinfo_from_line() {
273                // Mount with optional field (master:305)
274                assert_eq!(MountInfo::from_line("863 758 0:63 / / rw,relatime master:305 - overlay overlay rw,seclabel,lowerdir=/var/lib/docker/overlay2/l/SUKWDHL7W7YZCJ6YI66I7Z5PR2:/var/lib/docker/overlay2/l/ORL2I23UNUGM7FYF4BSL5JUCAB:/var/lib/docker/overlay2/l/LLKK3J2EHGPF5IGGDSAQGRFHLV:/var/lib/docker/overlay2/l/JEQIUQIQTVNRBAGCU7SLV4KK4K:/var/lib/docker/overlay2/l/5DS7KSJCA7BHWAYWII7BI5DBC5:/var/lib/docker/overlay2/l/ZAGXZ62GNFPZFLNUDZ3JOZIMYR:/var/lib/docker/overlay2/l/6WVXMD372IA24ZXRWGGTIPEQPA,upperdir=/var/lib/docker/overlay2/5c7734eb769484f3469b234181365466eb30bcd7f31c912f4250c8d701637ee4/diff,workdir=/var/lib/docker/overlay2/5c7734eb769484f3469b234181365466eb30bcd7f31c912f4250c8d701637ee4/work".to_owned()),
275                Some(MountInfo{
276                    root: PathBuf::from("/"),
277                    mount_point: PathBuf::from("/"),
278                    fs_type: "overlay".to_owned(),
279                    super_opts: vec![
280                        "rw".to_owned(),
281                        "seclabel".to_owned(),
282                        "lowerdir=/var/lib/docker/overlay2/l/SUKWDHL7W7YZCJ6YI66I7Z5PR2:/var/lib/docker/overlay2/l/ORL2I23UNUGM7FYF4BSL5JUCAB:/var/lib/docker/overlay2/l/LLKK3J2EHGPF5IGGDSAQGRFHLV:/var/lib/docker/overlay2/l/JEQIUQIQTVNRBAGCU7SLV4KK4K:/var/lib/docker/overlay2/l/5DS7KSJCA7BHWAYWII7BI5DBC5:/var/lib/docker/overlay2/l/ZAGXZ62GNFPZFLNUDZ3JOZIMYR:/var/lib/docker/overlay2/l/6WVXMD372IA24ZXRWGGTIPEQPA".to_owned(),
283                        "upperdir=/var/lib/docker/overlay2/5c7734eb769484f3469b234181365466eb30bcd7f31c912f4250c8d701637ee4/diff".to_owned(),
284                        "workdir=/var/lib/docker/overlay2/5c7734eb769484f3469b234181365466eb30bcd7f31c912f4250c8d701637ee4/work".to_owned(),
285                    ],
286                })
287                );
288
289                // cgroups v2
290                assert_eq!(MountInfo::from_line("868 867 0:27 / /sys/fs/cgroup ro,nosuid,nodev,noexec,relatime - cgroup2 cgroup rw,seclabel,nsdelegate,memory_recursiveprot".to_owned()), Some(MountInfo{
291
292                    root: PathBuf::from("/"),
293                    mount_point: PathBuf::from("/sys/fs/cgroup"),
294                    fs_type: "cgroup2".to_owned(),
295                    super_opts: vec![
296                        "rw".to_owned(),
297                        "seclabel".to_owned(),
298                        "nsdelegate".to_owned(),
299                        "memory_recursiveprot".to_owned(),
300                    ],
301                }));
302
303                // cgroups v1
304                assert_eq!(MountInfo::from_line("702 697 0:30 /kubepods/pod5b977639-f878-469b-94ee-47a4aa7e597a/dd55abbabd99bcb4d2ce17ffa77d6f811c90e09202f537c273962a8259cac8a0 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime master:13 - cgroup cgroup rw,memory".to_owned()), Some(MountInfo{
305
306                    root: PathBuf::from("/kubepods/pod5b977639-f878-469b-94ee-47a4aa7e597a/dd55abbabd99bcb4d2ce17ffa77d6f811c90e09202f537c273962a8259cac8a0"),
307                    mount_point: PathBuf::from("/sys/fs/cgroup/memory"),
308                    fs_type: "cgroup".to_owned(),
309                    super_opts: vec![
310                        "rw".to_owned(),
311                        "memory".to_owned(),
312                    ],
313                }));
314            }
315        }
316    }
317}