mz_rocksdb_types/config.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10// This module is mostly boilerplate, with all relevant
11// documentation on `RocksDBTuningParameters`.
12#![allow(missing_docs)]
13
14//! This module offers a protobuf implementation (to be used
15//! with LaunchDarkly) `RocksDBTuningParameters` that can be used
16//! to tune a RocksDB instance. The supported options are carefully
17//! considered to be a minimal set required to tune RocksDB to perform
18//! well for the `UPSERT` usecase. This usecase is slightly odd:
19//! - Very high write rate (1:1 with reads)
20//! - No durability requirements
21//! - Minimal space amplification
22//! - Relatively relaxed read and write latency requirements
23//!     - (note that `UPSERT` RocksDB instances are NOT in the
24//!     critical path for any sort of query.
25//!
26//! The defaults (so, the values resulting from derserializing `{}`
27//! into a `RocksDBTuningParameters`) should be reasonable defaults.
28//!
29//! The documentation on each field in `RocksDBTuningParameters` has more
30//! information
31//!
32//! Note that the following documents are required reading to deeply understand
33//! this module:
34//! - <https://github.com/EighteenZi/rocksdb_wiki/blob/master/RocksDB-Tuning-Guide.md>
35//! - <https://github.com/EighteenZi/rocksdb_wiki/blob/master/Compression.md>
36//! - <https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning>
37//! - <https://www.eecg.toronto.edu/~stumm/Papers/Dong-CIDR-16.pdf>
38//! - <http://smalldatum.blogspot.com/2015/11/read-write-space-amplification-pick-2_23.html>
39
40use std::fmt::Debug;
41use std::str::FromStr;
42use std::time::Duration;
43
44use serde::{Deserialize, Serialize};
45use uncased::UncasedStr;
46
47/// A set of parameters to tune RocksDB. This struct is plain-old-data, and is
48/// used to update `RocksDBConfig`, which contains some dynamic value for some
49/// parameters.
50#[derive(Serialize, Deserialize, PartialEq, Clone, Debug)]
51pub struct RocksDBTuningParameters {
52    /// RocksDB has 2 primary styles of compaction:
53    /// - The default, usually referred to as "level" compaction
54    /// - "universal"
55    ///
56    /// Universal is simpler and for some workloads could be
57    /// better. Also, you can directly configure its space-amplification ratio
58    /// (using `universal_compaction_target_ratio`). However, its unclear
59    /// if the `UPSERT` workload is a good workload for universal compaction,
60    /// and its also might be the case that universal compaction uses significantly
61    /// more space temporarily while performing compaction.
62    ///
63    /// For these reasons, the default is `CompactionStyle::Level`.
64    pub compaction_style: CompactionStyle,
65    /// The `RocksDB` api offers a single configuration method that sets some
66    /// reasonable defaults for heavy-write workloads, either
67    /// <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.optimize_level_style_compaction>
68    /// or
69    /// <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.optimize_universal_style_compaction>
70    /// depending on `compaction_style`. We ALSO enable this configuration, which is tuned
71    /// by the size of the memtable (basically the in-memory buffer used to avoid IO). The default
72    /// here is ~512MB, which is the default from here: <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L102>,
73    /// and about twice the global RocksDB default.
74    pub optimize_compaction_memtable_budget: usize,
75
76    /// This option, when enabled, dynamically tunes
77    /// the size of the various LSM levels to put a bound on space-amplification.
78    /// With the default level-ratio of `10`, this means space-amplification is
79    /// O(1.11 * the size of data). Note this is big-O notation, and the actual
80    /// amplification factor depends on the workload.
81    ///
82    /// See <https://www.eecg.toronto.edu/~stumm/Papers/Dong-CIDR-16.pdf> for more details.
83    ///
84    /// This option defaults to true, as its basically free saved-space, and only applies to
85    /// `CompactionStyle::Level`.
86    pub level_compaction_dynamic_level_bytes: bool,
87
88    /// The additional space-amplification used with universal compaction.
89    /// Only applies to `CompactionStyle::Universal`.
90    ///
91    /// See `compaction_style` for more information.
92    pub universal_compaction_target_ratio: i32,
93
94    /// By default, RocksDB uses only 1 thread to perform compaction and other background tasks.
95    ///
96    /// The default here is the number of cores, as mentioned by
97    /// <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.increase_parallelism>.
98    ///
99    /// Note that this option is shared across all RocksDB instances that share a `rocksdb::Env`.
100    pub parallelism: Option<i32>,
101
102    /// The most important way to reduce space amplification in RocksDB is compression.
103    ///
104    /// In RocksDB, data on disk is stored in an LSM tree. Because the higher layers (which are
105    /// smaller) will need to be read during reads that aren't cached, we want a relatively
106    /// lightweight compression scheme, choosing `Lz4` as the default, which is considered almost
107    /// always better than `Snappy`.
108    ///
109    /// The meat of the data is stored in the largest, bottom layer, which can be configured
110    /// (using `bottommost_compression_type`) to use a more expensive compression scheme to save
111    /// more space. The default is `Zstd`, which many think has the best compression ratio. Note
112    /// that tuning the bottommost layer separately only makes sense when you have free cpu,
113    /// which we have in the case of the `UPSERT` usecase.
114    pub compression_type: CompressionType,
115
116    /// See `compression_type` for more information.
117    pub bottommost_compression_type: CompressionType,
118
119    /// The size of the `multi_get` and `multi_put` batches sent to RocksDB. The default is 1024.
120    pub batch_size: usize,
121
122    /// The maximum duration for the retries when performing rocksdb actions in case of retry-able errors.
123    pub retry_max_duration: Duration,
124
125    /// The interval to dump stats in `LOG`.
126    pub stats_log_interval_seconds: u32,
127
128    /// The interval to persist stats into rocksdb.
129    pub stats_persist_interval_seconds: u32,
130
131    /// The optional block cache size in MiB for optimizing rocksdb for point lookups.
132    /// If not provided there will be no optimization.
133    /// <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L82-L85>
134    pub point_lookup_block_cache_size_mb: Option<u32>,
135
136    /// The number of times by which unused buffers will be reduced.
137    /// For example, if the number is 2, the buffers will be reduced to being twice as small,
138    /// i.e. halved.
139    /// Shrinking will be disabled if value is 0;
140    pub shrink_buffers_by_ratio: usize,
141
142    /// Optional write buffer manager bytes. This needs to be set to enable write buffer manager
143    /// across all rocksdb instances
144    pub write_buffer_manager_memory_bytes: Option<usize>,
145    /// Optional write buffer manager memory limit as a percentage of cluster limit
146    pub write_buffer_manager_memory_fraction: Option<f64>,
147    /// Config to enable stalls with write buffer manager
148    pub write_buffer_manager_allow_stall: bool,
149}
150
151impl Default for RocksDBTuningParameters {
152    fn default() -> Self {
153        Self {
154            compaction_style: defaults::DEFAULT_COMPACTION_STYLE,
155            optimize_compaction_memtable_budget:
156                defaults::DEFAULT_OPTIMIZE_COMPACTION_MEMTABLE_BUDGET,
157            level_compaction_dynamic_level_bytes:
158                defaults::DEFAULT_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES,
159            universal_compaction_target_ratio: defaults::DEFAULT_UNIVERSAL_COMPACTION_RATIO,
160            parallelism: defaults::DEFAULT_PARALLELISM,
161            compression_type: defaults::DEFAULT_COMPRESSION_TYPE,
162            bottommost_compression_type: defaults::DEFAULT_BOTTOMMOST_COMPRESSION_TYPE,
163            batch_size: defaults::DEFAULT_BATCH_SIZE,
164            retry_max_duration: defaults::DEFAULT_RETRY_DURATION,
165            stats_log_interval_seconds: defaults::DEFAULT_STATS_LOG_INTERVAL_S,
166            stats_persist_interval_seconds: defaults::DEFAULT_STATS_PERSIST_INTERVAL_S,
167            point_lookup_block_cache_size_mb: None,
168            shrink_buffers_by_ratio: defaults::DEFAULT_SHRINK_BUFFERS_BY_RATIO,
169            write_buffer_manager_memory_bytes: None,
170            write_buffer_manager_memory_fraction: None,
171            write_buffer_manager_allow_stall: false,
172        }
173    }
174}
175
176impl RocksDBTuningParameters {
177    /// Build a `RocksDBTuningParameters` from strings and values from LD parameters.
178    pub fn from_parameters(
179        compaction_style: CompactionStyle,
180        optimize_compaction_memtable_budget: usize,
181        level_compaction_dynamic_level_bytes: bool,
182        universal_compaction_target_ratio: i32,
183        parallelism: Option<i32>,
184        compression_type: CompressionType,
185        bottommost_compression_type: CompressionType,
186        batch_size: usize,
187        retry_max_duration: Duration,
188        stats_log_interval_seconds: u32,
189        stats_persist_interval_seconds: u32,
190        point_lookup_block_cache_size_mb: Option<u32>,
191        shrink_buffers_by_ratio: usize,
192        write_buffer_manager_memory_bytes: Option<usize>,
193        write_buffer_manager_memory_fraction: Option<f64>,
194        write_buffer_manager_allow_stall: bool,
195    ) -> Result<Self, anyhow::Error> {
196        Ok(Self {
197            compaction_style,
198            optimize_compaction_memtable_budget,
199            level_compaction_dynamic_level_bytes,
200            universal_compaction_target_ratio: if universal_compaction_target_ratio > 100 {
201                universal_compaction_target_ratio
202            } else {
203                return Err(anyhow::anyhow!(
204                    "universal_compaction_target_ratio ({}) must be > 100",
205                    universal_compaction_target_ratio
206                ));
207            },
208            parallelism: match parallelism {
209                Some(parallelism) => {
210                    if parallelism < 1 {
211                        return Err(anyhow::anyhow!(
212                            "parallelism({}) must be > 1, or not specified",
213                            universal_compaction_target_ratio
214                        ));
215                    }
216                    Some(parallelism)
217                }
218                None => None,
219            },
220            compression_type,
221            bottommost_compression_type,
222            batch_size,
223            retry_max_duration,
224            stats_log_interval_seconds,
225            stats_persist_interval_seconds,
226            point_lookup_block_cache_size_mb,
227            shrink_buffers_by_ratio,
228            write_buffer_manager_memory_bytes,
229            write_buffer_manager_memory_fraction,
230            write_buffer_manager_allow_stall,
231        })
232    }
233}
234
235/// The 2 primary compaction styles in RocksDB`. See `RocksDBTuningParameters::compaction_style`
236/// for more information.
237#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Debug)]
238pub enum CompactionStyle {
239    Level,
240    Universal,
241}
242
243impl FromStr for CompactionStyle {
244    type Err = anyhow::Error;
245
246    fn from_str(s: &str) -> Result<Self, Self::Err> {
247        let s = UncasedStr::new(s);
248        if s == "level" {
249            Ok(Self::Level)
250        } else if s == "universal" {
251            Ok(Self::Universal)
252        } else {
253            Err(anyhow::anyhow!("{} is not a supported compaction style", s))
254        }
255    }
256}
257
258impl std::fmt::Display for CompactionStyle {
259    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
260        match self {
261            CompactionStyle::Level => write!(f, "level"),
262            CompactionStyle::Universal => write!(f, "universal"),
263        }
264    }
265}
266
267/// Mz-supported compression types in RocksDB`. See `RocksDBTuningParameters::compression_type`
268/// for more information.
269#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Debug)]
270pub enum CompressionType {
271    Zstd,
272    Snappy,
273    Lz4,
274    None,
275}
276
277impl FromStr for CompressionType {
278    type Err = anyhow::Error;
279
280    fn from_str(s: &str) -> Result<Self, Self::Err> {
281        let s = UncasedStr::new(s);
282        if s == "zstd" {
283            Ok(Self::Zstd)
284        } else if s == "snappy" {
285            Ok(Self::Snappy)
286        } else if s == "lz4" {
287            Ok(Self::Lz4)
288        } else if s == "none" {
289            Ok(Self::None)
290        } else {
291            Err(anyhow::anyhow!("{} is not a supported compression type", s))
292        }
293    }
294}
295
296impl std::fmt::Display for CompressionType {
297    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
298        match self {
299            CompressionType::Zstd => write!(f, "zstd"),
300            CompressionType::Snappy => write!(f, "snappy"),
301            CompressionType::Lz4 => write!(f, "lz4"),
302            CompressionType::None => write!(f, "none"),
303        }
304    }
305}
306
307#[derive(Clone, Debug)]
308pub struct RocksDbWriteBufferManagerConfig {
309    /// Optional write buffer manager bytes. This needs to be set to enable write buffer manager
310    /// across all rocksdb instances
311    pub write_buffer_manager_memory_bytes: Option<usize>,
312    /// Optional write buffer manager memory limit as a percentage of cluster limit
313    pub write_buffer_manager_memory_fraction: Option<f64>,
314    /// Config to enable stalls with write buffer manager
315    pub write_buffer_manager_allow_stall: bool,
316    /// Cluster memory limit used to calculate write buffer manager limit
317    /// if `write_buffer_manager_memory_fraction` is provided
318    pub cluster_memory_limit: Option<usize>,
319}
320
321/// The following are defaults (and default strings for LD parameters)
322/// for `RocksDBTuningParameters`.
323pub mod defaults {
324    use std::time::Duration;
325
326    use super::*;
327
328    pub const DEFAULT_COMPACTION_STYLE: CompactionStyle = CompactionStyle::Level;
329
330    /// From here: <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L102>
331    /// And then setting it to 1/3rd from our testing in production
332    pub const DEFAULT_OPTIMIZE_COMPACTION_MEMTABLE_BUDGET: usize = 512 * 1024 * 1024 / 3;
333
334    pub const DEFAULT_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES: bool = true;
335
336    /// From here: <https://docs.rs/rocksdb/latest/rocksdb/struct.UniversalCompactOptions.html>
337    pub const DEFAULT_UNIVERSAL_COMPACTION_RATIO: i32 = 200;
338
339    pub const DEFAULT_PARALLELISM: Option<i32> = None;
340
341    pub const DEFAULT_COMPRESSION_TYPE: CompressionType = CompressionType::Lz4;
342
343    pub const DEFAULT_BOTTOMMOST_COMPRESSION_TYPE: CompressionType = CompressionType::Lz4;
344
345    /// A reasonable default batch size for gets and puts in RocksDB. Based
346    /// on advice here: <https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ>.
347    /// Based on our testing we are using 20 times that.
348    pub const DEFAULT_BATCH_SIZE: usize = 20 * 1024;
349
350    /// The default max duration for retrying the retry-able errors in rocksdb.
351    pub const DEFAULT_RETRY_DURATION: Duration = Duration::from_secs(1);
352
353    /// Default is 10 minutes, from <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.set_stats_dump_period_sec>
354    pub const DEFAULT_STATS_LOG_INTERVAL_S: u32 = 600;
355
356    /// Default is 10 minutes, from <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.set_stats_persist_period_sec>
357    pub const DEFAULT_STATS_PERSIST_INTERVAL_S: u32 = 600;
358
359    /// Default is 0, i.e. shrinking will be disabled
360    pub const DEFAULT_SHRINK_BUFFERS_BY_RATIO: usize = 0;
361
362    /// Not allowing stalls for write buffer manager. Only applicable if write buffer manager is enabled by other flags.
363    pub const DEFAULT_WRITE_BUFFER_MANAGER_ALLOW_STALL: bool = false;
364}
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369
370    #[mz_ore::test]
371    fn defaults_equality() {
372        let r = RocksDBTuningParameters::from_parameters(
373            defaults::DEFAULT_COMPACTION_STYLE,
374            defaults::DEFAULT_OPTIMIZE_COMPACTION_MEMTABLE_BUDGET,
375            defaults::DEFAULT_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES,
376            defaults::DEFAULT_UNIVERSAL_COMPACTION_RATIO,
377            defaults::DEFAULT_PARALLELISM,
378            defaults::DEFAULT_COMPRESSION_TYPE,
379            defaults::DEFAULT_BOTTOMMOST_COMPRESSION_TYPE,
380            defaults::DEFAULT_BATCH_SIZE,
381            defaults::DEFAULT_RETRY_DURATION,
382            defaults::DEFAULT_STATS_LOG_INTERVAL_S,
383            defaults::DEFAULT_STATS_PERSIST_INTERVAL_S,
384            None,
385            defaults::DEFAULT_SHRINK_BUFFERS_BY_RATIO,
386            None,
387            None,
388            defaults::DEFAULT_WRITE_BUFFER_MANAGER_ALLOW_STALL,
389        )
390        .unwrap();
391
392        assert_eq!(r, RocksDBTuningParameters::default());
393    }
394}