mz_rocksdb_types/config.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10// This module is mostly boilerplate, with all relevant
11// documentation on `RocksDBTuningParameters`.
12#![allow(missing_docs)]
13
14//! This module offers a protobuf implementation (to be used
15//! with LaunchDarkly) `RocksDBTuningParameters` that can be used
16//! to tune a RocksDB instance. The supported options are carefully
17//! considered to be a minimal set required to tune RocksDB to perform
18//! well for the `UPSERT` usecase. This usecase is slightly odd:
19//! - Very high write rate (1:1 with reads)
20//! - No durability requirements
21//! - Minimal space amplification
22//! - Relatively relaxed read and write latency requirements
23//! - (note that `UPSERT` RocksDB instances are NOT in the
24//! critical path for any sort of query.
25//!
26//! The defaults (so, the values resulting from derserializing `{}`
27//! into a `RocksDBTuningParameters`) should be reasonable defaults.
28//!
29//! The documentation on each field in `RocksDBTuningParameters` has more
30//! information
31//!
32//! Note that the following documents are required reading to deeply understand
33//! this module:
34//! - <https://github.com/EighteenZi/rocksdb_wiki/blob/master/RocksDB-Tuning-Guide.md>
35//! - <https://github.com/EighteenZi/rocksdb_wiki/blob/master/Compression.md>
36//! - <https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning>
37//! - <https://www.eecg.toronto.edu/~stumm/Papers/Dong-CIDR-16.pdf>
38//! - <http://smalldatum.blogspot.com/2015/11/read-write-space-amplification-pick-2_23.html>
39
40use std::fmt::Debug;
41use std::str::FromStr;
42use std::time::Duration;
43
44use serde::{Deserialize, Serialize};
45use uncased::UncasedStr;
46
47/// A set of parameters to tune RocksDB. This struct is plain-old-data, and is
48/// used to update `RocksDBConfig`, which contains some dynamic value for some
49/// parameters.
50#[derive(Serialize, Deserialize, PartialEq, Clone, Debug)]
51pub struct RocksDBTuningParameters {
52 /// RocksDB has 2 primary styles of compaction:
53 /// - The default, usually referred to as "level" compaction
54 /// - "universal"
55 ///
56 /// Universal is simpler and for some workloads could be
57 /// better. Also, you can directly configure its space-amplification ratio
58 /// (using `universal_compaction_target_ratio`). However, its unclear
59 /// if the `UPSERT` workload is a good workload for universal compaction,
60 /// and its also might be the case that universal compaction uses significantly
61 /// more space temporarily while performing compaction.
62 ///
63 /// For these reasons, the default is `CompactionStyle::Level`.
64 pub compaction_style: CompactionStyle,
65 /// The `RocksDB` api offers a single configuration method that sets some
66 /// reasonable defaults for heavy-write workloads, either
67 /// <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.optimize_level_style_compaction>
68 /// or
69 /// <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.optimize_universal_style_compaction>
70 /// depending on `compaction_style`. We ALSO enable this configuration, which is tuned
71 /// by the size of the memtable (basically the in-memory buffer used to avoid IO). The default
72 /// here is ~512MB, which is the default from here: <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L102>,
73 /// and about twice the global RocksDB default.
74 pub optimize_compaction_memtable_budget: usize,
75
76 /// This option, when enabled, dynamically tunes
77 /// the size of the various LSM levels to put a bound on space-amplification.
78 /// With the default level-ratio of `10`, this means space-amplification is
79 /// O(1.11 * the size of data). Note this is big-O notation, and the actual
80 /// amplification factor depends on the workload.
81 ///
82 /// See <https://www.eecg.toronto.edu/~stumm/Papers/Dong-CIDR-16.pdf> for more details.
83 ///
84 /// This option defaults to true, as its basically free saved-space, and only applies to
85 /// `CompactionStyle::Level`.
86 pub level_compaction_dynamic_level_bytes: bool,
87
88 /// The additional space-amplification used with universal compaction.
89 /// Only applies to `CompactionStyle::Universal`.
90 ///
91 /// See `compaction_style` for more information.
92 pub universal_compaction_target_ratio: i32,
93
94 /// By default, RocksDB uses only 1 thread to perform compaction and other background tasks.
95 ///
96 /// The default here is the number of cores, as mentioned by
97 /// <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.increase_parallelism>.
98 ///
99 /// Note that this option is shared across all RocksDB instances that share a `rocksdb::Env`.
100 pub parallelism: Option<i32>,
101
102 /// The most important way to reduce space amplification in RocksDB is compression.
103 ///
104 /// In RocksDB, data on disk is stored in an LSM tree. Because the higher layers (which are
105 /// smaller) will need to be read during reads that aren't cached, we want a relatively
106 /// lightweight compression scheme, choosing `Lz4` as the default, which is considered almost
107 /// always better than `Snappy`.
108 ///
109 /// The meat of the data is stored in the largest, bottom layer, which can be configured
110 /// (using `bottommost_compression_type`) to use a more expensive compression scheme to save
111 /// more space. The default is `Zstd`, which many think has the best compression ratio. Note
112 /// that tuning the bottommost layer separately only makes sense when you have free cpu,
113 /// which we have in the case of the `UPSERT` usecase.
114 pub compression_type: CompressionType,
115
116 /// See `compression_type` for more information.
117 pub bottommost_compression_type: CompressionType,
118
119 /// The size of the `multi_get` and `multi_put` batches sent to RocksDB. The default is 1024.
120 pub batch_size: usize,
121
122 /// The maximum duration for the retries when performing rocksdb actions in case of retry-able errors.
123 pub retry_max_duration: Duration,
124
125 /// The interval to dump stats in `LOG`.
126 pub stats_log_interval_seconds: u32,
127
128 /// The interval to persist stats into rocksdb.
129 pub stats_persist_interval_seconds: u32,
130
131 /// The optional block cache size in MiB for optimizing rocksdb for point lookups.
132 /// If not provided there will be no optimization.
133 /// <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L82-L85>
134 pub point_lookup_block_cache_size_mb: Option<u32>,
135
136 /// The number of times by which unused buffers will be reduced.
137 /// For example, if the number is 2, the buffers will be reduced to being twice as small,
138 /// i.e. halved.
139 /// Shrinking will be disabled if value is 0;
140 pub shrink_buffers_by_ratio: usize,
141
142 /// Optional write buffer manager bytes. This needs to be set to enable write buffer manager
143 /// across all rocksdb instances
144 pub write_buffer_manager_memory_bytes: Option<usize>,
145 /// Optional write buffer manager memory limit as a percentage of cluster limit
146 pub write_buffer_manager_memory_fraction: Option<f64>,
147 /// Config to enable stalls with write buffer manager
148 pub write_buffer_manager_allow_stall: bool,
149}
150
151impl Default for RocksDBTuningParameters {
152 fn default() -> Self {
153 Self {
154 compaction_style: defaults::DEFAULT_COMPACTION_STYLE,
155 optimize_compaction_memtable_budget:
156 defaults::DEFAULT_OPTIMIZE_COMPACTION_MEMTABLE_BUDGET,
157 level_compaction_dynamic_level_bytes:
158 defaults::DEFAULT_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES,
159 universal_compaction_target_ratio: defaults::DEFAULT_UNIVERSAL_COMPACTION_RATIO,
160 parallelism: defaults::DEFAULT_PARALLELISM,
161 compression_type: defaults::DEFAULT_COMPRESSION_TYPE,
162 bottommost_compression_type: defaults::DEFAULT_BOTTOMMOST_COMPRESSION_TYPE,
163 batch_size: defaults::DEFAULT_BATCH_SIZE,
164 retry_max_duration: defaults::DEFAULT_RETRY_DURATION,
165 stats_log_interval_seconds: defaults::DEFAULT_STATS_LOG_INTERVAL_S,
166 stats_persist_interval_seconds: defaults::DEFAULT_STATS_PERSIST_INTERVAL_S,
167 point_lookup_block_cache_size_mb: None,
168 shrink_buffers_by_ratio: defaults::DEFAULT_SHRINK_BUFFERS_BY_RATIO,
169 write_buffer_manager_memory_bytes: None,
170 write_buffer_manager_memory_fraction: None,
171 write_buffer_manager_allow_stall: false,
172 }
173 }
174}
175
176impl RocksDBTuningParameters {
177 /// Build a `RocksDBTuningParameters` from strings and values from LD parameters.
178 pub fn from_parameters(
179 compaction_style: CompactionStyle,
180 optimize_compaction_memtable_budget: usize,
181 level_compaction_dynamic_level_bytes: bool,
182 universal_compaction_target_ratio: i32,
183 parallelism: Option<i32>,
184 compression_type: CompressionType,
185 bottommost_compression_type: CompressionType,
186 batch_size: usize,
187 retry_max_duration: Duration,
188 stats_log_interval_seconds: u32,
189 stats_persist_interval_seconds: u32,
190 point_lookup_block_cache_size_mb: Option<u32>,
191 shrink_buffers_by_ratio: usize,
192 write_buffer_manager_memory_bytes: Option<usize>,
193 write_buffer_manager_memory_fraction: Option<f64>,
194 write_buffer_manager_allow_stall: bool,
195 ) -> Result<Self, anyhow::Error> {
196 Ok(Self {
197 compaction_style,
198 optimize_compaction_memtable_budget,
199 level_compaction_dynamic_level_bytes,
200 universal_compaction_target_ratio: if universal_compaction_target_ratio > 100 {
201 universal_compaction_target_ratio
202 } else {
203 return Err(anyhow::anyhow!(
204 "universal_compaction_target_ratio ({}) must be > 100",
205 universal_compaction_target_ratio
206 ));
207 },
208 parallelism: match parallelism {
209 Some(parallelism) => {
210 if parallelism < 1 {
211 return Err(anyhow::anyhow!(
212 "parallelism({}) must be > 1, or not specified",
213 universal_compaction_target_ratio
214 ));
215 }
216 Some(parallelism)
217 }
218 None => None,
219 },
220 compression_type,
221 bottommost_compression_type,
222 batch_size,
223 retry_max_duration,
224 stats_log_interval_seconds,
225 stats_persist_interval_seconds,
226 point_lookup_block_cache_size_mb,
227 shrink_buffers_by_ratio,
228 write_buffer_manager_memory_bytes,
229 write_buffer_manager_memory_fraction,
230 write_buffer_manager_allow_stall,
231 })
232 }
233}
234
235/// The 2 primary compaction styles in RocksDB`. See `RocksDBTuningParameters::compaction_style`
236/// for more information.
237#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Debug)]
238pub enum CompactionStyle {
239 Level,
240 Universal,
241}
242
243impl FromStr for CompactionStyle {
244 type Err = anyhow::Error;
245
246 fn from_str(s: &str) -> Result<Self, Self::Err> {
247 let s = UncasedStr::new(s);
248 if s == "level" {
249 Ok(Self::Level)
250 } else if s == "universal" {
251 Ok(Self::Universal)
252 } else {
253 Err(anyhow::anyhow!("{} is not a supported compaction style", s))
254 }
255 }
256}
257
258impl std::fmt::Display for CompactionStyle {
259 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
260 match self {
261 CompactionStyle::Level => write!(f, "level"),
262 CompactionStyle::Universal => write!(f, "universal"),
263 }
264 }
265}
266
267/// Mz-supported compression types in RocksDB`. See `RocksDBTuningParameters::compression_type`
268/// for more information.
269#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Debug)]
270pub enum CompressionType {
271 Zstd,
272 Snappy,
273 Lz4,
274 None,
275}
276
277impl FromStr for CompressionType {
278 type Err = anyhow::Error;
279
280 fn from_str(s: &str) -> Result<Self, Self::Err> {
281 let s = UncasedStr::new(s);
282 if s == "zstd" {
283 Ok(Self::Zstd)
284 } else if s == "snappy" {
285 Ok(Self::Snappy)
286 } else if s == "lz4" {
287 Ok(Self::Lz4)
288 } else if s == "none" {
289 Ok(Self::None)
290 } else {
291 Err(anyhow::anyhow!("{} is not a supported compression type", s))
292 }
293 }
294}
295
296impl std::fmt::Display for CompressionType {
297 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
298 match self {
299 CompressionType::Zstd => write!(f, "zstd"),
300 CompressionType::Snappy => write!(f, "snappy"),
301 CompressionType::Lz4 => write!(f, "lz4"),
302 CompressionType::None => write!(f, "none"),
303 }
304 }
305}
306
307#[derive(Clone, Debug)]
308pub struct RocksDbWriteBufferManagerConfig {
309 /// Optional write buffer manager bytes. This needs to be set to enable write buffer manager
310 /// across all rocksdb instances
311 pub write_buffer_manager_memory_bytes: Option<usize>,
312 /// Optional write buffer manager memory limit as a percentage of cluster limit
313 pub write_buffer_manager_memory_fraction: Option<f64>,
314 /// Config to enable stalls with write buffer manager
315 pub write_buffer_manager_allow_stall: bool,
316 /// Cluster memory limit used to calculate write buffer manager limit
317 /// if `write_buffer_manager_memory_fraction` is provided
318 pub cluster_memory_limit: Option<usize>,
319}
320
321/// The following are defaults (and default strings for LD parameters)
322/// for `RocksDBTuningParameters`.
323pub mod defaults {
324 use std::time::Duration;
325
326 use super::*;
327
328 pub const DEFAULT_COMPACTION_STYLE: CompactionStyle = CompactionStyle::Level;
329
330 /// From here: <https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L102>
331 /// And then setting it to 1/3rd from our testing in production
332 pub const DEFAULT_OPTIMIZE_COMPACTION_MEMTABLE_BUDGET: usize = 512 * 1024 * 1024 / 3;
333
334 pub const DEFAULT_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES: bool = true;
335
336 /// From here: <https://docs.rs/rocksdb/latest/rocksdb/struct.UniversalCompactOptions.html>
337 pub const DEFAULT_UNIVERSAL_COMPACTION_RATIO: i32 = 200;
338
339 pub const DEFAULT_PARALLELISM: Option<i32> = None;
340
341 pub const DEFAULT_COMPRESSION_TYPE: CompressionType = CompressionType::Lz4;
342
343 pub const DEFAULT_BOTTOMMOST_COMPRESSION_TYPE: CompressionType = CompressionType::Lz4;
344
345 /// A reasonable default batch size for gets and puts in RocksDB. Based
346 /// on advice here: <https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ>.
347 /// Based on our testing we are using 20 times that.
348 pub const DEFAULT_BATCH_SIZE: usize = 20 * 1024;
349
350 /// The default max duration for retrying the retry-able errors in rocksdb.
351 pub const DEFAULT_RETRY_DURATION: Duration = Duration::from_secs(1);
352
353 /// Default is 10 minutes, from <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.set_stats_dump_period_sec>
354 pub const DEFAULT_STATS_LOG_INTERVAL_S: u32 = 600;
355
356 /// Default is 10 minutes, from <https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#method.set_stats_persist_period_sec>
357 pub const DEFAULT_STATS_PERSIST_INTERVAL_S: u32 = 600;
358
359 /// Default is 0, i.e. shrinking will be disabled
360 pub const DEFAULT_SHRINK_BUFFERS_BY_RATIO: usize = 0;
361
362 /// Not allowing stalls for write buffer manager. Only applicable if write buffer manager is enabled by other flags.
363 pub const DEFAULT_WRITE_BUFFER_MANAGER_ALLOW_STALL: bool = false;
364}
365
366#[cfg(test)]
367mod tests {
368 use super::*;
369
370 #[mz_ore::test]
371 fn defaults_equality() {
372 let r = RocksDBTuningParameters::from_parameters(
373 defaults::DEFAULT_COMPACTION_STYLE,
374 defaults::DEFAULT_OPTIMIZE_COMPACTION_MEMTABLE_BUDGET,
375 defaults::DEFAULT_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES,
376 defaults::DEFAULT_UNIVERSAL_COMPACTION_RATIO,
377 defaults::DEFAULT_PARALLELISM,
378 defaults::DEFAULT_COMPRESSION_TYPE,
379 defaults::DEFAULT_BOTTOMMOST_COMPRESSION_TYPE,
380 defaults::DEFAULT_BATCH_SIZE,
381 defaults::DEFAULT_RETRY_DURATION,
382 defaults::DEFAULT_STATS_LOG_INTERVAL_S,
383 defaults::DEFAULT_STATS_PERSIST_INTERVAL_S,
384 None,
385 defaults::DEFAULT_SHRINK_BUFFERS_BY_RATIO,
386 None,
387 None,
388 defaults::DEFAULT_WRITE_BUFFER_MANAGER_ALLOW_STALL,
389 )
390 .unwrap();
391
392 assert_eq!(r, RocksDBTuningParameters::default());
393 }
394}