diff --git a/documentation/src/app/configuration/page.mdx b/documentation/src/app/configuration/page.mdx index af69a12..98c3e6d 100644 --- a/documentation/src/app/configuration/page.mdx +++ b/documentation/src/app/configuration/page.mdx @@ -195,6 +195,8 @@ l0_max_ssts = 16 # Max SST files in L0 before compaction max_unflushed_gb = 1.0 # Max unflushed data before forcing flush (in GB) max_concurrent_compactions = 8 # Max concurrent compaction operations flush_interval_secs = 30 # Interval between periodic flushes (in seconds) +wal_enabled = false # Whether the write-ahead log is enabled +sync_writes = false # Make every write durable on return (HA mode) ``` ### Parameters @@ -209,6 +211,8 @@ flush_interval_secs = 30 # Interval between periodic flushes (in seconds - **`wal_enabled`** (default: `false`): Controls the write-ahead log (WAL). Can be changed at any time without data loss. Without WAL, each `fsync` flushes the memtable and produces a small SST, leading to compaction churn. With WAL, fsyncs write to the WAL instead, so the memtable keeps growing and flushes produce properly-sized SSTs. Enable WAL for fsync-heavy workloads to reduce compaction overhead. +- **`sync_writes`** (default: `false`): When enabled, every write is durably flushed before returning success, eliminating the "in-memory unflushed" window between fsync calls. This does **not** change POSIX fsync semantics: with the default `false`, explicit `fsync` from clients is still honored and waits for durable persistence. The flag only governs writes *between* fsync calls. Trades per-op latency (one round-trip per concurrent batch, since concurrent writes still coalesce) for zero unflushed data on crash. Pair with `wal_enabled = true`, without the WAL, every write blocks on a memtable->L0 SST flush. See the [Eliminating the Durability Window](/durability#eliminating-the-durability-window) section for guidance on when to enable. + These are advanced settings. The defaults are optimized for most workloads. Only modify these if you understand LSM tree behavior and are experiencing specific performance issues. diff --git a/documentation/src/app/durability/page.mdx b/documentation/src/app/durability/page.mdx index 997f88c..e7849e2 100644 --- a/documentation/src/app/durability/page.mdx +++ b/documentation/src/app/durability/page.mdx @@ -216,6 +216,24 @@ ZeroFS follows standard POSIX durability semantics. Write operations are buffere +## Eliminating the Durability Window + +By default, writes between fsync calls live in the memtable and become durable only when the next periodic flush, capacity threshold, or explicit fsync persists them. For workloads that need every write durable on return, set `sync_writes = true` in the `[lsm]` section. + +When enabled, the commit coordinator forces a flush after every coalesced write batch before returning success to the caller. Concurrent writes still merge into a single batch and a single flush. + + +This flag does not change POSIX semantics. With `sync_writes = false` (the default), explicit `fsync` from clients is always honored and waits for durable persistence, the same behavior all major filesystems implement. The flag only governs what happens to writes *between* fsync calls. + + +The trade-off is per-operation latency. Without the WAL, each batch waits for a memtable->L0 SST flush, which is unsuitable for chatty workloads. With the WAL enabled, each batch waits for a WAL append which is much cheaper, especially when paired with a [separate WAL store](/separate-wal) on lower-latency storage. **Always pair `sync_writes = true` with `wal_enabled = true`.** + +```toml +[lsm] +sync_writes = true +wal_enabled = true +``` + ## Protocol Considerations The durability guarantees available to applications depend on the access protocol. diff --git a/zerofs/Cargo.lock b/zerofs/Cargo.lock index bfae330..ba8de61 100644 --- a/zerofs/Cargo.lock +++ b/zerofs/Cargo.lock @@ -5451,7 +5451,7 @@ dependencies = [ [[package]] name = "zerofs" -version = "1.1.0" +version = "1.1.1" dependencies = [ "anyhow", "arc-swap", diff --git a/zerofs/Cargo.toml b/zerofs/Cargo.toml index b837dd4..307683b 100644 --- a/zerofs/Cargo.toml +++ b/zerofs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zerofs" -version = "1.1.0" +version = "1.1.1" edition = "2024" description = "A high-performance filesystem that makes S3 your primary storage with NFS, 9P, and NBD support" license = "AGPL-3.0" diff --git a/zerofs/src/cli/server.rs b/zerofs/src/cli/server.rs index bcd9ddc..fba1ac3 100644 --- a/zerofs/src/cli/server.rs +++ b/zerofs/src/cli/server.rs @@ -691,8 +691,14 @@ async fn initialize_filesystem( .await?; let db_handle = slatedb.clone(); - let fs = - ZeroFS::new_with_slatedb(slatedb, settings.max_bytes(), metrics_recorder.clone()).await?; + let sync_writes = settings.lsm.map(|c| c.sync_writes()).unwrap_or(false); + let fs = ZeroFS::new_with_slatedb( + slatedb, + settings.max_bytes(), + metrics_recorder.clone(), + sync_writes, + ) + .await?; Ok(InitResult { fs: Arc::new(fs), diff --git a/zerofs/src/config.rs b/zerofs/src/config.rs index c7e3baf..4647f4e 100644 --- a/zerofs/src/config.rs +++ b/zerofs/src/config.rs @@ -196,6 +196,18 @@ pub struct LsmConfig { /// Whether the write-ahead log (WAL) is enabled #[serde(skip_serializing_if = "Option::is_none", default)] pub wal_enabled: Option, + /// When true, every committed write is durably flushed to object storage + /// before returning success. Trades per-op latency for zero unflushed data + /// in case of a crash. Pair with `wal_enabled = true` for acceptable + /// performance. + /// + /// This does NOT change POSIX semantics: with `sync_writes = false` (the + /// default), explicit fsync from clients is still honored and + /// waits for durable persistence. The flag only changes what happens to + /// writes between fsync calls, making them durable on return rather than + /// buffered until the next flush. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub sync_writes: Option, } impl LsmConfig { @@ -246,6 +258,10 @@ impl LsmConfig { pub fn wal_enabled(&self) -> bool { self.wal_enabled.unwrap_or(false) } + + pub fn sync_writes(&self) -> bool { + self.sync_writes.unwrap_or(false) + } } #[derive(Debug, Deserialize, Serialize, Clone)] @@ -634,6 +650,11 @@ impl Settings { toml_string.push_str("# max_concurrent_compactions = 8 # Max concurrent compaction operations (default: 8, min: 1)\n"); toml_string.push_str("# flush_interval_secs = 30 # Interval between periodic flushes in seconds (default: 30, min: 5)\n"); toml_string.push_str("# wal_enabled = false # Whether the write-ahead log (WAL) is enabled (default: false)\n"); + toml_string.push_str("# sync_writes = false # Flush every write to object storage before returning success (default: false).\n"); + toml_string.push_str(" # Does NOT affect POSIX fsync semantics: explicit fsync from clients\n"); + toml_string.push_str(" # is always honored. This flag only governs writes between fsync calls. When on,\n"); + toml_string.push_str(" # they become durable on return instead of buffered until the next periodic flush.\n"); + toml_string.push_str(" # wal_enabled = true is strongly recommended when this is on.\n"); toml_string.push_str("\n# Optional separate WAL (Write-Ahead Log) object store\n"); toml_string.push_str("# Use a faster/closer store for WAL to improve fsync latency\n"); diff --git a/zerofs/src/fs/mod.rs b/zerofs/src/fs/mod.rs index 5b7612c..2b11717 100644 --- a/zerofs/src/fs/mod.rs +++ b/zerofs/src/fs/mod.rs @@ -108,6 +108,7 @@ impl ZeroFS { slatedb: SlateDbHandle, max_bytes: u64, metrics_recorder: Option>, + sync_writes: bool, ) -> anyhow::Result { let lock_manager = Arc::new(KeyedLockManager::new()); @@ -176,7 +177,12 @@ impl ZeroFS { let directory_store = DirectoryStore::new(db.clone()); let inode_store = InodeStore::new(db.clone(), next_inode_id); let tombstone_store = TombstoneStore::new(db.clone()); - let write_coordinator = WriteCoordinator::new(db.clone(), inode_store.clone()); + let write_coordinator = WriteCoordinator::new( + db.clone(), + inode_store.clone(), + flush_coordinator.clone(), + sync_writes, + ); let fs = Self { db: db.clone(), @@ -198,6 +204,11 @@ impl ZeroFS { #[cfg(test)] pub async fn new_in_memory() -> anyhow::Result { + Self::new_in_memory_with_sync_writes(false).await + } + + #[cfg(test)] + pub async fn new_in_memory_with_sync_writes(sync_writes: bool) -> anyhow::Result { use crate::block_transformer::ZeroFsBlockTransformer; use crate::config::CompressionConfig; use slatedb::BlockTransformer; @@ -219,7 +230,13 @@ impl ZeroFS { .await?, ); - Self::new_with_slatedb(SlateDbHandle::ReadWrite(slatedb), u64::MAX, None).await + Self::new_with_slatedb( + SlateDbHandle::ReadWrite(slatedb), + u64::MAX, + None, + sync_writes, + ) + .await } #[cfg(test)] @@ -249,6 +266,7 @@ impl ZeroFS { SlateDbHandle::ReadOnly(ArcSwap::new(reader)), u64::MAX, None, + false, ) .await } @@ -2767,9 +2785,10 @@ mod tests { .unwrap(), ); - let fs_rw = ZeroFS::new_with_slatedb(SlateDbHandle::ReadWrite(slatedb), u64::MAX, None) - .await - .unwrap(); + let fs_rw = + ZeroFS::new_with_slatedb(SlateDbHandle::ReadWrite(slatedb), u64::MAX, None, false) + .await + .unwrap(); let test_inode_id = fs_rw.inode_store.allocate(); let file_inode = FileInode { diff --git a/zerofs/src/fs/write_coordinator.rs b/zerofs/src/fs/write_coordinator.rs index 88c2b81..136118c 100644 --- a/zerofs/src/fs/write_coordinator.rs +++ b/zerofs/src/fs/write_coordinator.rs @@ -8,6 +8,7 @@ use crate::db::{Db, Transaction}; use crate::fs::errors::FsError; +use crate::fs::flush_coordinator::FlushCoordinator; use crate::fs::key_codec::KeyCodec; use crate::fs::store::InodeStore; use crate::task::spawn_named; @@ -24,7 +25,12 @@ pub struct WriteCoordinator { } impl WriteCoordinator { - pub fn new(db: Arc, inode_store: InodeStore) -> Self { + pub fn new( + db: Arc, + inode_store: InodeStore, + flush_coordinator: FlushCoordinator, + sync_writes: bool, + ) -> Self { // Capture synchronously: the spawned task starts later, by which point // callers may already have bumped `next_id`. If we captured inside the // task we'd over-shoot and skip the first emit. @@ -32,7 +38,14 @@ impl WriteCoordinator { let (sender, receiver) = mpsc::unbounded_channel(); spawn_named( "commit-worker", - worker_loop(db, inode_store, receiver, initial_counter), + worker_loop( + db, + inode_store, + flush_coordinator, + sync_writes, + receiver, + initial_counter, + ), ); Self { sender } } @@ -49,6 +62,8 @@ impl WriteCoordinator { async fn worker_loop( db: Arc, inode_store: InodeStore, + flush_coordinator: FlushCoordinator, + sync_writes: bool, mut rx: mpsc::UnboundedReceiver<(Transaction, Reply)>, initial_counter: u64, ) { @@ -79,7 +94,7 @@ async fn worker_loop( last_emitted_counter = current; } - let result = db + let mut result = db .write_with_options( merged, &WriteOptions { @@ -88,6 +103,15 @@ async fn worker_loop( ) .await .map_err(|_| FsError::IoError); + + // In sync_writes mode, force a flush after the batch is committed so + // every caller in the batch only sees Ok once their data is durable in + // object storage. FlushCoordinator coalesces concurrent flush requests + // into a single db.flush() + if sync_writes && result.is_ok() { + result = flush_coordinator.flush().await; + } + for reply in replies { let _ = reply.send(result); } @@ -144,6 +168,32 @@ mod tests { } } + #[tokio::test] + async fn sync_writes_commits_persist() { + let fs = ZeroFS::new_in_memory_with_sync_writes(true).await.unwrap(); + let coord = fs.write_coordinator.clone(); + let mut handles = Vec::new(); + for i in 0u64..16 { + let c = coord.clone(); + handles.push(tokio::spawn(async move { + let mut txn = Transaction::new(); + txn.put_bytes(&Bytes::from(format!("sync/{i}")), Bytes::from(vec![2u8; 8])); + c.commit(txn).await + })); + } + for h in handles { + h.await.unwrap().unwrap(); + } + for i in 0u64..16 { + let v = fs + .db + .get_bytes(&Bytes::from(format!("sync/{i}"))) + .await + .unwrap(); + assert!(v.is_some(), "sync/{i} not durable after sync_writes commit"); + } + } + #[tokio::test] async fn counter_emitted_only_when_advanced() { let fs = make_fs().await; diff --git a/zerofs/src/posix_tests.rs b/zerofs/src/posix_tests.rs index 0103c81..f761e26 100644 --- a/zerofs/src/posix_tests.rs +++ b/zerofs/src/posix_tests.rs @@ -1340,6 +1340,7 @@ mod tests { )), 1_000_000, None, + false, ) .await .unwrap(), @@ -1399,6 +1400,7 @@ mod tests { )), 1_000_000, None, + false, ) .await .unwrap(), @@ -1455,6 +1457,7 @@ mod tests { )), 1_000_000, None, + false, ) .await .unwrap(), diff --git a/zerofs/tests/failpoints/mod.rs b/zerofs/tests/failpoints/mod.rs index a617656..e049dce 100644 --- a/zerofs/tests/failpoints/mod.rs +++ b/zerofs/tests/failpoints/mod.rs @@ -53,7 +53,7 @@ impl CrashTestContext { ); Arc::new( - ZeroFS::new_with_slatedb(SlateDbHandle::ReadWrite(slatedb), u64::MAX, None) + ZeroFS::new_with_slatedb(SlateDbHandle::ReadWrite(slatedb), u64::MAX, None, false) .await .unwrap(), )