From 7f057ad8475623e90e7ed976ba3efb8612ef018d Mon Sep 17 00:00:00 2001 From: Raphael Vigee Date: Mon, 8 Jun 2026 12:37:54 +0200 Subject: [PATCH 1/6] perf(pluginfs): cross-run glob walk cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs-glob targets are cache=off, so the engine re-walks the tree every run (walkdir + per-entry stat + per-file open/read/hash). On a warm go/large run this glob walk was ~19% of CPU. Add a single-file sidecar (/.heph3/cache/fsglob.bin) memoizing each (root,pattern,exclude) walk, validated by directory mtimes (the matched file set) and per-file (size,mtime) (content). A full match reconstructs the artifacts with stat only — no readdir, no opens, no reads, no hashing. Loaded once per process, flushed on Driver drop. mtime+size is a fast-path proxy for content identity (heph otherwise hashes content); disable with HEPH_FS_GLOB_CACHE=0. Correct-by-fallback: any mismatch re-walks. Measured on example/go/large warm cache-hit run (profiling binary): cached_glob_walk CPU: 19.3% -> 2.5% warm wall median: ~2.50s -> ~2.27s (~9%) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/pluginfs/mod.rs | 463 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 453 insertions(+), 10 deletions(-) diff --git a/src/pluginfs/mod.rs b/src/pluginfs/mod.rs index 157a69d3..5c6ca6f1 100644 --- a/src/pluginfs/mod.rs +++ b/src/pluginfs/mod.rs @@ -225,7 +225,9 @@ impl ProviderFn for GlobFn { // No user excludes, so `request_id` is irrelevant (the built-in exclude // path is taken). Reuses the driver's compiled glob + walk verbatim. let compiled = compile_glob(&self.skip, "heph.fs.glob", &resolved, &[])?; - let artifacts = walk_glob(ctx.root, &compiled)?; + // BUILD-time expansion: only the artifacts are needed here (the + // cross-run cache entry is produced for the `run`-path walk). + let (artifacts, _) = walk_glob(ctx.root, &compiled)?; let pkg_prefix = (!ctx.pkg.is_empty()).then(|| std::path::Path::new(ctx.pkg)); @@ -572,7 +574,9 @@ fn compile_glob( }) } -/// Walks `root` for files matching `compiled`, returning their artifacts. +/// Walks `root` for files matching `compiled`, returning their artifacts plus a +/// [`CachedGlobEntry`] for the cross-run cache (or `None` when the walk root is +/// missing or an mtime is unreadable, so the result is not persistable). /// /// Starts at the pattern's literal prefix so a rooted pattern (`a/b/**/*`) scans /// only `/a/b`, not the whole tree. Matching uses the cached glob/exclude @@ -580,14 +584,27 @@ fn compile_glob( fn walk_glob( root: &std::path::Path, compiled: &CompiledGlob, -) -> anyhow::Result> { +) -> anyhow::Result<(Vec, Option)> { let walk_root = if compiled.prefix.is_empty() { root.to_path_buf() } else { root.join(&compiled.prefix) }; + // A missing walk root is an empty match — but must not be persisted: were it + // cached as "empty", a later-created tree (which doesn't change any recorded + // dir's mtime, since none were recorded) would keep serving the stale empty + // set. Return `None` so every run re-walks until the root exists. + if std::fs::metadata(&walk_root).is_err() { + return Ok((vec![], None)); + } + let mut artifacts = vec![]; + // Cross-run cache accumulation. `persistable` flips off if any mtime can't be + // read, in which case we never write a (possibly unverifiable) entry. + let mut dirs: Vec<(String, i64)> = vec![]; + let mut files: Vec = vec![]; + let mut persistable = true; let walker = walkdir::WalkDir::new(&walk_root) .into_iter() @@ -617,6 +634,19 @@ fn walk_glob( }; if entry.file_type().is_dir() { + // Record every descended dir's mtime: it bumps on any entry + // add/remove/rename, so matching all of them on a later run proves + // the matched file *set* is unchanged without re-reading the dir. + if let Ok(rel) = entry.path().strip_prefix(root) { + if let Some(rel) = rel.to_str() { + match entry.metadata().ok().as_ref().and_then(mtime_ns) { + Some(mt) => dirs.push((rel.to_owned(), mt)), + None => persistable = false, + } + } else { + persistable = false; + } + } continue; } @@ -664,6 +694,20 @@ fn walk_glob( let hashout = file_hashout(abs_path, x) .with_context(|| format!("hash glob entry '{}'", abs_path.display()))?; + // Cross-run cache: record (size, mtime) so a later run can validate this + // file's content without re-reading it. A missing mtime makes the whole + // walk non-persistable. + match mtime_ns(&meta) { + Some(mt) => files.push(CachedGlobFile { + rel: rel_str.to_string(), + x, + size: meta.len(), + mtime_ns: mt, + hashout: hashout.clone(), + }), + None => persistable = false, + } + // Materialize the owned strings that borrow `abs_path`/`rel_str` *before* // consuming `entry` for `source_path` below — the borrows end here. let out_path = rel_str.to_string(); @@ -697,13 +741,222 @@ fn walk_glob( }); } - Ok(artifacts) + let entry = if persistable { + Some(CachedGlobEntry { + version: GLOB_CACHE_VERSION, + dirs, + files, + }) + } else { + None + }; + Ok((artifacts, entry)) +} + +// ── Persistent cross-run glob cache ────────────────────────────────────────── +// +// fs-glob targets are `CacheConfig::off()`, so the engine re-walks the tree on +// every run (walkdir + per-entry stat + per-file open/read/hash). This sidecar +// memoizes a walk's result per `(root, pattern, exclude)` *across* runs, +// validated by directory mtimes (the matched file *set*) and per-file +// `(size, mtime)` (file *content*). A full match reconstructs the artifacts with +// only stat syscalls — no readdir, no file opens, no reads, no hashing. +// +// `mtime+size` is a fast-path proxy for content identity; heph otherwise hashes +// content precisely, so a same-size in-place rewrite within the filesystem's +// mtime granularity can be missed (accepted tradeoff). Correct-by-fallback: any +// IO/decode/validation mismatch falls through to a full walk, and the whole +// layer is disabled with `HEPH_FS_GLOB_CACHE=0`. + +const GLOB_CACHE_VERSION: u32 = 1; + +#[derive(Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] +struct CachedGlobFile { + /// Path relative to the tree root. `source = root/rel`, `out_path = rel`, + /// `name = rel.replace('/', "_")` — mirrors `walk_glob`'s artifact build. + rel: String, + x: bool, + size: u64, + mtime_ns: i64, + hashout: String, +} + +#[derive(Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] +struct CachedGlobEntry { + version: u32, + /// (dir rel-to-root, mtime_ns) for every directory descended during the walk. + dirs: Vec<(String, i64)>, + files: Vec, +} + +/// Nanoseconds since the unix epoch for `meta`'s mtime, or `None` if unreadable. +fn mtime_ns(meta: &std::fs::Metadata) -> Option { + let t = meta.modified().ok()?; + let d = t.duration_since(std::time::UNIX_EPOCH).ok()?; + i64::try_from(d.as_nanos()).ok() +} + +/// On-disk format for the single-file glob cache sidecar. +#[derive(Default, borsh::BorshSerialize, borsh::BorshDeserialize)] +struct GlobStoreFile { + version: u32, + entries: Vec<(String, CachedGlobEntry)>, +} + +/// Process-lifetime, single-file cache of glob walk results, shared by the fs +/// `Driver`. Backed by one `/.heph3/cache/fsglob.bin` sidecar loaded once +/// and flushed on `Driver` drop — a single open/read amortized over every glob +/// target, instead of one cache file (one `open`) per target. +#[derive(Default)] +struct GlobStore { + inner: parking_lot::Mutex, +} + +#[derive(Default)] +struct GlobStoreInner { + /// Sidecar path; empty until the first walk sets it from the tree root. + path: std::path::PathBuf, + /// Whether the sidecar has been loaded (or loading was disabled). + loaded: bool, + /// Set when `map` holds inserts not yet persisted. + dirty: bool, + /// Disabled (no read, no write) via `HEPH_FS_GLOB_CACHE=0`. + enabled: bool, + map: FxHashMap>, +} + +/// Read + decode the glob sidecar at `path`, or `None` on any IO/decode/version +/// mismatch (⇒ start empty). +fn load_glob_sidecar(path: &std::path::Path) -> Option>> { + let bytes = std::fs::read(path).ok()?; + let file: GlobStoreFile = borsh::from_slice(&bytes).ok()?; + (file.version == GLOB_CACHE_VERSION).then(|| { + file.entries + .into_iter() + .map(|(k, v)| (k, Arc::new(v))) + .collect() + }) +} + +impl GlobStore { + /// Load the sidecar on first use, deriving its path from the tree `root`. + /// Holding the lock across the read serializes the (one-time) load. + fn ensure_loaded(&self, root: &std::path::Path) { + let mut inner = self.inner.lock(); + if inner.loaded { + return; + } + inner.loaded = true; + inner.enabled = std::env::var_os("HEPH_FS_GLOB_CACHE").is_none_or(|v| v != "0"); + if !inner.enabled { + return; + } + inner.path = root.join(".heph3").join("cache").join("fsglob.bin"); + if let Some(map) = load_glob_sidecar(&inner.path) { + inner.map = map; + } + } + + fn get(&self, key: &str) -> Option> { + self.inner.lock().map.get(key).cloned() + } + + fn insert(&self, key: String, entry: CachedGlobEntry) { + let mut inner = self.inner.lock(); + if !inner.enabled { + return; + } + inner.map.insert(key, Arc::new(entry)); + inner.dirty = true; + } + + /// Persist the map to its sidecar (temp + atomic rename). Best-effort, called + /// on `Driver` drop; a no-op when nothing changed (e.g. a pure cache-hit run). + fn flush(&self) { + let mut inner = self.inner.lock(); + if !inner.dirty || inner.path.as_os_str().is_empty() { + return; + } + let file = GlobStoreFile { + version: GLOB_CACHE_VERSION, + entries: inner + .map + .iter() + .map(|(k, v)| (k.clone(), (**v).clone())) + .collect(), + }; + let Some(parent) = inner.path.parent().map(std::path::Path::to_path_buf) else { + return; + }; + if std::fs::create_dir_all(&parent).is_err() { + return; + } + let Ok(bytes) = borsh::to_vec(&file) else { + return; + }; + let tmp = inner + .path + .with_extension(format!("tmp.{}", std::process::id())); + if std::fs::write(&tmp, &bytes).is_ok() { + if std::fs::rename(&tmp, &inner.path).is_ok() { + inner.dirty = false; + } else { + drop(std::fs::remove_file(&tmp)); + } + } + } +} + +/// Validate `entry` against the current tree; on a full match reconstruct the +/// artifacts. Returns `None` (⇒ caller re-walks) on any mismatch. +fn reconstruct_glob( + root: &std::path::Path, + entry: &CachedGlobEntry, +) -> Option> { + // Set check: every walked dir's mtime must match. An added/removed/renamed + // entry bumps its parent dir's mtime, so this proves the file set is intact. + for (rel, mt) in &entry.dirs { + let meta = std::fs::metadata(root.join(rel)).ok()?; + if !meta.is_dir() || mtime_ns(&meta) != Some(*mt) { + return None; + } + } + let mut artifacts = Vec::with_capacity(entry.files.len()); + for f in &entry.files { + let abs = root.join(&f.rel); + let meta = std::fs::metadata(&abs).ok()?; + // Content check: size + mtime. An in-place edit changes at least the + // mtime (and bumps no parent dir), so this is load-bearing. + if meta.is_dir() || meta.len() != f.size || mtime_ns(&meta) != Some(f.mtime_ns) { + return None; + } + // A file that gained the codegen xattr (which bumps neither size nor + // mtime) must drop out, matching `walk_glob`. + if has_codegen_xattr(&abs) { + return None; + } + let source_path = abs.into_os_string().into_string().ok()?; + artifacts.push(OutputArtifact { + group: String::new(), + name: f.rel.replace('/', "_"), + r#type: Type::Output, + content: Content::File(ContentFile { + source_path, + out_path: f.rel.clone(), + x: f.x, + }), + hashout: f.hashout.clone(), + }); + } + Some(artifacts) } -/// Returns the glob walk artifacts for `(root, pattern, exclude)`, memoizing -/// across calls within `request_id`. The first call walks; repeats reuse the -/// cached `Arc`. +/// Returns the glob walk artifacts for `(root, pattern, exclude)`. Within a +/// request the result is memoized (the tree is immutable mid-request); across +/// runs it is served from `store` (the single-file sidecar) when the tree is +/// unchanged. The first uncached call walks the tree. fn cached_glob_walk( + store: &GlobStore, request_id: &str, root: &std::path::Path, pattern: &str, @@ -724,7 +977,22 @@ fn cached_glob_walk( return Ok(a.clone()); } - let artifacts = Arc::new(walk_glob(root, compiled)?); + // Cross-run persistent cache: on a full validation hit, reconstruct without + // touching the directory tree beyond stat. On a miss, walk and record. + store.ensure_loaded(root); + let artifacts = match store + .get(&key) + .and_then(|entry| reconstruct_glob(root, &entry)) + { + Some(arts) => Arc::new(arts), + None => { + let (arts, entry) = walk_glob(root, compiled)?; + if let Some(entry) = entry { + store.insert(key.clone(), entry); + } + Arc::new(arts) + } + }; Ok(glob_result_cache() .write() @@ -739,11 +1007,24 @@ fn cached_glob_walk( pub struct Driver { /// Engine-owned + built-in dirs pruned during glob walks. skip: Arc, + /// Cross-run glob walk cache, flushed to disk when the driver drops. + glob_store: Arc, +} + +impl Drop for Driver { + fn drop(&mut self) { + // Persist any new glob walk results gathered this run. A pure cache-hit + // run leaves the store clean, so this is a no-op there. + self.glob_store.flush(); + } } impl Driver { pub fn new(skip: Arc) -> Self { - Self { skip } + Self { + skip, + glob_store: Arc::default(), + } } } @@ -967,7 +1248,14 @@ impl crate::engine::driver::Driver for Driver { // Within a request the tree is immutable, so the walk result for // this `(root, pattern, excludes)` is memoized — repeat calls // skip walkdir + per-entry stat entirely. - let artifacts = cached_glob_walk(req.request_id, root, pattern, exclude, compiled)?; + let artifacts = cached_glob_walk( + &self.glob_store, + req.request_id, + root, + pattern, + exclude, + compiled, + )?; Ok(RunResponse { artifacts: (*artifacts).clone(), @@ -1473,6 +1761,161 @@ mod tests { ); } + /// The persistent glob cache must reconstruct an unchanged tree's artifacts + /// and reject every kind of change: a file's content (size/mtime), the file + /// *set* (a recorded dir's mtime), and a freshly-stamped codegen xattr. + #[test] + fn test_glob_cache_reconstruct_and_invalidation() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + fs::create_dir(root.join("sub")).unwrap(); + fs::write(root.join("a.rs"), b"aaa").unwrap(); + fs::write(root.join("sub").join("b.rs"), b"bbbb").unwrap(); + fs::write(root.join("c.txt"), b"ignored").unwrap(); + + let skip = Arc::new(Ignore::new(&[], &[]).unwrap()); + let compiled = compile_glob(&skip, "t", "**/*.rs", &[]).unwrap(); + + let key = |arts: &[OutputArtifact]| { + let mut v: Vec<_> = arts + .iter() + .map(|a| (a.name.clone(), a.hashout.clone())) + .collect(); + v.sort(); + v + }; + + let (arts, entry) = walk_glob(root, &compiled).unwrap(); + let entry = entry.expect("walk over a present tree is persistable"); + assert_eq!(arts.len(), 2, "matches a.rs + sub/b.rs, not c.txt"); + + // Unchanged tree → reconstruct yields the identical artifact set. + let rebuilt = reconstruct_glob(root, &entry).expect("unchanged tree reconstructs"); + assert_eq!(key(&arts), key(&rebuilt)); + + // Content change: rewrite a.rs with a different size → must invalidate. + fs::write(root.join("a.rs"), b"a much longer body").unwrap(); + assert!( + reconstruct_glob(root, &entry).is_none(), + "a changed file size must invalidate the cache" + ); + fs::write(root.join("a.rs"), b"aaa").unwrap(); // restore size; mtime moved + + // Set change: bump a recorded dir's mtime (simulating an add/remove/rename + // in it) → must invalidate even though no recorded file changed. + let (_, fresh) = walk_glob(root, &compiled).unwrap(); + let fresh = fresh.expect("persistable"); + let dir_handle = std::fs::File::open(root.join("sub")).unwrap(); + dir_handle + .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) + .unwrap(); + assert!( + reconstruct_glob(root, &fresh).is_none(), + "a bumped directory mtime must invalidate the cache" + ); + } + + /// A file that gains the codegen provenance xattr (which bumps neither size + /// nor mtime) must drop out of a reconstructed glob, matching `walk_glob`. + #[cfg(unix)] + #[test] + fn test_glob_cache_reconstruct_drops_codegen_xattr() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + fs::write(root.join("gen.rs"), b"x").unwrap(); + let skip = Arc::new(Ignore::new(&[], &[]).unwrap()); + let compiled = compile_glob(&skip, "t", "*.rs", &[]).unwrap(); + + let (arts, entry) = walk_glob(root, &compiled).unwrap(); + assert_eq!(arts.len(), 1); + let entry = entry.unwrap(); + + // Stamp the codegen xattr without touching content/mtime. + if xattr::set(root.join("gen.rs"), CODEGEN_XATTR, b"//gen:it").is_err() { + return; // filesystem without xattr support — nothing to assert + } + assert!( + reconstruct_glob(root, &entry).is_none(), + "a newly codegen-stamped file must invalidate the cache" + ); + } + + /// End-to-end cross-run persistence: a first driver populates and (on drop) + /// flushes the single-file sidecar; a *fresh* driver loads it from disk and + /// reuses it for the unchanged tree, then re-walks once a file is added. + /// `.heph3` is pruned, so writing the sidecar never self-invalidates a + /// recorded directory. + #[tokio::test] + async fn test_glob_cache_cross_run() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + let home = root.join(".heph3"); + fs::create_dir_all(&home).unwrap(); + fs::write(root.join("a.rs"), b"aaa").unwrap(); + fs::write(root.join("b.rs"), b"bbb").unwrap(); + + let skip = Arc::new(Ignore::new(&[home], &[]).unwrap()); + let config = + std::collections::HashMap::from([("p".to_string(), Value::String("*.rs".to_string()))]); + let hashin = String::new(); + let (id1, id2, id3) = ( + "req-1".to_string(), + "req-2".to_string(), + "req-3".to_string(), + ); + + // First driver: walk + populate, then drop to flush the sidecar. + let parse_res = { + let driver = Driver::new(skip.clone()); + let parse_res = driver + .parse(make_parse_req(config), &ctoken()) + .await + .unwrap(); + let first = driver + .run( + make_run_req(&parse_res.target_def, &id1, root.to_path_buf(), &hashin), + &ctoken(), + ) + .await + .unwrap(); + assert_eq!(first.artifacts.len(), 2); + parse_res + }; // driver dropped here → flush + + let sidecar = root.join(".heph3").join("cache").join("fsglob.bin"); + assert!( + sidecar.exists(), + "dropping the driver persists the single-file glob sidecar" + ); + + // Fresh driver loads the sidecar from disk and reconstructs the unchanged + // tree without re-walking — same artifacts. + let driver2 = Driver::new(skip.clone()); + let second = driver2 + .run( + make_run_req(&parse_res.target_def, &id2, root.to_path_buf(), &hashin), + &ctoken(), + ) + .await + .unwrap(); + assert_eq!(second.artifacts.len(), 2); + + // Adding a matching file bumps the root dir mtime → re-walk includes it. + fs::write(root.join("c.rs"), b"ccc").unwrap(); + std::fs::File::open(root) + .unwrap() + .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) + .unwrap(); + let third = driver2 + .run( + make_run_req(&parse_res.target_def, &id3, root.to_path_buf(), &hashin), + &ctoken(), + ) + .await + .unwrap(); + assert_eq!(third.artifacts.len(), 3, "newly added c.rs is picked up"); + } + /// A glob over a tree containing a symlink-to-dir (matching the pattern) and a /// dangling symlink must NOT error: `file_hashout` opens+reads, so these /// would otherwise blow up with EISDIR/ENOENT. A symlink-to-FILE is sourced, From 86ed1d5bb082cf235e7325e7df994ac5f82984c3 Mon Sep 17 00:00:00 2001 From: Raphael Vigee Date: Mon, 8 Jun 2026 13:37:33 +0200 Subject: [PATCH 2/6] refactor(walk-cache): generic SQLite-backed cross-run walk cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the fs glob cache into a reusable module and move its storage from a per-workspace borsh sidecar to the durable cache's SQLite db, then reuse it for the buildfile provider's package discovery. What changed: - engine::walk_cache — generic `WalkCache` keyed by an arbitrary string, validated by a `WalkSignature` (directory mtimes for the file *set* + optional per-file size/mtime for *content*). Loaded once from the KV namespace, served from memory, write-through on insert (a pure cache-hit run writes nothing). - LocalCache gains a namespaced key→blob KV store (`kv_get`/`kv_list`/`kv_put`), implemented on the SQLite backend (new `kv` table + a fire-and-forget WriterCmd, flushed when the writer thread joins on drop) and delegated through LocalCacheMem; default no-ops elsewhere. - PluginInit now carries the engine's `Arc`, so plugins reach the KV. The fs Driver and buildfile Provider take it (Driver::new gains a cache arg; Provider::with_cache builder). - pluginfs: the inline GlobStore/sidecar is replaced by `WalkCache`. No flush-on-drop — inserts write through to the KV. - pluginbuildfile: `find_packages_sync` now records directory mtimes, and `list_packages` memoizes the discovery walk across runs via `WalkCache>` (dir-set validation only — BUILD *contents* don't change the package set). `HEPH_FS_GLOB_CACHE=0` still disables the glob cache. Behavior/perf: glob cache unchanged on example/go/large (cached_glob_walk 19.3% -> 2.3% CPU; warm wall ~2.88s -> ~2.33s, ~19%); the KV load is ~60ms one-time. Package discovery is now cross-run cached too. Tests: walk_cache (signature validation, KV roundtrip, disabled passthrough), sqlite kv_put/get/list, pluginfs glob signature+reconstruct+xattr+cross-run, pluginbuildfile cross-run discovery. Full lib suite (1003) passes; clippy clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/commands/bootstrap.rs | 30 ++- src/engine/engine.rs | 9 +- src/engine/local_cache.rs | 17 ++ src/engine/local_cache_mem.rs | 12 + src/engine/local_cache_sqlite.rs | 130 ++++++++- src/engine/mod.rs | 2 + src/engine/walk_cache.rs | 323 +++++++++++++++++++++++ src/pluginbuildfile/provider.rs | 194 +++++++++++++- src/pluginfs/mod.rs | 438 ++++++++++++------------------- 9 files changed, 866 insertions(+), 289 deletions(-) create mode 100644 src/engine/walk_cache.rs diff --git a/src/commands/bootstrap.rs b/src/commands/bootstrap.rs index 538e8b17..9fe3184c 100644 --- a/src/commands/bootstrap.rs +++ b/src/commands/bootstrap.rs @@ -145,12 +145,15 @@ pub fn new_engine() -> anyhow::Result<(Arc, ShutdownTrigger)> { // Opt-in factories — instantiated by `apply_config` if listed in the YAML. e.register_provider_factory("buildfile", |init, opts| { - Ok(Box::new(pluginbuildfile::Provider::from_options( - init.root.to_path_buf(), - &init.skip_dirs, - &init.skip_globs, - opts, - )?)) + Ok(Box::new( + pluginbuildfile::Provider::from_options( + init.root.to_path_buf(), + &init.skip_dirs, + &init.skip_globs, + opts, + )? + .with_cache(Some(init.cache.clone())), + )) })?; e.register_provider_factory("go", |init, opts| { Ok(Box::new(plugingo::Provider::from_options( @@ -268,12 +271,15 @@ mod tests { })?; e.register_provider_factory("buildfile", |init, opts| { - Ok(Box::new(pluginbuildfile::Provider::from_options( - init.root.to_path_buf(), - &init.skip_dirs, - &init.skip_globs, - opts, - )?)) + Ok(Box::new( + pluginbuildfile::Provider::from_options( + init.root.to_path_buf(), + &init.skip_dirs, + &init.skip_globs, + opts, + )? + .with_cache(Some(init.cache.clone())), + )) })?; e.register_managed_driver_factory("exec", |_init, opts| { Ok(Box::new(pluginexec::Driver::from_options_exec(opts)?)) diff --git a/src/engine/engine.rs b/src/engine/engine.rs index 2e869086..73bb4cae 100644 --- a/src/engine/engine.rs +++ b/src/engine/engine.rs @@ -101,6 +101,9 @@ pub struct PluginInit { /// Workspace-relative `fs.skip` glob patterns (e.g. `**/node_modules/**`), /// matched against entry paths. pub skip_globs: Vec, + /// The engine's durable local cache, handed to plugins for cross-run scratch + /// state via its namespaced KV store (see [`crate::engine::walk_cache`]). + pub cache: Arc, } /// True if `entry` contains wax glob metacharacters — used to split `fs.skip` @@ -440,7 +443,10 @@ impl Engine { &init.skip_dirs, &init.skip_globs, )?); - Ok(Box::new(crate::pluginfs::Driver::new(ignore))) + Ok(Box::new(crate::pluginfs::Driver::new( + ignore, + Some(init.cache.clone()), + ))) })?; Ok(engine) @@ -507,6 +513,7 @@ impl Engine { root: self.cfg.root.clone(), skip_dirs: self.skip_dirs(), skip_globs: self.skip_globs(), + cache: self.local_cache.clone(), } } diff --git a/src/engine/local_cache.rs b/src/engine/local_cache.rs index f8497860..41352d5b 100644 --- a/src/engine/local_cache.rs +++ b/src/engine/local_cache.rs @@ -125,6 +125,23 @@ pub trait LocalCache: Send + Sync { ) -> anyhow::Result>> { Ok(None) } + + // ── Namespaced key→blob store ──────────────────────────────────────────── + // + // A general scratch store, separate from the (addr, hashin, name) artifact + // space, used for cross-run plugin state such as filesystem-walk caches + // (see `crate::engine::walk_cache`). Defaults are no-ops so only the durable + // backend (and its fronting tiers) need implement it; a no-op backend simply + // makes those caches always miss (correctness-neutral). + fn kv_get(&self, _ns: &str, _k: &str) -> anyhow::Result>> { + Ok(None) + } + fn kv_list(&self, _ns: &str) -> anyhow::Result)>> { + Ok(Vec::new()) + } + fn kv_put(&self, _ns: &str, _k: &str, _v: &[u8]) -> anyhow::Result<()> { + Ok(()) + } } #[derive(Debug, thiserror::Error)] diff --git a/src/engine/local_cache_mem.rs b/src/engine/local_cache_mem.rs index f83d67ba..9b7a8db5 100644 --- a/src/engine/local_cache_mem.rs +++ b/src/engine/local_cache_mem.rs @@ -117,6 +117,18 @@ impl LocalCache for LocalCacheMem { self.inner.list_target_entries(addr) } + // KV is not fronted by the mem LRU (small, read once at startup) — delegate + // straight to the durable backend. + fn kv_get(&self, ns: &str, k: &str) -> Result>> { + self.inner.kv_get(ns, k) + } + fn kv_list(&self, ns: &str) -> Result)>> { + self.inner.kv_list(ns) + } + fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> Result<()> { + self.inner.kv_put(ns, k, v) + } + fn seekable_reader( &self, addr: &Addr, diff --git a/src/engine/local_cache_sqlite.rs b/src/engine/local_cache_sqlite.rs index b9f94c4a..eaf09d12 100644 --- a/src/engine/local_cache_sqlite.rs +++ b/src/engine/local_cache_sqlite.rs @@ -166,9 +166,19 @@ struct DeleteJob { slot: Arc, } +/// Namespaced key→blob upsert for the `kv` table. Fire-and-forget: callers keep +/// their own in-memory copy and only read the table once at startup, so there is +/// no read-after-write race to track with a [`PendingSlot`]. +struct KvPutJob { + ns: String, + k: String, + v: Vec, +} + enum WriterCmd { Write(WriteJob), Delete(DeleteJob), + KvPut(KvPutJob), } pub struct LocalCacheSQLite { @@ -214,7 +224,13 @@ impl LocalCacheSQLite { data BLOB NOT NULL, PRIMARY KEY (addr, hashin, name) ); - CREATE INDEX IF NOT EXISTS idx_artifacts_addr_hashin ON artifacts (addr, hashin);", + CREATE INDEX IF NOT EXISTS idx_artifacts_addr_hashin ON artifacts (addr, hashin); + CREATE TABLE IF NOT EXISTS kv ( + ns TEXT NOT NULL, + k TEXT NOT NULL, + v BLOB NOT NULL, + PRIMARY KEY (ns, k) + );", ) .context("initialising sqlite cache schema")?; @@ -303,6 +319,8 @@ fn writer_loop(conn: &mut Connection, rx: &mpsc::Receiver, pending: & match cmd { WriterCmd::Write(j) => pending.complete(&j.key, &j.slot), WriterCmd::Delete(j) => pending.complete(&j.key, &j.slot), + // KvPut is fire-and-forget — no pending slot to release. + WriterCmd::KvPut(_) => {} } } } @@ -359,6 +377,13 @@ fn process_batch(conn: &mut Connection, batch: &mut [WriterCmd]) -> Result<()> { ) })?; } + WriterCmd::KvPut(job) => { + tx.execute( + "INSERT OR REPLACE INTO kv (ns, k, v) VALUES (?1, ?2, ?3)", + rusqlite::params![job.ns, job.k, job.v.as_slice()], + ) + .with_context(|| format!("kv put {}/{}", job.ns, job.k))?; + } } } @@ -495,6 +520,52 @@ impl LocalCache for LocalCacheSQLite { Ok(found) } + fn kv_get(&self, ns: &str, k: &str) -> Result>> { + let conn = self + .read_pool + .get() + .context("acquiring read connection from pool")?; + let mut stmt = conn + .prepare_cached("SELECT v FROM kv WHERE ns=?1 AND k=?2") + .context("preparing kv get")?; + match stmt.query_row(rusqlite::params![ns, k], |row| row.get::<_, Vec>(0)) { + Ok(v) => Ok(Some(v)), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e).context("reading kv value"), + } + } + + fn kv_list(&self, ns: &str) -> Result)>> { + let conn = self + .read_pool + .get() + .context("acquiring read connection from pool")?; + let mut stmt = conn + .prepare_cached("SELECT k, v FROM kv WHERE ns=?1") + .context("preparing kv list")?; + let rows = stmt + .query_map(rusqlite::params![ns], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, Vec>(1)?)) + }) + .context("querying kv namespace")?; + rows.collect::>>() + .context("reading kv rows") + } + + fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> Result<()> { + // Fire-and-forget through the writer thread (batched with artifact + // writes). The `Drop` impl joins the writer, so enqueued puts flush + // before the process exits. + self.writer_tx()? + .send(WriterCmd::KvPut(KvPutJob { + ns: ns.to_owned(), + k: k.to_owned(), + v: v.to_vec(), + })) + .context("enqueuing kv put")?; + Ok(()) + } + fn list_targets(&self) -> Result { // Stream distinct addrs over a bounded channel: the producer holds one // pooled connection and a `SELECT DISTINCT addr` cursor on a dedicated @@ -754,6 +825,63 @@ mod tests { Ok(()) } + #[test] + fn test_kv_put_get_list() -> Result<()> { + let dir = tempdir()?; + let cache = LocalCacheSQLite::with_pipe_limit( + dir.path().join("cache.db"), + 16 * 1024, + DEFAULT_MAX_CONCURRENT_PIPES, + )?; + + assert!(cache.kv_get("ns", "missing")?.is_none()); + + cache.kv_put("ns", "a", b"alpha")?; + cache.kv_put("ns", "b", b"beta")?; + cache.kv_put("other", "a", b"zzz")?; + // kv_put is async through the writer thread; block until it lands by + // dropping into a short spin on kv_get. + let mut tries = 0; + while cache.kv_get("ns", "a")?.is_none() && tries < 1000 { + std::thread::yield_now(); + tries += 1; + } + + assert_eq!( + cache.kv_get("ns", "a")?.as_deref(), + Some(b"alpha".as_slice()) + ); + assert_eq!( + cache.kv_get("ns", "b")?.as_deref(), + Some(b"beta".as_slice()) + ); + + // kv_list is scoped to the namespace. + let mut listed = cache.kv_list("ns")?; + listed.sort(); + assert_eq!( + listed, + vec![ + ("a".to_string(), b"alpha".to_vec()), + ("b".to_string(), b"beta".to_vec()), + ] + ); + + // Overwrite replaces. + cache.kv_put("ns", "a", b"alpha2")?; + let mut tries = 0; + while cache.kv_get("ns", "a")?.as_deref() != Some(b"alpha2".as_slice()) && tries < 1000 { + std::thread::yield_now(); + tries += 1; + } + assert_eq!( + cache.kv_get("ns", "a")?.as_deref(), + Some(b"alpha2".as_slice()) + ); + + Ok(()) + } + #[test] fn test_seekable_reader_pread_in_middle() -> Result<()> { use io::{Read, Seek, SeekFrom}; diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 0efb22cd..811b1299 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -20,6 +20,7 @@ pub mod driver; pub mod error; pub mod event; mod local_cache; +pub use local_cache::{LocalCache, SizedReader}; #[cfg(test)] mod local_cache_fs; mod local_cache_mem; @@ -46,6 +47,7 @@ pub use result_lock::{LockBackend, ResultLock}; mod expand; pub mod fanout; mod gc; +pub mod walk_cache; pub use gc::GcStats; pub mod gitignore; mod grow_stack; diff --git a/src/engine/walk_cache.rs b/src/engine/walk_cache.rs new file mode 100644 index 00000000..49957257 --- /dev/null +++ b/src/engine/walk_cache.rs @@ -0,0 +1,323 @@ +//! Cross-run cache of filesystem-walk results. +//! +//! Several plugins re-walk the workspace tree on every run because their targets +//! are intentionally uncacheable: the fs `Driver` re-globs source files, the +//! buildfile `Provider` re-discovers packages. The tree rarely changes between +//! runs, so these walks are repeated work. +//! +//! [`WalkCache`] memoizes a walk's result across runs in the durable cache's +//! namespaced KV store (see [`LocalCache::kv_get`]). Each entry pairs a +//! [`WalkSignature`] — directory mtimes (the matched *set*) plus optional +//! per-file `(size, mtime)` (file *content*) — with a borsh value. A lookup +//! returns the value only when the signature still validates against the live +//! tree; otherwise the caller walks and re-inserts. +//! +//! `mtime+size` is a fast-path proxy for content identity (heph otherwise hashes +//! content precisely); a same-size in-place rewrite within the filesystem's mtime +//! granularity can be missed — an accepted tradeoff. Everything is +//! correct-by-fallback: a missing/disabled store, a decode error, or any +//! validation mismatch simply makes the caller re-walk. + +use crate::engine::local_cache::LocalCache; +use borsh::{BorshDeserialize, BorshSerialize}; +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use std::path::Path; +use std::sync::Arc; + +/// Nanoseconds since the unix epoch for `meta`'s mtime, or `None` if unreadable +/// (pre-epoch or unsupported) — a walk with any unreadable mtime is not cached. +pub fn mtime_ns(meta: &std::fs::Metadata) -> Option { + let t = meta.modified().ok()?; + let d = t.duration_since(std::time::UNIX_EPOCH).ok()?; + i64::try_from(d.as_nanos()).ok() +} + +/// Validation fingerprint for a filesystem walk: the directories it descended +/// (by mtime) and, optionally, the files it read (by size + mtime). +#[derive(Clone, Default, Debug, BorshSerialize, BorshDeserialize)] +pub struct WalkSignature { + /// `(path relative to root, mtime_ns)` for every directory descended. A + /// directory's mtime bumps on any entry add/remove/rename, so matching all of + /// them proves the matched file *set* is unchanged without re-reading them. + pub dirs: Vec<(String, i64)>, + /// `(path relative to root, size, mtime_ns)` for content-sensitive walks. + /// Empty when only the directory *set* matters (e.g. discovering which dirs + /// contain a marker file). + pub files: Vec<(String, u64, i64)>, +} + +impl WalkSignature { + /// Record a directory's mtime under `root`. Returns `false` if the mtime is + /// unreadable (⇒ the caller should mark the walk non-persistable). + pub fn push_dir(&mut self, rel: impl Into, meta: &std::fs::Metadata) -> bool { + match mtime_ns(meta) { + Some(mt) => { + self.dirs.push((rel.into(), mt)); + true + } + None => false, + } + } + + /// Record a file's `(size, mtime)` under `root`. Returns `false` if the mtime + /// is unreadable. + pub fn push_file(&mut self, rel: impl Into, meta: &std::fs::Metadata) -> bool { + match mtime_ns(meta) { + Some(mt) => { + self.files.push((rel.into(), meta.len(), mt)); + true + } + None => false, + } + } + + /// True iff the tree under `root` still matches: every recorded directory + /// mtime and every recorded file `(size, mtime)` is unchanged. + pub fn is_valid(&self, root: &Path) -> bool { + for (rel, mt) in &self.dirs { + match std::fs::metadata(root.join(rel)) { + Ok(m) if m.is_dir() && mtime_ns(&m) == Some(*mt) => {} + _ => return false, + } + } + for (rel, size, mt) in &self.files { + match std::fs::metadata(root.join(rel)) { + Ok(m) if !m.is_dir() && m.len() == *size && mtime_ns(&m) == Some(*mt) => {} + _ => return false, + } + } + true + } +} + +const WALK_CACHE_VERSION: u32 = 1; + +#[derive(BorshSerialize, BorshDeserialize)] +struct StoredEntry { + version: u32, + sig: WalkSignature, + value: T, +} + +/// Cross-run, in-memory-fronted cache of walk results keyed by an arbitrary +/// string, backed by a [`LocalCache`] KV namespace. +/// +/// The KV namespace is scanned once (lazily, on first access) into an in-memory +/// map; lookups then serve from memory. Inserts write-through to the KV +/// incrementally, so a pure cache-hit run performs no writes. Constructed with +/// `None` (or a backend whose KV is a no-op) it degrades to always-miss. +pub struct WalkCache { + cache: Option>, + ns: String, + inner: Mutex>, +} + +struct Inner { + loaded: bool, + map: FxHashMap>>, +} + +impl WalkCache +where + T: BorshSerialize + BorshDeserialize + Clone, +{ + /// A cache backed by `cache`'s KV namespace `ns`. `None` disables it + /// (always-miss, no writes). + pub fn new(cache: Option>, ns: impl Into) -> Self { + Self { + cache, + ns: ns.into(), + inner: Mutex::new(Inner { + loaded: false, + map: FxHashMap::default(), + }), + } + } + + fn ensure_loaded(&self, cache: &dyn LocalCache, inner: &mut Inner) { + if inner.loaded { + return; + } + inner.loaded = true; + let Ok(rows) = cache.kv_list(&self.ns) else { + return; + }; + for (k, bytes) in rows { + if let Ok(entry) = borsh::from_slice::>(&bytes) + && entry.version == WALK_CACHE_VERSION + { + inner.map.insert(k, Arc::new(entry)); + } + } + } + + /// Returns the cached value for `key` if its [`WalkSignature`] still validates + /// against the tree at `root`; otherwise `None` (the caller should walk and + /// [`insert`](Self::insert)). + pub fn get(&self, key: &str, root: &Path) -> Option { + let cache = self.cache.as_deref()?; + let entry = { + let mut inner = self.inner.lock(); + self.ensure_loaded(cache, &mut inner); + inner.map.get(key).cloned() + }?; + entry.sig.is_valid(root).then(|| entry.value.clone()) + } + + /// Records a fresh walk result: updates the in-memory map and write-through + /// to the KV (best-effort). No-op when the cache is disabled. + pub fn insert(&self, key: impl Into, sig: WalkSignature, value: T) { + let Some(cache) = self.cache.as_deref() else { + return; + }; + let key = key.into(); + let entry = Arc::new(StoredEntry { + version: WALK_CACHE_VERSION, + sig, + value, + }); + if let Ok(bytes) = borsh::to_vec(&*entry) { + drop(cache.kv_put(&self.ns, &key, &bytes)); + } + self.inner.lock().map.insert(key, entry); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + /// Minimal `LocalCache` exposing only the KV methods over an in-memory map. + #[derive(Default)] + struct KvMock { + kv: Mutex>>, + } + impl LocalCache for KvMock { + fn reader( + &self, + _a: &crate::htaddr::Addr, + _h: &str, + _n: &str, + ) -> anyhow::Result { + unimplemented!() + } + fn writer( + &self, + _a: &crate::htaddr::Addr, + _h: &str, + _n: &str, + ) -> anyhow::Result> { + unimplemented!() + } + fn exists(&self, _a: &crate::htaddr::Addr, _h: &str, _n: &str) -> anyhow::Result { + Ok(false) + } + fn delete(&self, _a: &crate::htaddr::Addr, _h: &str, _n: &str) -> anyhow::Result<()> { + Ok(()) + } + fn kv_get(&self, ns: &str, k: &str) -> anyhow::Result>> { + Ok(self.kv.lock().get(&(ns.to_owned(), k.to_owned())).cloned()) + } + fn kv_list(&self, ns: &str) -> anyhow::Result)>> { + Ok(self + .kv + .lock() + .iter() + .filter(|((n, _), _)| n == ns) + .map(|((_, k), v)| (k.clone(), v.clone())) + .collect()) + } + fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> anyhow::Result<()> { + self.kv + .lock() + .insert((ns.to_owned(), k.to_owned()), v.to_vec()); + Ok(()) + } + } + + #[test] + fn signature_validates_and_detects_changes() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir(root.join("d")).unwrap(); + std::fs::write(root.join("d/f"), b"abc").unwrap(); + + let mut sig = WalkSignature::default(); + sig.push_dir("d", &std::fs::metadata(root.join("d")).unwrap()); + sig.push_file("d/f", &std::fs::metadata(root.join("d/f")).unwrap()); + assert!(sig.is_valid(root), "unchanged tree validates"); + + // Content change (different size) invalidates. + std::fs::write(root.join("d/f"), b"a longer body").unwrap(); + assert!(!sig.is_valid(root), "changed file size invalidates"); + } + + #[test] + fn signature_detects_dir_mtime_change() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir(root.join("d")).unwrap(); + let mut sig = WalkSignature::default(); + sig.push_dir("d", &std::fs::metadata(root.join("d")).unwrap()); + assert!(sig.is_valid(root)); + + std::fs::File::open(root.join("d")) + .unwrap() + .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) + .unwrap(); + assert!(!sig.is_valid(root), "bumped dir mtime invalidates"); + } + + #[test] + fn cache_roundtrips_through_kv_and_validates() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir(root.join("d")).unwrap(); + + let backend: Arc = Arc::new(KvMock::default()); + let mut sig = WalkSignature::default(); + sig.push_dir("d", &std::fs::metadata(root.join("d")).unwrap()); + + // First WalkCache populates the KV. + { + let wc: WalkCache> = WalkCache::new(Some(backend.clone()), "test"); + assert!(wc.get("k", root).is_none(), "cold miss"); + wc.insert( + "k", + sig.clone(), + vec!["pkg/a".to_string(), "pkg/b".to_string()], + ); + assert_eq!( + wc.get("k", root).unwrap().len(), + 2, + "warm hit, same process" + ); + } + + // A fresh WalkCache loads from the shared KV (simulates a new run). + let wc2: WalkCache> = WalkCache::new(Some(backend.clone()), "test"); + assert_eq!( + wc2.get("k", root).unwrap(), + vec!["pkg/a".to_string(), "pkg/b".to_string()], + "fresh cache reloads from KV and validates" + ); + + // After a dir mtime bump the entry no longer validates. + std::fs::File::open(root.join("d")) + .unwrap() + .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) + .unwrap(); + let wc3: WalkCache> = WalkCache::new(Some(backend), "test"); + assert!(wc3.get("k", root).is_none(), "stale entry invalidates"); + } + + #[test] + fn disabled_cache_always_misses() { + let dir = tempfile::tempdir().unwrap(); + let wc: WalkCache> = WalkCache::new(None, "test"); + wc.insert("k", WalkSignature::default(), vec!["x".to_string()]); + assert!(wc.get("k", dir.path()).is_none()); + } +} diff --git a/src/pluginbuildfile/provider.rs b/src/pluginbuildfile/provider.rs index c91169b0..e2b6d42a 100644 --- a/src/pluginbuildfile/provider.rs +++ b/src/pluginbuildfile/provider.rs @@ -1,9 +1,11 @@ +use crate::engine::LocalCache; use crate::engine::provider::GetError::NotFound; use crate::engine::provider::{ ConfigRequest, ConfigResponse, GetError, GetRequest, GetResponse, ListPackageResponse, ListPackagesRequest, ListRequest, ListResponse, ProbeRequest, ProbeResponse, Provider as EProvider, ProviderFunctionRegistry, State, TargetSpec, }; +use crate::engine::walk_cache::{WalkCache, WalkSignature}; use crate::hasync::Cancellable; use crate::hmemoizer::Memoizer; use crate::htaddr::Addr; @@ -55,6 +57,12 @@ pub struct Provider { /// Lazily-built Starlark globals (built from `function_registry` on first eval), /// shared with every `BuildFileLoader` so the namespace is built at most once. pub(crate) globals: Arc>, + /// Cross-run cache of the package-discovery walk, validated by directory + /// mtimes only (a BUILD file's *contents* don't change the package set — + /// that's handled by `pkg_cache`). Disabled until [`with_cache`] is called. + /// + /// [`with_cache`]: Provider::with_cache + pub(crate) packages_walk_cache: Arc>>, } impl Default for Provider { @@ -71,10 +79,14 @@ impl Default for Provider { dir_cache: Arc::new(Mutex::new(HashMap::new())), function_registry: OnceLock::new(), globals: Arc::new(OnceLock::new()), + packages_walk_cache: Arc::new(WalkCache::new(None, PACKAGES_CACHE_NS)), } } } +/// KV namespace for the buildfile package-discovery walk cache. +const PACKAGES_CACHE_NS: &str = "pluginbuildfile.packages"; + impl Provider { pub fn new(root: std::path::PathBuf) -> Self { Self { @@ -83,6 +95,14 @@ impl Provider { } } + /// Enable the cross-run package-discovery cache backed by `cache`'s KV store. + /// Without this the provider re-walks the tree to discover packages on every + /// run (the in-process `packages_cache` only dedupes within one run). + pub fn with_cache(mut self, cache: Option>) -> Self { + self.packages_walk_cache = Arc::new(WalkCache::new(cache, PACKAGES_CACHE_NS)); + self + } + pub fn from_options( root: std::path::PathBuf, skip_dirs: &[std::path::PathBuf], @@ -123,13 +143,41 @@ impl Provider { } } +/// Stable cross-run cache key for a package-discovery walk: the (sorted) build +/// file patterns and skip globs that determine which dirs are visited and which +/// count as packages. `root` is not included — it's the validation reference. +fn packages_cache_key(patterns: &[glob::Pattern], skip: &Ignore) -> String { + let mut pats: Vec<&str> = patterns.iter().map(glob::Pattern::as_str).collect(); + pats.sort_unstable(); + let mut globs: Vec<&str> = skip.globs().iter().map(String::as_str).collect(); + globs.sort_unstable(); + format!("{}\u{1}{}", pats.join(","), globs.join(",")) +} + fn find_packages_sync( path: &std::path::Path, root: &std::path::Path, patterns: &[glob::Pattern], skip: &Ignore, packages: &mut std::collections::HashSet, + sig: &mut WalkSignature, + persistable: &mut bool, ) -> anyhow::Result<()> { + // Record this directory's mtime for the cross-run cache: it bumps on any + // entry add/remove/rename, so matching every descended dir proves the package + // set is unchanged without re-reading the tree. + match ( + path.strip_prefix(root).ok().and_then(|r| r.to_str()), + std::fs::metadata(path).ok(), + ) { + (Some(rel), Some(meta)) if meta.is_dir() => { + if !sig.push_dir(rel, &meta) { + *persistable = false; + } + } + _ => *persistable = false, + } + let mut has_build_file = false; for entry in std::fs::read_dir(path).with_context(|| format!("reading {}", path.display()))? { let entry = entry?; @@ -150,7 +198,15 @@ fn find_packages_sync( if skip.prune_dir(&entry_path, rel) { continue; } - find_packages_sync(&entry_path, root, patterns, skip, packages)?; + find_packages_sync( + &entry_path, + root, + patterns, + skip, + packages, + sig, + persistable, + )?; } } @@ -228,11 +284,25 @@ impl EProvider for Provider { .packages_cache .once( (), - enclose!((self.root => root, self.build_file_patterns => patterns, self.skip => skip) move || async move { + enclose!((self.root => root, self.build_file_patterns => patterns, self.skip => skip, self.packages_walk_cache => walk_cache) move || async move { let packages = crate::process_supervisor::block_or_inline(move || { + let key = packages_cache_key(&patterns, &skip); + // Cross-run cache hit: the directory set is unchanged. + if let Some(pkgs) = walk_cache.get(&key, &root) { + return Ok::<_, anyhow::Error>(pkgs); + } let mut packages = std::collections::HashSet::new(); - find_packages_sync(&root, &root, &patterns, &skip, &mut packages)?; - Ok::<_, anyhow::Error>(packages.into_iter().collect::>()) + let mut sig = WalkSignature::default(); + let mut persistable = true; + find_packages_sync( + &root, &root, &patterns, &skip, + &mut packages, &mut sig, &mut persistable, + )?; + let pkgs: Vec = packages.into_iter().collect(); + if persistable { + walk_cache.insert(key, sig, pkgs.clone()); + } + Ok(pkgs) })?; Ok(Arc::new(packages)) }), @@ -341,6 +411,122 @@ mod tests { use std::fs; use tempfile::tempdir; + /// Minimal `LocalCache` exposing only the KV methods over an in-memory map. + #[derive(Default)] + struct KvMock { + kv: Mutex>>, + } + impl LocalCache for KvMock { + fn reader( + &self, + _a: &Addr, + _h: &str, + _n: &str, + ) -> anyhow::Result { + unimplemented!() + } + fn writer(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result> { + unimplemented!() + } + fn exists(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result { + Ok(false) + } + fn delete(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result<()> { + Ok(()) + } + fn kv_get(&self, ns: &str, k: &str) -> anyhow::Result>> { + Ok(self + .kv + .lock() + .unwrap() + .get(&(ns.to_owned(), k.to_owned())) + .cloned()) + } + fn kv_list(&self, ns: &str) -> anyhow::Result)>> { + Ok(self + .kv + .lock() + .unwrap() + .iter() + .filter(|((n, _), _)| n == ns) + .map(|((_, k), v)| (k.clone(), v.clone())) + .collect()) + } + fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> anyhow::Result<()> { + self.kv + .lock() + .unwrap() + .insert((ns.to_owned(), k.to_owned()), v.to_vec()); + Ok(()) + } + } + + /// Package discovery is cached across runs: a fresh provider sharing the KV + /// reuses the discovered set for an unchanged tree, and a newly-added package + /// (which bumps a recorded dir's mtime) is re-discovered. + #[tokio::test] + async fn test_list_packages_cross_run_cache() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + fs::write(root.join("BUILD"), "").unwrap(); + let a = root.join("a"); + fs::create_dir_all(&a).unwrap(); + fs::write(a.join("BUILD"), "").unwrap(); + + let backend: Arc = Arc::new(KvMock::default()); + let list = |p: Provider| async move { + let ctoken = StdCancellationToken::new(); + let res = p + .list_packages( + ListPackagesRequest { + prefix: PkgBuf::from(""), + }, + &ctoken, + ) + .await + .unwrap(); + let mut v: Vec = res.map(|r| r.unwrap().pkg.to_string()).collect(); + v.sort(); + v + }; + + let p1 = Provider { + root: root.to_path_buf(), + ..Provider::default() + } + .with_cache(Some(backend.clone())); + assert_eq!(list(p1).await, vec!["".to_string(), "a".to_string()]); + + // Fresh provider sharing the KV (simulates a new run) → same set, served + // from the cross-run cache for the unchanged tree. + let p2 = Provider { + root: root.to_path_buf(), + ..Provider::default() + } + .with_cache(Some(backend.clone())); + assert_eq!(list(p2).await, vec!["".to_string(), "a".to_string()]); + + // Add a new package; bump root mtime so the recorded dir invalidates. + let b = root.join("b"); + fs::create_dir_all(&b).unwrap(); + fs::write(b.join("BUILD"), "").unwrap(); + std::fs::File::open(root) + .unwrap() + .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) + .unwrap(); + + let p3 = Provider { + root: root.to_path_buf(), + ..Provider::default() + } + .with_cache(Some(backend.clone())); + assert_eq!( + list(p3).await, + vec!["".to_string(), "a".to_string(), "b".to_string()], + "a newly-added package is re-discovered" + ); + } + #[test] fn from_options_defaults_to_build() { let dir = tempdir().expect("tempdir"); diff --git a/src/pluginfs/mod.rs b/src/pluginfs/mod.rs index 5c6ca6f1..0a120fb9 100644 --- a/src/pluginfs/mod.rs +++ b/src/pluginfs/mod.rs @@ -1,3 +1,4 @@ +use crate::engine::LocalCache; use crate::engine::driver::{ ApplyTransitiveRequest, ApplyTransitiveResponse, ConfigRequest, ConfigResponse, ParseRequest, ParseResponse, RunRequest, RunResponse, @@ -13,6 +14,7 @@ use crate::engine::provider::{ ListRequest, ListResponse, ProbeRequest, ProbeResponse, Provider as EProvider, ProviderFn, ProviderFunctionDef, TargetSpec, }; +use crate::engine::walk_cache::{WalkCache, WalkSignature}; use crate::hasync::Cancellable; use crate::htaddr::Addr; use crate::htpkg::PkgBuf; @@ -581,18 +583,15 @@ fn compile_glob( /// Starts at the pattern's literal prefix so a rooted pattern (`a/b/**/*`) scans /// only `/a/b`, not the whole tree. Matching uses the cached glob/exclude /// NFAs directly — no per-run regex compilation. -fn walk_glob( - root: &std::path::Path, - compiled: &CompiledGlob, -) -> anyhow::Result<(Vec, Option)> { +fn walk_glob(root: &std::path::Path, compiled: &CompiledGlob) -> anyhow::Result { let walk_root = if compiled.prefix.is_empty() { root.to_path_buf() } else { root.join(&compiled.prefix) }; - // A missing walk root is an empty match — but must not be persisted: were it - // cached as "empty", a later-created tree (which doesn't change any recorded + // A missing walk root is an empty match — but must not be cached: were it + // stored as "empty", a later-created tree (which doesn't change any recorded // dir's mtime, since none were recorded) would keep serving the stale empty // set. Return `None` so every run re-walks until the root exists. if std::fs::metadata(&walk_root).is_err() { @@ -601,9 +600,9 @@ fn walk_glob( let mut artifacts = vec![]; // Cross-run cache accumulation. `persistable` flips off if any mtime can't be - // read, in which case we never write a (possibly unverifiable) entry. - let mut dirs: Vec<(String, i64)> = vec![]; - let mut files: Vec = vec![]; + // read, in which case we never store a (possibly unverifiable) entry. + let mut sig = WalkSignature::default(); + let mut value = GlobValue::default(); let mut persistable = true; let walker = walkdir::WalkDir::new(&walk_root) @@ -637,15 +636,20 @@ fn walk_glob( // Record every descended dir's mtime: it bumps on any entry // add/remove/rename, so matching all of them on a later run proves // the matched file *set* is unchanged without re-reading the dir. - if let Ok(rel) = entry.path().strip_prefix(root) { - if let Some(rel) = rel.to_str() { - match entry.metadata().ok().as_ref().and_then(mtime_ns) { - Some(mt) => dirs.push((rel.to_owned(), mt)), - None => persistable = false, + match ( + entry + .path() + .strip_prefix(root) + .ok() + .and_then(|r| r.to_str()), + entry.metadata().ok(), + ) { + (Some(rel), Some(meta)) => { + if !sig.push_dir(rel, &meta) { + persistable = false; } - } else { - persistable = false; } + _ => persistable = false, } continue; } @@ -694,18 +698,18 @@ fn walk_glob( let hashout = file_hashout(abs_path, x) .with_context(|| format!("hash glob entry '{}'", abs_path.display()))?; - // Cross-run cache: record (size, mtime) so a later run can validate this - // file's content without re-reading it. A missing mtime makes the whole - // walk non-persistable. - match mtime_ns(&meta) { - Some(mt) => files.push(CachedGlobFile { + // Cross-run cache: record (size, mtime) in the signature so a later run + // can validate this file's content without re-reading it, and the + // (rel, x, hashout) needed to rebuild the artifact. A missing mtime makes + // the whole walk non-persistable. + if sig.push_file(rel_str, &meta) { + value.files.push(GlobFile { rel: rel_str.to_string(), x, - size: meta.len(), - mtime_ns: mt, hashout: hashout.clone(), - }), - None => persistable = false, + }); + } else { + persistable = false; } // Materialize the owned strings that borrow `abs_path`/`rel_str` *before* @@ -742,196 +746,54 @@ fn walk_glob( } let entry = if persistable { - Some(CachedGlobEntry { - version: GLOB_CACHE_VERSION, - dirs, - files, - }) + Some((sig, value)) } else { None }; Ok((artifacts, entry)) } -// ── Persistent cross-run glob cache ────────────────────────────────────────── +// ── Cross-run glob cache ───────────────────────────────────────────────────── // // fs-glob targets are `CacheConfig::off()`, so the engine re-walks the tree on -// every run (walkdir + per-entry stat + per-file open/read/hash). This sidecar -// memoizes a walk's result per `(root, pattern, exclude)` *across* runs, -// validated by directory mtimes (the matched file *set*) and per-file -// `(size, mtime)` (file *content*). A full match reconstructs the artifacts with -// only stat syscalls — no readdir, no file opens, no reads, no hashing. -// -// `mtime+size` is a fast-path proxy for content identity; heph otherwise hashes -// content precisely, so a same-size in-place rewrite within the filesystem's -// mtime granularity can be missed (accepted tradeoff). Correct-by-fallback: any -// IO/decode/validation mismatch falls through to a full walk, and the whole -// layer is disabled with `HEPH_FS_GLOB_CACHE=0`. - -const GLOB_CACHE_VERSION: u32 = 1; - +// every run (walkdir + per-entry stat + per-file open/read/hash). The generic +// `walk_cache` memoizes a walk's result per `(root, pattern, exclude)` across +// runs in the durable cache's KV store, validated by directory mtimes (the +// matched file *set*) and per-file `(size, mtime)` (file *content*). A full +// match reconstructs the artifacts with stat only — no readdir, opens, reads, or +// hashing. Disable with `HEPH_FS_GLOB_CACHE=0`. + +/// KV namespace for the fs glob walk cache. +const GLOB_CACHE_NS: &str = "pluginfs.glob"; + +/// Per-matched-file reconstruction data stored in the [`WalkCache`]. The +/// validating `(size, mtime)` live in the [`WalkSignature`]; this carries only +/// what's needed to rebuild an [`OutputArtifact`]. `source = root/rel`, +/// `out_path = rel`, `name = rel.replace('/', "_")` — mirrors `walk_glob`. #[derive(Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] -struct CachedGlobFile { - /// Path relative to the tree root. `source = root/rel`, `out_path = rel`, - /// `name = rel.replace('/', "_")` — mirrors `walk_glob`'s artifact build. +struct GlobFile { rel: String, x: bool, - size: u64, - mtime_ns: i64, hashout: String, } -#[derive(Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] -struct CachedGlobEntry { - version: u32, - /// (dir rel-to-root, mtime_ns) for every directory descended during the walk. - dirs: Vec<(String, i64)>, - files: Vec, -} - -/// Nanoseconds since the unix epoch for `meta`'s mtime, or `None` if unreadable. -fn mtime_ns(meta: &std::fs::Metadata) -> Option { - let t = meta.modified().ok()?; - let d = t.duration_since(std::time::UNIX_EPOCH).ok()?; - i64::try_from(d.as_nanos()).ok() -} - -/// On-disk format for the single-file glob cache sidecar. -#[derive(Default, borsh::BorshSerialize, borsh::BorshDeserialize)] -struct GlobStoreFile { - version: u32, - entries: Vec<(String, CachedGlobEntry)>, -} - -/// Process-lifetime, single-file cache of glob walk results, shared by the fs -/// `Driver`. Backed by one `/.heph3/cache/fsglob.bin` sidecar loaded once -/// and flushed on `Driver` drop — a single open/read amortized over every glob -/// target, instead of one cache file (one `open`) per target. -#[derive(Default)] -struct GlobStore { - inner: parking_lot::Mutex, -} - -#[derive(Default)] -struct GlobStoreInner { - /// Sidecar path; empty until the first walk sets it from the tree root. - path: std::path::PathBuf, - /// Whether the sidecar has been loaded (or loading was disabled). - loaded: bool, - /// Set when `map` holds inserts not yet persisted. - dirty: bool, - /// Disabled (no read, no write) via `HEPH_FS_GLOB_CACHE=0`. - enabled: bool, - map: FxHashMap>, -} - -/// Read + decode the glob sidecar at `path`, or `None` on any IO/decode/version -/// mismatch (⇒ start empty). -fn load_glob_sidecar(path: &std::path::Path) -> Option>> { - let bytes = std::fs::read(path).ok()?; - let file: GlobStoreFile = borsh::from_slice(&bytes).ok()?; - (file.version == GLOB_CACHE_VERSION).then(|| { - file.entries - .into_iter() - .map(|(k, v)| (k, Arc::new(v))) - .collect() - }) -} - -impl GlobStore { - /// Load the sidecar on first use, deriving its path from the tree `root`. - /// Holding the lock across the read serializes the (one-time) load. - fn ensure_loaded(&self, root: &std::path::Path) { - let mut inner = self.inner.lock(); - if inner.loaded { - return; - } - inner.loaded = true; - inner.enabled = std::env::var_os("HEPH_FS_GLOB_CACHE").is_none_or(|v| v != "0"); - if !inner.enabled { - return; - } - inner.path = root.join(".heph3").join("cache").join("fsglob.bin"); - if let Some(map) = load_glob_sidecar(&inner.path) { - inner.map = map; - } - } - - fn get(&self, key: &str) -> Option> { - self.inner.lock().map.get(key).cloned() - } - - fn insert(&self, key: String, entry: CachedGlobEntry) { - let mut inner = self.inner.lock(); - if !inner.enabled { - return; - } - inner.map.insert(key, Arc::new(entry)); - inner.dirty = true; - } - - /// Persist the map to its sidecar (temp + atomic rename). Best-effort, called - /// on `Driver` drop; a no-op when nothing changed (e.g. a pure cache-hit run). - fn flush(&self) { - let mut inner = self.inner.lock(); - if !inner.dirty || inner.path.as_os_str().is_empty() { - return; - } - let file = GlobStoreFile { - version: GLOB_CACHE_VERSION, - entries: inner - .map - .iter() - .map(|(k, v)| (k.clone(), (**v).clone())) - .collect(), - }; - let Some(parent) = inner.path.parent().map(std::path::Path::to_path_buf) else { - return; - }; - if std::fs::create_dir_all(&parent).is_err() { - return; - } - let Ok(bytes) = borsh::to_vec(&file) else { - return; - }; - let tmp = inner - .path - .with_extension(format!("tmp.{}", std::process::id())); - if std::fs::write(&tmp, &bytes).is_ok() { - if std::fs::rename(&tmp, &inner.path).is_ok() { - inner.dirty = false; - } else { - drop(std::fs::remove_file(&tmp)); - } - } - } +#[derive(Default, Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] +struct GlobValue { + files: Vec, } -/// Validate `entry` against the current tree; on a full match reconstruct the -/// artifacts. Returns `None` (⇒ caller re-walks) on any mismatch. -fn reconstruct_glob( - root: &std::path::Path, - entry: &CachedGlobEntry, -) -> Option> { - // Set check: every walked dir's mtime must match. An added/removed/renamed - // entry bumps its parent dir's mtime, so this proves the file set is intact. - for (rel, mt) in &entry.dirs { - let meta = std::fs::metadata(root.join(rel)).ok()?; - if !meta.is_dir() || mtime_ns(&meta) != Some(*mt) { - return None; - } - } - let mut artifacts = Vec::with_capacity(entry.files.len()); - for f in &entry.files { +/// `walk_glob`'s result: the artifacts plus an optional `(signature, value)` to +/// store in the [`WalkCache`] (`None` ⇒ the walk is not cacheable this run). +type GlobWalk = (Vec, Option<(WalkSignature, GlobValue)>); + +/// Rebuild the glob artifacts from a cache-validated [`GlobValue`]. The file +/// `(size, mtime)` were already checked by [`WalkCache::get`]; only the codegen +/// xattr (which bumps neither) is re-checked here, matching `walk_glob`. Returns +/// `None` (⇒ caller re-walks) if any file has since been codegen-stamped. +fn reconstruct_glob(root: &std::path::Path, value: &GlobValue) -> Option> { + let mut artifacts = Vec::with_capacity(value.files.len()); + for f in &value.files { let abs = root.join(&f.rel); - let meta = std::fs::metadata(&abs).ok()?; - // Content check: size + mtime. An in-place edit changes at least the - // mtime (and bumps no parent dir), so this is load-bearing. - if meta.is_dir() || meta.len() != f.size || mtime_ns(&meta) != Some(f.mtime_ns) { - return None; - } - // A file that gained the codegen xattr (which bumps neither size nor - // mtime) must drop out, matching `walk_glob`. if has_codegen_xattr(&abs) { return None; } @@ -956,7 +818,7 @@ fn reconstruct_glob( /// runs it is served from `store` (the single-file sidecar) when the tree is /// unchanged. The first uncached call walks the tree. fn cached_glob_walk( - store: &GlobStore, + glob_cache: &WalkCache, request_id: &str, root: &std::path::Path, pattern: &str, @@ -977,18 +839,17 @@ fn cached_glob_walk( return Ok(a.clone()); } - // Cross-run persistent cache: on a full validation hit, reconstruct without - // touching the directory tree beyond stat. On a miss, walk and record. - store.ensure_loaded(root); - let artifacts = match store - .get(&key) - .and_then(|entry| reconstruct_glob(root, &entry)) + // Cross-run cache: on a signature-validated hit, reconstruct with stat only. + // On a miss, walk and record (write-through to the KV). + let artifacts = match glob_cache + .get(&key, root) + .and_then(|value| reconstruct_glob(root, &value)) { Some(arts) => Arc::new(arts), None => { let (arts, entry) = walk_glob(root, compiled)?; - if let Some(entry) = entry { - store.insert(key.clone(), entry); + if let Some((sig, value)) = entry { + glob_cache.insert(key.clone(), sig, value); } Arc::new(arts) } @@ -1003,27 +864,33 @@ fn cached_glob_walk( .clone()) } -#[derive(Default)] pub struct Driver { /// Engine-owned + built-in dirs pruned during glob walks. skip: Arc, - /// Cross-run glob walk cache, flushed to disk when the driver drops. - glob_store: Arc, + /// Cross-run glob walk cache (KV-backed; write-through, no flush needed). + glob_cache: Arc>, } -impl Drop for Driver { - fn drop(&mut self) { - // Persist any new glob walk results gathered this run. A pure cache-hit - // run leaves the store clean, so this is a no-op there. - self.glob_store.flush(); +impl Default for Driver { + fn default() -> Self { + Self { + skip: Arc::default(), + glob_cache: Arc::new(WalkCache::new(None, GLOB_CACHE_NS)), + } } } impl Driver { - pub fn new(skip: Arc) -> Self { + pub fn new(skip: Arc, cache: Option>) -> Self { + // `HEPH_FS_GLOB_CACHE=0` is a kill-switch: disable the cache entirely. + let cache = if std::env::var_os("HEPH_FS_GLOB_CACHE").is_some_and(|v| v == "0") { + None + } else { + cache + }; Self { skip, - glob_store: Arc::default(), + glob_cache: Arc::new(WalkCache::new(cache, GLOB_CACHE_NS)), } } } @@ -1249,7 +1116,7 @@ impl crate::engine::driver::Driver for Driver { // this `(root, pattern, excludes)` is memoized — repeat calls // skip walkdir + per-entry stat entirely. let artifacts = cached_glob_walk( - &self.glob_store, + &self.glob_cache, req.request_id, root, pattern, @@ -1761,11 +1628,12 @@ mod tests { ); } - /// The persistent glob cache must reconstruct an unchanged tree's artifacts - /// and reject every kind of change: a file's content (size/mtime), the file - /// *set* (a recorded dir's mtime), and a freshly-stamped codegen xattr. + /// `walk_glob` returns a [`WalkSignature`] that validates the unchanged tree + /// (and invalidates on a content change) plus a value that reconstructs the + /// identical artifact set. (Signature-validation edge cases live in + /// `engine::walk_cache`.) #[test] - fn test_glob_cache_reconstruct_and_invalidation() { + fn test_walk_glob_signature_and_reconstruct() { let tmp = tempdir().unwrap(); let root = tmp.path(); fs::create_dir(root.join("sub")).unwrap(); @@ -1786,40 +1654,26 @@ mod tests { }; let (arts, entry) = walk_glob(root, &compiled).unwrap(); - let entry = entry.expect("walk over a present tree is persistable"); + let (sig, value) = entry.expect("walk over a present tree is persistable"); assert_eq!(arts.len(), 2, "matches a.rs + sub/b.rs, not c.txt"); - // Unchanged tree → reconstruct yields the identical artifact set. - let rebuilt = reconstruct_glob(root, &entry).expect("unchanged tree reconstructs"); + // The value rebuilds the identical artifact set. + let rebuilt = reconstruct_glob(root, &value).expect("reconstructs"); assert_eq!(key(&arts), key(&rebuilt)); - // Content change: rewrite a.rs with a different size → must invalidate. + // The signature validates the unchanged tree, and a content change (size) + // invalidates it. + assert!(sig.is_valid(root), "unchanged tree validates"); fs::write(root.join("a.rs"), b"a much longer body").unwrap(); - assert!( - reconstruct_glob(root, &entry).is_none(), - "a changed file size must invalidate the cache" - ); - fs::write(root.join("a.rs"), b"aaa").unwrap(); // restore size; mtime moved - - // Set change: bump a recorded dir's mtime (simulating an add/remove/rename - // in it) → must invalidate even though no recorded file changed. - let (_, fresh) = walk_glob(root, &compiled).unwrap(); - let fresh = fresh.expect("persistable"); - let dir_handle = std::fs::File::open(root.join("sub")).unwrap(); - dir_handle - .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) - .unwrap(); - assert!( - reconstruct_glob(root, &fresh).is_none(), - "a bumped directory mtime must invalidate the cache" - ); + assert!(!sig.is_valid(root), "changed file size invalidates"); } /// A file that gains the codegen provenance xattr (which bumps neither size - /// nor mtime) must drop out of a reconstructed glob, matching `walk_glob`. + /// nor mtime, so the signature still validates) must drop out of a + /// reconstructed glob, matching `walk_glob`. #[cfg(unix)] #[test] - fn test_glob_cache_reconstruct_drops_codegen_xattr() { + fn test_reconstruct_glob_drops_codegen_xattr() { let tmp = tempdir().unwrap(); let root = tmp.path(); fs::write(root.join("gen.rs"), b"x").unwrap(); @@ -1828,33 +1682,81 @@ mod tests { let (arts, entry) = walk_glob(root, &compiled).unwrap(); assert_eq!(arts.len(), 1); - let entry = entry.unwrap(); + let (_, value) = entry.unwrap(); // Stamp the codegen xattr without touching content/mtime. if xattr::set(root.join("gen.rs"), CODEGEN_XATTR, b"//gen:it").is_err() { return; // filesystem without xattr support — nothing to assert } assert!( - reconstruct_glob(root, &entry).is_none(), + reconstruct_glob(root, &value).is_none(), "a newly codegen-stamped file must invalidate the cache" ); } - /// End-to-end cross-run persistence: a first driver populates and (on drop) - /// flushes the single-file sidecar; a *fresh* driver loads it from disk and - /// reuses it for the unchanged tree, then re-walks once a file is added. - /// `.heph3` is pruned, so writing the sidecar never self-invalidates a - /// recorded directory. + /// Minimal `LocalCache` exposing only the KV methods over an in-memory map, + /// so the cross-run test exercises the real `WalkCache` path without SQLite. + #[derive(Default)] + struct KvMock { + kv: std::sync::Mutex>>, + } + impl LocalCache for KvMock { + fn reader( + &self, + _a: &Addr, + _h: &str, + _n: &str, + ) -> anyhow::Result { + unimplemented!() + } + fn writer(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result> { + unimplemented!() + } + fn exists(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result { + Ok(false) + } + fn delete(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result<()> { + Ok(()) + } + fn kv_get(&self, ns: &str, k: &str) -> anyhow::Result>> { + Ok(self + .kv + .lock() + .unwrap() + .get(&(ns.to_owned(), k.to_owned())) + .cloned()) + } + fn kv_list(&self, ns: &str) -> anyhow::Result)>> { + Ok(self + .kv + .lock() + .unwrap() + .iter() + .filter(|((n, _), _)| n == ns) + .map(|((_, k), v)| (k.clone(), v.clone())) + .collect()) + } + fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> anyhow::Result<()> { + self.kv + .lock() + .unwrap() + .insert((ns.to_owned(), k.to_owned()), v.to_vec()); + Ok(()) + } + } + + /// End-to-end cross-run: a first driver populates the KV-backed walk cache; a + /// *fresh* driver sharing the same backend (simulating a new process) reuses + /// it for the unchanged tree, then re-walks once a file is added. #[tokio::test] async fn test_glob_cache_cross_run() { let tmp = tempdir().unwrap(); let root = tmp.path(); - let home = root.join(".heph3"); - fs::create_dir_all(&home).unwrap(); fs::write(root.join("a.rs"), b"aaa").unwrap(); fs::write(root.join("b.rs"), b"bbb").unwrap(); - let skip = Arc::new(Ignore::new(&[home], &[]).unwrap()); + let skip = Arc::new(Ignore::new(&[], &[]).unwrap()); + let backend: Arc = Arc::new(KvMock::default()); let config = std::collections::HashMap::from([("p".to_string(), Value::String("*.rs".to_string()))]); let hashin = String::new(); @@ -1864,9 +1766,9 @@ mod tests { "req-3".to_string(), ); - // First driver: walk + populate, then drop to flush the sidecar. + // First driver populates the shared KV. let parse_res = { - let driver = Driver::new(skip.clone()); + let driver = Driver::new(skip.clone(), Some(backend.clone())); let parse_res = driver .parse(make_parse_req(config), &ctoken()) .await @@ -1880,17 +1782,11 @@ mod tests { .unwrap(); assert_eq!(first.artifacts.len(), 2); parse_res - }; // driver dropped here → flush - - let sidecar = root.join(".heph3").join("cache").join("fsglob.bin"); - assert!( - sidecar.exists(), - "dropping the driver persists the single-file glob sidecar" - ); + }; - // Fresh driver loads the sidecar from disk and reconstructs the unchanged - // tree without re-walking — same artifacts. - let driver2 = Driver::new(skip.clone()); + // Fresh driver sharing the backend loads the entry from the KV and + // reconstructs the unchanged tree — same artifacts. + let driver2 = Driver::new(skip.clone(), Some(backend.clone())); let second = driver2 .run( make_run_req(&parse_res.target_def, &id2, root.to_path_buf(), &hashin), @@ -2017,7 +1913,7 @@ mod tests { // The engine hands the fs plugin its skip dirs (the heph home); the walk // must prune that subtree. let skip = Arc::new(Ignore::new(&[home], &[]).unwrap()); - let driver = Driver::new(skip); + let driver = Driver::new(skip, None); let config = std::collections::HashMap::from([("p".to_string(), Value::String("**/*".to_string()))]); @@ -2054,7 +1950,7 @@ mod tests { // A `fs.skip` dir from the config file (resolved to an absolute path) is // pruned just like the engine home. let skip = Arc::new(Ignore::new(&[tmp.path().join("vendor")], &[]).unwrap()); - let driver = Driver::new(skip); + let driver = Driver::new(skip, None); let config = std::collections::HashMap::from([("p".to_string(), Value::String("**/*".to_string()))]); @@ -2091,7 +1987,7 @@ mod tests { // A `fs.skip` glob (`**/node_modules/**`) excludes the whole subtree at // any depth — and prunes the dir so the walk never descends into it. let skip = Arc::new(Ignore::new(&[], &["**/node_modules/**".to_string()]).unwrap()); - let driver = Driver::new(skip); + let driver = Driver::new(skip, None); let config = std::collections::HashMap::from([("p".to_string(), Value::String("**/*".to_string()))]); From 3f2e29d8d424c99a8ec4794d1499f40cb7f403d7 Mon Sep 17 00:00:00 2001 From: Raphael Vigee Date: Mon, 8 Jun 2026 14:51:28 +0200 Subject: [PATCH 3/6] refactor(htwalk): shared on-demand cached filesystem walker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-consumer WalkCache (and the cache.db KV it rode on) with a shared, path-keyed, on-demand cached walker in htwalk, used by every tree-walking plugin irrespective of who asks. htwalk::CachedWalker exposes two consumer-agnostic primitives: - read_dir(dir) → cached directory listing, validated by dir mtime - file_hash(file)→ cached content hash + exec bit, validated by (size, mtime) Filtering (globs, excludes, skip dirs, codegen xattr) and the decision to recurse belong to the consumer, so a requester that stops shallow and one that recurses deep reuse the dirs they share and independently cache the ones they don't. Each explored directory is cached on its own. Backed by a dedicated fswalk.db (separate from the artifact cache.db) so it can be GC'd independently: a read pool + single write connection; rows carry a last-access stamp; `heph tool gc` prunes rows past a 14-day TTL and orphaned rows (path no longer exists). In-process front + write-through; a pure cache-hit run performs no writes. Correct-by-fallback: any db/decode/validation failure re-reads from disk. Consumers rewired: - pluginfs glob (`walk_glob`) + `file()` targets + the `heph.fs.glob` BUILD function now recurse via the walker; `file_hashout` moved into htwalk. - pluginbuildfile package discovery (`find_packages_sync`) reads dirs via the walker (dir-set only — BUILD contents don't change the package set). The walker is handed to plugins through PluginInit; the engine owns it. Removed: engine::walk_cache, the LocalCache kv_get/kv_list/kv_put + the sqlite `kv` table, and PluginInit.cache. Measured on example/go/large warm cache-hit run (profiling binary): warm (walker cache) ~2.44s vs cold (db wiped) ~2.74s — ~11%. Full lib suite (1003) passes; clippy clean; `heph tool gc` prunes fswalk rows. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/commands/bootstrap.rs | 4 +- src/engine/engine.rs | 24 +- src/engine/gc.rs | 14 + src/engine/local_cache.rs | 17 - src/engine/local_cache_mem.rs | 12 - src/engine/local_cache_sqlite.rs | 130 +------ src/engine/mod.rs | 2 - src/engine/walk_cache.rs | 323 ---------------- src/htwalk/cached_walker.rs | 635 +++++++++++++++++++++++++++++++ src/htwalk/mod.rs | 3 + src/pluginbuildfile/provider.rs | 225 +++-------- src/pluginfs/mod.rs | 584 ++++++++-------------------- 12 files changed, 889 insertions(+), 1084 deletions(-) delete mode 100644 src/engine/walk_cache.rs create mode 100644 src/htwalk/cached_walker.rs diff --git a/src/commands/bootstrap.rs b/src/commands/bootstrap.rs index 9fe3184c..61ecbbff 100644 --- a/src/commands/bootstrap.rs +++ b/src/commands/bootstrap.rs @@ -152,7 +152,7 @@ pub fn new_engine() -> anyhow::Result<(Arc, ShutdownTrigger)> { &init.skip_globs, opts, )? - .with_cache(Some(init.cache.clone())), + .with_walker(init.walker.clone()), )) })?; e.register_provider_factory("go", |init, opts| { @@ -278,7 +278,7 @@ mod tests { &init.skip_globs, opts, )? - .with_cache(Some(init.cache.clone())), + .with_walker(init.walker.clone()), )) })?; e.register_managed_driver_factory("exec", |_init, opts| { diff --git a/src/engine/engine.rs b/src/engine/engine.rs index 73bb4cae..5904728f 100644 --- a/src/engine/engine.rs +++ b/src/engine/engine.rs @@ -101,9 +101,8 @@ pub struct PluginInit { /// Workspace-relative `fs.skip` glob patterns (e.g. `**/node_modules/**`), /// matched against entry paths. pub skip_globs: Vec, - /// The engine's durable local cache, handed to plugins for cross-run scratch - /// state via its namespaced KV store (see [`crate::engine::walk_cache`]). - pub cache: Arc, + /// Shared cross-run filesystem-walk cache for tree-walking plugins. + pub walker: Arc, } /// True if `entry` contains wax glob metacharacters — used to split `fs.skip` @@ -135,6 +134,9 @@ pub struct Engine { /// memory and never touch the SQLite WAL; entries over the per-entry cap /// spill to `local_cache`. See [`LocalCacheTmp`]. pub(crate) local_cache_tmp: Arc, + /// Shared cross-run filesystem-walk cache (separate `fswalk.db`), handed to + /// tree-walking plugins via [`PluginInit`]. + pub(crate) walker: Arc, pub(crate) providers: Vec>, pub(crate) providers_by_name: HashMap>, @@ -403,6 +405,12 @@ impl Engine { .with_context(|| format!("create lock dir {lock_dir:?}"))?; let result_lock = ResultLock::new(cfg.lock_backend, lock_dir); + // Shared cross-run filesystem-walk cache, handed to tree-walking plugins + // via `PluginInit`. Its own sqlite db so it can be pruned independently. + let walker = Arc::new(crate::htwalk::CachedWalker::open( + &home.join("cache").join("fswalk.db"), + )); + let max_workers = 2 * parallelism; let mut engine = Engine { @@ -410,6 +418,7 @@ impl Engine { home: home.clone(), local_cache, local_cache_tmp, + walker, providers: vec![], providers_by_name: HashMap::new(), drivers: vec![], @@ -436,7 +445,10 @@ impl Engine { &init.skip_dirs, &init.skip_globs, )?); - Ok(Box::new(crate::pluginfs::Provider::new(ignore))) + Ok(Box::new(crate::pluginfs::Provider::new( + ignore, + init.walker.clone(), + ))) })?; engine.try_register_driver(|init| { let ignore = Arc::new(crate::htwalk::Ignore::new( @@ -445,7 +457,7 @@ impl Engine { )?); Ok(Box::new(crate::pluginfs::Driver::new( ignore, - Some(init.cache.clone()), + init.walker.clone(), ))) })?; @@ -513,7 +525,7 @@ impl Engine { root: self.cfg.root.clone(), skip_dirs: self.skip_dirs(), skip_globs: self.skip_globs(), - cache: self.local_cache.clone(), + walker: self.walker.clone(), } } diff --git a/src/engine/gc.rs b/src/engine/gc.rs index 6439f56a..32d9ad36 100644 --- a/src/engine/gc.rs +++ b/src/engine/gc.rs @@ -41,6 +41,9 @@ pub struct GcStats { /// Targets that could not be processed (resolve/delete failed). GC logs each /// and keeps going — a single bad target never aborts the sweep. pub errored: usize, + /// Rows pruned from the shared filesystem-walk cache (stale past the TTL or + /// orphaned because their path no longer exists). + pub fswalk_rows_removed: usize, } /// Per-target result of a GC pass, accumulated into [`GcStats`]. @@ -226,6 +229,17 @@ impl Engine { Self::drain_one(&mut set, &rs, &mut stats).await; } + // Prune the shared filesystem-walk cache: drop rows untouched past the + // TTL and rows whose path no longer exists. Best-effort — a prune failure + // never fails the artifact GC. + let walker = self.walker.clone(); + match crate::process_supervisor::block_or_inline(move || { + walker.prune(crate::htwalk::cached_walker::DEFAULT_TTL, true) + }) { + Ok(n) => stats.fswalk_rows_removed = n, + Err(e) => tracing::warn!(error = %format!("{e:#}"), "fswalk prune failed"), + } + Ok(stats) } diff --git a/src/engine/local_cache.rs b/src/engine/local_cache.rs index 41352d5b..f8497860 100644 --- a/src/engine/local_cache.rs +++ b/src/engine/local_cache.rs @@ -125,23 +125,6 @@ pub trait LocalCache: Send + Sync { ) -> anyhow::Result>> { Ok(None) } - - // ── Namespaced key→blob store ──────────────────────────────────────────── - // - // A general scratch store, separate from the (addr, hashin, name) artifact - // space, used for cross-run plugin state such as filesystem-walk caches - // (see `crate::engine::walk_cache`). Defaults are no-ops so only the durable - // backend (and its fronting tiers) need implement it; a no-op backend simply - // makes those caches always miss (correctness-neutral). - fn kv_get(&self, _ns: &str, _k: &str) -> anyhow::Result>> { - Ok(None) - } - fn kv_list(&self, _ns: &str) -> anyhow::Result)>> { - Ok(Vec::new()) - } - fn kv_put(&self, _ns: &str, _k: &str, _v: &[u8]) -> anyhow::Result<()> { - Ok(()) - } } #[derive(Debug, thiserror::Error)] diff --git a/src/engine/local_cache_mem.rs b/src/engine/local_cache_mem.rs index 9b7a8db5..f83d67ba 100644 --- a/src/engine/local_cache_mem.rs +++ b/src/engine/local_cache_mem.rs @@ -117,18 +117,6 @@ impl LocalCache for LocalCacheMem { self.inner.list_target_entries(addr) } - // KV is not fronted by the mem LRU (small, read once at startup) — delegate - // straight to the durable backend. - fn kv_get(&self, ns: &str, k: &str) -> Result>> { - self.inner.kv_get(ns, k) - } - fn kv_list(&self, ns: &str) -> Result)>> { - self.inner.kv_list(ns) - } - fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> Result<()> { - self.inner.kv_put(ns, k, v) - } - fn seekable_reader( &self, addr: &Addr, diff --git a/src/engine/local_cache_sqlite.rs b/src/engine/local_cache_sqlite.rs index eaf09d12..b9f94c4a 100644 --- a/src/engine/local_cache_sqlite.rs +++ b/src/engine/local_cache_sqlite.rs @@ -166,19 +166,9 @@ struct DeleteJob { slot: Arc, } -/// Namespaced key→blob upsert for the `kv` table. Fire-and-forget: callers keep -/// their own in-memory copy and only read the table once at startup, so there is -/// no read-after-write race to track with a [`PendingSlot`]. -struct KvPutJob { - ns: String, - k: String, - v: Vec, -} - enum WriterCmd { Write(WriteJob), Delete(DeleteJob), - KvPut(KvPutJob), } pub struct LocalCacheSQLite { @@ -224,13 +214,7 @@ impl LocalCacheSQLite { data BLOB NOT NULL, PRIMARY KEY (addr, hashin, name) ); - CREATE INDEX IF NOT EXISTS idx_artifacts_addr_hashin ON artifacts (addr, hashin); - CREATE TABLE IF NOT EXISTS kv ( - ns TEXT NOT NULL, - k TEXT NOT NULL, - v BLOB NOT NULL, - PRIMARY KEY (ns, k) - );", + CREATE INDEX IF NOT EXISTS idx_artifacts_addr_hashin ON artifacts (addr, hashin);", ) .context("initialising sqlite cache schema")?; @@ -319,8 +303,6 @@ fn writer_loop(conn: &mut Connection, rx: &mpsc::Receiver, pending: & match cmd { WriterCmd::Write(j) => pending.complete(&j.key, &j.slot), WriterCmd::Delete(j) => pending.complete(&j.key, &j.slot), - // KvPut is fire-and-forget — no pending slot to release. - WriterCmd::KvPut(_) => {} } } } @@ -377,13 +359,6 @@ fn process_batch(conn: &mut Connection, batch: &mut [WriterCmd]) -> Result<()> { ) })?; } - WriterCmd::KvPut(job) => { - tx.execute( - "INSERT OR REPLACE INTO kv (ns, k, v) VALUES (?1, ?2, ?3)", - rusqlite::params![job.ns, job.k, job.v.as_slice()], - ) - .with_context(|| format!("kv put {}/{}", job.ns, job.k))?; - } } } @@ -520,52 +495,6 @@ impl LocalCache for LocalCacheSQLite { Ok(found) } - fn kv_get(&self, ns: &str, k: &str) -> Result>> { - let conn = self - .read_pool - .get() - .context("acquiring read connection from pool")?; - let mut stmt = conn - .prepare_cached("SELECT v FROM kv WHERE ns=?1 AND k=?2") - .context("preparing kv get")?; - match stmt.query_row(rusqlite::params![ns, k], |row| row.get::<_, Vec>(0)) { - Ok(v) => Ok(Some(v)), - Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), - Err(e) => Err(e).context("reading kv value"), - } - } - - fn kv_list(&self, ns: &str) -> Result)>> { - let conn = self - .read_pool - .get() - .context("acquiring read connection from pool")?; - let mut stmt = conn - .prepare_cached("SELECT k, v FROM kv WHERE ns=?1") - .context("preparing kv list")?; - let rows = stmt - .query_map(rusqlite::params![ns], |row| { - Ok((row.get::<_, String>(0)?, row.get::<_, Vec>(1)?)) - }) - .context("querying kv namespace")?; - rows.collect::>>() - .context("reading kv rows") - } - - fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> Result<()> { - // Fire-and-forget through the writer thread (batched with artifact - // writes). The `Drop` impl joins the writer, so enqueued puts flush - // before the process exits. - self.writer_tx()? - .send(WriterCmd::KvPut(KvPutJob { - ns: ns.to_owned(), - k: k.to_owned(), - v: v.to_vec(), - })) - .context("enqueuing kv put")?; - Ok(()) - } - fn list_targets(&self) -> Result { // Stream distinct addrs over a bounded channel: the producer holds one // pooled connection and a `SELECT DISTINCT addr` cursor on a dedicated @@ -825,63 +754,6 @@ mod tests { Ok(()) } - #[test] - fn test_kv_put_get_list() -> Result<()> { - let dir = tempdir()?; - let cache = LocalCacheSQLite::with_pipe_limit( - dir.path().join("cache.db"), - 16 * 1024, - DEFAULT_MAX_CONCURRENT_PIPES, - )?; - - assert!(cache.kv_get("ns", "missing")?.is_none()); - - cache.kv_put("ns", "a", b"alpha")?; - cache.kv_put("ns", "b", b"beta")?; - cache.kv_put("other", "a", b"zzz")?; - // kv_put is async through the writer thread; block until it lands by - // dropping into a short spin on kv_get. - let mut tries = 0; - while cache.kv_get("ns", "a")?.is_none() && tries < 1000 { - std::thread::yield_now(); - tries += 1; - } - - assert_eq!( - cache.kv_get("ns", "a")?.as_deref(), - Some(b"alpha".as_slice()) - ); - assert_eq!( - cache.kv_get("ns", "b")?.as_deref(), - Some(b"beta".as_slice()) - ); - - // kv_list is scoped to the namespace. - let mut listed = cache.kv_list("ns")?; - listed.sort(); - assert_eq!( - listed, - vec![ - ("a".to_string(), b"alpha".to_vec()), - ("b".to_string(), b"beta".to_vec()), - ] - ); - - // Overwrite replaces. - cache.kv_put("ns", "a", b"alpha2")?; - let mut tries = 0; - while cache.kv_get("ns", "a")?.as_deref() != Some(b"alpha2".as_slice()) && tries < 1000 { - std::thread::yield_now(); - tries += 1; - } - assert_eq!( - cache.kv_get("ns", "a")?.as_deref(), - Some(b"alpha2".as_slice()) - ); - - Ok(()) - } - #[test] fn test_seekable_reader_pread_in_middle() -> Result<()> { use io::{Read, Seek, SeekFrom}; diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 811b1299..0efb22cd 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -20,7 +20,6 @@ pub mod driver; pub mod error; pub mod event; mod local_cache; -pub use local_cache::{LocalCache, SizedReader}; #[cfg(test)] mod local_cache_fs; mod local_cache_mem; @@ -47,7 +46,6 @@ pub use result_lock::{LockBackend, ResultLock}; mod expand; pub mod fanout; mod gc; -pub mod walk_cache; pub use gc::GcStats; pub mod gitignore; mod grow_stack; diff --git a/src/engine/walk_cache.rs b/src/engine/walk_cache.rs deleted file mode 100644 index 49957257..00000000 --- a/src/engine/walk_cache.rs +++ /dev/null @@ -1,323 +0,0 @@ -//! Cross-run cache of filesystem-walk results. -//! -//! Several plugins re-walk the workspace tree on every run because their targets -//! are intentionally uncacheable: the fs `Driver` re-globs source files, the -//! buildfile `Provider` re-discovers packages. The tree rarely changes between -//! runs, so these walks are repeated work. -//! -//! [`WalkCache`] memoizes a walk's result across runs in the durable cache's -//! namespaced KV store (see [`LocalCache::kv_get`]). Each entry pairs a -//! [`WalkSignature`] — directory mtimes (the matched *set*) plus optional -//! per-file `(size, mtime)` (file *content*) — with a borsh value. A lookup -//! returns the value only when the signature still validates against the live -//! tree; otherwise the caller walks and re-inserts. -//! -//! `mtime+size` is a fast-path proxy for content identity (heph otherwise hashes -//! content precisely); a same-size in-place rewrite within the filesystem's mtime -//! granularity can be missed — an accepted tradeoff. Everything is -//! correct-by-fallback: a missing/disabled store, a decode error, or any -//! validation mismatch simply makes the caller re-walk. - -use crate::engine::local_cache::LocalCache; -use borsh::{BorshDeserialize, BorshSerialize}; -use parking_lot::Mutex; -use rustc_hash::FxHashMap; -use std::path::Path; -use std::sync::Arc; - -/// Nanoseconds since the unix epoch for `meta`'s mtime, or `None` if unreadable -/// (pre-epoch or unsupported) — a walk with any unreadable mtime is not cached. -pub fn mtime_ns(meta: &std::fs::Metadata) -> Option { - let t = meta.modified().ok()?; - let d = t.duration_since(std::time::UNIX_EPOCH).ok()?; - i64::try_from(d.as_nanos()).ok() -} - -/// Validation fingerprint for a filesystem walk: the directories it descended -/// (by mtime) and, optionally, the files it read (by size + mtime). -#[derive(Clone, Default, Debug, BorshSerialize, BorshDeserialize)] -pub struct WalkSignature { - /// `(path relative to root, mtime_ns)` for every directory descended. A - /// directory's mtime bumps on any entry add/remove/rename, so matching all of - /// them proves the matched file *set* is unchanged without re-reading them. - pub dirs: Vec<(String, i64)>, - /// `(path relative to root, size, mtime_ns)` for content-sensitive walks. - /// Empty when only the directory *set* matters (e.g. discovering which dirs - /// contain a marker file). - pub files: Vec<(String, u64, i64)>, -} - -impl WalkSignature { - /// Record a directory's mtime under `root`. Returns `false` if the mtime is - /// unreadable (⇒ the caller should mark the walk non-persistable). - pub fn push_dir(&mut self, rel: impl Into, meta: &std::fs::Metadata) -> bool { - match mtime_ns(meta) { - Some(mt) => { - self.dirs.push((rel.into(), mt)); - true - } - None => false, - } - } - - /// Record a file's `(size, mtime)` under `root`. Returns `false` if the mtime - /// is unreadable. - pub fn push_file(&mut self, rel: impl Into, meta: &std::fs::Metadata) -> bool { - match mtime_ns(meta) { - Some(mt) => { - self.files.push((rel.into(), meta.len(), mt)); - true - } - None => false, - } - } - - /// True iff the tree under `root` still matches: every recorded directory - /// mtime and every recorded file `(size, mtime)` is unchanged. - pub fn is_valid(&self, root: &Path) -> bool { - for (rel, mt) in &self.dirs { - match std::fs::metadata(root.join(rel)) { - Ok(m) if m.is_dir() && mtime_ns(&m) == Some(*mt) => {} - _ => return false, - } - } - for (rel, size, mt) in &self.files { - match std::fs::metadata(root.join(rel)) { - Ok(m) if !m.is_dir() && m.len() == *size && mtime_ns(&m) == Some(*mt) => {} - _ => return false, - } - } - true - } -} - -const WALK_CACHE_VERSION: u32 = 1; - -#[derive(BorshSerialize, BorshDeserialize)] -struct StoredEntry { - version: u32, - sig: WalkSignature, - value: T, -} - -/// Cross-run, in-memory-fronted cache of walk results keyed by an arbitrary -/// string, backed by a [`LocalCache`] KV namespace. -/// -/// The KV namespace is scanned once (lazily, on first access) into an in-memory -/// map; lookups then serve from memory. Inserts write-through to the KV -/// incrementally, so a pure cache-hit run performs no writes. Constructed with -/// `None` (or a backend whose KV is a no-op) it degrades to always-miss. -pub struct WalkCache { - cache: Option>, - ns: String, - inner: Mutex>, -} - -struct Inner { - loaded: bool, - map: FxHashMap>>, -} - -impl WalkCache -where - T: BorshSerialize + BorshDeserialize + Clone, -{ - /// A cache backed by `cache`'s KV namespace `ns`. `None` disables it - /// (always-miss, no writes). - pub fn new(cache: Option>, ns: impl Into) -> Self { - Self { - cache, - ns: ns.into(), - inner: Mutex::new(Inner { - loaded: false, - map: FxHashMap::default(), - }), - } - } - - fn ensure_loaded(&self, cache: &dyn LocalCache, inner: &mut Inner) { - if inner.loaded { - return; - } - inner.loaded = true; - let Ok(rows) = cache.kv_list(&self.ns) else { - return; - }; - for (k, bytes) in rows { - if let Ok(entry) = borsh::from_slice::>(&bytes) - && entry.version == WALK_CACHE_VERSION - { - inner.map.insert(k, Arc::new(entry)); - } - } - } - - /// Returns the cached value for `key` if its [`WalkSignature`] still validates - /// against the tree at `root`; otherwise `None` (the caller should walk and - /// [`insert`](Self::insert)). - pub fn get(&self, key: &str, root: &Path) -> Option { - let cache = self.cache.as_deref()?; - let entry = { - let mut inner = self.inner.lock(); - self.ensure_loaded(cache, &mut inner); - inner.map.get(key).cloned() - }?; - entry.sig.is_valid(root).then(|| entry.value.clone()) - } - - /// Records a fresh walk result: updates the in-memory map and write-through - /// to the KV (best-effort). No-op when the cache is disabled. - pub fn insert(&self, key: impl Into, sig: WalkSignature, value: T) { - let Some(cache) = self.cache.as_deref() else { - return; - }; - let key = key.into(); - let entry = Arc::new(StoredEntry { - version: WALK_CACHE_VERSION, - sig, - value, - }); - if let Ok(bytes) = borsh::to_vec(&*entry) { - drop(cache.kv_put(&self.ns, &key, &bytes)); - } - self.inner.lock().map.insert(key, entry); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::collections::HashMap; - - /// Minimal `LocalCache` exposing only the KV methods over an in-memory map. - #[derive(Default)] - struct KvMock { - kv: Mutex>>, - } - impl LocalCache for KvMock { - fn reader( - &self, - _a: &crate::htaddr::Addr, - _h: &str, - _n: &str, - ) -> anyhow::Result { - unimplemented!() - } - fn writer( - &self, - _a: &crate::htaddr::Addr, - _h: &str, - _n: &str, - ) -> anyhow::Result> { - unimplemented!() - } - fn exists(&self, _a: &crate::htaddr::Addr, _h: &str, _n: &str) -> anyhow::Result { - Ok(false) - } - fn delete(&self, _a: &crate::htaddr::Addr, _h: &str, _n: &str) -> anyhow::Result<()> { - Ok(()) - } - fn kv_get(&self, ns: &str, k: &str) -> anyhow::Result>> { - Ok(self.kv.lock().get(&(ns.to_owned(), k.to_owned())).cloned()) - } - fn kv_list(&self, ns: &str) -> anyhow::Result)>> { - Ok(self - .kv - .lock() - .iter() - .filter(|((n, _), _)| n == ns) - .map(|((_, k), v)| (k.clone(), v.clone())) - .collect()) - } - fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> anyhow::Result<()> { - self.kv - .lock() - .insert((ns.to_owned(), k.to_owned()), v.to_vec()); - Ok(()) - } - } - - #[test] - fn signature_validates_and_detects_changes() { - let dir = tempfile::tempdir().unwrap(); - let root = dir.path(); - std::fs::create_dir(root.join("d")).unwrap(); - std::fs::write(root.join("d/f"), b"abc").unwrap(); - - let mut sig = WalkSignature::default(); - sig.push_dir("d", &std::fs::metadata(root.join("d")).unwrap()); - sig.push_file("d/f", &std::fs::metadata(root.join("d/f")).unwrap()); - assert!(sig.is_valid(root), "unchanged tree validates"); - - // Content change (different size) invalidates. - std::fs::write(root.join("d/f"), b"a longer body").unwrap(); - assert!(!sig.is_valid(root), "changed file size invalidates"); - } - - #[test] - fn signature_detects_dir_mtime_change() { - let dir = tempfile::tempdir().unwrap(); - let root = dir.path(); - std::fs::create_dir(root.join("d")).unwrap(); - let mut sig = WalkSignature::default(); - sig.push_dir("d", &std::fs::metadata(root.join("d")).unwrap()); - assert!(sig.is_valid(root)); - - std::fs::File::open(root.join("d")) - .unwrap() - .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) - .unwrap(); - assert!(!sig.is_valid(root), "bumped dir mtime invalidates"); - } - - #[test] - fn cache_roundtrips_through_kv_and_validates() { - let dir = tempfile::tempdir().unwrap(); - let root = dir.path(); - std::fs::create_dir(root.join("d")).unwrap(); - - let backend: Arc = Arc::new(KvMock::default()); - let mut sig = WalkSignature::default(); - sig.push_dir("d", &std::fs::metadata(root.join("d")).unwrap()); - - // First WalkCache populates the KV. - { - let wc: WalkCache> = WalkCache::new(Some(backend.clone()), "test"); - assert!(wc.get("k", root).is_none(), "cold miss"); - wc.insert( - "k", - sig.clone(), - vec!["pkg/a".to_string(), "pkg/b".to_string()], - ); - assert_eq!( - wc.get("k", root).unwrap().len(), - 2, - "warm hit, same process" - ); - } - - // A fresh WalkCache loads from the shared KV (simulates a new run). - let wc2: WalkCache> = WalkCache::new(Some(backend.clone()), "test"); - assert_eq!( - wc2.get("k", root).unwrap(), - vec!["pkg/a".to_string(), "pkg/b".to_string()], - "fresh cache reloads from KV and validates" - ); - - // After a dir mtime bump the entry no longer validates. - std::fs::File::open(root.join("d")) - .unwrap() - .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) - .unwrap(); - let wc3: WalkCache> = WalkCache::new(Some(backend), "test"); - assert!(wc3.get("k", root).is_none(), "stale entry invalidates"); - } - - #[test] - fn disabled_cache_always_misses() { - let dir = tempfile::tempdir().unwrap(); - let wc: WalkCache> = WalkCache::new(None, "test"); - wc.insert("k", WalkSignature::default(), vec!["x".to_string()]); - assert!(wc.get("k", dir.path()).is_none()); - } -} diff --git a/src/htwalk/cached_walker.rs b/src/htwalk/cached_walker.rs new file mode 100644 index 00000000..4223328b --- /dev/null +++ b/src/htwalk/cached_walker.rs @@ -0,0 +1,635 @@ +//! Shared, on-demand, cross-run cached filesystem walker. +//! +//! Every tree-walking plugin (the `fs` glob driver, the buildfile package +//! discovery) repeats the same `readdir` + `stat` + content-hash work each run +//! over a tree that rarely changes. This walker caches that work *per path*, +//! independent of who asks: two consumers reading the same directory share one +//! cached listing, and one consumer hashing a file shares it with another. +//! +//! It is **consumer-agnostic** and **on-demand**: the walker only answers +//! "what's in this directory?" ([`read_dir`]) and "what's this file's content +//! hash?" ([`file_hash`]). Filtering (globs, excludes, skip dirs) and the +//! decision of whether to recurse belong to the *consumer* — so a requester that +//! stops shallow and one that recurses deep both reuse the dirs they share and +//! independently cache the ones they don't. +//! +//! Validation is mtime/size based (a fast-path proxy for content identity, the +//! same tradeoff accepted elsewhere): a directory's listing is reused while its +//! mtime is unchanged (mtime bumps on any entry add/remove/rename); a file's +//! hash is reused while its `(size, mtime)` is unchanged. Everything is +//! correct-by-fallback — a missing/locked db, a decode error, or any mismatch +//! just re-reads from disk. +//! +//! Backed by a dedicated `fswalk.db` (separate from the artifact cache so it can +//! be pruned independently). Rows carry a last-access stamp; [`prune`] drops +//! stale and orphaned rows so the db cannot grow without bound. +//! +//! [`read_dir`]: CachedWalker::read_dir +//! [`file_hash`]: CachedWalker::file_hash +//! [`prune`]: CachedWalker::prune + +use anyhow::{Context, Result}; +use borsh::{BorshDeserialize, BorshSerialize}; +use parking_lot::Mutex; +use r2d2_sqlite::SqliteConnectionManager; +use rusqlite::{Connection, OpenFlags}; +use rustc_hash::FxHashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; +use xxhash_rust::xxh3::Xxh3; + +/// Default time-to-live for fswalk rows: entries untouched for this long are +/// dropped by [`CachedWalker::prune`]. +pub const DEFAULT_TTL: std::time::Duration = std::time::Duration::from_secs(14 * 24 * 60 * 60); + +/// The kind of a directory entry, from the platform `d_type` (no extra stat). +#[derive(Clone, Copy, Debug, PartialEq, Eq, BorshSerialize, BorshDeserialize)] +pub enum EntryKind { + File, + Dir, + Symlink, + Other, +} + +/// One entry in a directory listing: its file name (not a full path) and kind. +#[derive(Clone, Debug, BorshSerialize, BorshDeserialize)] +pub struct DirEntry { + pub name: String, + pub kind: EntryKind, +} + +/// A cached directory listing. Entries are sorted by name for determinism. +#[derive(Clone, Debug, Default, BorshSerialize, BorshDeserialize)] +pub struct DirListing { + pub entries: Vec, +} + +/// A cached file content hash plus the stat fields that validate it. +#[derive(Clone, Debug)] +pub struct FileHash { + pub size: u64, + pub mtime_ns: i64, + pub exec: bool, + /// xxh3 of the file content with the exec bit folded in (see [`file_hashout`]). + pub hashout: String, +} + +/// Content identity for a sourced file: a hash of its bytes plus the executable +/// marker. Deliberately ignores size and mtime — only the content and the `x` +/// bit determine the artifact, so a file rewritten with identical bytes (new +/// mtime, same content) hashes the same and stays a cache hit. (mtime/size are +/// used only as a *cache validation* fast-path, never as the identity.) +pub fn file_hashout(path: &Path, x: bool) -> Result { + use std::io::Read as _; + let file = std::fs::File::open(path) + .with_context(|| format!("open file for hashing '{}'", path.display()))?; + let mut reader = std::io::BufReader::new(file); + let mut h = Xxh3::new(); + // Stream in chunks so large inputs never load wholesale into memory. + let mut buf = [0u8; 64 * 1024]; + loop { + let n = reader + .read(&mut buf) + .with_context(|| format!("read file for hashing '{}'", path.display()))?; + if n == 0 { + break; + } + if let Some(chunk) = buf.get(..n) { + h.update(chunk); + } + } + h.update(&[x as u8]); + Ok(format!("{:x}", h.digest())) +} + +#[cfg(unix)] +fn is_exec(meta: &std::fs::Metadata) -> bool { + use std::os::unix::fs::PermissionsExt; + meta.permissions().mode() & 0o111 != 0 +} + +#[cfg(not(unix))] +fn is_exec(_meta: &std::fs::Metadata) -> bool { + false +} + +fn mtime_ns(meta: &std::fs::Metadata) -> Option { + let d = meta.modified().ok()?.duration_since(UNIX_EPOCH).ok()?; + i64::try_from(d.as_nanos()).ok() +} + +fn now_ns() -> i64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .ok() + .and_then(|d| i64::try_from(d.as_nanos()).ok()) + .unwrap_or(0) +} + +fn entry_kind(ft: std::fs::FileType) -> EntryKind { + if ft.is_dir() { + EntryKind::Dir + } else if ft.is_file() { + EntryKind::File + } else if ft.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::Other + } +} + +/// Shared cached filesystem walker. Cheap to clone (`Arc` the whole thing). +pub struct CachedWalker { + store: Option, + /// In-process front for directory listings, validated against live mtime. + dirs: Mutex)>>, + /// In-process front for file hashes, validated against live (size, mtime). + files: Mutex>>, + /// Paths read-served from the durable store this process; their last-access + /// stamps are refreshed in one batch on drop (so reads stay write-free). + touched_dirs: Mutex>, + touched_files: Mutex>, +} + +impl CachedWalker { + /// Open (or create) the walker backed by the sqlite db at `db_path`. On any + /// db-open failure the walker still works — it just degrades to always + /// re-reading from disk (no cross-run cache). + pub fn open(db_path: &Path) -> Self { + let store = match FsWalkStore::open(db_path) { + Ok(s) => Some(s), + Err(e) => { + tracing::warn!(error = %format!("{e:#}"), "fswalk cache disabled"); + None + } + }; + Self { + store, + dirs: Mutex::new(FxHashMap::default()), + files: Mutex::new(FxHashMap::default()), + touched_dirs: Mutex::new(Vec::new()), + touched_files: Mutex::new(Vec::new()), + } + } + + /// A disabled walker (no caching) for tests / contexts without a db. + pub fn disabled() -> Self { + Self { + store: None, + dirs: Mutex::new(FxHashMap::default()), + files: Mutex::new(FxHashMap::default()), + touched_dirs: Mutex::new(Vec::new()), + touched_files: Mutex::new(Vec::new()), + } + } + + /// The cached listing of directory `dir` (absolute). A missing directory + /// lists empty. The caller filters entries and decides whether to recurse. + pub fn read_dir(&self, dir: &Path) -> Result> { + let meta = match std::fs::metadata(dir) { + Ok(m) => m, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + return Ok(Arc::new(DirListing::default())); + } + Err(e) => return Err(e).with_context(|| format!("stat dir '{}'", dir.display())), + }; + if !meta.is_dir() { + return Ok(Arc::new(DirListing::default())); + } + let live_mtime = mtime_ns(&meta); + + // In-process front: reuse while the live mtime matches. + if let Some(mt) = live_mtime + && let Some((cached_mt, listing)) = self.dirs.lock().get(dir).cloned() + && cached_mt == mt + { + return Ok(listing); + } + + // Durable store: reuse while the recorded mtime matches. + if let Some(mt) = live_mtime + && let Some(store) = self.store.as_ref() + && let Some((stored_mtime, blob)) = store.get_dir(dir) + && stored_mtime == mt + && let Ok(listing) = borsh::from_slice::(&blob) + { + let listing = Arc::new(listing); + self.dirs + .lock() + .insert(dir.to_path_buf(), (mt, listing.clone())); + self.touched_dirs.lock().push(path_key(dir)); + return Ok(listing); + } + + // Miss: read the directory and cache it. + let listing = Arc::new(read_dir_uncached(dir)?); + if let (Some(mt), Some(store)) = (live_mtime, self.store.as_ref()) { + if let Ok(blob) = borsh::to_vec(&*listing) { + store.put_dir(dir, mt, &blob, now_ns()); + } + self.dirs + .lock() + .insert(dir.to_path_buf(), (mt, listing.clone())); + } + Ok(listing) + } + + /// The cached content hash (and exec bit) of file `file` (absolute), following + /// symlinks. Validated by `(size, mtime)`; re-hashed on a mismatch. + pub fn file_hash(&self, file: &Path) -> Result> { + let meta = + std::fs::metadata(file).with_context(|| format!("stat file '{}'", file.display()))?; + // A directory (e.g. a symlink resolving to one) is not hashable — error so + // the caller skips it rather than trying to read it. + anyhow::ensure!(!meta.is_dir(), "'{}' is a directory", file.display()); + let size = meta.len(); + let live_mtime = mtime_ns(&meta); + let exec = is_exec(&meta); + + // In-process front. + if let Some(mt) = live_mtime + && let Some(found) = self.files.lock().get(file).cloned() + && found.size == size + && found.mtime_ns == mt + { + return Ok(found); + } + + // Durable store. + if let Some(mt) = live_mtime + && let Some(store) = self.store.as_ref() + && let Some(fh) = store.get_file(file) + && fh.size == size + && fh.mtime_ns == mt + { + let fh = Arc::new(fh); + self.files.lock().insert(file.to_path_buf(), fh.clone()); + self.touched_files.lock().push(path_key(file)); + return Ok(fh); + } + + // Miss: hash the file and cache it. + let hashout = file_hashout(file, exec)?; + let fh = Arc::new(FileHash { + size, + mtime_ns: live_mtime.unwrap_or(-1), + exec, + hashout, + }); + if let (Some(_mt), Some(store)) = (live_mtime, self.store.as_ref()) { + store.put_file(file, &fh, now_ns()); + self.files.lock().insert(file.to_path_buf(), fh.clone()); + } + Ok(fh) + } + + /// Drop rows untouched for longer than `ttl`, and (when `check_orphans`) + /// rows whose path no longer exists on disk. Returns the number removed. + pub fn prune(&self, ttl: std::time::Duration, check_orphans: bool) -> Result { + match self.store.as_ref() { + Some(store) => store.prune(ttl, check_orphans), + None => Ok(0), + } + } + + /// Flush this process's accumulated last-access stamps. Called on drop; + /// exposed for tests. + fn flush_touches(&self) { + let Some(store) = self.store.as_ref() else { + return; + }; + let now = now_ns(); + let dirs = std::mem::take(&mut *self.touched_dirs.lock()); + let files = std::mem::take(&mut *self.touched_files.lock()); + if !dirs.is_empty() { + store.touch("dirs", &dirs, now); + } + if !files.is_empty() { + store.touch("files", &files, now); + } + } +} + +impl Default for CachedWalker { + fn default() -> Self { + Self::disabled() + } +} + +impl Drop for CachedWalker { + fn drop(&mut self) { + self.flush_touches(); + } +} + +/// Read a directory directly (no cache), returning a sorted [`DirListing`]. +fn read_dir_uncached(dir: &Path) -> Result { + let rd = match std::fs::read_dir(dir) { + Ok(rd) => rd, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(DirListing::default()), + Err(e) => return Err(e).with_context(|| format!("read dir '{}'", dir.display())), + }; + let mut entries = Vec::new(); + for entry in rd { + let entry = entry.with_context(|| format!("read dir entry in '{}'", dir.display()))?; + let Ok(ft) = entry.file_type() else { continue }; + let Ok(name) = entry.file_name().into_string() else { + continue; + }; + entries.push(DirEntry { + name, + kind: entry_kind(ft), + }); + } + entries.sort_by(|a, b| a.name.cmp(&b.name)); + Ok(DirListing { entries }) +} + +/// A path's db key. Lossy is fine — a non-UTF-8 path just gets a stable lossy +/// key; the worst case is a cache miss, never an incorrect hit (the on-disk +/// mtime/size still gate every reuse). +fn path_key(p: &Path) -> String { + p.to_string_lossy().into_owned() +} + +// ── Durable store ──────────────────────────────────────────────────────────── + +struct FsWalkStore { + read_pool: r2d2::Pool, + write: Mutex, +} + +impl FsWalkStore { + fn open(db_path: &Path) -> Result { + if let Some(parent) = db_path.parent() { + std::fs::create_dir_all(parent) + .with_context(|| format!("creating fswalk dir {parent:?}"))?; + } + let write = Connection::open(db_path) + .with_context(|| format!("opening fswalk db at {db_path:?}"))?; + write + .execute_batch( + "PRAGMA journal_mode = WAL; + PRAGMA busy_timeout = 10000; + PRAGMA synchronous = NORMAL; + PRAGMA temp_store = MEMORY; + PRAGMA mmap_size = 268435456; + CREATE TABLE IF NOT EXISTS dirs ( + path TEXT PRIMARY KEY, + mtime_ns INTEGER NOT NULL, + entries BLOB NOT NULL, + accessed_ns INTEGER NOT NULL + ); + CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + size INTEGER NOT NULL, + mtime_ns INTEGER NOT NULL, + exec INTEGER NOT NULL, + hashout TEXT NOT NULL, + accessed_ns INTEGER NOT NULL + );", + ) + .context("initialising fswalk schema")?; + + let manager = SqliteConnectionManager::file(db_path) + .with_flags(OpenFlags::SQLITE_OPEN_READ_ONLY) + .with_init(|c| { + c.execute_batch( + "PRAGMA busy_timeout = 10000; + PRAGMA temp_store = MEMORY; + PRAGMA mmap_size = 268435456;", + ) + }); + let read_pool = r2d2::Pool::builder() + .max_size(16) + .min_idle(Some(1)) + .build(manager) + .context("building fswalk read pool")?; + + Ok(Self { + read_pool, + write: Mutex::new(write), + }) + } + + fn get_dir(&self, path: &Path) -> Option<(i64, Vec)> { + let conn = self.read_pool.get().ok()?; + let mut stmt = conn + .prepare_cached("SELECT mtime_ns, entries FROM dirs WHERE path = ?1") + .ok()?; + stmt.query_row([path_key(path)], |r| Ok((r.get(0)?, r.get(1)?))) + .ok() + } + + fn put_dir(&self, path: &Path, mtime_ns: i64, entries: &[u8], now: i64) { + let conn = self.write.lock(); + drop(conn.execute( + "INSERT OR REPLACE INTO dirs (path, mtime_ns, entries, accessed_ns) \ + VALUES (?1, ?2, ?3, ?4)", + rusqlite::params![path_key(path), mtime_ns, entries, now], + )); + } + + fn get_file(&self, path: &Path) -> Option { + let conn = self.read_pool.get().ok()?; + let mut stmt = conn + .prepare_cached("SELECT size, mtime_ns, exec, hashout FROM files WHERE path = ?1") + .ok()?; + stmt.query_row([path_key(path)], |r| { + Ok(FileHash { + size: u64::try_from(r.get::<_, i64>(0)?).unwrap_or(0), + mtime_ns: r.get(1)?, + exec: r.get::<_, i64>(2)? != 0, + hashout: r.get(3)?, + }) + }) + .ok() + } + + fn put_file(&self, path: &Path, fh: &FileHash, now: i64) { + let conn = self.write.lock(); + drop(conn.execute( + "INSERT OR REPLACE INTO files (path, size, mtime_ns, exec, hashout, accessed_ns) \ + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + rusqlite::params![ + path_key(path), + i64::try_from(fh.size).unwrap_or(i64::MAX), + fh.mtime_ns, + fh.exec as i64, + fh.hashout, + now + ], + )); + } + + /// Refresh last-access for a batch of paths in `table` (chunked under the + /// sqlite variable limit). Best-effort. + fn touch(&self, table: &str, paths: &[String], now: i64) { + let conn = self.write.lock(); + for chunk in paths.chunks(400) { + let placeholders = std::iter::repeat_n("?", chunk.len()) + .collect::>() + .join(","); + let sql = format!("UPDATE {table} SET accessed_ns = ? WHERE path IN ({placeholders})"); + let Ok(mut stmt) = conn.prepare(&sql) else { + continue; + }; + let mut params: Vec<&dyn rusqlite::ToSql> = Vec::with_capacity(chunk.len() + 1); + params.push(&now); + for p in chunk { + params.push(p); + } + drop(stmt.execute(params.as_slice())); + } + } + + fn prune(&self, ttl: std::time::Duration, check_orphans: bool) -> Result { + let cutoff = now_ns().saturating_sub(i64::try_from(ttl.as_nanos()).unwrap_or(i64::MAX)); + let conn = self.write.lock(); + let mut removed = 0usize; + removed += conn + .execute("DELETE FROM dirs WHERE accessed_ns < ?1", [cutoff]) + .context("pruning stale fswalk dirs")?; + removed += conn + .execute("DELETE FROM files WHERE accessed_ns < ?1", [cutoff]) + .context("pruning stale fswalk files")?; + + if check_orphans { + for table in ["dirs", "files"] { + let paths: Vec = { + let mut stmt = conn.prepare(&format!("SELECT path FROM {table}"))?; + let rows = stmt.query_map([], |r| r.get::<_, String>(0))?; + rows.filter_map(|r| r.ok()) + .filter(|p| !Path::new(p).exists()) + .collect() + }; + for chunk in paths.chunks(400) { + let placeholders = std::iter::repeat_n("?", chunk.len()) + .collect::>() + .join(","); + let sql = format!("DELETE FROM {table} WHERE path IN ({placeholders})"); + let params: Vec<&dyn rusqlite::ToSql> = + chunk.iter().map(|p| p as &dyn rusqlite::ToSql).collect(); + removed += conn.execute(&sql, params.as_slice()).unwrap_or(0); + } + } + } + Ok(removed) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn walker(dir: &Path) -> CachedWalker { + CachedWalker::open(&dir.join("fswalk.db")) + } + + #[test] + fn read_dir_caches_and_revalidates() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + std::fs::create_dir(root.join("pkg")).unwrap(); + std::fs::write(root.join("pkg").join("a.txt"), b"a").unwrap(); + + let w = walker(root); + let l1 = w.read_dir(&root.join("pkg")).unwrap(); + assert_eq!(l1.entries.len(), 1); + assert_eq!(l1.entries[0].name, "a.txt"); + assert_eq!(l1.entries[0].kind, EntryKind::File); + + // A fresh walker sharing the db reloads the listing without re-reading + // (we can't observe the readdir directly, but the content must match). + drop(w); + let w2 = walker(root); + let l2 = w2.read_dir(&root.join("pkg")).unwrap(); + assert_eq!(l2.entries.len(), 1); + + // Add a file → dir mtime bumps → listing refreshes. + std::fs::write(root.join("pkg").join("b.txt"), b"b").unwrap(); + std::fs::File::open(root.join("pkg")) + .unwrap() + .set_modified(SystemTime::now() + std::time::Duration::from_secs(7200)) + .unwrap(); + let l3 = w2.read_dir(&root.join("pkg")).unwrap(); + assert_eq!(l3.entries.len(), 2); + } + + #[test] + fn missing_dir_lists_empty() { + let tmp = tempdir().unwrap(); + let w = walker(tmp.path()); + let l = w.read_dir(&tmp.path().join("nope")).unwrap(); + assert!(l.entries.is_empty()); + } + + #[test] + fn file_hash_caches_and_revalidates() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + let f = root.join("f"); + std::fs::write(&f, b"hello").unwrap(); + + let w = walker(root); + let h1 = w.file_hash(&f).unwrap(); + // Matches the direct hash. + assert_eq!(h1.hashout, file_hashout(&f, h1.exec).unwrap()); + + // Fresh walker reloads the same hash from the db. + drop(w); + let w2 = walker(root); + let h2 = w2.file_hash(&f).unwrap(); + assert_eq!(h1.hashout, h2.hashout); + + // Content change (different size) → re-hash, different value. + std::fs::write(&f, b"a different, longer body").unwrap(); + let h3 = w2.file_hash(&f).unwrap(); + assert_ne!(h1.hashout, h3.hashout); + } + + #[test] + fn prune_drops_stale_rows() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + std::fs::create_dir(root.join("d")).unwrap(); + let w = walker(root); + w.read_dir(&root.join("d")).unwrap(); + w.flush_touches(); + + // TTL of zero ⇒ everything is stale ⇒ pruned. + let removed = w.prune(std::time::Duration::from_secs(0), false).unwrap(); + assert!(removed >= 1, "stale dir row pruned"); + } + + #[test] + fn prune_drops_orphaned_rows() { + let tmp = tempdir().unwrap(); + let dbdir = tempdir().unwrap(); + let gone = tmp.path().join("gone"); + std::fs::create_dir(&gone).unwrap(); + + let w = walker(dbdir.path()); + w.read_dir(&gone).unwrap(); + + // Delete the directory, then prune with a long TTL: the row survives TTL + // (just written) but is removed as an orphan (its path no longer exists). + std::fs::remove_dir_all(&gone).unwrap(); + let removed = w.prune(DEFAULT_TTL, true).unwrap(); + assert!(removed >= 1, "orphaned dir row pruned"); + // And a second prune finds nothing. + assert_eq!(w.prune(DEFAULT_TTL, true).unwrap(), 0); + } + + #[test] + fn disabled_walker_still_reads() { + let tmp = tempdir().unwrap(); + std::fs::create_dir(tmp.path().join("d")).unwrap(); + std::fs::write(tmp.path().join("d").join("x"), b"").unwrap(); + let w = CachedWalker::disabled(); + let l = w.read_dir(&tmp.path().join("d")).unwrap(); + assert_eq!(l.entries.len(), 1); + } +} diff --git a/src/htwalk/mod.rs b/src/htwalk/mod.rs index cf002f86..1b1b4ee2 100644 --- a/src/htwalk/mod.rs +++ b/src/htwalk/mod.rs @@ -19,6 +19,9 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use wax::{Any, Glob, Program as _}; +pub mod cached_walker; +pub use cached_walker::{CachedWalker, DirEntry, DirListing, EntryKind, FileHash, file_hashout}; + /// Directory + glob ignore rules for a filesystem walk. See the module docs. #[derive(Debug, Clone)] pub struct Ignore { diff --git a/src/pluginbuildfile/provider.rs b/src/pluginbuildfile/provider.rs index e2b6d42a..b9e08fba 100644 --- a/src/pluginbuildfile/provider.rs +++ b/src/pluginbuildfile/provider.rs @@ -1,16 +1,14 @@ -use crate::engine::LocalCache; use crate::engine::provider::GetError::NotFound; use crate::engine::provider::{ ConfigRequest, ConfigResponse, GetError, GetRequest, GetResponse, ListPackageResponse, ListPackagesRequest, ListRequest, ListResponse, ProbeRequest, ProbeResponse, Provider as EProvider, ProviderFunctionRegistry, State, TargetSpec, }; -use crate::engine::walk_cache::{WalkCache, WalkSignature}; use crate::hasync::Cancellable; use crate::hmemoizer::Memoizer; use crate::htaddr::Addr; use crate::htpkg::PkgBuf; -use crate::htwalk::Ignore; +use crate::htwalk::{CachedWalker, Ignore}; use crate::pluginbuildfile::run_file::RunResult; use anyhow::Context; use enclose::enclose; @@ -57,12 +55,13 @@ pub struct Provider { /// Lazily-built Starlark globals (built from `function_registry` on first eval), /// shared with every `BuildFileLoader` so the namespace is built at most once. pub(crate) globals: Arc>, - /// Cross-run cache of the package-discovery walk, validated by directory - /// mtimes only (a BUILD file's *contents* don't change the package set — - /// that's handled by `pkg_cache`). Disabled until [`with_cache`] is called. + /// Shared cross-run filesystem-walk cache. The package-discovery walk reads + /// directories through it, so an unchanged tree skips `readdir` entirely (a + /// BUILD file's *contents* don't change the package set — that's handled by + /// `pkg_cache`). Disabled until [`with_walker`] is called. /// - /// [`with_cache`]: Provider::with_cache - pub(crate) packages_walk_cache: Arc>>, + /// [`with_walker`]: Provider::with_walker + pub(crate) walker: Arc, } impl Default for Provider { @@ -79,14 +78,11 @@ impl Default for Provider { dir_cache: Arc::new(Mutex::new(HashMap::new())), function_registry: OnceLock::new(), globals: Arc::new(OnceLock::new()), - packages_walk_cache: Arc::new(WalkCache::new(None, PACKAGES_CACHE_NS)), + walker: Arc::new(CachedWalker::disabled()), } } } -/// KV namespace for the buildfile package-discovery walk cache. -const PACKAGES_CACHE_NS: &str = "pluginbuildfile.packages"; - impl Provider { pub fn new(root: std::path::PathBuf) -> Self { Self { @@ -95,11 +91,11 @@ impl Provider { } } - /// Enable the cross-run package-discovery cache backed by `cache`'s KV store. - /// Without this the provider re-walks the tree to discover packages on every - /// run (the in-process `packages_cache` only dedupes within one run). - pub fn with_cache(mut self, cache: Option>) -> Self { - self.packages_walk_cache = Arc::new(WalkCache::new(cache, PACKAGES_CACHE_NS)); + /// Use `walker` (the shared cross-run fs-walk cache) for package discovery, so + /// an unchanged tree skips `readdir`. Without it the provider walks the tree + /// live every run (the in-process `packages_cache` only dedupes within a run). + pub fn with_walker(mut self, walker: Arc) -> Self { + self.walker = walker; self } @@ -143,70 +139,35 @@ impl Provider { } } -/// Stable cross-run cache key for a package-discovery walk: the (sorted) build -/// file patterns and skip globs that determine which dirs are visited and which -/// count as packages. `root` is not included — it's the validation reference. -fn packages_cache_key(patterns: &[glob::Pattern], skip: &Ignore) -> String { - let mut pats: Vec<&str> = patterns.iter().map(glob::Pattern::as_str).collect(); - pats.sort_unstable(); - let mut globs: Vec<&str> = skip.globs().iter().map(String::as_str).collect(); - globs.sort_unstable(); - format!("{}\u{1}{}", pats.join(","), globs.join(",")) -} - +/// Recursively discover packages under `path`, reading each directory through +/// the shared [`CachedWalker`] (so an unchanged tree skips `readdir`). Filtering +/// (build-file pattern, skip-dir pruning) is applied here. fn find_packages_sync( + walker: &CachedWalker, path: &std::path::Path, root: &std::path::Path, patterns: &[glob::Pattern], skip: &Ignore, packages: &mut std::collections::HashSet, - sig: &mut WalkSignature, - persistable: &mut bool, ) -> anyhow::Result<()> { - // Record this directory's mtime for the cross-run cache: it bumps on any - // entry add/remove/rename, so matching every descended dir proves the package - // set is unchanged without re-reading the tree. - match ( - path.strip_prefix(root).ok().and_then(|r| r.to_str()), - std::fs::metadata(path).ok(), - ) { - (Some(rel), Some(meta)) if meta.is_dir() => { - if !sig.push_dir(rel, &meta) { - *persistable = false; - } - } - _ => *persistable = false, - } - + let listing = walker.read_dir(path)?; let mut has_build_file = false; - for entry in std::fs::read_dir(path).with_context(|| format!("reading {}", path.display()))? { - let entry = entry?; - let Ok(ft) = entry.file_type() else { continue }; - let entry_path = entry.path(); - - if ft.is_file() { - if entry_path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| patterns.iter().any(|p| p.matches(n))) - .unwrap_or(false) - { - has_build_file = true; + for entry in &listing.entries { + match entry.kind { + crate::htwalk::EntryKind::File | crate::htwalk::EntryKind::Symlink => { + if patterns.iter().any(|p| p.matches(&entry.name)) { + has_build_file = true; + } } - } else if ft.is_dir() { - let rel = entry_path.strip_prefix(root).unwrap_or(&entry_path); - if skip.prune_dir(&entry_path, rel) { - continue; + crate::htwalk::EntryKind::Dir => { + let entry_path = path.join(&entry.name); + let rel = entry_path.strip_prefix(root).unwrap_or(&entry_path); + if skip.prune_dir(&entry_path, rel) { + continue; + } + find_packages_sync(walker, &entry_path, root, patterns, skip, packages)?; } - find_packages_sync( - &entry_path, - root, - patterns, - skip, - packages, - sig, - persistable, - )?; + crate::htwalk::EntryKind::Other => {} } } @@ -284,25 +245,13 @@ impl EProvider for Provider { .packages_cache .once( (), - enclose!((self.root => root, self.build_file_patterns => patterns, self.skip => skip, self.packages_walk_cache => walk_cache) move || async move { + enclose!((self.root => root, self.build_file_patterns => patterns, self.skip => skip, self.walker => walker) move || async move { let packages = crate::process_supervisor::block_or_inline(move || { - let key = packages_cache_key(&patterns, &skip); - // Cross-run cache hit: the directory set is unchanged. - if let Some(pkgs) = walk_cache.get(&key, &root) { - return Ok::<_, anyhow::Error>(pkgs); - } + // Recursion reads dirs through the shared walker, so an + // unchanged tree is served from the cross-run fswalk cache. let mut packages = std::collections::HashSet::new(); - let mut sig = WalkSignature::default(); - let mut persistable = true; - find_packages_sync( - &root, &root, &patterns, &skip, - &mut packages, &mut sig, &mut persistable, - )?; - let pkgs: Vec = packages.into_iter().collect(); - if persistable { - walk_cache.insert(key, sig, pkgs.clone()); - } - Ok(pkgs) + find_packages_sync(&walker, &root, &root, &patterns, &skip, &mut packages)?; + Ok::<_, anyhow::Error>(packages.into_iter().collect::>()) })?; Ok(Arc::new(packages)) }), @@ -411,69 +360,23 @@ mod tests { use std::fs; use tempfile::tempdir; - /// Minimal `LocalCache` exposing only the KV methods over an in-memory map. - #[derive(Default)] - struct KvMock { - kv: Mutex>>, - } - impl LocalCache for KvMock { - fn reader( - &self, - _a: &Addr, - _h: &str, - _n: &str, - ) -> anyhow::Result { - unimplemented!() - } - fn writer(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result> { - unimplemented!() - } - fn exists(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result { - Ok(false) - } - fn delete(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result<()> { - Ok(()) - } - fn kv_get(&self, ns: &str, k: &str) -> anyhow::Result>> { - Ok(self - .kv - .lock() - .unwrap() - .get(&(ns.to_owned(), k.to_owned())) - .cloned()) - } - fn kv_list(&self, ns: &str) -> anyhow::Result)>> { - Ok(self - .kv - .lock() - .unwrap() - .iter() - .filter(|((n, _), _)| n == ns) - .map(|((_, k), v)| (k.clone(), v.clone())) - .collect()) - } - fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> anyhow::Result<()> { - self.kv - .lock() - .unwrap() - .insert((ns.to_owned(), k.to_owned()), v.to_vec()); - Ok(()) - } - } - - /// Package discovery is cached across runs: a fresh provider sharing the KV - /// reuses the discovered set for an unchanged tree, and a newly-added package - /// (which bumps a recorded dir's mtime) is re-discovered. + /// Package discovery is cached across runs through the shared walker: a fresh + /// provider sharing the fswalk db reuses the discovered set for an unchanged + /// tree, and a newly-added package (which bumps a recorded dir's mtime) is + /// re-discovered. #[tokio::test] async fn test_list_packages_cross_run_cache() { let tmp = tempdir().unwrap(); let root = tmp.path(); + // fswalk db outside the walked tree (in production it's under pruned + // `.heph3`), so its writes don't bump the discovered dirs' mtimes. + let dbdir = tempdir().unwrap(); + let db = dbdir.path().join("fswalk.db"); fs::write(root.join("BUILD"), "").unwrap(); let a = root.join("a"); fs::create_dir_all(&a).unwrap(); fs::write(a.join("BUILD"), "").unwrap(); - let backend: Arc = Arc::new(KvMock::default()); let list = |p: Provider| async move { let ctoken = StdCancellationToken::new(); let res = p @@ -489,22 +392,25 @@ mod tests { v.sort(); v }; + let provider = || { + Provider { + root: root.to_path_buf(), + ..Provider::default() + } + .with_walker(Arc::new(CachedWalker::open(&db))) + }; - let p1 = Provider { - root: root.to_path_buf(), - ..Provider::default() - } - .with_cache(Some(backend.clone())); - assert_eq!(list(p1).await, vec!["".to_string(), "a".to_string()]); + assert_eq!( + list(provider()).await, + vec!["".to_string(), "a".to_string()] + ); - // Fresh provider sharing the KV (simulates a new run) → same set, served - // from the cross-run cache for the unchanged tree. - let p2 = Provider { - root: root.to_path_buf(), - ..Provider::default() - } - .with_cache(Some(backend.clone())); - assert_eq!(list(p2).await, vec!["".to_string(), "a".to_string()]); + // Fresh provider sharing the walker db (new run) → same set, served from + // the cross-run readdir cache for the unchanged tree. + assert_eq!( + list(provider()).await, + vec!["".to_string(), "a".to_string()] + ); // Add a new package; bump root mtime so the recorded dir invalidates. let b = root.join("b"); @@ -515,13 +421,8 @@ mod tests { .set_modified(std::time::SystemTime::now() + std::time::Duration::from_secs(7200)) .unwrap(); - let p3 = Provider { - root: root.to_path_buf(), - ..Provider::default() - } - .with_cache(Some(backend.clone())); assert_eq!( - list(p3).await, + list(provider()).await, vec!["".to_string(), "a".to_string(), "b".to_string()], "a newly-added package is re-discovered" ); diff --git a/src/pluginfs/mod.rs b/src/pluginfs/mod.rs index 0a120fb9..8ee8f172 100644 --- a/src/pluginfs/mod.rs +++ b/src/pluginfs/mod.rs @@ -1,4 +1,3 @@ -use crate::engine::LocalCache; use crate::engine::driver::{ ApplyTransitiveRequest, ApplyTransitiveResponse, ConfigRequest, ConfigResponse, ParseRequest, ParseResponse, RunRequest, RunResponse, @@ -14,12 +13,11 @@ use crate::engine::provider::{ ListRequest, ListResponse, ProbeRequest, ProbeResponse, Provider as EProvider, ProviderFn, ProviderFunctionDef, TargetSpec, }; -use crate::engine::walk_cache::{WalkCache, WalkSignature}; use crate::hasync::Cancellable; use crate::htaddr::Addr; use crate::htpkg::PkgBuf; use crate::htvalue::Value; -use crate::htwalk::Ignore; +use crate::htwalk::{CachedWalker, Ignore}; use anyhow::Context; use async_trait::async_trait; use futures::future::BoxFuture; @@ -81,11 +79,13 @@ pub fn is_glob_addr(addr: &Addr) -> bool { pub struct Provider { /// Dirs the `glob` provider function must prune, shared with the driver. skip: Arc, + /// Shared cross-run filesystem-walk cache, used by the `glob` function. + walker: Arc, } impl Provider { - pub fn new(skip: Arc) -> Self { - Self { skip } + pub fn new(skip: Arc, walker: Arc) -> Self { + Self { skip, walker } } } @@ -178,6 +178,7 @@ impl EProvider for Provider { name: "glob".to_string(), func: Arc::new(GlobFn { skip: self.skip.clone(), + walker: self.walker.clone(), }), }, ProviderFunctionDef { @@ -206,6 +207,7 @@ impl EProvider for Provider { /// the engine's skip dirs/globs. Result is sorted. struct GlobFn { skip: Arc, + walker: Arc, } #[async_trait] @@ -227,9 +229,7 @@ impl ProviderFn for GlobFn { // No user excludes, so `request_id` is irrelevant (the built-in exclude // path is taken). Reuses the driver's compiled glob + walk verbatim. let compiled = compile_glob(&self.skip, "heph.fs.glob", &resolved, &[])?; - // BUILD-time expansion: only the artifacts are needed here (the - // cross-run cache entry is produced for the `run`-path walk). - let (artifacts, _) = walk_glob(ctx.root, &compiled)?; + let artifacts = walk_glob(&self.walker, ctx.root, &compiled)?; let pkg_prefix = (!ctx.pkg.is_empty()).then(|| std::path::Path::new(ctx.pkg)); @@ -576,249 +576,114 @@ fn compile_glob( }) } -/// Walks `root` for files matching `compiled`, returning their artifacts plus a -/// [`CachedGlobEntry`] for the cross-run cache (or `None` when the walk root is -/// missing or an mtime is unreadable, so the result is not persistable). +/// Walks `root` for files matching `compiled`, returning their artifacts. /// -/// Starts at the pattern's literal prefix so a rooted pattern (`a/b/**/*`) scans -/// only `/a/b`, not the whole tree. Matching uses the cached glob/exclude -/// NFAs directly — no per-run regex compilation. -fn walk_glob(root: &std::path::Path, compiled: &CompiledGlob) -> anyhow::Result { +/// Recursion runs through the shared [`CachedWalker`]: every `readdir` and every +/// file content hash is served from the cross-run fswalk cache when the tree is +/// unchanged. Filtering (glob match, excludes, skip-dir pruning, codegen xattr) +/// is applied here — the walker itself is consumer-agnostic. Starts at the +/// pattern's literal prefix so a rooted pattern (`a/b/**/*`) scans only +/// `/a/b`, not the whole tree. +fn walk_glob( + walker: &CachedWalker, + root: &std::path::Path, + compiled: &CompiledGlob, +) -> anyhow::Result> { let walk_root = if compiled.prefix.is_empty() { root.to_path_buf() } else { root.join(&compiled.prefix) }; - // A missing walk root is an empty match — but must not be cached: were it - // stored as "empty", a later-created tree (which doesn't change any recorded - // dir's mtime, since none were recorded) would keep serving the stale empty - // set. Return `None` so every run re-walks until the root exists. - if std::fs::metadata(&walk_root).is_err() { - return Ok((vec![], None)); - } - let mut artifacts = vec![]; - // Cross-run cache accumulation. `persistable` flips off if any mtime can't be - // read, in which case we never store a (possibly unverifiable) entry. - let mut sig = WalkSignature::default(); - let mut value = GlobValue::default(); - let mut persistable = true; - - let walker = walkdir::WalkDir::new(&walk_root) - .into_iter() - .filter_entry(|entry| { - // Never descend into the engine's skip dirs (heph home + literal - // `fs.skip` dirs) or skip-glob subtrees (e.g. `**/node_modules/**`). - if !entry.file_type().is_dir() { - return true; - } - let rel = entry.path().strip_prefix(root).unwrap_or(entry.path()); - !compiled.skip.prune_dir(entry.path(), rel) - }); - - for entry in walker { - let entry = match entry { - Ok(e) => e, - Err(e) => { - // A missing walk root (or a path that vanished mid-walk) is an - // empty match, not an error. - if e.io_error() - .is_some_and(|io| io.kind() == std::io::ErrorKind::NotFound) - { - continue; - } - return Err(anyhow::Error::from(e)).context("walking glob entries"); - } - }; - - if entry.file_type().is_dir() { - // Record every descended dir's mtime: it bumps on any entry - // add/remove/rename, so matching all of them on a later run proves - // the matched file *set* is unchanged without re-reading the dir. - match ( - entry - .path() - .strip_prefix(root) - .ok() - .and_then(|r| r.to_str()), - entry.metadata().ok(), - ) { - (Some(rel), Some(meta)) => { - if !sig.push_dir(rel, &meta) { - persistable = false; + // The literal prefix can name a file directly (a fully-literal pattern), in + // which case the walk root is that single file, not a directory to descend. + match std::fs::metadata(&walk_root) { + Ok(m) if m.is_dir() => { + let mut stack = vec![walk_root]; + while let Some(dir) = stack.pop() { + let listing = walker.read_dir(&dir)?; + for entry in &listing.entries { + let abs = dir.join(&entry.name); + if entry.kind == crate::htwalk::EntryKind::Dir { + // Never descend into skip dirs or skip-glob subtrees. + let rel = abs.strip_prefix(root).unwrap_or(&abs); + if !compiled.skip.prune_dir(&abs, rel) { + stack.push(abs); + } + } else if matches!( + entry.kind, + crate::htwalk::EntryKind::File | crate::htwalk::EntryKind::Symlink + ) { + // A symlink-to-dir is rejected by `file_hash` (which follows + // and errors on a dir), matching the old walk. + emit_glob_file(walker, root, compiled, &abs, &mut artifacts)?; } } - _ => persistable = false, } - continue; - } - - let abs_path = entry.path(); - let rel = abs_path - .strip_prefix(root) - .with_context(|| "strip root prefix from glob entry")?; - - // Include on a pattern match, drop on an exclude match. - use wax::Program as _; - if !compiled.glob.is_match(rel) || compiled.not.is_match(rel) { - continue; } - - // Skip net-new codegen outputs stamped back into the tree — sourcing - // them here would double-source the generated content. - if has_codegen_xattr(abs_path) { - continue; - } - - let rel_str = rel.to_str().ok_or_else(|| { - anyhow::anyhow!("glob entry path is not valid UTF-8: {}", rel.display()) - })?; - - // Resolve through symlinks. The walker has follow_links off (so it never - // descends INTO a symlinked dir), but an individual symlink entry that - // points at a regular file is still a valid source. Stat the target so - // the exec marker and the hashed content both come from what - // `file_hashout` actually opens — and so a symlink-to-dir (or a - // dangling/vanished link) is skipped instead of erroring on read. - let meta = match std::fs::metadata(abs_path) { - Ok(m) => m, - // A dangling/vanished symlink resolves to nothing — skip, don't fail. - // Other stat errors (e.g. permission) still surface with context. - Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue, - Err(e) => { - return Err(e).with_context(|| format!("stat glob entry '{}'", abs_path.display())); - } - }; - if meta.is_dir() { - continue; - } - - let x = is_exec(&meta); - let hashout = file_hashout(abs_path, x) - .with_context(|| format!("hash glob entry '{}'", abs_path.display()))?; - - // Cross-run cache: record (size, mtime) in the signature so a later run - // can validate this file's content without re-reading it, and the - // (rel, x, hashout) needed to rebuild the artifact. A missing mtime makes - // the whole walk non-persistable. - if sig.push_file(rel_str, &meta) { - value.files.push(GlobFile { - rel: rel_str.to_string(), - x, - hashout: hashout.clone(), - }); - } else { - persistable = false; - } - - // Materialize the owned strings that borrow `abs_path`/`rel_str` *before* - // consuming `entry` for `source_path` below — the borrows end here. - let out_path = rel_str.to_string(); - let name = rel_str.replace('/', "_"); - - // `source_path` is `abs_path` owned. Reuse walkdir's already-allocated - // `PathBuf` instead of copying its bytes into a fresh String: on Unix - // `OsString -> String` reuses the buffer (validate-in-place, no realloc), - // so this drops one allocation per matched file off the glob hot path. - let source_path = entry - .into_path() - .into_os_string() - .into_string() - .map_err(|os| { - anyhow::anyhow!( - "glob entry path is not valid UTF-8: {}", - os.to_string_lossy() - ) - })?; - - artifacts.push(OutputArtifact { - group: "".to_string(), - name, - r#type: Type::Output, - content: Content::File(ContentFile { - source_path, - out_path, - x, - }), - hashout, - }); + Ok(_) => emit_glob_file(walker, root, compiled, &walk_root, &mut artifacts)?, + Err(_) => {} // missing walk root ⇒ empty match } - let entry = if persistable { - Some((sig, value)) - } else { - None - }; - Ok((artifacts, entry)) -} - -// ── Cross-run glob cache ───────────────────────────────────────────────────── -// -// fs-glob targets are `CacheConfig::off()`, so the engine re-walks the tree on -// every run (walkdir + per-entry stat + per-file open/read/hash). The generic -// `walk_cache` memoizes a walk's result per `(root, pattern, exclude)` across -// runs in the durable cache's KV store, validated by directory mtimes (the -// matched file *set*) and per-file `(size, mtime)` (file *content*). A full -// match reconstructs the artifacts with stat only — no readdir, opens, reads, or -// hashing. Disable with `HEPH_FS_GLOB_CACHE=0`. - -/// KV namespace for the fs glob walk cache. -const GLOB_CACHE_NS: &str = "pluginfs.glob"; - -/// Per-matched-file reconstruction data stored in the [`WalkCache`]. The -/// validating `(size, mtime)` live in the [`WalkSignature`]; this carries only -/// what's needed to rebuild an [`OutputArtifact`]. `source = root/rel`, -/// `out_path = rel`, `name = rel.replace('/', "_")` — mirrors `walk_glob`. -#[derive(Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] -struct GlobFile { - rel: String, - x: bool, - hashout: String, + Ok(artifacts) } -#[derive(Default, Clone, borsh::BorshSerialize, borsh::BorshDeserialize)] -struct GlobValue { - files: Vec, -} +/// If `abs` matches the glob (and isn't excluded or a codegen output), hash it +/// via the walker and push its artifact onto `out`. +fn emit_glob_file( + walker: &CachedWalker, + root: &std::path::Path, + compiled: &CompiledGlob, + abs: &std::path::Path, + out: &mut Vec, +) -> anyhow::Result<()> { + use wax::Program as _; -/// `walk_glob`'s result: the artifacts plus an optional `(signature, value)` to -/// store in the [`WalkCache`] (`None` ⇒ the walk is not cacheable this run). -type GlobWalk = (Vec, Option<(WalkSignature, GlobValue)>); - -/// Rebuild the glob artifacts from a cache-validated [`GlobValue`]. The file -/// `(size, mtime)` were already checked by [`WalkCache::get`]; only the codegen -/// xattr (which bumps neither) is re-checked here, matching `walk_glob`. Returns -/// `None` (⇒ caller re-walks) if any file has since been codegen-stamped. -fn reconstruct_glob(root: &std::path::Path, value: &GlobValue) -> Option> { - let mut artifacts = Vec::with_capacity(value.files.len()); - for f in &value.files { - let abs = root.join(&f.rel); - if has_codegen_xattr(&abs) { - return None; - } - let source_path = abs.into_os_string().into_string().ok()?; - artifacts.push(OutputArtifact { - group: String::new(), - name: f.rel.replace('/', "_"), - r#type: Type::Output, - content: Content::File(ContentFile { - source_path, - out_path: f.rel.clone(), - x: f.x, - }), - hashout: f.hashout.clone(), - }); + let Ok(rel) = abs.strip_prefix(root) else { + return Ok(()); + }; + // Include on a pattern match, drop on an exclude match. + if !compiled.glob.is_match(rel) || compiled.not.is_match(rel) { + return Ok(()); + } + // Skip net-new codegen outputs stamped back into the tree — sourcing them + // here would double-source the generated content. + if has_codegen_xattr(abs) { + return Ok(()); } - Some(artifacts) + let Some(rel_str) = rel.to_str() else { + anyhow::bail!("glob entry path is not valid UTF-8: {}", rel.display()); + }; + // Hash via the walker (follows symlinks). A symlink-to-dir or a + // dangling/vanished link errors here → skip. + let fh = match walker.file_hash(abs) { + Ok(fh) => fh, + Err(_) => return Ok(()), + }; + let Some(source_path) = abs.to_str().map(str::to_owned) else { + anyhow::bail!("glob entry path is not valid UTF-8: {}", abs.display()); + }; + out.push(OutputArtifact { + group: String::new(), + name: rel_str.replace('/', "_"), + r#type: Type::Output, + content: Content::File(ContentFile { + source_path, + out_path: rel_str.to_string(), + x: fh.exec, + }), + hashout: fh.hashout.clone(), + }); + Ok(()) } -/// Returns the glob walk artifacts for `(root, pattern, exclude)`. Within a -/// request the result is memoized (the tree is immutable mid-request); across -/// runs it is served from `store` (the single-file sidecar) when the tree is -/// unchanged. The first uncached call walks the tree. +/// Returns the glob walk artifacts for `(root, pattern, exclude)`, memoized +/// within `request_id` (the tree is immutable mid-request, so repeat calls reuse +/// the assembled list). Cross-run reuse of readdir + file hashes happens inside +/// the shared [`CachedWalker`]. fn cached_glob_walk( - glob_cache: &WalkCache, + walker: &CachedWalker, request_id: &str, root: &std::path::Path, pattern: &str, @@ -839,21 +704,7 @@ fn cached_glob_walk( return Ok(a.clone()); } - // Cross-run cache: on a signature-validated hit, reconstruct with stat only. - // On a miss, walk and record (write-through to the KV). - let artifacts = match glob_cache - .get(&key, root) - .and_then(|value| reconstruct_glob(root, &value)) - { - Some(arts) => Arc::new(arts), - None => { - let (arts, entry) = walk_glob(root, compiled)?; - if let Some((sig, value)) = entry { - glob_cache.insert(key.clone(), sig, value); - } - Arc::new(arts) - } - }; + let artifacts = Arc::new(walk_glob(walker, root, compiled)?); Ok(glob_result_cache() .write() @@ -867,72 +718,25 @@ fn cached_glob_walk( pub struct Driver { /// Engine-owned + built-in dirs pruned during glob walks. skip: Arc, - /// Cross-run glob walk cache (KV-backed; write-through, no flush needed). - glob_cache: Arc>, + /// Shared cross-run filesystem-walk cache (readdir + file hashes). + walker: Arc, } impl Default for Driver { fn default() -> Self { Self { skip: Arc::default(), - glob_cache: Arc::new(WalkCache::new(None, GLOB_CACHE_NS)), + walker: Arc::new(CachedWalker::disabled()), } } } impl Driver { - pub fn new(skip: Arc, cache: Option>) -> Self { - // `HEPH_FS_GLOB_CACHE=0` is a kill-switch: disable the cache entirely. - let cache = if std::env::var_os("HEPH_FS_GLOB_CACHE").is_some_and(|v| v == "0") { - None - } else { - cache - }; - Self { - skip, - glob_cache: Arc::new(WalkCache::new(cache, GLOB_CACHE_NS)), - } + pub fn new(skip: Arc, walker: Arc) -> Self { + Self { skip, walker } } } -#[cfg(unix)] -fn is_exec(meta: &std::fs::Metadata) -> bool { - use std::os::unix::fs::PermissionsExt; - meta.permissions().mode() & 0o111 != 0 -} - -#[cfg(not(unix))] -fn is_exec(_meta: &std::fs::Metadata) -> bool { - false -} - -/// Content identity for a sourced file: a hash of its bytes plus the executable -/// marker. Deliberately ignores size and mtime — only the content and the `x` -/// bit determine the artifact, so a file rewritten with identical bytes (new -/// mtime, same content) hashes the same and stays a cache hit. -fn file_hashout(path: &std::path::Path, x: bool) -> anyhow::Result { - use std::io::Read as _; - let file = std::fs::File::open(path) - .with_context(|| format!("open file for hashing '{}'", path.display()))?; - let mut reader = std::io::BufReader::new(file); - let mut h = Xxh3::new(); - // Stream in chunks so large inputs never load wholesale into memory. - let mut buf = [0u8; 64 * 1024]; - loop { - let n = reader - .read(&mut buf) - .with_context(|| format!("read file for hashing '{}'", path.display()))?; - if n == 0 { - break; - } - if let Some(chunk) = buf.get(..n) { - h.update(chunk); - } - } - h.update(&[x as u8]); - Ok(format!("{:x}", h.digest())) -} - #[async_trait] impl crate::engine::driver::Driver for Driver { fn config(&self, _req: ConfigRequest) -> anyhow::Result { @@ -1072,10 +876,10 @@ impl crate::engine::driver::Driver for Driver { }); } - let meta = std::fs::metadata(&abs) - .with_context(|| format!("stat file '{}'", abs.display()))?; - let x = is_exec(&meta); - let hashout = file_hashout(&abs, x) + // Content hash + exec bit via the shared walker (cross-run cached). + let fh = self + .walker + .file_hash(&abs) .with_context(|| format!("hash file '{}'", abs.display()))?; let source_path = abs @@ -1099,9 +903,9 @@ impl crate::engine::driver::Driver for Driver { content: Content::File(ContentFile { source_path, out_path: path.clone(), - x, + x: fh.exec, }), - hashout, + hashout: fh.hashout.clone(), }], ..Default::default() }) @@ -1114,9 +918,9 @@ impl crate::engine::driver::Driver for Driver { } => { // Within a request the tree is immutable, so the walk result for // this `(root, pattern, excludes)` is memoized — repeat calls - // skip walkdir + per-entry stat entirely. + // skip even the walker round-trips. let artifacts = cached_glob_walk( - &self.glob_cache, + &self.walker, req.request_id, root, pattern, @@ -1150,6 +954,7 @@ mod tests { use crate::engine::provider::Provider as EProvider; use crate::hasync::StdCancellationToken; use crate::htaddr::parse_addr; + use crate::htwalk::file_hashout; use std::fs; use tempfile::tempdir; @@ -1202,7 +1007,14 @@ mod tests { positional: vec![Value::String(pattern.to_string())], named: Default::default(), }; - let v = futures::executor::block_on(GlobFn { skip }.call(&ctx, args)).unwrap(); + let v = futures::executor::block_on( + GlobFn { + skip, + walker: Arc::new(CachedWalker::disabled()), + } + .call(&ctx, args), + ) + .unwrap(); match v { Value::List(l) => l .into_iter() @@ -1628,12 +1440,11 @@ mod tests { ); } - /// `walk_glob` returns a [`WalkSignature`] that validates the unchanged tree - /// (and invalidates on a content change) plus a value that reconstructs the - /// identical artifact set. (Signature-validation edge cases live in - /// `engine::walk_cache`.) + /// `walk_glob` matches the right files (recursively), hashes them, and skips a + /// codegen-stamped file. Cross-run cache mechanics are tested in + /// `htwalk::cached_walker`. #[test] - fn test_walk_glob_signature_and_reconstruct() { + fn test_walk_glob_matches_and_skips_codegen() { let tmp = tempdir().unwrap(); let root = tmp.path(); fs::create_dir(root.join("sub")).unwrap(); @@ -1643,150 +1454,61 @@ mod tests { let skip = Arc::new(Ignore::new(&[], &[]).unwrap()); let compiled = compile_glob(&skip, "t", "**/*.rs", &[]).unwrap(); + let walker = CachedWalker::disabled(); - let key = |arts: &[OutputArtifact]| { - let mut v: Vec<_> = arts - .iter() - .map(|a| (a.name.clone(), a.hashout.clone())) - .collect(); - v.sort(); - v - }; - - let (arts, entry) = walk_glob(root, &compiled).unwrap(); - let (sig, value) = entry.expect("walk over a present tree is persistable"); - assert_eq!(arts.len(), 2, "matches a.rs + sub/b.rs, not c.txt"); - - // The value rebuilds the identical artifact set. - let rebuilt = reconstruct_glob(root, &value).expect("reconstructs"); - assert_eq!(key(&arts), key(&rebuilt)); - - // The signature validates the unchanged tree, and a content change (size) - // invalidates it. - assert!(sig.is_valid(root), "unchanged tree validates"); - fs::write(root.join("a.rs"), b"a much longer body").unwrap(); - assert!(!sig.is_valid(root), "changed file size invalidates"); - } - - /// A file that gains the codegen provenance xattr (which bumps neither size - /// nor mtime, so the signature still validates) must drop out of a - /// reconstructed glob, matching `walk_glob`. - #[cfg(unix)] - #[test] - fn test_reconstruct_glob_drops_codegen_xattr() { - let tmp = tempdir().unwrap(); - let root = tmp.path(); - fs::write(root.join("gen.rs"), b"x").unwrap(); - let skip = Arc::new(Ignore::new(&[], &[]).unwrap()); - let compiled = compile_glob(&skip, "t", "*.rs", &[]).unwrap(); - - let (arts, entry) = walk_glob(root, &compiled).unwrap(); - assert_eq!(arts.len(), 1); - let (_, value) = entry.unwrap(); - - // Stamp the codegen xattr without touching content/mtime. - if xattr::set(root.join("gen.rs"), CODEGEN_XATTR, b"//gen:it").is_err() { - return; // filesystem without xattr support — nothing to assert - } - assert!( - reconstruct_glob(root, &value).is_none(), - "a newly codegen-stamped file must invalidate the cache" - ); - } - - /// Minimal `LocalCache` exposing only the KV methods over an in-memory map, - /// so the cross-run test exercises the real `WalkCache` path without SQLite. - #[derive(Default)] - struct KvMock { - kv: std::sync::Mutex>>, - } - impl LocalCache for KvMock { - fn reader( - &self, - _a: &Addr, - _h: &str, - _n: &str, - ) -> anyhow::Result { - unimplemented!() - } - fn writer(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result> { - unimplemented!() - } - fn exists(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result { - Ok(false) - } - fn delete(&self, _a: &Addr, _h: &str, _n: &str) -> anyhow::Result<()> { - Ok(()) - } - fn kv_get(&self, ns: &str, k: &str) -> anyhow::Result>> { - Ok(self - .kv - .lock() - .unwrap() - .get(&(ns.to_owned(), k.to_owned())) - .cloned()) - } - fn kv_list(&self, ns: &str) -> anyhow::Result)>> { - Ok(self - .kv - .lock() - .unwrap() - .iter() - .filter(|((n, _), _)| n == ns) - .map(|((_, k), v)| (k.clone(), v.clone())) - .collect()) - } - fn kv_put(&self, ns: &str, k: &str, v: &[u8]) -> anyhow::Result<()> { - self.kv - .lock() - .unwrap() - .insert((ns.to_owned(), k.to_owned()), v.to_vec()); - Ok(()) + let arts = walk_glob(&walker, root, &compiled).unwrap(); + let mut names: Vec<_> = arts.iter().map(|a| a.name.clone()).collect(); + names.sort(); + assert_eq!(names, vec!["a.rs".to_string(), "sub_b.rs".to_string()]); + assert!(arts.iter().all(|a| !a.hashout.is_empty())); + + // A codegen-stamped file drops out of a re-walk. + #[cfg(unix)] + if xattr::set(root.join("a.rs"), CODEGEN_XATTR, b"//gen:it").is_ok() { + let arts = walk_glob(&walker, root, &compiled).unwrap(); + assert_eq!(arts.len(), 1, "codegen-stamped a.rs is excluded"); } } - /// End-to-end cross-run: a first driver populates the KV-backed walk cache; a - /// *fresh* driver sharing the same backend (simulating a new process) reuses - /// it for the unchanged tree, then re-walks once a file is added. + /// Cross-run via the shared walker: a first driver populates the fswalk db; a + /// fresh driver (new process) serves the unchanged tree from it, and re-walks + /// after a file is added. #[tokio::test] - async fn test_glob_cache_cross_run() { + async fn test_glob_cross_run_via_walker() { let tmp = tempdir().unwrap(); let root = tmp.path(); + // Keep the fswalk db out of the globbed tree so its writes don't bump the + // walked dirs' mtimes (in production it lives under the pruned `.heph3`). + let dbdir = tempdir().unwrap(); + let db = dbdir.path().join("fswalk.db"); fs::write(root.join("a.rs"), b"aaa").unwrap(); fs::write(root.join("b.rs"), b"bbb").unwrap(); let skip = Arc::new(Ignore::new(&[], &[]).unwrap()); - let backend: Arc = Arc::new(KvMock::default()); let config = std::collections::HashMap::from([("p".to_string(), Value::String("*.rs".to_string()))]); let hashin = String::new(); - let (id1, id2, id3) = ( - "req-1".to_string(), - "req-2".to_string(), - "req-3".to_string(), - ); + let (id1, id2, id3) = ("r1".to_string(), "r2".to_string(), "r3".to_string()); - // First driver populates the shared KV. let parse_res = { - let driver = Driver::new(skip.clone(), Some(backend.clone())); - let parse_res = driver + let driver = Driver::new(skip.clone(), Arc::new(CachedWalker::open(&db))); + let pr = driver .parse(make_parse_req(config), &ctoken()) .await .unwrap(); let first = driver .run( - make_run_req(&parse_res.target_def, &id1, root.to_path_buf(), &hashin), + make_run_req(&pr.target_def, &id1, root.to_path_buf(), &hashin), &ctoken(), ) .await .unwrap(); assert_eq!(first.artifacts.len(), 2); - parse_res + pr }; - // Fresh driver sharing the backend loads the entry from the KV and - // reconstructs the unchanged tree — same artifacts. - let driver2 = Driver::new(skip.clone(), Some(backend.clone())); + // A fresh walker reads the populated db (unchanged tree). + let driver2 = Driver::new(skip.clone(), Arc::new(CachedWalker::open(&db))); let second = driver2 .run( make_run_req(&parse_res.target_def, &id2, root.to_path_buf(), &hashin), @@ -1796,7 +1518,7 @@ mod tests { .unwrap(); assert_eq!(second.artifacts.len(), 2); - // Adding a matching file bumps the root dir mtime → re-walk includes it. + // Add a file → dir mtime bumps → re-walk picks it up. fs::write(root.join("c.rs"), b"ccc").unwrap(); std::fs::File::open(root) .unwrap() @@ -1913,7 +1635,7 @@ mod tests { // The engine hands the fs plugin its skip dirs (the heph home); the walk // must prune that subtree. let skip = Arc::new(Ignore::new(&[home], &[]).unwrap()); - let driver = Driver::new(skip, None); + let driver = Driver::new(skip, Arc::new(CachedWalker::disabled())); let config = std::collections::HashMap::from([("p".to_string(), Value::String("**/*".to_string()))]); @@ -1950,7 +1672,7 @@ mod tests { // A `fs.skip` dir from the config file (resolved to an absolute path) is // pruned just like the engine home. let skip = Arc::new(Ignore::new(&[tmp.path().join("vendor")], &[]).unwrap()); - let driver = Driver::new(skip, None); + let driver = Driver::new(skip, Arc::new(CachedWalker::disabled())); let config = std::collections::HashMap::from([("p".to_string(), Value::String("**/*".to_string()))]); @@ -1987,7 +1709,7 @@ mod tests { // A `fs.skip` glob (`**/node_modules/**`) excludes the whole subtree at // any depth — and prunes the dir so the walk never descends into it. let skip = Arc::new(Ignore::new(&[], &["**/node_modules/**".to_string()]).unwrap()); - let driver = Driver::new(skip, None); + let driver = Driver::new(skip, Arc::new(CachedWalker::disabled())); let config = std::collections::HashMap::from([("p".to_string(), Value::String("**/*".to_string()))]); From 85c6112209b711fd007541f53967a1a8712edb59 Mon Sep 17 00:00:00 2001 From: Raphael Vigee Date: Wed, 10 Jun 2026 12:17:49 +0200 Subject: [PATCH 4/6] feat(htwalk): HEPH_DEBUG_CACHED_WALKER=0 bypasses the walker cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a transparent escape hatch: set HEPH_DEBUG_CACHED_WALKER=0 and CachedWalker::open returns a fully bypassing walker — no in-process front, no durable store, every read_dir/file_hash goes straight to disk. The consumer-facing API is unchanged, so plugins are unaware. Distinct from disabled(), which still keeps the in-process map; the new bypassing() mode does no caching at all, for isolating cache bugs from correctness bugs. --- src/htwalk/cached_walker.rs | 72 ++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/src/htwalk/cached_walker.rs b/src/htwalk/cached_walker.rs index 4223328b..da9c34c6 100644 --- a/src/htwalk/cached_walker.rs +++ b/src/htwalk/cached_walker.rs @@ -43,6 +43,16 @@ use xxhash_rust::xxh3::Xxh3; /// dropped by [`CachedWalker::prune`]. pub const DEFAULT_TTL: std::time::Duration = std::time::Duration::from_secs(14 * 24 * 60 * 60); +/// Escape hatch: set `HEPH_DEBUG_CACHED_WALKER=0` to bypass caching entirely and +/// fall back to reading every directory listing and file hash straight from disk +/// (no in-process front, no durable store). Any other value (or unset) keeps the +/// cache on. Cached in a `OnceLock` because `std::env::var` takes a global libc +/// mutex; the value can't change within a run. +fn cache_bypassed() -> bool { + static FLAG: std::sync::OnceLock = std::sync::OnceLock::new(); + *FLAG.get_or_init(|| matches!(std::env::var("HEPH_DEBUG_CACHED_WALKER").as_deref(), Ok("0"))) +} + /// The kind of a directory entry, from the platform `d_type` (no extra stat). #[derive(Clone, Copy, Debug, PartialEq, Eq, BorshSerialize, BorshDeserialize)] pub enum EntryKind { @@ -141,6 +151,9 @@ fn entry_kind(ft: std::fs::FileType) -> EntryKind { /// Shared cached filesystem walker. Cheap to clone (`Arc` the whole thing). pub struct CachedWalker { + /// When set, all caching is skipped: every call reads straight from disk. + /// Transparent to consumers — the API is identical, only slower. + bypass: bool, store: Option, /// In-process front for directory listings, validated against live mtime. dirs: Mutex)>>, @@ -157,6 +170,10 @@ impl CachedWalker { /// db-open failure the walker still works — it just degrades to always /// re-reading from disk (no cross-run cache). pub fn open(db_path: &Path) -> Self { + if cache_bypassed() { + tracing::warn!("cached walker bypassed (HEPH_DEBUG_CACHED_WALKER=0): reading from disk"); + return Self::bypassing(); + } let store = match FsWalkStore::open(db_path) { Ok(s) => Some(s), Err(e) => { @@ -165,6 +182,7 @@ impl CachedWalker { } }; Self { + bypass: false, store, dirs: Mutex::new(FxHashMap::default()), files: Mutex::new(FxHashMap::default()), @@ -173,9 +191,24 @@ impl CachedWalker { } } - /// A disabled walker (no caching) for tests / contexts without a db. + /// A disabled walker (no durable store, in-process front only) for tests / + /// contexts without a db. pub fn disabled() -> Self { Self { + bypass: false, + store: None, + dirs: Mutex::new(FxHashMap::default()), + files: Mutex::new(FxHashMap::default()), + touched_dirs: Mutex::new(Vec::new()), + touched_files: Mutex::new(Vec::new()), + } + } + + /// A fully bypassing walker: no in-process front, no durable store — every + /// call reads straight from disk. Used when `HEPH_DEBUG_CACHED_WALKER=0`. + pub fn bypassing() -> Self { + Self { + bypass: true, store: None, dirs: Mutex::new(FxHashMap::default()), files: Mutex::new(FxHashMap::default()), @@ -187,6 +220,9 @@ impl CachedWalker { /// The cached listing of directory `dir` (absolute). A missing directory /// lists empty. The caller filters entries and decides whether to recurse. pub fn read_dir(&self, dir: &Path) -> Result> { + if self.bypass { + return Ok(Arc::new(read_dir_uncached(dir)?)); + } let meta = match std::fs::metadata(dir) { Ok(m) => m, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { @@ -247,6 +283,15 @@ impl CachedWalker { let live_mtime = mtime_ns(&meta); let exec = is_exec(&meta); + if self.bypass { + return Ok(Arc::new(FileHash { + size, + mtime_ns: live_mtime.unwrap_or(-1), + exec, + hashout: file_hashout(file, exec)?, + })); + } + // In-process front. if let Some(mt) = live_mtime && let Some(found) = self.files.lock().get(file).cloned() @@ -632,4 +677,29 @@ mod tests { let l = w.read_dir(&tmp.path().join("d")).unwrap(); assert_eq!(l.entries.len(), 1); } + + #[test] + fn bypassing_walker_reads_disk_without_caching() { + let tmp = tempdir().unwrap(); + let root = tmp.path(); + std::fs::create_dir(root.join("d")).unwrap(); + let f = root.join("d").join("x"); + std::fs::write(&f, b"hello").unwrap(); + + let w = CachedWalker::bypassing(); + // Reads and hashes correctly, matching the uncached primitives. + let l = w.read_dir(&root.join("d")).unwrap(); + assert_eq!(l.entries.len(), 1); + let h = w.file_hash(&f).unwrap(); + assert_eq!(h.hashout, file_hashout(&f, h.exec).unwrap()); + + // Bypass keeps no in-process front: a content change is seen immediately, + // with no stale cached value (and the dir's mtime is never consulted). + std::fs::write(&f, b"a different, longer body").unwrap(); + let h2 = w.file_hash(&f).unwrap(); + assert_ne!(h.hashout, h2.hashout); + + // No durable store ⇒ nothing to prune. + assert_eq!(w.prune(std::time::Duration::from_secs(0), false).unwrap(), 0); + } } From 2901657473b35d63bae4c926443e33ad66805afc Mon Sep 17 00:00:00 2001 From: Raphael Vigee Date: Wed, 10 Jun 2026 12:21:31 +0200 Subject: [PATCH 5/6] perf(htwalk): synchronous = OFF on the fswalk cache db MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fswalk db is a pure optimization cache — every row is rebuildable from disk — so crash durability buys us nothing. Drop synchronous from NORMAL to OFF to skip the remaining checkpoint fsyncs. Trade-off: an OS crash or power loss can now corrupt the db (an app crash is still safe — the OS flushes the pages). That's acceptable: a corrupt cache just re-reads from disk and rebuilds. WAL and busy_timeout are kept — they govern concurrent multi-process access, not durability, so cross-process behavior is unchanged. --- src/htwalk/cached_walker.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/htwalk/cached_walker.rs b/src/htwalk/cached_walker.rs index da9c34c6..1ac7e794 100644 --- a/src/htwalk/cached_walker.rs +++ b/src/htwalk/cached_walker.rs @@ -414,10 +414,21 @@ impl FsWalkStore { let write = Connection::open(db_path) .with_context(|| format!("opening fswalk db at {db_path:?}"))?; write + // This db is a pure optimization cache: every row is reconstructable + // from disk, so corruption is tolerable (a bad db just re-reads and + // rebuilds). We trade crash durability for speed. + // - WAL stays: required so multiple processes read concurrently + // while one writes. busy_timeout handles cross-process write + // contention. Neither is a durability knob, so both are kept. + // - synchronous = OFF (vs NORMAL): drops the remaining checkpoint + // fsyncs. An app crash is still safe (the OS flushes the pages); + // only an OS crash / power loss can corrupt — which we accept. + // synchronous affects only crash durability, never concurrent-access + // correctness, so multi-process behavior is unchanged. .execute_batch( "PRAGMA journal_mode = WAL; PRAGMA busy_timeout = 10000; - PRAGMA synchronous = NORMAL; + PRAGMA synchronous = OFF; PRAGMA temp_store = MEMORY; PRAGMA mmap_size = 268435456; CREATE TABLE IF NOT EXISTS dirs ( From 5a6dcf9300091c89e1d94c14fcbd990f4b743de0 Mon Sep 17 00:00:00 2001 From: Raphael Vigee Date: Wed, 10 Jun 2026 12:30:40 +0200 Subject: [PATCH 6/6] style: cargo fmt Co-Authored-By: Claude Opus 4.8 (1M context) --- src/htwalk/cached_walker.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/htwalk/cached_walker.rs b/src/htwalk/cached_walker.rs index 1ac7e794..5aa44918 100644 --- a/src/htwalk/cached_walker.rs +++ b/src/htwalk/cached_walker.rs @@ -50,7 +50,12 @@ pub const DEFAULT_TTL: std::time::Duration = std::time::Duration::from_secs(14 * /// mutex; the value can't change within a run. fn cache_bypassed() -> bool { static FLAG: std::sync::OnceLock = std::sync::OnceLock::new(); - *FLAG.get_or_init(|| matches!(std::env::var("HEPH_DEBUG_CACHED_WALKER").as_deref(), Ok("0"))) + *FLAG.get_or_init(|| { + matches!( + std::env::var("HEPH_DEBUG_CACHED_WALKER").as_deref(), + Ok("0") + ) + }) } /// The kind of a directory entry, from the platform `d_type` (no extra stat). @@ -171,7 +176,9 @@ impl CachedWalker { /// re-reading from disk (no cross-run cache). pub fn open(db_path: &Path) -> Self { if cache_bypassed() { - tracing::warn!("cached walker bypassed (HEPH_DEBUG_CACHED_WALKER=0): reading from disk"); + tracing::warn!( + "cached walker bypassed (HEPH_DEBUG_CACHED_WALKER=0): reading from disk" + ); return Self::bypassing(); } let store = match FsWalkStore::open(db_path) { @@ -711,6 +718,9 @@ mod tests { assert_ne!(h.hashout, h2.hashout); // No durable store ⇒ nothing to prune. - assert_eq!(w.prune(std::time::Duration::from_secs(0), false).unwrap(), 0); + assert_eq!( + w.prune(std::time::Duration::from_secs(0), false).unwrap(), + 0 + ); } }