diff --git a/docs/notes/2.33.x.md b/docs/notes/2.33.x.md index af3a975cede..688a4850000 100644 --- a/docs/notes/2.33.x.md +++ b/docs/notes/2.33.x.md @@ -87,6 +87,8 @@ Fixes JavaScript workspace builds where dependencies are installed under members Third-party module analysis is now deduplicated across `go.mod` files. Previously, a module required by `N` `go.mod` files was downloaded and analyzed `N` times, which caused significant memory and time overhead in monorepos with many overlapping `go.mod` files. On a 3-`go.mod` reproducer, `pants list ::` peak memory dropped from 91 GB to 32 GB (-65%). This is a no-op for repos with a single `go.mod`. See [#20274](https://github.com/pantsbuild/pants/issues/20274). +Third-party Go module downloads now use a persistent named cache (`go_mod_cache`) for `GOMODCACHE`. Previously every sandbox got a fresh module cache, so any `go.mod`/`go.sum` change caused all modules to be re-downloaded from the network. With the named cache, subsequent builds reuse locally cached module zips even when the engine's content-addressed store is cold. Captured digests remain the source of truth: modules are copied out of the shared cache into the sandbox before capture, so results are byte-identical regardless of whether the cache is warm or cold. Remote-execution users: the named cache is only effective when `--remote-execution-append-only-caches-base-path` is configured; without it the process falls back silently to a per-sandbox download (today's behavior). Note: this change invalidates all previously cached Go process results (one-time invalidation). See [#13390](https://github.com/pantsbuild/pants/issues/13390). + ### Plugin API changes `FrozenOrderedSet` is now backed by a native Rust implementation and is built in `__new__` rather than `__init__`. Subclasses that customized the set's contents by overriding `__init__` (for example, to transform the input iterable before constructing) must move that logic to `__new__`, since the set is already built and immutable by the time `__init__` runs. diff --git a/src/python/pants/backend/go/util_rules/sdk.py b/src/python/pants/backend/go/util_rules/sdk.py index 392553130e0..7758e085319 100644 --- a/src/python/pants/backend/go/util_rules/sdk.py +++ b/src/python/pants/backend/go/util_rules/sdk.py @@ -38,6 +38,8 @@ class GoSdkProcess: output_directories: tuple[str, ...] replace_sandbox_root_in_args: bool + use_module_cache: bool + def __init__( self, command: Iterable[str], @@ -50,6 +52,7 @@ def __init__( output_directories: Iterable[str] = (), allow_downloads: bool = False, replace_sandbox_root_in_args: bool = False, + use_module_cache: bool = False, ) -> None: object.__setattr__(self, "command", tuple(command)) object.__setattr__(self, "description", description) @@ -67,6 +70,7 @@ def __init__( object.__setattr__(self, "output_files", tuple(output_files)) object.__setattr__(self, "output_directories", tuple(output_directories)) object.__setattr__(self, "replace_sandbox_root_in_args", replace_sandbox_root_in_args) + object.__setattr__(self, "use_module_cache", use_module_cache) @dataclass(frozen=True) @@ -76,6 +80,7 @@ class GoSdkRunSetup: CHDIR_ENV = "__PANTS_CHDIR_TO" SANDBOX_ROOT_ENV = "__PANTS_REPLACE_SANDBOX_ROOT" + FETCH_MODULE_ENV = "__PANTS_GO_FETCH_MODULE" @rule @@ -92,6 +97,43 @@ async def go_sdk_invoke_setup(goroot: GoRoot) -> GoSdkRunSetup: export GOPATH="${{sandbox_root}}/gopath" export GOCACHE="${{sandbox_root}}/cache" /bin/mkdir -p "$GOPATH" "$GOCACHE" + if [ -n "$__PANTS_GO_MODCACHE" ]; then + export GOMODCACHE="${{sandbox_root}}/__gomodcache" + export GOFLAGS="${{GOFLAGS:+${{GOFLAGS}} }}-modcacherw" + /bin/mkdir -p "$GOMODCACHE" + fi + if [ -n "$__PANTS_GO_FETCH_MODULE" ]; then + module_version="$__PANTS_GO_FETCH_MODULE" + "{goroot.path}/bin/go" mod download -json "$module_version" > __module_metadata.json + download_status=$? + /bin/cat __module_metadata.json + if [ "$download_status" -ne 0 ]; then + exit "$download_status" + fi + # Parse the Dir/GoMod paths from the metadata with shell string operations only: + # the sandbox PATH cannot be assumed to provide grep/sed. + dir="" + gomod="" + while IFS= read -r line; do + case "$line" in + *'"Dir": "'*) if [ -z "$dir" ]; then dir=${{line#*'"Dir": "'}}; dir=${{dir%'"'*}}; fi ;; + *'"GoMod": "'*) if [ -z "$gomod" ]; then gomod=${{line#*'"GoMod": "'}}; gomod=${{gomod%'"'*}}; fi ;; + esac + done < __module_metadata.json + marker="__gomodcache/" + dir_rel="${{dir#*${{marker}}}}" + gomod_rel="${{gomod#*${{marker}}}}" + if [ -z "$dir" ] || [ -z "$gomod" ] || [ "$dir_rel" = "$dir" ] || [ "$gomod_rel" = "$gomod" ]; then + echo "Failed to locate module $module_version under GOMODCACHE after download." 1>&2 + exit 1 + fi + dest_dir="$GOPATH/pkg/mod/$dir_rel" + dest_gomod="$GOPATH/pkg/mod/$gomod_rel" + /bin/mkdir -p "$dest_dir" "${{dest_gomod%/*}}" || exit 1 + /bin/cp -Rp "$dir/." "$dest_dir/" || exit 1 + /bin/cp -p "$gomod" "$dest_gomod" || exit 1 + exit 0 + fi if [ -n "${GoSdkRunSetup.CHDIR_ENV}" ]; then cd "${GoSdkRunSetup.CHDIR_ENV}" fi @@ -158,6 +200,11 @@ async def setup_go_sdk_process( if request.replace_sandbox_root_in_args: env[GoSdkRunSetup.SANDBOX_ROOT_ENV] = "1" + append_only_caches: dict[str, str] = {} + if request.use_module_cache: + env["__PANTS_GO_MODCACHE"] = "1" + append_only_caches["go_mod_cache"] = "__gomodcache" + # Disable the "coverage redesign" experiment on Go v1.20+ for now since Pants does not yet support it. if goroot.is_compatible_version("1.20") and not goroot.is_compatible_version("1.25"): exp_str = env.get("GOEXPERIMENT", "") @@ -175,6 +222,7 @@ async def setup_go_sdk_process( description=request.description, output_files=request.output_files, output_directories=request.output_directories, + append_only_caches=append_only_caches, level=LogLevel.DEBUG, ) diff --git a/src/python/pants/backend/go/util_rules/third_party_pkg.py b/src/python/pants/backend/go/util_rules/third_party_pkg.py index fd95ece6773..86de168d47a 100644 --- a/src/python/pants/backend/go/util_rules/third_party_pkg.py +++ b/src/python/pants/backend/go/util_rules/third_party_pkg.py @@ -114,11 +114,12 @@ def debug_hint(self) -> str: @dataclass(frozen=True) class AllThirdPartyPackages: - """All the packages downloaded from a go.mod, along with a digest of the downloaded files. + """All the packages downloaded from a go.mod. - The digest has files in the format `gopath/pkg/mod`, which is what `GoSdkProcess` sets `GOPATH` - to. This means that you can include the digest in a process and Go will properly consume it as - the `GOPATH`. + Each ``ThirdPartyPkgAnalysis`` carries its own per-package digest with files + in ``gopath/pkg/mod//...`` layout (the captured subset from the + download sandbox). Consumers merge whichever package digests they need into + their own input digest. """ digest: Digest @@ -151,7 +152,6 @@ class ModuleDescriptor: @dataclass(frozen=True) class ModuleDescriptors: modules: FrozenOrderedSet[ModuleDescriptor] - go_mods_digest: Digest @dataclass(frozen=True) @@ -232,17 +232,17 @@ async def analyze_module_dependencies(request: ModuleDescriptorsRequest) -> Modu GoSdkProcess( command=["list", "-mod=readonly", "-e", "-m", "-json", "all"], input_digest=request.digest, - output_directories=("gopath",), working_dir=request.path if request.path else None, # Allow downloads of the module metadata (i.e., go.mod files). allow_downloads=True, + use_module_cache=True, description="Analyze Go module dependencies.", ) ) ) if len(mod_list_result.stdout) == 0: - return ModuleDescriptors(FrozenOrderedSet(), EMPTY_DIGEST) + return ModuleDescriptors(FrozenOrderedSet()) descriptors: dict[tuple[str, str], ModuleDescriptor] = {} @@ -276,7 +276,7 @@ async def analyze_module_dependencies(request: ModuleDescriptorsRequest) -> Modu # Gazelle does this, mainly to store the sum on the go_repository rule. We could store it (or its # absence) to be able to download sums automatically. - return ModuleDescriptors(FrozenOrderedSet(descriptors.values()), mod_list_result.output_digest) + return ModuleDescriptors(FrozenOrderedSet(descriptors.values())) def strip_sandbox_prefix(path: str, marker: str) -> str: @@ -529,12 +529,20 @@ async def download_and_analyze_module( synthetic_files.append(FileContent("go.sum", synthetic_go_sum.encode())) synthetic_digest = await create_digest(CreateDigest(synthetic_files)) + # Fetch the module into the named cache and copy to gopath/pkg/mod in one process. + # The __PANTS_GO_FETCH_MODULE mode in __run_go.sh handles both steps atomically: + # it runs `go mod download -json`, emits the metadata on stdout, then copies the + # extracted tree from __gomodcache into gopath/pkg/mod. This avoids any race + # between the download (process 1) and copy (process 2) that would violate the + # self-healing invariant if the named cache were wiped between steps. download_result = await fallible_to_exec_result_or_raise( **implicitly( GoSdkProcess( - ("mod", "download", "-json", f"{request.name}@{request.version}"), + (), + env={"__PANTS_GO_FETCH_MODULE": f"{request.name}@{request.version}"}, input_digest=synthetic_digest, allow_downloads=True, + use_module_cache=True, output_directories=("gopath",), description=f"Download Go module {request.name}@{request.version}.", ) @@ -543,12 +551,20 @@ async def download_and_analyze_module( if len(download_result.stdout) == 0: raise AssertionError( - f"Expected output from `go mod download` for {request.name}@{request.version}." + f"Expected metadata output from module fetch for {request.name}@{request.version}." ) - - module_metadata = json.loads(download_result.stdout) - module_sources_relpath = strip_sandbox_prefix(module_metadata["Dir"], "gopath/") - go_mod_relpath = strip_sandbox_prefix(module_metadata["GoMod"], "gopath/") + module_metadata_json = json.loads(download_result.stdout) + + # Dir/GoMod point into __gomodcache (absolute sandbox paths). Extract the + # portion after __gomodcache/ to get the relative path within the cache tree, + # then re-prefix with gopath/pkg/mod/ to match where the copy step placed them. + # strip_sandbox_prefix returns path[marker_pos:] (includes the marker), so we + # strip the marker length to get just the relative tail. + _modcache_marker = "__gomodcache/" + _dir_from_marker = strip_sandbox_prefix(module_metadata_json["Dir"], _modcache_marker) + _gomod_from_marker = strip_sandbox_prefix(module_metadata_json["GoMod"], _modcache_marker) + module_sources_relpath = "gopath/pkg/mod/" + _dir_from_marker[len(_modcache_marker) :] + go_mod_relpath = "gopath/pkg/mod/" + _gomod_from_marker[len(_modcache_marker) :] module_sources_snapshot = await digest_to_snapshot( **implicitly( diff --git a/src/python/pants/backend/go/util_rules/third_party_pkg_test.py b/src/python/pants/backend/go/util_rules/third_party_pkg_test.py index 21321edc7b7..e5123764e14 100644 --- a/src/python/pants/backend/go/util_rules/third_party_pkg_test.py +++ b/src/python/pants/backend/go/util_rules/third_party_pkg_test.py @@ -3,7 +3,10 @@ from __future__ import annotations +import os import os.path +import pathlib +import tempfile from textwrap import dedent import pytest @@ -738,3 +741,189 @@ def test_parse_go_sum() -> None: crlf_parsed = _parse_go_sum(crlf_sum) assert ("mod", "v1.0.0") in crlf_parsed assert len(crlf_parsed[("mod", "v1.0.0")]) == 2 + + +def _make_rule_runner(named_caches_dir: str) -> RuleRunner: + """Create a RuleRunner with a fixed named-caches-dir (for named-cache tests).""" + runner = RuleRunner( + rules=[ + *sdk.rules(), + *third_party_pkg.rules(), + *first_party_pkg.rules(), + *load_go_binary.rules(), + *build_pkg.rules(), + *import_analysis.rules(), + *link.rules(), + *assembly.rules(), + *target_type_rules.rules(), + *go_mod.rules(), + QueryRule(AllThirdPartyPackages, [AllThirdPartyPackagesRequest]), + QueryRule(ThirdPartyPkgAnalysis, [ThirdPartyPkgAnalysisRequest]), + QueryRule(ModuleDescriptors, [ModuleDescriptorsRequest]), + ], + target_types=[GoModTarget], + bootstrap_args=[f"--named-caches-dir={named_caches_dir}"], + ) + runner.set_options(["--golang-cgo-enabled"], env_inherit={"PATH"}) + return runner + + +def test_named_cache_digest_equality() -> None: + """Digest produced by download_and_analyze_module is identical across cold and warm cache. + + This test is the absolute-path-leak detector: if any sandbox-absolute path leaks + into a captured digest, the two digests will differ because the sandbox roots differ. + """ + go_mod_single = dedent( + """\ + module example.com/cache-test + go 1.16 + require github.com/google/uuid v1.3.0 + """ + ) + with tempfile.TemporaryDirectory() as named_caches_dir: + # Run 1: cold cache -- populates go_mod_cache named cache + runner1 = _make_rule_runner(named_caches_dir) + digest1 = runner1.make_snapshot({"go.mod": go_mod_single, "go.sum": UUID_GO_SUM}).digest + result1 = runner1.request( + AllThirdPartyPackages, + [ + AllThirdPartyPackagesRequest( + Address("cache-test", target_name="mod"), + digest1, + "go.mod", + build_opts=GoBuildOptions(), + ) + ], + ) + uuid_pkg1 = result1.import_paths_to_pkg_info["github.com/google/uuid"] + + # Run 2: warm cache, fresh LMDB store (new RuleRunner = new in-memory store) + runner2 = _make_rule_runner(named_caches_dir) + digest2 = runner2.make_snapshot({"go.mod": go_mod_single, "go.sum": UUID_GO_SUM}).digest + result2 = runner2.request( + AllThirdPartyPackages, + [ + AllThirdPartyPackagesRequest( + Address("cache-test", target_name="mod"), + digest2, + "go.mod", + build_opts=GoBuildOptions(), + ) + ], + ) + uuid_pkg2 = result2.import_paths_to_pkg_info["github.com/google/uuid"] + + # Digests must be byte-identical: cold vs warm cache produces the same sources. + assert uuid_pkg1.digest == uuid_pkg2.digest, ( + "Digest differs between cold and warm named cache runs -- " + "likely an absolute sandbox path leaked into the captured digest." + ) + assert uuid_pkg1.dir_path == uuid_pkg2.dir_path + + # Run 3: independent cache dir -- must still produce equal digest + with tempfile.TemporaryDirectory() as named_caches_dir2: + runner3 = _make_rule_runner(named_caches_dir2) + digest3 = runner3.make_snapshot({"go.mod": go_mod_single, "go.sum": UUID_GO_SUM}).digest + result3 = runner3.request( + AllThirdPartyPackages, + [ + AllThirdPartyPackagesRequest( + Address("cache-test", target_name="mod"), + digest3, + "go.mod", + build_opts=GoBuildOptions(), + ) + ], + ) + uuid_pkg3 = result3.import_paths_to_pkg_info["github.com/google/uuid"] + assert uuid_pkg1.digest == uuid_pkg3.digest, ( + "Digest differs across different named-cache directories -- " + "likely an absolute sandbox path leaked into the captured digest." + ) + + +def test_named_cache_populated_and_writable() -> None: + """After a download, the go_mod_cache named cache is populated and owner-writable. + + The -modcacherw flag ensures Go writes the cache with writable permissions so + Pants can prune or clear the named cache without permission errors. + """ + go_mod_single = dedent( + """\ + module example.com/cache-writability-test + go 1.16 + require rsc.io/quote v1.5.2 + """ + ) + rsc_quote_go_sum = dedent( + """\ + golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c h1:qgOY6WgZOaTkIIMiVjBQcw93ERBE4m30iBm00nkL0i8= + golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= + rsc.io/quote v1.5.2 h1:w5fcysjrx7yqtD/aO+QwRjYZOKnaM9Uh2b40tElTs3Y= + rsc.io/quote v1.5.2/go.mod h1:LzX7hefJvL54yjefDEDHNONDjII0t9xZLPXsUe+TKr0= + rsc.io/sampler v1.3.0 h1:7uVkIFmeBqHfdjD+gZwtXXI+RODJ2Wc4O7MPEh/QiW4= + rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= + """ + ) + with tempfile.TemporaryDirectory() as named_caches_dir: + runner = _make_rule_runner(named_caches_dir) + digest = runner.make_snapshot({"go.mod": go_mod_single, "go.sum": rsc_quote_go_sum}).digest + runner.request( + AllThirdPartyPackages, + [ + AllThirdPartyPackagesRequest( + Address("cache-writability-test", target_name="mod"), + digest, + "go.mod", + build_opts=GoBuildOptions(), + ) + ], + ) + cache_dir = pathlib.Path(named_caches_dir) / "go_mod_cache" + assert cache_dir.exists(), ( + f"go_mod_cache named cache directory not created at {cache_dir}; " + "the named cache may not be mounted or GOMODCACHE not set correctly." + ) + # Find at least one file and verify it is owner-writable (proof of -modcacherw). + found_writable = False + for root, _dirs, files in os.walk(str(cache_dir)): + for fname in files: + fpath = os.path.join(root, fname) + if os.access(fpath, os.W_OK): + found_writable = True + break + if found_writable: + break + assert found_writable, ( + "No owner-writable files found in go_mod_cache; " + "-modcacherw flag may not be reaching the Go process." + ) + + +def test_analyze_module_dependencies_uses_named_cache() -> None: + """analyze_module_dependencies uses the named cache (no output_directories capture). + + Verifies that after running analyze_module_dependencies, the go_mod_cache is + populated (named cache is mounted and used), and the returned ModuleDescriptors + has the expected modules without a go_mods_digest field. + """ + go_mod_content = dedent( + """\ + module example.com/dep-analysis-test + go 1.16 + require github.com/google/uuid v1.3.0 + """ + ) + with tempfile.TemporaryDirectory() as named_caches_dir: + runner = _make_rule_runner(named_caches_dir) + digest = runner.make_snapshot({"go.mod": go_mod_content, "go.sum": UUID_GO_SUM}).digest + result = runner.request( + ModuleDescriptors, + [ModuleDescriptorsRequest(digest=digest, path="")], + ) + assert any(mod.name == "github.com/google/uuid" for mod in result.modules) + # go_mods_digest was removed; verify ModuleDescriptors has only 'modules'. + assert not hasattr(result, "go_mods_digest"), ( + "ModuleDescriptors.go_mods_digest should have been removed in this PR." + )