Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/notes/2.33.x.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ Fixes JavaScript workspace builds where dependencies are installed under members

Third-party module analysis is now deduplicated across `go.mod` files. Previously, a module required by `N` `go.mod` files was downloaded and analyzed `N` times, which caused significant memory and time overhead in monorepos with many overlapping `go.mod` files. On a 3-`go.mod` reproducer, `pants list ::` peak memory dropped from 91 GB to 32 GB (-65%). This is a no-op for repos with a single `go.mod`. See [#20274](https://github.com/pantsbuild/pants/issues/20274).

Third-party Go module downloads now use a persistent named cache (`go_mod_cache`) for `GOMODCACHE`. Previously every sandbox got a fresh module cache, so any `go.mod`/`go.sum` change caused all modules to be re-downloaded from the network. With the named cache, subsequent builds reuse locally cached module zips even when the engine's content-addressed store is cold. Captured digests remain the source of truth: modules are copied out of the shared cache into the sandbox before capture, so results are byte-identical regardless of whether the cache is warm or cold. Remote-execution users: the named cache is only effective when `--remote-execution-append-only-caches-base-path` is configured; without it the process falls back silently to a per-sandbox download (today's behavior). Note: this change invalidates all previously cached Go process results (one-time invalidation). See [#13390](https://github.com/pantsbuild/pants/issues/13390).

### Plugin API changes

`FrozenOrderedSet` is now backed by a native Rust implementation and is built in `__new__` rather than `__init__`. Subclasses that customized the set's contents by overriding `__init__` (for example, to transform the input iterable before constructing) must move that logic to `__new__`, since the set is already built and immutable by the time `__init__` runs.
Expand Down
48 changes: 48 additions & 0 deletions src/python/pants/backend/go/util_rules/sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class GoSdkProcess:
output_directories: tuple[str, ...]
replace_sandbox_root_in_args: bool

use_module_cache: bool

def __init__(
self,
command: Iterable[str],
Expand All @@ -50,6 +52,7 @@ def __init__(
output_directories: Iterable[str] = (),
allow_downloads: bool = False,
replace_sandbox_root_in_args: bool = False,
use_module_cache: bool = False,
) -> None:
object.__setattr__(self, "command", tuple(command))
object.__setattr__(self, "description", description)
Expand All @@ -67,6 +70,7 @@ def __init__(
object.__setattr__(self, "output_files", tuple(output_files))
object.__setattr__(self, "output_directories", tuple(output_directories))
object.__setattr__(self, "replace_sandbox_root_in_args", replace_sandbox_root_in_args)
object.__setattr__(self, "use_module_cache", use_module_cache)


@dataclass(frozen=True)
Expand All @@ -76,6 +80,7 @@ class GoSdkRunSetup:

CHDIR_ENV = "__PANTS_CHDIR_TO"
SANDBOX_ROOT_ENV = "__PANTS_REPLACE_SANDBOX_ROOT"
FETCH_MODULE_ENV = "__PANTS_GO_FETCH_MODULE"


@rule
Expand All @@ -92,6 +97,43 @@ async def go_sdk_invoke_setup(goroot: GoRoot) -> GoSdkRunSetup:
export GOPATH="${{sandbox_root}}/gopath"
export GOCACHE="${{sandbox_root}}/cache"
/bin/mkdir -p "$GOPATH" "$GOCACHE"
if [ -n "$__PANTS_GO_MODCACHE" ]; then
export GOMODCACHE="${{sandbox_root}}/__gomodcache"
export GOFLAGS="${{GOFLAGS:+${{GOFLAGS}} }}-modcacherw"
/bin/mkdir -p "$GOMODCACHE"
fi
if [ -n "$__PANTS_GO_FETCH_MODULE" ]; then
module_version="$__PANTS_GO_FETCH_MODULE"
"{goroot.path}/bin/go" mod download -json "$module_version" > __module_metadata.json
download_status=$?
/bin/cat __module_metadata.json
if [ "$download_status" -ne 0 ]; then
exit "$download_status"
fi
# Parse the Dir/GoMod paths from the metadata with shell string operations only:
# the sandbox PATH cannot be assumed to provide grep/sed.
dir=""
gomod=""
while IFS= read -r line; do
case "$line" in
*'"Dir": "'*) if [ -z "$dir" ]; then dir=${{line#*'"Dir": "'}}; dir=${{dir%'"'*}}; fi ;;
*'"GoMod": "'*) if [ -z "$gomod" ]; then gomod=${{line#*'"GoMod": "'}}; gomod=${{gomod%'"'*}}; fi ;;
esac
done < __module_metadata.json
marker="__gomodcache/"
dir_rel="${{dir#*${{marker}}}}"
gomod_rel="${{gomod#*${{marker}}}}"
if [ -z "$dir" ] || [ -z "$gomod" ] || [ "$dir_rel" = "$dir" ] || [ "$gomod_rel" = "$gomod" ]; then
echo "Failed to locate module $module_version under GOMODCACHE after download." 1>&2
exit 1
fi
dest_dir="$GOPATH/pkg/mod/$dir_rel"
dest_gomod="$GOPATH/pkg/mod/$gomod_rel"
/bin/mkdir -p "$dest_dir" "${{dest_gomod%/*}}" || exit 1
/bin/cp -Rp "$dir/." "$dest_dir/" || exit 1
/bin/cp -p "$gomod" "$dest_gomod" || exit 1
exit 0
fi
if [ -n "${GoSdkRunSetup.CHDIR_ENV}" ]; then
cd "${GoSdkRunSetup.CHDIR_ENV}"
fi
Expand Down Expand Up @@ -158,6 +200,11 @@ async def setup_go_sdk_process(
if request.replace_sandbox_root_in_args:
env[GoSdkRunSetup.SANDBOX_ROOT_ENV] = "1"

append_only_caches: dict[str, str] = {}
if request.use_module_cache:
env["__PANTS_GO_MODCACHE"] = "1"
append_only_caches["go_mod_cache"] = "__gomodcache"

# Disable the "coverage redesign" experiment on Go v1.20+ for now since Pants does not yet support it.
if goroot.is_compatible_version("1.20") and not goroot.is_compatible_version("1.25"):
exp_str = env.get("GOEXPERIMENT", "")
Expand All @@ -175,6 +222,7 @@ async def setup_go_sdk_process(
description=request.description,
output_files=request.output_files,
output_directories=request.output_directories,
append_only_caches=append_only_caches,
level=LogLevel.DEBUG,
)

Expand Down
44 changes: 30 additions & 14 deletions src/python/pants/backend/go/util_rules/third_party_pkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,12 @@ def debug_hint(self) -> str:

@dataclass(frozen=True)
class AllThirdPartyPackages:
"""All the packages downloaded from a go.mod, along with a digest of the downloaded files.
"""All the packages downloaded from a go.mod.

The digest has files in the format `gopath/pkg/mod`, which is what `GoSdkProcess` sets `GOPATH`
to. This means that you can include the digest in a process and Go will properly consume it as
the `GOPATH`.
Each ``ThirdPartyPkgAnalysis`` carries its own per-package digest with files
in ``gopath/pkg/mod/<module@version>/...`` layout (the captured subset from the
download sandbox). Consumers merge whichever package digests they need into
their own input digest.
"""

digest: Digest
Expand Down Expand Up @@ -151,7 +152,6 @@ class ModuleDescriptor:
@dataclass(frozen=True)
class ModuleDescriptors:
modules: FrozenOrderedSet[ModuleDescriptor]
go_mods_digest: Digest


@dataclass(frozen=True)
Expand Down Expand Up @@ -232,17 +232,17 @@ async def analyze_module_dependencies(request: ModuleDescriptorsRequest) -> Modu
GoSdkProcess(
command=["list", "-mod=readonly", "-e", "-m", "-json", "all"],
input_digest=request.digest,
output_directories=("gopath",),
working_dir=request.path if request.path else None,
# Allow downloads of the module metadata (i.e., go.mod files).
allow_downloads=True,
use_module_cache=True,
description="Analyze Go module dependencies.",
)
)
)

if len(mod_list_result.stdout) == 0:
return ModuleDescriptors(FrozenOrderedSet(), EMPTY_DIGEST)
return ModuleDescriptors(FrozenOrderedSet())

descriptors: dict[tuple[str, str], ModuleDescriptor] = {}

Expand Down Expand Up @@ -276,7 +276,7 @@ async def analyze_module_dependencies(request: ModuleDescriptorsRequest) -> Modu
# Gazelle does this, mainly to store the sum on the go_repository rule. We could store it (or its
# absence) to be able to download sums automatically.

return ModuleDescriptors(FrozenOrderedSet(descriptors.values()), mod_list_result.output_digest)
return ModuleDescriptors(FrozenOrderedSet(descriptors.values()))


def strip_sandbox_prefix(path: str, marker: str) -> str:
Expand Down Expand Up @@ -529,12 +529,20 @@ async def download_and_analyze_module(
synthetic_files.append(FileContent("go.sum", synthetic_go_sum.encode()))
synthetic_digest = await create_digest(CreateDigest(synthetic_files))

# Fetch the module into the named cache and copy to gopath/pkg/mod in one process.
# The __PANTS_GO_FETCH_MODULE mode in __run_go.sh handles both steps atomically:
# it runs `go mod download -json`, emits the metadata on stdout, then copies the
# extracted tree from __gomodcache into gopath/pkg/mod. This avoids any race
# between the download (process 1) and copy (process 2) that would violate the
# self-healing invariant if the named cache were wiped between steps.
download_result = await fallible_to_exec_result_or_raise(
**implicitly(
GoSdkProcess(
("mod", "download", "-json", f"{request.name}@{request.version}"),
(),
env={"__PANTS_GO_FETCH_MODULE": f"{request.name}@{request.version}"},
input_digest=synthetic_digest,
allow_downloads=True,
use_module_cache=True,
output_directories=("gopath",),
description=f"Download Go module {request.name}@{request.version}.",
)
Expand All @@ -543,12 +551,20 @@ async def download_and_analyze_module(

if len(download_result.stdout) == 0:
raise AssertionError(
f"Expected output from `go mod download` for {request.name}@{request.version}."
f"Expected metadata output from module fetch for {request.name}@{request.version}."
)

module_metadata = json.loads(download_result.stdout)
module_sources_relpath = strip_sandbox_prefix(module_metadata["Dir"], "gopath/")
go_mod_relpath = strip_sandbox_prefix(module_metadata["GoMod"], "gopath/")
module_metadata_json = json.loads(download_result.stdout)

# Dir/GoMod point into __gomodcache (absolute sandbox paths). Extract the
# portion after __gomodcache/ to get the relative path within the cache tree,
# then re-prefix with gopath/pkg/mod/ to match where the copy step placed them.
# strip_sandbox_prefix returns path[marker_pos:] (includes the marker), so we
# strip the marker length to get just the relative tail.
_modcache_marker = "__gomodcache/"
_dir_from_marker = strip_sandbox_prefix(module_metadata_json["Dir"], _modcache_marker)
_gomod_from_marker = strip_sandbox_prefix(module_metadata_json["GoMod"], _modcache_marker)
module_sources_relpath = "gopath/pkg/mod/" + _dir_from_marker[len(_modcache_marker) :]
go_mod_relpath = "gopath/pkg/mod/" + _gomod_from_marker[len(_modcache_marker) :]

module_sources_snapshot = await digest_to_snapshot(
**implicitly(
Expand Down
Loading
Loading