From 4f98c715868410c7839953049cfc3ff8c56c4a2f Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 21 May 2026 15:10:01 -0500 Subject: [PATCH 1/2] Run pytest --collect-only in parallel batches in split_tests cProfile showed 99.6% of split_tests.py wall time was spent in the single pytest --collect-only subprocess. Fan out the collection across ``os.cpu_count()`` workers; round-robin chunking keeps each batch roughly equal, and tests/components is expanded one level deeper so the ~1000 integration subdirectories distribute evenly. Local wall time dropped from ~132s to ~11s on an 18-core box. Bucket output is unchanged because we still parse the same pytest -qq output, just aggregated from multiple invocations. --- script/split_tests.py | 81 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/script/split_tests.py b/script/split_tests.py index 6888372d947b2f..6d60b654a1b67a 100755 --- a/script/split_tests.py +++ b/script/split_tests.py @@ -2,13 +2,19 @@ """Helper script to split test into n buckets.""" import argparse +from concurrent.futures import ProcessPoolExecutor from dataclasses import dataclass, field from math import ceil +import os from pathlib import Path import subprocess import sys from typing import Final +# tests/components has ~1000 sub-directories, which makes it the natural +# place to subdivide to keep each pytest invocation roughly equal in size. +_FAN_OUT_DIRS: Final = frozenset({"components"}) + class Bucket: """Class to hold bucket.""" @@ -164,33 +170,76 @@ def get_all_flatten(self) -> list[TestFolder | TestFile]: return result -def collect_tests(path: Path) -> TestFolder: - """Collect all tests.""" +def _collect_batch(paths: list[Path]) -> tuple[str, str, int]: + """Run pytest --collect-only on a batch of paths.""" result = subprocess.run( - ["pytest", "--collect-only", "-qq", "-p", "no:warnings", path], + ["pytest", "--collect-only", "-qq", "-p", "no:warnings", *map(str, paths)], check=False, capture_output=True, text=True, ) + return result.stdout, result.stderr, result.returncode - if result.returncode != 0: - print("Failed to collect tests:") - print(result.stderr) - print(result.stdout) - sys.exit(1) - folder = TestFolder(path) +def _enumerate_batch_paths(path: Path) -> list[Path]: + """Return the child paths to run pytest --collect-only over. - for line in result.stdout.splitlines(): - if not line.strip(): + Files are returned as-is. Directories are expanded one level deep, with + a second level of expansion for entries named in ``_FAN_OUT_DIRS`` so the + enormous ``tests/components`` tree fans out into per-integration paths. + """ + if path.is_file(): + return [path] + + paths: list[Path] = [] + for entry in sorted(path.iterdir()): + if entry.name.startswith((".", "_")): continue - file_path, _, total_tests = line.partition(": ") - if not path or not total_tests: - print(f"Unexpected line: {line}") + if entry.is_dir(): + if entry.name in _FAN_OUT_DIRS: + paths.extend( + sub + for sub in sorted(entry.iterdir()) + if not sub.name.startswith((".", "_")) + ) + else: + paths.append(entry) + elif entry.suffix == ".py" and entry.name.startswith("test_"): + paths.append(entry) + return paths + + +def collect_tests(path: Path) -> TestFolder: + """Collect all tests.""" + batch_paths = _enumerate_batch_paths(path) + workers = min(len(batch_paths), os.cpu_count() or 1) or 1 + # Round-robin chunking keeps batches roughly balanced when path + # ordering correlates with test size. + batches = [batch_paths[i::workers] for i in range(workers)] + + if workers == 1: + results = [_collect_batch(batches[0])] + else: + with ProcessPoolExecutor(max_workers=workers) as executor: + results = list(executor.map(_collect_batch, batches)) + + folder = TestFolder(path) + for stdout, stderr, returncode in results: + if returncode != 0: + print("Failed to collect tests:") + print(stderr) + print(stdout) sys.exit(1) + for line in stdout.splitlines(): + if not line.strip(): + continue + file_path, _, total_tests = line.partition(": ") + if not file_path or not total_tests: + print(f"Unexpected line: {line}") + sys.exit(1) - file = TestFile(int(total_tests), Path(file_path)) - folder.add_test_file(file) + file = TestFile(int(total_tests), Path(file_path)) + folder.add_test_file(file) return folder From 8dadaa2f9e0eb404b63c7406b6c8c418dcbe1c48 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 21 May 2026 15:17:42 -0500 Subject: [PATCH 2/2] Filter fan-out children and fail fast on empty batch list Only pass directories and test_*.py files to pytest --collect-only so helpers like tests/components/conftest.py and tests/components/common.py are not treated as explicit collection targets, and bail out with a clear error if no eligible paths are found instead of running pytest with no arguments. --- script/split_tests.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/script/split_tests.py b/script/split_tests.py index 6d60b654a1b67a..5b770fc913a31d 100755 --- a/script/split_tests.py +++ b/script/split_tests.py @@ -181,6 +181,22 @@ def _collect_batch(paths: list[Path]) -> tuple[str, str, int]: return result.stdout, result.stderr, result.returncode +def _iter_eligible_children(path: Path) -> list[Path]: + """Return immediate children of ``path`` that pytest should collect. + + Filters out hidden/dunder entries, non-``test_*.py`` files (so helper + modules like ``conftest.py`` and ``common.py`` are not passed as + explicit collection targets), and pycache-style directories. + """ + children: list[Path] = [] + for entry in sorted(path.iterdir()): + if entry.name.startswith((".", "_")): + continue + if entry.is_dir() or (entry.suffix == ".py" and entry.name.startswith("test_")): + children.append(entry) + return children + + def _enumerate_batch_paths(path: Path) -> list[Path]: """Return the child paths to run pytest --collect-only over. @@ -192,19 +208,10 @@ def _enumerate_batch_paths(path: Path) -> list[Path]: return [path] paths: list[Path] = [] - for entry in sorted(path.iterdir()): - if entry.name.startswith((".", "_")): - continue - if entry.is_dir(): - if entry.name in _FAN_OUT_DIRS: - paths.extend( - sub - for sub in sorted(entry.iterdir()) - if not sub.name.startswith((".", "_")) - ) - else: - paths.append(entry) - elif entry.suffix == ".py" and entry.name.startswith("test_"): + for entry in _iter_eligible_children(path): + if entry.is_dir() and entry.name in _FAN_OUT_DIRS: + paths.extend(_iter_eligible_children(entry)) + else: paths.append(entry) return paths @@ -212,6 +219,9 @@ def _enumerate_batch_paths(path: Path) -> list[Path]: def collect_tests(path: Path) -> TestFolder: """Collect all tests.""" batch_paths = _enumerate_batch_paths(path) + if not batch_paths: + print(f"No eligible test paths found under {path}") + sys.exit(1) workers = min(len(batch_paths), os.cpu_count() or 1) or 1 # Round-robin chunking keeps batches roughly balanced when path # ordering correlates with test size.