From 4f98c715868410c7839953049cfc3ff8c56c4a2f Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@home-assistant.io>
Date: Thu, 21 May 2026 15:10:01 -0500
Subject: [PATCH 1/2] Run pytest --collect-only in parallel batches in
 split_tests

cProfile showed 99.6% of split_tests.py wall time was spent in the
single pytest --collect-only subprocess.  Fan out the collection across
``os.cpu_count()`` workers; round-robin chunking keeps each batch
roughly equal, and tests/components is expanded one level deeper so
the ~1000 integration subdirectories distribute evenly.  Local wall
time dropped from ~132s to ~11s on an 18-core box.  Bucket output is
unchanged because we still parse the same pytest -qq output, just
aggregated from multiple invocations.
---
 script/split_tests.py | 81 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 65 insertions(+), 16 deletions(-)

diff --git a/script/split_tests.py b/script/split_tests.py
index 6888372d947b2f..6d60b654a1b67a 100755
--- a/script/split_tests.py
+++ b/script/split_tests.py
@@ -2,13 +2,19 @@
 """Helper script to split test into n buckets."""
 
 import argparse
+from concurrent.futures import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from math import ceil
+import os
 from pathlib import Path
 import subprocess
 import sys
 from typing import Final
 
+# tests/components has ~1000 sub-directories, which makes it the natural
+# place to subdivide to keep each pytest invocation roughly equal in size.
+_FAN_OUT_DIRS: Final = frozenset({"components"})
+
 
 class Bucket:
     """Class to hold bucket."""
@@ -164,33 +170,76 @@ def get_all_flatten(self) -> list[TestFolder | TestFile]:
         return result
 
 
-def collect_tests(path: Path) -> TestFolder:
-    """Collect all tests."""
+def _collect_batch(paths: list[Path]) -> tuple[str, str, int]:
+    """Run pytest --collect-only on a batch of paths."""
     result = subprocess.run(
-        ["pytest", "--collect-only", "-qq", "-p", "no:warnings", path],
+        ["pytest", "--collect-only", "-qq", "-p", "no:warnings", *map(str, paths)],
         check=False,
         capture_output=True,
         text=True,
     )
+    return result.stdout, result.stderr, result.returncode
 
-    if result.returncode != 0:
-        print("Failed to collect tests:")
-        print(result.stderr)
-        print(result.stdout)
-        sys.exit(1)
 
-    folder = TestFolder(path)
+def _enumerate_batch_paths(path: Path) -> list[Path]:
+    """Return the child paths to run pytest --collect-only over.
 
-    for line in result.stdout.splitlines():
-        if not line.strip():
+    Files are returned as-is.  Directories are expanded one level deep, with
+    a second level of expansion for entries named in ``_FAN_OUT_DIRS`` so the
+    enormous ``tests/components`` tree fans out into per-integration paths.
+    """
+    if path.is_file():
+        return [path]
+
+    paths: list[Path] = []
+    for entry in sorted(path.iterdir()):
+        if entry.name.startswith((".", "_")):
             continue
-        file_path, _, total_tests = line.partition(": ")
-        if not path or not total_tests:
-            print(f"Unexpected line: {line}")
+        if entry.is_dir():
+            if entry.name in _FAN_OUT_DIRS:
+                paths.extend(
+                    sub
+                    for sub in sorted(entry.iterdir())
+                    if not sub.name.startswith((".", "_"))
+                )
+            else:
+                paths.append(entry)
+        elif entry.suffix == ".py" and entry.name.startswith("test_"):
+            paths.append(entry)
+    return paths
+
+
+def collect_tests(path: Path) -> TestFolder:
+    """Collect all tests."""
+    batch_paths = _enumerate_batch_paths(path)
+    workers = min(len(batch_paths), os.cpu_count() or 1) or 1
+    # Round-robin chunking keeps batches roughly balanced when path
+    # ordering correlates with test size.
+    batches = [batch_paths[i::workers] for i in range(workers)]
+
+    if workers == 1:
+        results = [_collect_batch(batches[0])]
+    else:
+        with ProcessPoolExecutor(max_workers=workers) as executor:
+            results = list(executor.map(_collect_batch, batches))
+
+    folder = TestFolder(path)
+    for stdout, stderr, returncode in results:
+        if returncode != 0:
+            print("Failed to collect tests:")
+            print(stderr)
+            print(stdout)
             sys.exit(1)
+        for line in stdout.splitlines():
+            if not line.strip():
+                continue
+            file_path, _, total_tests = line.partition(": ")
+            if not file_path or not total_tests:
+                print(f"Unexpected line: {line}")
+                sys.exit(1)
 
-        file = TestFile(int(total_tests), Path(file_path))
-        folder.add_test_file(file)
+            file = TestFile(int(total_tests), Path(file_path))
+            folder.add_test_file(file)
 
     return folder
 

From 8dadaa2f9e0eb404b63c7406b6c8c418dcbe1c48 Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@home-assistant.io>
Date: Thu, 21 May 2026 15:17:42 -0500
Subject: [PATCH 2/2] Filter fan-out children and fail fast on empty batch list

Only pass directories and test_*.py files to pytest --collect-only so
helpers like tests/components/conftest.py and tests/components/common.py
are not treated as explicit collection targets, and bail out with a
clear error if no eligible paths are found instead of running pytest
with no arguments.
---
 script/split_tests.py | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/script/split_tests.py b/script/split_tests.py
index 6d60b654a1b67a..5b770fc913a31d 100755
--- a/script/split_tests.py
+++ b/script/split_tests.py
@@ -181,6 +181,22 @@ def _collect_batch(paths: list[Path]) -> tuple[str, str, int]:
     return result.stdout, result.stderr, result.returncode
 
 
+def _iter_eligible_children(path: Path) -> list[Path]:
+    """Return immediate children of ``path`` that pytest should collect.
+
+    Filters out hidden/dunder entries, non-``test_*.py`` files (so helper
+    modules like ``conftest.py`` and ``common.py`` are not passed as
+    explicit collection targets), and pycache-style directories.
+    """
+    children: list[Path] = []
+    for entry in sorted(path.iterdir()):
+        if entry.name.startswith((".", "_")):
+            continue
+        if entry.is_dir() or (entry.suffix == ".py" and entry.name.startswith("test_")):
+            children.append(entry)
+    return children
+
+
 def _enumerate_batch_paths(path: Path) -> list[Path]:
     """Return the child paths to run pytest --collect-only over.
 
@@ -192,19 +208,10 @@ def _enumerate_batch_paths(path: Path) -> list[Path]:
         return [path]
 
     paths: list[Path] = []
-    for entry in sorted(path.iterdir()):
-        if entry.name.startswith((".", "_")):
-            continue
-        if entry.is_dir():
-            if entry.name in _FAN_OUT_DIRS:
-                paths.extend(
-                    sub
-                    for sub in sorted(entry.iterdir())
-                    if not sub.name.startswith((".", "_"))
-                )
-            else:
-                paths.append(entry)
-        elif entry.suffix == ".py" and entry.name.startswith("test_"):
+    for entry in _iter_eligible_children(path):
+        if entry.is_dir() and entry.name in _FAN_OUT_DIRS:
+            paths.extend(_iter_eligible_children(entry))
+        else:
             paths.append(entry)
     return paths
 
@@ -212,6 +219,9 @@ def _enumerate_batch_paths(path: Path) -> list[Path]:
 def collect_tests(path: Path) -> TestFolder:
     """Collect all tests."""
     batch_paths = _enumerate_batch_paths(path)
+    if not batch_paths:
+        print(f"No eligible test paths found under {path}")
+        sys.exit(1)
     workers = min(len(batch_paths), os.cpu_count() or 1) or 1
     # Round-robin chunking keeps batches roughly balanced when path
     # ordering correlates with test size.