awslabs
diff --git a/‎.github/dependabot.yml‎
Lines changed: 54 additions & 0 deletions b/‎.github/dependabot.yml‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎.github/workflows/security-scanners.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/security-scanners.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/ADMINISTRATIVE_GUIDE.md‎
Lines changed: 11 additions & 3 deletions b/‎docs/ADMINISTRATIVE_GUIDE.md‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎scripts/aidlc-evaluator/docker/sandbox/Dockerfile‎
Lines changed: 2 additions & 3 deletions b/‎scripts/aidlc-evaluator/docker/sandbox/Dockerfile‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py‎
Lines changed: 5 additions & 0 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py‎
Lines changed: 91 additions & 12 deletions b/‎scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py‎
Lines changed: 91 additions & 12 deletions
@@ -0,0 +1,54 @@
+# Dependabot version updates
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates
+#
+# Note: The ClamAV service container image in security-scanners.yml is pinned
+# by sha256 digest and is NOT covered by any Dependabot ecosystem. That digest
+# must be updated manually (monthly cadence). See:
+# docs/ADMINISTRATIVE_GUIDE.md → Updating Pinned Versions → How to update the ClamAV image digest
+
+version: 2
+updates:
+  # GitHub Actions — SHA-pinned actions across all workflows
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+      include: "scope"
+    groups:
+      github-actions:
+        patterns:
+          - "*"
+
+  # Python (uv) — evaluator framework dependencies
+  - package-ecosystem: "uv"
+    directory: "/scripts/aidlc-evaluator"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+      prefix-development: "chore(deps-dev)"
+      include: "scope"
+    groups:
+      evaluator-deps:
+        patterns:
+          - "*"
+
+  # Pre-commit hooks — .pre-commit-config.yaml rev pins
+  - package-ecosystem: "pre-commit"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+      include: "scope"
+
+  # Docker — sandbox Dockerfile base image
+  - package-ecosystem: "docker"
+    directory: "/scripts/aidlc-evaluator/docker/sandbox"
+    schedule:
+      interval: "weekly"
+    commit-message:
+      prefix: "chore(deps)"
+      include: "scope"
@@ -304,7 +304,7 @@ jobs:
     runs-on: ubuntu-latest
     services:
       clamav:
-        image: clamav/clamav@sha256:bf876a415b7ff77b9305b1de087e6d16833d170931581b01404e8761cb0dc87c
+        image: clamav/clamav@sha256:60ef5fee072ff46f91ca63ba09f36597e41693977600902d21df9d0d97f640e4
         ports:
           - 127.0.0.1:3310:3310
         options: >-
@@ -319,7 +319,7 @@ jobs:
       - name: Install clamdscan client
         run: |
           sudo apt-get update || true
-          sudo rm -f /var/lib/man-db/auto-update
+          sudo rm -f /var/lib/man-db/auto-update  # prevent man-db auto-update from blocking apt-get
           sudo apt-get install -y --no-install-recommends clamdscan
           sudo mkdir -p /etc/clamav
           cat << EOF | sudo tee /etc/clamav/clamd.conf
 
@@ -246,7 +246,7 @@ flowchart TD
 | **Triggers**    | `push` to `main`, `push` tags `v*`, `pull_request` to `main` (label-gated, path-filtered), `workflow_dispatch` (dispatched by `tag-on-merge.yml` or manual — select a tag in the UI to trigger a release build) |
 | **Environment** | `codebuild` (protected, manual approval)                                                                                                                                                                        |
 | **Runner**      | `ubuntu-latest`                                                                                                                                                                                                 |
-| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress                                                                                                                                                               |
+| **Concurrency** | Groups by `{workflow}-{event_name}-{ref}`, cancels in-progress                                                                                                                                                  |
 
 **Purpose:** Runs an AWS CodeBuild project, downloads primary and secondary artifacts from S3, caches them in GitHub Actions cache, uploads them as workflow artifacts, and (when triggered from a `v*` tag) attaches them to the GitHub Release.
 
@@ -351,7 +351,7 @@ This job runs when the `rules` label is applied, immediately removing the remind
 | **Triggers**    | `pull_request_target` to `main` (edited, labeled, opened, ready_for_review, reopened, synchronize, unlabeled); `merge_group` (checks_requested) |
 | **Environment** | *(none)*                                                                                                                                        |
 | **Runner**      | `ubuntu-latest`                                                                                                                                 |
-| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress                                                                                               |
+| **Concurrency** | Groups by `{workflow}-{event_name}-{ref}`, cancels in-progress                                                                                  |
 
 **Purpose:** Validates pull requests before merge. Enforces conventional commit PR titles, the contributor acknowledgment statement, merge-halt controls, and a do-not-merge label gate. Also runs as a merge queue check.
 
@@ -422,7 +422,7 @@ Only runs for `pull_request` and `pull_request_target` events. Skipped for bot a
 | **Triggers**    | `push` to `main`, `pull_request` to `main`, `schedule` (daily 03:47 UTC), `workflow_dispatch`  |
 | **Environment** | *(none)*                                                                                       |
 | **Runner**      | `ubuntu-latest`                                                                                |
-| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress                                              |
+| **Concurrency** | Groups by `{workflow}-{event_name}-{ref}`, cancels in-progress                                 |
 
 **Purpose:** Runs six independent security scanners in parallel to detect secrets, vulnerabilities, misconfigurations, and malware. All HIGH and CRITICAL findings must be remediated or have a documented risk acceptance before merge (see [Security Finding Requirements](#security-finding-requirements)).
 
@@ -675,3 +675,11 @@ Pinned versions should be reviewed and updated **at least quarterly**.
   - How to handle breaking changes in scanner tool upgrades
   - Consider automating this with Dependabot or Renovate
 -->
+
+Agent pre-commit checklist (recommended):
+
+- npx markdownlint-cli2 --fix "**/*.md"  # auto-fix markdown lint issues
+- npx markdownlint-cli2 "**/*.md"    # verify no lint errors
+- uv run pytest                            # run tests via uv wrapper
+
+Agents must run the checklist above and ensure all checks pass before committing and pushing changes.
@@ -1,16 +1,15 @@
 # Multi-language sandbox image for running AI-generated code in isolation.
 #
-# Includes Python 3.13 + uv, Node.js 22 + npm, and common build tools.
+# Includes Python 3.14 + uv, Node.js 22 + npm, and common build tools.
 # Runs as a non-root user with no credentials or host tools.
 #
 # Security notes:
-# - Base image is intentionally not pinned to a hash to receive security updates
 # - HEALTHCHECK is omitted as this is an ephemeral test sandbox, not a service
 # - RUN commands use pipes without pipefail, acceptable for dependency installation
 
 # checkov:skip=CKV_DOCKER_2:HEALTHCHECK not needed for ephemeral test sandbox
 # nosemgrep: dockerfile-source-not-pinned
-FROM public.ecr.aws/docker/library/python:3.13-slim AS base
+FROM public.ecr.aws/docker/library/python:3.14-slim@sha256:3989a23fd2c28a34c7be819e488b958a10601d421ac25bea1e7a5d757365e2d5 AS base
 
 # Install system dependencies and Node.js 22
 # nosemgrep: set-pipefail
 
@@ -19,6 +19,8 @@
 from trend_reports.models import (
     BaselineMetrics,
     GateResult,
+    InfraFailure,
+    InfraFailureReason,
     RunData,
     RunType,
     SemVer,
@@ -33,6 +35,8 @@
 __all__ = [
     "BaselineMetrics",
     "GateResult",
+    "InfraFailure",
+    "InfraFailureReason",
     "RunData",
     "RunType",
     "SemVer",
 
@@ -217,6 +217,11 @@ def cmd_trend(
         # 5. Gate
         if gate:
             result = check_regressions(trend)
+            if result.infra_failure_detected:
+                print(
+                    f"Gate WARNING: {result.infra_failure_summary}",
+                    file=sys.stderr,
+                )
             if result.passed:
                 print(
                     f"Gate PASSED: {result.latest_label} vs {result.comparison_label} "
 
@@ -18,6 +18,8 @@
     ContractTestResults,
     DocumentScore,
     HandoffMetrics,
+    InfraFailure,
+    InfraFailureReason,
     QualitativeComparison,
     RunConfig,
     RunData,
@@ -147,14 +149,20 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
 
     hp = raw.get("handoff_patterns", {})
     errors = raw.get("errors", {})
+    throttle_events = errors.get("throttle_events", 0)
+    timeout_events = errors.get("timeout_events", 0)
+    failed_tool_calls = errors.get("failed_tool_calls", 0)
+    model_error_events = errors.get("model_error_events", 0)
+    service_unavailable_events = errors.get("service_unavailable_events", 0)
+    validation_error_events = errors.get("validation_error_events", 0)
     error_count = sum(
         [
-            errors.get("throttle_events", 0),
-            errors.get("timeout_events", 0),
-            errors.get("failed_tool_calls", 0),
-            errors.get("model_error_events", 0),
-            errors.get("service_unavailable_events", 0),
-            errors.get("validation_error_events", 0),
+            throttle_events,
+            timeout_events,
+            failed_tool_calls,
+            model_error_events,
+            service_unavailable_events,
+            validation_error_events,
         ]
     )
 
@@ -175,6 +183,12 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
         handoffs=handoffs,
         server_startup_success=True,
         error_count=error_count,
+        throttle_events=throttle_events,
+        service_unavailable_events=service_unavailable_events,
+        model_error_events=model_error_events,
+        timeout_events=timeout_events,
+        failed_tool_calls=failed_tool_calls,
+        validation_error_events=validation_error_events,
     )
 
 
@@ -215,12 +229,17 @@ def parse_contract_tests(yaml_path: Path) -> ContractTestResults:
                 )
             )
 
+    server_started = raw.get("server_started", True)
+    server_error = raw.get("server_error") or ""
+
     return ContractTestResults(
         total=total,
         passed=passed,
         failed=failed,
         pass_rate=pass_rate,
         failures=failures,
+        server_started=server_started,
+        server_error=server_error,
     )
 
 
@@ -301,6 +320,59 @@ def classify_run(rules_ref: str) -> tuple[RunType, str, SemVer | None, int | Non
         return RunType.RELEASE, rules_ref, None, None
 
 
+# ---------------------------------------------------------------------------
+# Infrastructure failure detection
+# ---------------------------------------------------------------------------
+
+
+def detect_infra_failure(
+    meta: RunMeta,
+    metrics: RunMetrics,
+    contract_tests: ContractTestResults,
+    has_metrics_file: bool,
+) -> InfraFailure:
+    """Detect infrastructure failures from run signals.
+
+    Conservative: only flags clear infra issues, not ambiguous cases.
+    """
+    reasons: list[InfraFailureReason] = []
+
+    # Signal 1: Bedrock infra errors in run-metrics.yaml
+    if metrics.throttle_events > 0:
+        reasons.append(InfraFailureReason.THROTTLED)
+    if metrics.service_unavailable_events > 0:
+        reasons.append(InfraFailureReason.SERVICE_UNAVAILABLE)
+    if metrics.model_error_events > 0:
+        reasons.append(InfraFailureReason.MODEL_ERROR)
+
+    # Signal 2: run-meta.yaml status indicates failure/crash
+    status_lower = meta.status.lower() if meta.status else ""
+    if "failed" in status_lower:
+        reasons.append(InfraFailureReason.RUN_FAILED)
+    elif not meta.status or meta.status.strip() == "":
+        reasons.append(InfraFailureReason.RUN_CRASHED)
+
+    # Signal 3: run-metrics.yaml missing entirely (swarm crashed before writing)
+    if not has_metrics_file:
+        reasons.append(InfraFailureReason.METRICS_MISSING)
+
+    # Signal 4: Server failed to start (from contract-test-results.yaml)
+    if not contract_tests.server_started:
+        reasons.append(InfraFailureReason.SERVER_START_FAILED)
+
+    if not reasons:
+        return InfraFailure(is_infra_failure=False)
+
+    reason_strs = [r.value for r in reasons]
+    summary = f"Infrastructure failure detected: {', '.join(reason_strs)}"
+
+    return InfraFailure(
+        is_infra_failure=True,
+        reasons=reasons,
+        summary=summary,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Collection pipeline
 # ---------------------------------------------------------------------------
@@ -319,11 +391,8 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
     meta = parse_run_meta(yaml_files["run-meta"])
     run_type, label, semver, pr_number = classify_run(meta.config.rules_ref)
 
-    metrics = (
-        parse_run_metrics(yaml_files["run-metrics"])
-        if "run-metrics" in yaml_files
-        else RunMetrics()
-    )
+    has_metrics_file = "run-metrics" in yaml_files
+    metrics = parse_run_metrics(yaml_files["run-metrics"]) if has_metrics_file else RunMetrics()
     unit_tests = (
         parse_test_results(yaml_files["test-results"])
         if "test-results" in yaml_files
@@ -334,6 +403,10 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
         if "contract-test-results" in yaml_files
         else ContractTestResults()
     )
+
+    # Propagate actual server_started to metrics
+    metrics.server_startup_success = contract_tests.server_started
+
     code_quality = (
         parse_quality_report(yaml_files["quality-report"])
         if "quality-report" in yaml_files
@@ -346,13 +419,18 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
     )
 
     # Backfill artifact counts from run-metrics if available
-    if "run-metrics" in yaml_files:
+    if has_metrics_file:
         raw_metrics = _load_yaml(yaml_files["run-metrics"])
         workspace = raw_metrics.get("artifacts", {}).get("workspace", {})
         code_quality.source_file_count = workspace.get("source_files", 0)
         code_quality.test_file_count = workspace.get("test_files", 0)
         code_quality.total_lines_of_code = workspace.get("total_lines_of_code", 0)
 
+    # Detect infrastructure failures
+    infra_failure = detect_infra_failure(meta, metrics, contract_tests, has_metrics_file)
+    if infra_failure.is_infra_failure:
+        logger.warning("Infra failure detected in %s: %s", source_label, infra_failure.summary)
+
     return RunData(
         label=label,
         run_type=run_type,
@@ -364,6 +442,7 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
         contract_tests=contract_tests,
         code_quality=code_quality,
         qualitative=qualitative,
+        infra_failure=infra_failure,
     )