Skip to content

Commit 9635515

Browse files
Merge branch 'main' into add-agents-md
2 parents 6bc5e17 + 0de057d commit 9635515

20 files changed

Lines changed: 848 additions & 101 deletions

.github/dependabot.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Dependabot version updates
2+
# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates
3+
#
4+
# Note: The ClamAV service container image in security-scanners.yml is pinned
5+
# by sha256 digest and is NOT covered by any Dependabot ecosystem. That digest
6+
# must be updated manually (monthly cadence). See:
7+
# docs/ADMINISTRATIVE_GUIDE.md → Updating Pinned Versions → How to update the ClamAV image digest
8+
9+
version: 2
10+
updates:
11+
# GitHub Actions — SHA-pinned actions across all workflows
12+
- package-ecosystem: "github-actions"
13+
directory: "/"
14+
schedule:
15+
interval: "weekly"
16+
commit-message:
17+
prefix: "chore(deps)"
18+
include: "scope"
19+
groups:
20+
github-actions:
21+
patterns:
22+
- "*"
23+
24+
# Python (uv) — evaluator framework dependencies
25+
- package-ecosystem: "uv"
26+
directory: "/scripts/aidlc-evaluator"
27+
schedule:
28+
interval: "weekly"
29+
commit-message:
30+
prefix: "chore(deps)"
31+
prefix-development: "chore(deps-dev)"
32+
include: "scope"
33+
groups:
34+
evaluator-deps:
35+
patterns:
36+
- "*"
37+
38+
# Pre-commit hooks — .pre-commit-config.yaml rev pins
39+
- package-ecosystem: "pre-commit"
40+
directory: "/"
41+
schedule:
42+
interval: "weekly"
43+
commit-message:
44+
prefix: "chore(deps)"
45+
include: "scope"
46+
47+
# Docker — sandbox Dockerfile base image
48+
- package-ecosystem: "docker"
49+
directory: "/scripts/aidlc-evaluator/docker/sandbox"
50+
schedule:
51+
interval: "weekly"
52+
commit-message:
53+
prefix: "chore(deps)"
54+
include: "scope"

.github/workflows/security-scanners.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ jobs:
304304
runs-on: ubuntu-latest
305305
services:
306306
clamav:
307-
image: clamav/clamav@sha256:bf876a415b7ff77b9305b1de087e6d16833d170931581b01404e8761cb0dc87c
307+
image: clamav/clamav@sha256:60ef5fee072ff46f91ca63ba09f36597e41693977600902d21df9d0d97f640e4
308308
ports:
309309
- 127.0.0.1:3310:3310
310310
options: >-
@@ -319,7 +319,7 @@ jobs:
319319
- name: Install clamdscan client
320320
run: |
321321
sudo apt-get update || true
322-
sudo rm -f /var/lib/man-db/auto-update
322+
sudo rm -f /var/lib/man-db/auto-update # prevent man-db auto-update from blocking apt-get
323323
sudo apt-get install -y --no-install-recommends clamdscan
324324
sudo mkdir -p /etc/clamav
325325
cat << EOF | sudo tee /etc/clamav/clamd.conf

docs/ADMINISTRATIVE_GUIDE.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ flowchart TD
246246
| **Triggers** | `push` to `main`, `push` tags `v*`, `pull_request` to `main` (label-gated, path-filtered), `workflow_dispatch` (dispatched by `tag-on-merge.yml` or manual — select a tag in the UI to trigger a release build) |
247247
| **Environment** | `codebuild` (protected, manual approval) |
248248
| **Runner** | `ubuntu-latest` |
249-
| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress |
249+
| **Concurrency** | Groups by `{workflow}-{event_name}-{ref}`, cancels in-progress |
250250

251251
**Purpose:** Runs an AWS CodeBuild project, downloads primary and secondary artifacts from S3, caches them in GitHub Actions cache, uploads them as workflow artifacts, and (when triggered from a `v*` tag) attaches them to the GitHub Release.
252252

@@ -351,7 +351,7 @@ This job runs when the `rules` label is applied, immediately removing the remind
351351
| **Triggers** | `pull_request_target` to `main` (edited, labeled, opened, ready_for_review, reopened, synchronize, unlabeled); `merge_group` (checks_requested) |
352352
| **Environment** | *(none)* |
353353
| **Runner** | `ubuntu-latest` |
354-
| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress |
354+
| **Concurrency** | Groups by `{workflow}-{event_name}-{ref}`, cancels in-progress |
355355

356356
**Purpose:** Validates pull requests before merge. Enforces conventional commit PR titles, the contributor acknowledgment statement, merge-halt controls, and a do-not-merge label gate. Also runs as a merge queue check.
357357

@@ -422,7 +422,7 @@ Only runs for `pull_request` and `pull_request_target` events. Skipped for bot a
422422
| **Triggers** | `push` to `main`, `pull_request` to `main`, `schedule` (daily 03:47 UTC), `workflow_dispatch` |
423423
| **Environment** | *(none)* |
424424
| **Runner** | `ubuntu-latest` |
425-
| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress |
425+
| **Concurrency** | Groups by `{workflow}-{event_name}-{ref}`, cancels in-progress |
426426

427427
**Purpose:** Runs six independent security scanners in parallel to detect secrets, vulnerabilities, misconfigurations, and malware. All HIGH and CRITICAL findings must be remediated or have a documented risk acceptance before merge (see [Security Finding Requirements](#security-finding-requirements)).
428428

@@ -675,3 +675,11 @@ Pinned versions should be reviewed and updated **at least quarterly**.
675675
- How to handle breaking changes in scanner tool upgrades
676676
- Consider automating this with Dependabot or Renovate
677677
-->
678+
679+
Agent pre-commit checklist (recommended):
680+
681+
- npx markdownlint-cli2 --fix "**/*.md" # auto-fix markdown lint issues
682+
- npx markdownlint-cli2 "**/*.md" # verify no lint errors
683+
- uv run pytest # run tests via uv wrapper
684+
685+
Agents must run the checklist above and ensure all checks pass before committing and pushing changes.

scripts/aidlc-evaluator/docker/sandbox/Dockerfile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
# Multi-language sandbox image for running AI-generated code in isolation.
22
#
3-
# Includes Python 3.13 + uv, Node.js 22 + npm, and common build tools.
3+
# Includes Python 3.14 + uv, Node.js 22 + npm, and common build tools.
44
# Runs as a non-root user with no credentials or host tools.
55
#
66
# Security notes:
7-
# - Base image is intentionally not pinned to a hash to receive security updates
87
# - HEALTHCHECK is omitted as this is an ephemeral test sandbox, not a service
98
# - RUN commands use pipes without pipefail, acceptable for dependency installation
109

1110
# checkov:skip=CKV_DOCKER_2:HEALTHCHECK not needed for ephemeral test sandbox
1211
# nosemgrep: dockerfile-source-not-pinned
13-
FROM public.ecr.aws/docker/library/python:3.13-slim AS base
12+
FROM public.ecr.aws/docker/library/python:3.14-slim@sha256:3989a23fd2c28a34c7be819e488b958a10601d421ac25bea1e7a5d757365e2d5 AS base
1413

1514
# Install system dependencies and Node.js 22
1615
# nosemgrep: set-pipefail

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
from trend_reports.models import (
2020
BaselineMetrics,
2121
GateResult,
22+
InfraFailure,
23+
InfraFailureReason,
2224
RunData,
2325
RunType,
2426
SemVer,
@@ -33,6 +35,8 @@
3335
__all__ = [
3436
"BaselineMetrics",
3537
"GateResult",
38+
"InfraFailure",
39+
"InfraFailureReason",
3640
"RunData",
3741
"RunType",
3842
"SemVer",

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,11 @@ def cmd_trend(
217217
# 5. Gate
218218
if gate:
219219
result = check_regressions(trend)
220+
if result.infra_failure_detected:
221+
print(
222+
f"Gate WARNING: {result.infra_failure_summary}",
223+
file=sys.stderr,
224+
)
220225
if result.passed:
221226
print(
222227
f"Gate PASSED: {result.latest_label} vs {result.comparison_label} "

scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
ContractTestResults,
1919
DocumentScore,
2020
HandoffMetrics,
21+
InfraFailure,
22+
InfraFailureReason,
2123
QualitativeComparison,
2224
RunConfig,
2325
RunData,
@@ -147,14 +149,20 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
147149

148150
hp = raw.get("handoff_patterns", {})
149151
errors = raw.get("errors", {})
152+
throttle_events = errors.get("throttle_events", 0)
153+
timeout_events = errors.get("timeout_events", 0)
154+
failed_tool_calls = errors.get("failed_tool_calls", 0)
155+
model_error_events = errors.get("model_error_events", 0)
156+
service_unavailable_events = errors.get("service_unavailable_events", 0)
157+
validation_error_events = errors.get("validation_error_events", 0)
150158
error_count = sum(
151159
[
152-
errors.get("throttle_events", 0),
153-
errors.get("timeout_events", 0),
154-
errors.get("failed_tool_calls", 0),
155-
errors.get("model_error_events", 0),
156-
errors.get("service_unavailable_events", 0),
157-
errors.get("validation_error_events", 0),
160+
throttle_events,
161+
timeout_events,
162+
failed_tool_calls,
163+
model_error_events,
164+
service_unavailable_events,
165+
validation_error_events,
158166
]
159167
)
160168

@@ -175,6 +183,12 @@ def parse_run_metrics(yaml_path: Path) -> RunMetrics:
175183
handoffs=handoffs,
176184
server_startup_success=True,
177185
error_count=error_count,
186+
throttle_events=throttle_events,
187+
service_unavailable_events=service_unavailable_events,
188+
model_error_events=model_error_events,
189+
timeout_events=timeout_events,
190+
failed_tool_calls=failed_tool_calls,
191+
validation_error_events=validation_error_events,
178192
)
179193

180194

@@ -215,12 +229,17 @@ def parse_contract_tests(yaml_path: Path) -> ContractTestResults:
215229
)
216230
)
217231

232+
server_started = raw.get("server_started", True)
233+
server_error = raw.get("server_error") or ""
234+
218235
return ContractTestResults(
219236
total=total,
220237
passed=passed,
221238
failed=failed,
222239
pass_rate=pass_rate,
223240
failures=failures,
241+
server_started=server_started,
242+
server_error=server_error,
224243
)
225244

226245

@@ -301,6 +320,59 @@ def classify_run(rules_ref: str) -> tuple[RunType, str, SemVer | None, int | Non
301320
return RunType.RELEASE, rules_ref, None, None
302321

303322

323+
# ---------------------------------------------------------------------------
324+
# Infrastructure failure detection
325+
# ---------------------------------------------------------------------------
326+
327+
328+
def detect_infra_failure(
329+
meta: RunMeta,
330+
metrics: RunMetrics,
331+
contract_tests: ContractTestResults,
332+
has_metrics_file: bool,
333+
) -> InfraFailure:
334+
"""Detect infrastructure failures from run signals.
335+
336+
Conservative: only flags clear infra issues, not ambiguous cases.
337+
"""
338+
reasons: list[InfraFailureReason] = []
339+
340+
# Signal 1: Bedrock infra errors in run-metrics.yaml
341+
if metrics.throttle_events > 0:
342+
reasons.append(InfraFailureReason.THROTTLED)
343+
if metrics.service_unavailable_events > 0:
344+
reasons.append(InfraFailureReason.SERVICE_UNAVAILABLE)
345+
if metrics.model_error_events > 0:
346+
reasons.append(InfraFailureReason.MODEL_ERROR)
347+
348+
# Signal 2: run-meta.yaml status indicates failure/crash
349+
status_lower = meta.status.lower() if meta.status else ""
350+
if "failed" in status_lower:
351+
reasons.append(InfraFailureReason.RUN_FAILED)
352+
elif not meta.status or meta.status.strip() == "":
353+
reasons.append(InfraFailureReason.RUN_CRASHED)
354+
355+
# Signal 3: run-metrics.yaml missing entirely (swarm crashed before writing)
356+
if not has_metrics_file:
357+
reasons.append(InfraFailureReason.METRICS_MISSING)
358+
359+
# Signal 4: Server failed to start (from contract-test-results.yaml)
360+
if not contract_tests.server_started:
361+
reasons.append(InfraFailureReason.SERVER_START_FAILED)
362+
363+
if not reasons:
364+
return InfraFailure(is_infra_failure=False)
365+
366+
reason_strs = [r.value for r in reasons]
367+
summary = f"Infrastructure failure detected: {', '.join(reason_strs)}"
368+
369+
return InfraFailure(
370+
is_infra_failure=True,
371+
reasons=reasons,
372+
summary=summary,
373+
)
374+
375+
304376
# ---------------------------------------------------------------------------
305377
# Collection pipeline
306378
# ---------------------------------------------------------------------------
@@ -319,11 +391,8 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
319391
meta = parse_run_meta(yaml_files["run-meta"])
320392
run_type, label, semver, pr_number = classify_run(meta.config.rules_ref)
321393

322-
metrics = (
323-
parse_run_metrics(yaml_files["run-metrics"])
324-
if "run-metrics" in yaml_files
325-
else RunMetrics()
326-
)
394+
has_metrics_file = "run-metrics" in yaml_files
395+
metrics = parse_run_metrics(yaml_files["run-metrics"]) if has_metrics_file else RunMetrics()
327396
unit_tests = (
328397
parse_test_results(yaml_files["test-results"])
329398
if "test-results" in yaml_files
@@ -334,6 +403,10 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
334403
if "contract-test-results" in yaml_files
335404
else ContractTestResults()
336405
)
406+
407+
# Propagate actual server_started to metrics
408+
metrics.server_startup_success = contract_tests.server_started
409+
337410
code_quality = (
338411
parse_quality_report(yaml_files["quality-report"])
339412
if "quality-report" in yaml_files
@@ -346,13 +419,18 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
346419
)
347420

348421
# Backfill artifact counts from run-metrics if available
349-
if "run-metrics" in yaml_files:
422+
if has_metrics_file:
350423
raw_metrics = _load_yaml(yaml_files["run-metrics"])
351424
workspace = raw_metrics.get("artifacts", {}).get("workspace", {})
352425
code_quality.source_file_count = workspace.get("source_files", 0)
353426
code_quality.test_file_count = workspace.get("test_files", 0)
354427
code_quality.total_lines_of_code = workspace.get("total_lines_of_code", 0)
355428

429+
# Detect infrastructure failures
430+
infra_failure = detect_infra_failure(meta, metrics, contract_tests, has_metrics_file)
431+
if infra_failure.is_infra_failure:
432+
logger.warning("Infra failure detected in %s: %s", source_label, infra_failure.summary)
433+
356434
return RunData(
357435
label=label,
358436
run_type=run_type,
@@ -364,6 +442,7 @@ def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData:
364442
contract_tests=contract_tests,
365443
code_quality=code_quality,
366444
qualitative=qualitative,
445+
infra_failure=infra_failure,
367446
)
368447

369448

0 commit comments

Comments
 (0)