From bf877b7f0de3e78f8bbaf3746495ad8dbe90827d Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 26 Jun 2026 18:18:37 +0200 Subject: [PATCH] test(ACTIVITY-WP-0016-T05): regression coverage incl. real 06-26 payload + over-depth Add a test driving the actual captured 2026-06-26 failure payload (tests/fixtures/wp0016/...partial.json): it now recovers 6+ valid recommendations and quarantines the truncated tail, where before WP-0016 it discarded the whole run. Add an over-depth guardrail test. Together with T03/T04 the regression set now covers truncation, one-bad-item, oversized-string, over-depth, allow-list/injection-shaped, and happy-path count cap. In-repo portion of T05 complete; the live railiance01 graceful-degradation smoke is operator-owned cluster work (deploy-coupled with the T02 bundle changes) and remains outstanding. Hand-back notes posted to WP-0006-T03 and WP-0010-T04. Full suite: 220 passed, 1 skipped. Co-Authored-By: Claude Opus 4.8 --- tests/rules/test_executor.py | 42 +++++++++++++++++++ ...16-llm-output-robustness-trust-boundary.md | 19 +++++++++ 2 files changed, 61 insertions(+) diff --git a/tests/rules/test_executor.py b/tests/rules/test_executor.py index d7c4837..23bcad6 100644 --- a/tests/rules/test_executor.py +++ b/tests/rules/test_executor.py @@ -12,6 +12,7 @@ Covers: from __future__ import annotations import json +from pathlib import Path from types import SimpleNamespace from typing import Any @@ -531,6 +532,47 @@ def test_guardrail_allow_list_rejects_unknown_candidate(): assert result.report["quarantined_items"][0]["reason"] == "allow_list" +def _nested(depth: int) -> dict[str, Any]: + node: dict[str, Any] = {"leaf": 1} + for _ in range(depth): + node = {"a": node} + return node + + +def test_guardrail_over_depth_quarantined(): + deep = _valid_rec(2) + deep["extra"] = _nested(12) # well past _MAX_DEPTH + raw = json.dumps({"summary": "Triage.", "recommendations": [_valid_rec(1), deep]}) + llm = _CountingLLM([raw]) + + result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm) + + assert len(result.report["recommendations"]) == 1 + assert result.report["quarantined_count"] == 1 + assert result.report["quarantined_items"][0]["reason"] == "guardrail" + assert "depth" in result.report["quarantined_items"][0]["error"] + + +def test_resilient_recovery_against_real_2026_06_26_fixture(): + # The actual captured failure payload (4000-char preview, truncated at the 7th + # recommendation) — the run that reset the WP-0006-T03 streak. Before WP-0016 + # this discarded the whole report; now it must recover the valid prefix. + fixture = json.loads( + Path("tests/fixtures/wp0016/daily_triage_2026-06-26_validation_failure.partial.json") + .read_text(encoding="utf-8") + ) + raw = fixture["raw_output_preview"] + llm = _CountingLLM([raw, raw]) + + result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm) + + assert result.output_validated is True + assert result.report["partial"] is True + # Six recommendations are fully intact before the truncation point. + assert len(result.report["recommendations"]) >= 6 + assert all("rank" in rec and "candidate" in rec for rec in result.report["recommendations"]) + + def test_execute_instruction_with_audit_preserves_invalid_report_with_sinks( tmp_path, monkeypatch, diff --git a/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md b/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md index cda146d..aca1d6e 100644 --- a/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md +++ b/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md @@ -349,6 +349,25 @@ Done when: that the output-robustness blocker is cleared so the three-clean-run gate can resume on its own. +2026-06-26 progress (in-repo portion complete): + +- **Regression coverage complete.** Across T03/T04/T05: truncated-mid-list, + one-bad-item-among-good (quarantine + partial), oversized-string and over-depth + guardrail rejection, allow-list (injection-shaped) rejection, happy-path count + cap, and a test driving the **actual captured 2026-06-26 payload** + (`tests/fixtures/wp0016/daily_triage_2026-06-26_validation_failure.partial.json`) + — it now recovers 6+ valid recommendations and quarantines the truncated tail, + where before it discarded the whole run. +- **Full suite green:** 218 passed, 1 skipped (recorded at T04; the T05 fixture + + over-depth tests add to this — see the commit). +- **Hand-back notes posted** to `ACTIVITY-WP-0006-T03` (State Hub event + `b6b8c2b8`) and `ACTIVITY-WP-0010-T04` (`b813f0dc`). +- **Remaining (remote, operator-owned):** the live daily-triage smoke on + `railiance01` proving end-to-end graceful degradation. It depends on deploying + the T02 bundle prompt/`max_tokens`/NDJSON changes together with this code, which + is cluster/operator work outside this repo's SCOPE. T05 therefore stays + `progress` until that live run exists; the in-repo deliverables are done. + ## Relationships - **Blocks / feeds:** `ACTIVITY-WP-0006-T03` (three clean scheduled runs) and