generated from coulomb/repo-seed
test(ACTIVITY-WP-0016-T05): regression coverage incl. real 06-26 payload + over-depth
Add a test driving the actual captured 2026-06-26 failure payload (tests/fixtures/wp0016/...partial.json): it now recovers 6+ valid recommendations and quarantines the truncated tail, where before WP-0016 it discarded the whole run. Add an over-depth guardrail test. Together with T03/T04 the regression set now covers truncation, one-bad-item, oversized-string, over-depth, allow-list/injection-shaped, and happy-path count cap. In-repo portion of T05 complete; the live railiance01 graceful-degradation smoke is operator-owned cluster work (deploy-coupled with the T02 bundle changes) and remains outstanding. Hand-back notes posted to WP-0006-T03 and WP-0010-T04. Full suite: 220 passed, 1 skipped. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ Covers:
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any
|
||||
|
||||
@@ -531,6 +532,47 @@ def test_guardrail_allow_list_rejects_unknown_candidate():
|
||||
assert result.report["quarantined_items"][0]["reason"] == "allow_list"
|
||||
|
||||
|
||||
def _nested(depth: int) -> dict[str, Any]:
|
||||
node: dict[str, Any] = {"leaf": 1}
|
||||
for _ in range(depth):
|
||||
node = {"a": node}
|
||||
return node
|
||||
|
||||
|
||||
def test_guardrail_over_depth_quarantined():
|
||||
deep = _valid_rec(2)
|
||||
deep["extra"] = _nested(12) # well past _MAX_DEPTH
|
||||
raw = json.dumps({"summary": "Triage.", "recommendations": [_valid_rec(1), deep]})
|
||||
llm = _CountingLLM([raw])
|
||||
|
||||
result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm)
|
||||
|
||||
assert len(result.report["recommendations"]) == 1
|
||||
assert result.report["quarantined_count"] == 1
|
||||
assert result.report["quarantined_items"][0]["reason"] == "guardrail"
|
||||
assert "depth" in result.report["quarantined_items"][0]["error"]
|
||||
|
||||
|
||||
def test_resilient_recovery_against_real_2026_06_26_fixture():
|
||||
# The actual captured failure payload (4000-char preview, truncated at the 7th
|
||||
# recommendation) — the run that reset the WP-0006-T03 streak. Before WP-0016
|
||||
# this discarded the whole report; now it must recover the valid prefix.
|
||||
fixture = json.loads(
|
||||
Path("tests/fixtures/wp0016/daily_triage_2026-06-26_validation_failure.partial.json")
|
||||
.read_text(encoding="utf-8")
|
||||
)
|
||||
raw = fixture["raw_output_preview"]
|
||||
llm = _CountingLLM([raw, raw])
|
||||
|
||||
result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm)
|
||||
|
||||
assert result.output_validated is True
|
||||
assert result.report["partial"] is True
|
||||
# Six recommendations are fully intact before the truncation point.
|
||||
assert len(result.report["recommendations"]) >= 6
|
||||
assert all("rank" in rec and "candidate" in rec for rec in result.report["recommendations"])
|
||||
|
||||
|
||||
def test_execute_instruction_with_audit_preserves_invalid_report_with_sinks(
|
||||
tmp_path,
|
||||
monkeypatch,
|
||||
|
||||
Reference in New Issue
Block a user