feat(ACTIVITY-WP-0016-T03): resilient per-item report recovery with quarantine lane

When the whole-document parse + one retry still fail, report instructions now run _resilient_report before the total-loss path. A brace/quote-aware scanner (_extract_object_spans) recovers each recommendation object whether pretty-printed across many lines or NDJSON one-per-line; a truncated tail gets a best-effort _try_repair; _partition_items validates each recovered object against the T02 item schema. Valid items survive (output_validated=True, partial=True), malformed/ over-maxItems items are quarantined with provenance (index, error, raw, reason), capped at 20. Error locality now matches the unit of work: one bad item costs one item, not the whole report. Verified against the real 06-26 shape: 7 valid recommendations + a truncated tail now recovers all 7 and quarantines the broken tail (previously the whole run was discarded). Happy-path maxItems top-N enforcement is deferred to T04 (count caps). Full suite: 215 passed, 1 skipped. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 17:56:28 +02:00
parent b41b6034ee
commit a70c00a789
2 changed files with 305 additions and 0 deletions
--- a/tests/rules/test_executor.py
+++ b/tests/rules/test_executor.py
@@ -403,6 +403,78 @@ def test_execute_instruction_with_audit_rejects_invalid_report_schema():
    assert llm.call_count == 2


+# ── WP-0016-T03 resilient report recovery ─────────────────────────────────────
+
+def _valid_rec(rank: int) -> dict[str, Any]:
+    return {
+        "rank": rank,
+        "candidate": f"WS-{rank}",
+        "action": "work-next",
+        "why": f"reason {rank}",
+        "wsjf": {"score": 5.0},
+    }
+
+
+def _pretty_triage_with_truncated_tail(num_valid: int) -> str:
+    body = ",\n".join("    " + json.dumps(_valid_rec(i)) for i in range(1, num_valid + 1))
+    # Trailing object is cut off mid-string — the whole document is invalid JSON,
+    # reproducing the 2026-06-26 failure shape (valid prefix, broken tail).
+    return (
+        '{\n  "summary": "Daily triage.",\n  "recommendations": [\n'
+        + body
+        + ',\n    {\n      "rank": '
+        + str(num_valid + 1)
+        + ',\n      "candidate": "WS-X",\n      "action": "work-'
+    )
+
+
+def test_resilient_report_recovers_valid_prefix_and_quarantines_truncated_tail():
+    raw = _pretty_triage_with_truncated_tail(7)
+    llm = _CountingLLM([raw, raw])
+    instr = _instr(
+        id="daily-triage-report",
+        prompt="Report.",
+        trusted_fields=[],
+        output_schema="schemas/daily-triage-report.json",
+        report_sinks=[{"type": "working-memory"}],
+    )
+
+    result = execute_instruction_with_audit(instr, _Event(), {}, llm)
+
+    assert result.output_validated is True
+    assert result.review_required is True
+    assert result.report is not None
+    assert result.report["partial"] is True
+    assert len(result.report["recommendations"]) == 7
+    assert result.report["summary"] == "Daily triage."
+    assert result.report["quarantined_count"] >= 1
+    # The broken tail is dropped — either as an unparseable/truncated span or,
+    # if _try_repair salvages its structure, as a schema-invalid item. Either way
+    # it carries a diagnostic error and never pollutes the surviving report.
+    assert result.report["quarantined_items"][0]["error"]
+
+
+def test_resilient_report_quarantines_one_bad_item_among_valid():
+    recs = [_valid_rec(1), {"candidate": "WS-2", "action": "x", "why": "no rank"}, _valid_rec(3)]
+    raw = json.dumps({"summary": "Triage.", "recommendations": recs})
+    llm = _CountingLLM([raw, raw])
+    instr = _instr(
+        id="daily-triage-report",
+        prompt="Report.",
+        trusted_fields=[],
+        output_schema="schemas/daily-triage-report.json",
+        report_sinks=[{"type": "working-memory"}],
+    )
+
+    result = execute_instruction_with_audit(instr, _Event(), {}, llm)
+
+    assert result.output_validated is True
+    assert result.report["partial"] is True
+    assert len(result.report["recommendations"]) == 2
+    assert result.report["quarantined_count"] == 1
+    assert "rank" in result.report["quarantined_items"][0]["error"]
+
+
 def test_execute_instruction_with_audit_preserves_invalid_report_with_sinks(
    tmp_path,
    monkeypatch,