feat(ACTIVITY-WP-0016-T04): producer trust-boundary guardrails + ADR-004

Add ADR-004 documenting the producer trust boundary: untrusted producers (LLM, agent, human; erroneous and malicious), the trust-but-handle vs verify-and-mitigate postures, error-locality and quarantine-with-provenance principles, and the concrete activity-core mechanisms. Implement producer-agnostic guardrails in executor.py, applied uniformly on the happy path and the recovery path via _partition_items: structural-type -> schema -> structural caps (_MAX_DEPTH, _MAX_STRING_LEN) -> reference allow-list -> count cap. Each quarantine carries a reason. Closes the happy-path maxItems count cap deferred from T03 (valid 9-item report keeps 7, quarantines 2). Reference allow-list reads context["known_candidates"] via _allow_list_from_context; inert until a resolver populates it. SCOPE.md updated (executor bullet + ADR list); no INTENT drift. New tests: happy-path count cap, oversized-string guardrail, allow-list rejection. Full suite: 218 passed, 1 skipped. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 18:10:17 +02:00
parent c5440e8429
commit 9be4ddbdb7
5 changed files with 373 additions and 12 deletions
--- a/tests/rules/test_executor.py
+++ b/tests/rules/test_executor.py
@@ -475,6 +475,62 @@ def test_resilient_report_quarantines_one_bad_item_among_valid():
    assert "rank" in result.report["quarantined_items"][0]["error"]


+# ── WP-0016-T04 producer guardrails ───────────────────────────────────────────
+
+def _triage_instr() -> SimpleNamespace:
+    return _instr(
+        id="daily-triage-report",
+        prompt="Report.",
+        trusted_fields=[],
+        output_schema="schemas/daily-triage-report.json",
+        report_sinks=[{"type": "working-memory"}],
+    )
+
+
+def test_guardrail_count_cap_on_valid_happy_path():
+    # 9 fully-valid recommendations in a syntactically valid document: schema
+    # validation passes, but the maxItems=7 count cap must keep 7 and quarantine 2.
+    recs = [_valid_rec(i) for i in range(1, 10)]
+    raw = json.dumps({"summary": "Triage.", "recommendations": recs})
+    llm = _CountingLLM([raw])
+
+    result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm)
+
+    assert llm.call_count == 1  # no retry — the document was valid
+    assert result.report["partial"] is True
+    assert len(result.report["recommendations"]) == 7
+    assert result.report["quarantined_count"] == 2
+    assert all(q["reason"] == "over_limit" for q in result.report["quarantined_items"])
+
+
+def test_guardrail_oversized_string_quarantined():
+    big = _valid_rec(2)
+    big["why"] = "x" * 5000  # exceeds _MAX_STRING_LEN
+    raw = json.dumps({"summary": "Triage.", "recommendations": [_valid_rec(1), big]})
+    llm = _CountingLLM([raw])
+
+    result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm)
+
+    assert len(result.report["recommendations"]) == 1
+    assert result.report["quarantined_count"] == 1
+    assert result.report["quarantined_items"][0]["reason"] == "guardrail"
+
+
+def test_guardrail_allow_list_rejects_unknown_candidate():
+    raw = json.dumps({
+        "summary": "Triage.",
+        "recommendations": [_valid_rec(1), _valid_rec(2)],  # candidates WS-1, WS-2
+    })
+    llm = _CountingLLM([raw])
+    context = {"known_candidates": ["WS-1"]}
+
+    result = execute_instruction_with_audit(_triage_instr(), _Event(), context, llm)
+
+    assert len(result.report["recommendations"]) == 1
+    assert result.report["recommendations"][0]["candidate"] == "WS-1"
+    assert result.report["quarantined_items"][0]["reason"] == "allow_list"
+
+
 def test_execute_instruction_with_audit_preserves_invalid_report_with_sinks(
    tmp_path,
    monkeypatch,