feat(ACTIVITY-WP-0016-T04): producer trust-boundary guardrails + ADR-004

Add ADR-004 documenting the producer trust boundary: untrusted producers (LLM,
agent, human; erroneous and malicious), the trust-but-handle vs verify-and-mitigate
postures, error-locality and quarantine-with-provenance principles, and the concrete
activity-core mechanisms.

Implement producer-agnostic guardrails in executor.py, applied uniformly on the
happy path and the recovery path via _partition_items: structural-type -> schema ->
structural caps (_MAX_DEPTH, _MAX_STRING_LEN) -> reference allow-list -> count cap.
Each quarantine carries a reason. Closes the happy-path maxItems count cap deferred
from T03 (valid 9-item report keeps 7, quarantines 2). Reference allow-list reads
context["known_candidates"] via _allow_list_from_context; inert until a resolver
populates it. SCOPE.md updated (executor bullet + ADR list); no INTENT drift.

New tests: happy-path count cap, oversized-string guardrail, allow-list rejection.
Full suite: 218 passed, 1 skipped.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-26 18:10:17 +02:00
parent c5440e8429
commit 9be4ddbdb7
5 changed files with 373 additions and 12 deletions

View File

@@ -475,6 +475,62 @@ def test_resilient_report_quarantines_one_bad_item_among_valid():
assert "rank" in result.report["quarantined_items"][0]["error"]
# ── WP-0016-T04 producer guardrails ───────────────────────────────────────────
def _triage_instr() -> SimpleNamespace:
return _instr(
id="daily-triage-report",
prompt="Report.",
trusted_fields=[],
output_schema="schemas/daily-triage-report.json",
report_sinks=[{"type": "working-memory"}],
)
def test_guardrail_count_cap_on_valid_happy_path():
# 9 fully-valid recommendations in a syntactically valid document: schema
# validation passes, but the maxItems=7 count cap must keep 7 and quarantine 2.
recs = [_valid_rec(i) for i in range(1, 10)]
raw = json.dumps({"summary": "Triage.", "recommendations": recs})
llm = _CountingLLM([raw])
result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm)
assert llm.call_count == 1 # no retry — the document was valid
assert result.report["partial"] is True
assert len(result.report["recommendations"]) == 7
assert result.report["quarantined_count"] == 2
assert all(q["reason"] == "over_limit" for q in result.report["quarantined_items"])
def test_guardrail_oversized_string_quarantined():
big = _valid_rec(2)
big["why"] = "x" * 5000 # exceeds _MAX_STRING_LEN
raw = json.dumps({"summary": "Triage.", "recommendations": [_valid_rec(1), big]})
llm = _CountingLLM([raw])
result = execute_instruction_with_audit(_triage_instr(), _Event(), {}, llm)
assert len(result.report["recommendations"]) == 1
assert result.report["quarantined_count"] == 1
assert result.report["quarantined_items"][0]["reason"] == "guardrail"
def test_guardrail_allow_list_rejects_unknown_candidate():
raw = json.dumps({
"summary": "Triage.",
"recommendations": [_valid_rec(1), _valid_rec(2)], # candidates WS-1, WS-2
})
llm = _CountingLLM([raw])
context = {"known_candidates": ["WS-1"]}
result = execute_instruction_with_audit(_triage_instr(), _Event(), context, llm)
assert len(result.report["recommendations"]) == 1
assert result.report["recommendations"][0]["candidate"] == "WS-1"
assert result.report["quarantined_items"][0]["reason"] == "allow_list"
def test_execute_instruction_with_audit_preserves_invalid_report_with_sinks(
tmp_path,
monkeypatch,