diff --git a/schemas/daily-triage-report.json b/schemas/daily-triage-report.json index a1ff71d..31e533a 100644 --- a/schemas/daily-triage-report.json +++ b/schemas/daily-triage-report.json @@ -1,4 +1,5 @@ { + "$comment": "ACTIVITY-WP-0016-T02. Strict, bounded contract for the daily WSJF triage report. The per-item 'recommendations' schema is intentionally strict on STRUCTURE (types + required keys) so the T03 boundary parser can validate each recommendation independently and quarantine only the malformed ones. 'maxItems' is a producer hint (honoured by llm-connect constrained decoding and by the prompt); it is deliberately NOT hard-enforced by the in-repo validator, because rejecting a whole report for having too many items would reproduce the monolithic-failure bug WP-0016 exists to remove. Over-count is mitigated in T03 (keep top-N by rank, quarantine the rest). Value-domain vocabularies (action/confidence) are documented in the prompt and enforced by T04 guardrails with mitigation, not as brittle hard-fail enums here.", "type": "object", "required": ["summary", "recommendations"], "properties": { @@ -7,8 +8,28 @@ }, "recommendations": { "type": "array", + "maxItems": 7, "items": { - "type": "object" + "type": "object", + "required": ["rank", "candidate", "action", "why"], + "properties": { + "rank": { "type": "integer" }, + "candidate": { "type": "string" }, + "action": { "type": "string" }, + "why": { "type": "string" }, + "confidence": { "type": "string" }, + "wsjf": { + "type": "object", + "properties": { + "score": { "type": "number" }, + "strategic_value": { "type": "number" }, + "time_criticality": { "type": "number" }, + "risk_reduction": { "type": "number" }, + "opportunity_enablement": { "type": "number" }, + "job_size": { "type": "number" } + } + } + } } } } diff --git a/tests/rules/test_executor.py b/tests/rules/test_executor.py index 70c9052..45b661e 100644 --- a/tests/rules/test_executor.py +++ b/tests/rules/test_executor.py @@ -333,7 +333,14 @@ def test_execute_instruction_forwards_output_schema_to_llm_connect(tmp_path, mon def test_execute_instruction_with_audit_accepts_report_payload(): report_data = { "summary": "State Hub has loose ends.", - "recommendations": [{"action": "revisit", "candidate": "CUST-WP-0045"}], + "recommendations": [ + { + "rank": 1, + "action": "revisit", + "candidate": "CUST-WP-0045", + "why": "Loose ends need attention.", + } + ], } llm = _CountingLLM([json.dumps(report_data)]) instr = _instr( @@ -353,7 +360,14 @@ def test_execute_instruction_with_audit_accepts_report_payload(): def test_execute_instruction_with_audit_accepts_fenced_report_payload(): report_data = { "summary": "State Hub has loose ends.", - "recommendations": [{"action": "revisit", "candidate": "CUST-WP-0045"}], + "recommendations": [ + { + "rank": 1, + "action": "revisit", + "candidate": "CUST-WP-0045", + "why": "Loose ends need attention.", + } + ], } llm = _CountingLLM([f"```json\n{json.dumps(report_data)}\n```"]) instr = _instr( diff --git a/tests/test_instruction_evaluation.py b/tests/test_instruction_evaluation.py index 4dee59c..c4b12e9 100644 --- a/tests/test_instruction_evaluation.py +++ b/tests/test_instruction_evaluation.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from pathlib import Path import pytest @@ -70,7 +71,14 @@ async def test_evaluate_instructions_returns_task_specs_with_audit(monkeypatch) async def test_evaluate_instructions_returns_report_payload(monkeypatch) -> None: llm = FakeLLMClient(json.dumps({ "summary": "State Hub has open loose ends.", - "recommendations": [{"candidate": "CUST-WP-0045", "action": "work-next"}], + "recommendations": [ + { + "rank": 1, + "candidate": "CUST-WP-0045", + "action": "work-next", + "why": "Open loose ends.", + } + ], })) monkeypatch.setattr(activities, "get_llm_client", lambda: llm) @@ -209,6 +217,12 @@ async def test_evaluate_instructions_forwards_llm_connect_depth_config(monkeypat "context": {}, }) + # Read the live schema file rather than hard-coding it, so the forwarded + # json_schema assertion tracks schemas/daily-triage-report.json as the + # contract evolves (ACTIVITY-WP-0016-T02). + expected_schema = json.loads( + Path("schemas/daily-triage-report.json").read_text(encoding="utf-8") + ) assert llm.calls[0][2] == { "model_name": "custodian-triage-balanced", "temperature": 0.2, @@ -216,16 +230,6 @@ async def test_evaluate_instructions_forwards_llm_connect_depth_config(monkeypat "max_depth": 2, "model_params": { "reasoning_effort": "medium", - "json_schema": { - "type": "object", - "required": ["summary", "recommendations"], - "properties": { - "summary": {"type": "string"}, - "recommendations": { - "type": "array", - "items": {"type": "object"}, - }, - }, - }, + "json_schema": expected_schema, }, } diff --git a/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md b/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md index 844c333..7d56e33 100644 --- a/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md +++ b/workplans/ACTIVITY-WP-0016-llm-output-robustness-trust-boundary.md @@ -170,6 +170,45 @@ Done when: occur at the expected size; - the output schema file (`_load_output_schema` target) is updated to match. +2026-06-26 progress (in-repo portion): + +- **Strict, bounded schema written** — `schemas/daily-triage-report.json` went + from `recommendations.items: {type: object}` (accept-anything) to a strict + per-item contract: `required [rank, candidate, action, why]` with typed + `wsjf` sub-fields, plus `maxItems: 7`. The strict item shape is what lets the + T03 boundary parser validate each recommendation independently. +- **`maxItems` is a hint, not a hard reject** — the in-repo validator + (`_validate_schema_node`) only enforces `type`/`required`/`properties`/`items` + and ignores `maxItems`/`enum`. That is deliberate: a hard `maxItems` reject + would discard a whole 16-item report — the exact blast-radius bug WP-0016 + removes. The bound is enforced via the prompt + the llm-connect `json_schema` + constraint hint + T03 mitigation (keep top-N by rank, quarantine extras). +- **DEPLOY COUPLING (important):** this schema file is consumed *both* as the + llm-connect hint *and* by the current whole-document validator. Tightening + per-item `required` fields makes the existing whole-doc validation hard-fail + **more** until T03 replaces it with per-item quarantine. Therefore the schema + change MUST ship together with T03 — do not deploy the strict schema to the + runtime bundle ahead of the T03 parser. Four executor/instruction tests that + asserted the old loose contract were updated to the strict contract; the + forwarded-schema test now reads the live file instead of hard-coding it. +- **Truncation hypothesis corroborated** — the instruction config carries + `max_tokens` on the order of ~1200 (per the wiring test fixture). 5268 chars ≈ + ~1300–1500 tokens, so a ~1200-token cap would truncate a 16-item list right at + the observed break. This strengthens T01's leading hypothesis and makes the + `max_tokens` headroom change below concrete. + +**Bundle handoff (NOT in this repo — runtime-projected definition).** The triage +prompt and `max_tokens` live in the Railiance runtime bundle, not in repo files. +Apply there: +1. Instruct a **bounded top-N** (≤ 7) ranked recommendations, "if uncertain emit + fewer well-formed items rather than more." +2. Specify the **per-item framing** the T03 parser will consume (NDJSON: a + leading summary object, then one recommendation JSON object per line). +3. Raise **`max_tokens`** to give clear headroom for 7 framed items (eliminate + truncation at the expected size). +4. State the value vocabularies (`action`, `confidence`) the T04 guardrails will + check. + ## Boundary Parser — Verify & Mitigate (Posture B) ```task