generated from coulomb/repo-seed
feat(ACTIVITY-WP-0016-T02): strict bounded daily-triage output schema
Replace the accept-anything recommendations.items ({type: object}) with a strict
per-item contract (required [rank, candidate, action, why] + typed wsjf) and a
maxItems:7 hint. Strict item structure is what lets the T03 boundary parser
validate each recommendation independently and quarantine only malformed ones.
maxItems is a producer hint (prompt + llm-connect json_schema + T03 mitigation),
NOT a hard reject — a hard maxItems reject would discard a whole 16-item report,
the blast-radius bug WP-0016 removes. DEPLOY COUPLING: the strict schema is also
consumed by the current whole-doc validator, so it must ship with T03's per-item
quarantine parser; until then it increases whole-doc hard-fails. Prompt + max_tokens
headroom + NDJSON framing are documented as a runtime-bundle handoff.
Updated four tests to the strict contract; the forwarded-schema test now reads the
live schema file instead of hard-coding it. Full suite: 213 passed, 1 skipped.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
{
|
||||
"$comment": "ACTIVITY-WP-0016-T02. Strict, bounded contract for the daily WSJF triage report. The per-item 'recommendations' schema is intentionally strict on STRUCTURE (types + required keys) so the T03 boundary parser can validate each recommendation independently and quarantine only the malformed ones. 'maxItems' is a producer hint (honoured by llm-connect constrained decoding and by the prompt); it is deliberately NOT hard-enforced by the in-repo validator, because rejecting a whole report for having too many items would reproduce the monolithic-failure bug WP-0016 exists to remove. Over-count is mitigated in T03 (keep top-N by rank, quarantine the rest). Value-domain vocabularies (action/confidence) are documented in the prompt and enforced by T04 guardrails with mitigation, not as brittle hard-fail enums here.",
|
||||
"type": "object",
|
||||
"required": ["summary", "recommendations"],
|
||||
"properties": {
|
||||
@@ -7,8 +8,28 @@
|
||||
},
|
||||
"recommendations": {
|
||||
"type": "array",
|
||||
"maxItems": 7,
|
||||
"items": {
|
||||
"type": "object"
|
||||
"type": "object",
|
||||
"required": ["rank", "candidate", "action", "why"],
|
||||
"properties": {
|
||||
"rank": { "type": "integer" },
|
||||
"candidate": { "type": "string" },
|
||||
"action": { "type": "string" },
|
||||
"why": { "type": "string" },
|
||||
"confidence": { "type": "string" },
|
||||
"wsjf": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"score": { "type": "number" },
|
||||
"strategic_value": { "type": "number" },
|
||||
"time_criticality": { "type": "number" },
|
||||
"risk_reduction": { "type": "number" },
|
||||
"opportunity_enablement": { "type": "number" },
|
||||
"job_size": { "type": "number" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -333,7 +333,14 @@ def test_execute_instruction_forwards_output_schema_to_llm_connect(tmp_path, mon
|
||||
def test_execute_instruction_with_audit_accepts_report_payload():
|
||||
report_data = {
|
||||
"summary": "State Hub has loose ends.",
|
||||
"recommendations": [{"action": "revisit", "candidate": "CUST-WP-0045"}],
|
||||
"recommendations": [
|
||||
{
|
||||
"rank": 1,
|
||||
"action": "revisit",
|
||||
"candidate": "CUST-WP-0045",
|
||||
"why": "Loose ends need attention.",
|
||||
}
|
||||
],
|
||||
}
|
||||
llm = _CountingLLM([json.dumps(report_data)])
|
||||
instr = _instr(
|
||||
@@ -353,7 +360,14 @@ def test_execute_instruction_with_audit_accepts_report_payload():
|
||||
def test_execute_instruction_with_audit_accepts_fenced_report_payload():
|
||||
report_data = {
|
||||
"summary": "State Hub has loose ends.",
|
||||
"recommendations": [{"action": "revisit", "candidate": "CUST-WP-0045"}],
|
||||
"recommendations": [
|
||||
{
|
||||
"rank": 1,
|
||||
"action": "revisit",
|
||||
"candidate": "CUST-WP-0045",
|
||||
"why": "Loose ends need attention.",
|
||||
}
|
||||
],
|
||||
}
|
||||
llm = _CountingLLM([f"```json\n{json.dumps(report_data)}\n```"])
|
||||
instr = _instr(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -70,7 +71,14 @@ async def test_evaluate_instructions_returns_task_specs_with_audit(monkeypatch)
|
||||
async def test_evaluate_instructions_returns_report_payload(monkeypatch) -> None:
|
||||
llm = FakeLLMClient(json.dumps({
|
||||
"summary": "State Hub has open loose ends.",
|
||||
"recommendations": [{"candidate": "CUST-WP-0045", "action": "work-next"}],
|
||||
"recommendations": [
|
||||
{
|
||||
"rank": 1,
|
||||
"candidate": "CUST-WP-0045",
|
||||
"action": "work-next",
|
||||
"why": "Open loose ends.",
|
||||
}
|
||||
],
|
||||
}))
|
||||
monkeypatch.setattr(activities, "get_llm_client", lambda: llm)
|
||||
|
||||
@@ -209,6 +217,12 @@ async def test_evaluate_instructions_forwards_llm_connect_depth_config(monkeypat
|
||||
"context": {},
|
||||
})
|
||||
|
||||
# Read the live schema file rather than hard-coding it, so the forwarded
|
||||
# json_schema assertion tracks schemas/daily-triage-report.json as the
|
||||
# contract evolves (ACTIVITY-WP-0016-T02).
|
||||
expected_schema = json.loads(
|
||||
Path("schemas/daily-triage-report.json").read_text(encoding="utf-8")
|
||||
)
|
||||
assert llm.calls[0][2] == {
|
||||
"model_name": "custodian-triage-balanced",
|
||||
"temperature": 0.2,
|
||||
@@ -216,16 +230,6 @@ async def test_evaluate_instructions_forwards_llm_connect_depth_config(monkeypat
|
||||
"max_depth": 2,
|
||||
"model_params": {
|
||||
"reasoning_effort": "medium",
|
||||
"json_schema": {
|
||||
"type": "object",
|
||||
"required": ["summary", "recommendations"],
|
||||
"properties": {
|
||||
"summary": {"type": "string"},
|
||||
"recommendations": {
|
||||
"type": "array",
|
||||
"items": {"type": "object"},
|
||||
},
|
||||
},
|
||||
},
|
||||
"json_schema": expected_schema,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -170,6 +170,45 @@ Done when:
|
||||
occur at the expected size;
|
||||
- the output schema file (`_load_output_schema` target) is updated to match.
|
||||
|
||||
2026-06-26 progress (in-repo portion):
|
||||
|
||||
- **Strict, bounded schema written** — `schemas/daily-triage-report.json` went
|
||||
from `recommendations.items: {type: object}` (accept-anything) to a strict
|
||||
per-item contract: `required [rank, candidate, action, why]` with typed
|
||||
`wsjf` sub-fields, plus `maxItems: 7`. The strict item shape is what lets the
|
||||
T03 boundary parser validate each recommendation independently.
|
||||
- **`maxItems` is a hint, not a hard reject** — the in-repo validator
|
||||
(`_validate_schema_node`) only enforces `type`/`required`/`properties`/`items`
|
||||
and ignores `maxItems`/`enum`. That is deliberate: a hard `maxItems` reject
|
||||
would discard a whole 16-item report — the exact blast-radius bug WP-0016
|
||||
removes. The bound is enforced via the prompt + the llm-connect `json_schema`
|
||||
constraint hint + T03 mitigation (keep top-N by rank, quarantine extras).
|
||||
- **DEPLOY COUPLING (important):** this schema file is consumed *both* as the
|
||||
llm-connect hint *and* by the current whole-document validator. Tightening
|
||||
per-item `required` fields makes the existing whole-doc validation hard-fail
|
||||
**more** until T03 replaces it with per-item quarantine. Therefore the schema
|
||||
change MUST ship together with T03 — do not deploy the strict schema to the
|
||||
runtime bundle ahead of the T03 parser. Four executor/instruction tests that
|
||||
asserted the old loose contract were updated to the strict contract; the
|
||||
forwarded-schema test now reads the live file instead of hard-coding it.
|
||||
- **Truncation hypothesis corroborated** — the instruction config carries
|
||||
`max_tokens` on the order of ~1200 (per the wiring test fixture). 5268 chars ≈
|
||||
~1300–1500 tokens, so a ~1200-token cap would truncate a 16-item list right at
|
||||
the observed break. This strengthens T01's leading hypothesis and makes the
|
||||
`max_tokens` headroom change below concrete.
|
||||
|
||||
**Bundle handoff (NOT in this repo — runtime-projected definition).** The triage
|
||||
prompt and `max_tokens` live in the Railiance runtime bundle, not in repo files.
|
||||
Apply there:
|
||||
1. Instruct a **bounded top-N** (≤ 7) ranked recommendations, "if uncertain emit
|
||||
fewer well-formed items rather than more."
|
||||
2. Specify the **per-item framing** the T03 parser will consume (NDJSON: a
|
||||
leading summary object, then one recommendation JSON object per line).
|
||||
3. Raise **`max_tokens`** to give clear headroom for 7 framed items (eliminate
|
||||
truncation at the expected size).
|
||||
4. State the value vocabularies (`action`, `confidence`) the T04 guardrails will
|
||||
check.
|
||||
|
||||
## Boundary Parser — Verify & Mitigate (Posture B)
|
||||
|
||||
```task
|
||||
|
||||
Reference in New Issue
Block a user