Wire instruction report execution

2026-05-19 18:28:23 +02:00
parent 0e7084207e
commit 0dc342eb1b
7 changed files with 513 additions and 16 deletions
--- a/schemas/daily-triage-report.json
+++ b/schemas/daily-triage-report.json
@@ -0,0 +1,15 @@
 {
  "type": "object",
  "required": ["summary", "recommendations"],
  "properties": {
    "summary": {
      "type": "string"
    },
    "recommendations": {
      "type": "array",
      "items": {
        "type": "object"
      }
    }
  }
 }
--- a/src/activity_core/activities.py
+++ b/src/activity_core/activities.py
@@ -25,6 +25,9 @@ from activity_core.issue_sink import get_issue_sink
 from activity_core.orm import ActivityDefinition as ActivityDefinitionRow
 from activity_core.orm import ActivityRun, TaskInstance, TaskSpawnLog
 from activity_core.rules import evaluate_condition
 from activity_core.llm_client import get_llm_client
 from activity_core.models import InstructionDef
 from activity_core.rules.executor import execute_instruction_with_audit
 _session_factory: async_sessionmaker[AsyncSession] | None = None
@@ -267,6 +270,75 @@ async def evaluate_rules(payload: dict) -> list[dict]:
    return matched
@activity.defn
 async def evaluate_instructions(payload: dict) -> dict:
    """Evaluate instruction blocks and return task specs/reports with audit fields.
    Expected keys in payload:
        instructions list[dict] — InstructionDef serialised dicts
        event        dict       — EventEnvelope attributes (or empty for cron)
        context      dict       — context snapshot from resolve_context
    """
    instructions = payload.get("instructions", [])
    event_attrs = payload.get("event", {})
    context = payload.get("context", {})
    llm_client = get_llm_client()
    class _Env:
        def __init__(self, attrs: dict) -> None:
            self.attributes = _DictObj(attrs)
    class _DictObj:
        def __init__(self, d: dict) -> None:
            self.__dict__.update(d)
    event_obj = _Env(event_attrs)
    task_specs: list[dict] = []
    reports: list[dict] = []
    for raw_instruction in instructions:
        try:
            instruction = InstructionDef.model_validate(raw_instruction)
        except Exception as exc:
            activity.logger.warning("instruction definition invalid — %s", exc)
            continue
        result = execute_instruction_with_audit(
            instruction,
            event_obj,
            context,
            llm_client,
        )
        if result.report is not None:
            reports.append({
                "instruction_id": instruction.id,
                "report": result.report,
                "condition": result.condition_matched,
                "prompt_hash": result.prompt_hash,
                "model": result.model,
                "output_validated": result.output_validated,
                "review_required": result.review_required,
            })
        for spec in result.tasks:
            task_specs.append({
                "title": spec.title,
                "description": spec.description,
                "target_repo": spec.target_repo,
                "priority": spec.priority,
                "labels": spec.labels,
                "due_in_days": spec.due_in_days,
                "source_type": "instruction",
                "source_id": instruction.id,
                "condition": result.condition_matched,
                "prompt_hash": result.prompt_hash,
                "model": result.model,
                "output_validated": result.output_validated,
                "review_required": result.review_required,
            })
    return {"task_specs": task_specs, "reports": reports}
@activity.defn
 async def emit_tasks(payload: dict) -> list[str]:
    """Emit TaskSpecs to IssueSink and write task_spawn_log rows.
@@ -316,6 +388,10 @@ async def emit_tasks(payload: dict) -> list[str]:
                        triggering_event_id=triggering_event_id,
                        task_ref=ref.external_id,
                        condition_matched=spec_dict.get("condition"),
                        prompt_hash=spec_dict.get("prompt_hash"),
                        model=spec_dict.get("model"),
                        output_validated=spec_dict.get("output_validated"),
                        review_required=spec_dict.get("review_required"),
                    )
                    session.add(log_row)
                except Exception as exc:
--- a/src/activity_core/llm_client.py
+++ b/src/activity_core/llm_client.py
@@ -0,0 +1,57 @@
 """llm-connect adapter for instruction execution.
 activity-core deliberately talks to llm-connect over its small HTTP surface
 instead of importing provider-specific SDKs. This keeps the activity worker on
 owned infrastructure while leaving provider selection, API keys, and model
 routing behind the existing llm-connect boundary.
 """
 from __future__ import annotations
 import os
 from typing import Any
 import httpx
 class DisabledLLMClient:
    """LLM client used when no llm-connect endpoint is configured."""
    def complete(self, prompt: str, model: str = "") -> str:  # noqa: ARG002
        raise RuntimeError("LLM_CONNECT_URL is not configured")
 class LLMConnectClient:
    """Small synchronous client for llm-connect server mode."""
    def __init__(self, base_url: str, timeout_seconds: float = 300.0) -> None:
        self.base_url = base_url.rstrip("/")
        self.timeout_seconds = timeout_seconds
    def complete(self, prompt: str, model: str = "") -> str:
        payload: dict[str, Any] = {
            "prompt": prompt,
            "config": {
                "model_name": model,
                "timeout_seconds": int(self.timeout_seconds),
            },
        }
        resp = httpx.post(
            f"{self.base_url}/execute",
            json=payload,
            timeout=self.timeout_seconds,
        )
        resp.raise_for_status()
        data = resp.json()
        content = data.get("content")
        if not isinstance(content, str):
            raise ValueError("llm-connect response missing string content")
        return content
 def get_llm_client() -> DisabledLLMClient | LLMConnectClient:
    base_url = os.environ.get("LLM_CONNECT_URL", "").strip()
    if not base_url:
        return DisabledLLMClient()
    timeout = float(os.environ.get("LLM_CONNECT_TIMEOUT_SECONDS", "300"))
    return LLMConnectClient(base_url, timeout)
--- a/src/activity_core/rules/executor.py
+++ b/src/activity_core/rules/executor.py
@@ -11,6 +11,8 @@ import hashlib
 import json
 import logging
 import re
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 from activity_core.rules.evaluator import UnsafeExpression, evaluate_condition
@@ -26,6 +28,19 @@ class UntrustedFieldError(ValueError):
    """Raised when a prompt placeholder references a field not in trusted_fields."""
@dataclass
 class InstructionResult:
    """Instruction output plus audit metadata for workflow integration."""
    tasks: list[TaskSpec]
    report: dict[str, Any] | None = None
    prompt_hash: str | None = None
    model: str | None = None
    output_validated: bool = False
    review_required: bool = False
    condition_matched: str | None = None
 def _resolve_path(obj: Any, path: str) -> Any:
    """Walk a dot-separated path on obj or dict. Returns None if not found."""
    parts = path.split(".")
@@ -92,14 +107,24 @@ def execute_instruction(
      4. Validate response against instr.output_schema (JSON Schema). Retry once.
      5. Return list[TaskSpec].
    """
    return execute_instruction_with_audit(instr, event, context, llm_client).tasks
 def execute_instruction_with_audit(
    instr: Any,
    event: Any,
    context: dict,
    llm_client: Any,
 ) -> InstructionResult:
    """Evaluate an Instruction and return task specs plus audit metadata."""
    try:
        return _execute(instr, event, context, llm_client)
    except UntrustedFieldError as exc:
        logger.warning("instruction %r rejected — %s", instr.id, exc)
-        return []
+        return _empty_result(instr)
    except Exception as exc:
        logger.warning("instruction %r failed — %s", instr.id, exc)
-        return []
+        return _empty_result(instr)
 def _execute(
@@ -107,14 +132,14 @@ def _execute(
    event: Any,
    context: dict,
    llm_client: Any,
-) -> list[TaskSpec]:
+) -> InstructionResult:
    # Step 1 — pre-filter
    try:
        if instr.condition and not evaluate_condition(instr.condition, event, context):
-            return []
+            return _empty_result(instr)
    except UnsafeExpression as exc:
        logger.warning("instruction %r condition is unsafe — %s", instr.id, exc)
-        return []
+        return _empty_result(instr)
    # Step 2 — render prompt (raises UntrustedFieldError on policy violation)
    rendered = _render_prompt(instr.prompt, instr.trusted_fields, event, context)
@@ -124,34 +149,87 @@ def _execute(
    raw_output = llm_client.complete(rendered, model=instr.model)
    # Step 4 — validate and optionally retry
-    task_specs, error = _validate_output(raw_output, instr)
+    task_specs, report, error = _validate_output(raw_output, instr)
    if error:
        retry_prompt = rendered + f"\n\nPrevious output was invalid: {error}\nPlease fix."
        raw_output = llm_client.complete(retry_prompt, model=instr.model)
-        task_specs, error = _validate_output(raw_output, instr)
+        task_specs, report, error = _validate_output(raw_output, instr)
        if error:
            logger.warning(
                "instruction_output_error: instruction=%r, prompt_hash=%s, error=%s",
                instr.id, prompt_hash, error,
            )
-            return []
+            return _empty_result(instr, prompt_hash=prompt_hash)
-    return task_specs
+    return InstructionResult(
        tasks=task_specs,
        report=report,
        prompt_hash=prompt_hash,
        model=instr.model,
        output_validated=True,
        review_required=bool(getattr(instr, "review_required", False)),
        condition_matched=instr.condition or None,
    )
-def _validate_output(raw_output: Any, instr: Any) -> tuple[list[TaskSpec], str | None]:
+def _empty_result(instr: Any, prompt_hash: str | None = None) -> InstructionResult:
-    """Parse raw LLM output into TaskSpec list. Returns (specs, error_message)."""
+    return InstructionResult(
        tasks=[],
        prompt_hash=prompt_hash,
        model=getattr(instr, "model", None),
        output_validated=False,
        review_required=bool(getattr(instr, "review_required", False)),
        condition_matched=getattr(instr, "condition", "") or None,
    )
 def _validate_output(
    raw_output: Any,
    instr: Any,
 ) -> tuple[list[TaskSpec], dict[str, Any] | None, str | None]:
    """Parse raw LLM output into TaskSpecs and optional report payload.
    Accepted shapes:
      - list[task]
      - single task dict with title/description/etc.
      - {"tasks": [...], "report": {...}}
      - report-only dict, such as {"summary": "...", "recommendations": [...]}
    Returns (specs, report, error_message).
    """
    try:
        if isinstance(raw_output, str):
            data = json.loads(raw_output)
        else:
            data = raw_output
-        if not isinstance(data, list):
+        schema_error = _validate_against_schema(data, getattr(instr, "output_schema", ""))
-            data = [data]
+        if schema_error:
            return [], None, schema_error
        report: dict[str, Any] | None = None
        task_items: list[Any]
        if isinstance(data, dict) and ("tasks" in data or "report" in data):
            maybe_report = data.get("report")
            if maybe_report is not None and not isinstance(maybe_report, dict):
                return [], None, "report must be a JSON object"
            report = maybe_report
            tasks = data.get("tasks", [])
            if not isinstance(tasks, list):
                return [], None, "tasks must be a JSON array"
            task_items = tasks
        elif isinstance(data, dict) and "title" not in data:
            report = data
            task_items = []
        elif isinstance(data, list):
            task_items = data
        else:
            task_items = [data]
        specs = []
-        for item in data:
+        for item in task_items:
            if not isinstance(item, dict):
                return [], None, "each task must be a JSON object"
            specs.append(TaskSpec(
                title=item.get("title", ""),
                description=item.get("description", ""),
@@ -162,6 +240,70 @@ def _validate_output(raw_output: Any, instr: Any) -> tuple[list[TaskSpec], str |
                source_type="instruction",
                source_id=instr.id,
            ))
-        return specs, None
+        return specs, report, None
    except (json.JSONDecodeError, AttributeError, KeyError, TypeError) as exc:
-        return [], str(exc)
+        return [], None, str(exc)
 def _validate_against_schema(data: Any, schema_path: str) -> str | None:
    if not schema_path:
        return None
    path = Path(schema_path)
    if not path.exists():
        return None
    try:
        schema = json.loads(path.read_text(encoding="utf-8"))
    except (OSError, json.JSONDecodeError) as exc:
        return f"could not read output schema: {exc}"
    return _validate_schema_node(data, schema, "$")
 def _validate_schema_node(data: Any, schema: dict[str, Any], path: str) -> str | None:
    expected_type = schema.get("type")
    if expected_type and not _matches_type(data, expected_type):
        return f"{path}: expected {expected_type}"
    if expected_type == "object":
        required = schema.get("required", [])
        if isinstance(required, list):
            for key in required:
                if isinstance(key, str) and key not in data:
                    return f"{path}: missing required property {key!r}"
        properties = schema.get("properties", {})
        if isinstance(properties, dict):
            for key, child_schema in properties.items():
                if key in data and isinstance(child_schema, dict):
                    error = _validate_schema_node(data[key], child_schema, f"{path}.{key}")
                    if error:
                        return error
    if expected_type == "array":
        item_schema = schema.get("items")
        if isinstance(item_schema, dict):
            for index, item in enumerate(data):
                error = _validate_schema_node(item, item_schema, f"{path}[{index}]")
                if error:
                    return error
    return None
 def _matches_type(data: Any, expected_type: str) -> bool:
    if expected_type == "object":
        return isinstance(data, dict)
    if expected_type == "array":
        return isinstance(data, list)
    if expected_type == "string":
        return isinstance(data, str)
    if expected_type == "integer":
        return isinstance(data, int) and not isinstance(data, bool)
    if expected_type == "number":
        return isinstance(data, (int, float)) and not isinstance(data, bool)
    if expected_type == "boolean":
        return isinstance(data, bool)
    if expected_type == "null":
        return data is None
    return True
--- a/src/activity_core/workflows.py
+++ b/src/activity_core/workflows.py
@@ -21,6 +21,7 @@ with workflow.unsafe.imports_passed_through():
    from activity_core.activities import (
        emit_tasks,
        evaluate_rules,
        evaluate_instructions,
        load_activity_definition,
        log_run,
        persist_task_instance,
@@ -136,6 +137,19 @@ class RunActivityWorkflow:
                "condition": rule.get("condition", ""),
            })
        if defn.get("instructions"):
            instruction_result: dict = await workflow.execute_activity(
                evaluate_instructions,
                {
                    "instructions": defn.get("instructions", []),
                    "event": event_attrs,
                    "context": context_snapshot,
                },
                start_to_close_timeout=_ACTIVITY_TIMEOUT,
                retry_policy=_RETRY_POLICY,
            )
            task_spec_dicts.extend(instruction_result.get("task_specs", []))
        # ── 4. Emit tasks via IssueSink ───────────────────────────────────────
        if trigger_key == SCHEDULED_TRIGGER_KEY:
            dedup_source = workflow.info().workflow_id
--- a/tests/rules/test_executor.py
+++ b/tests/rules/test_executor.py
@@ -21,6 +21,7 @@ from activity_core.rules.executor import (
    UntrustedFieldError,
    _render_prompt,
    execute_instruction,
    execute_instruction_with_audit,
 )
@@ -201,6 +202,82 @@ def test_valid_llm_output_returns_task_spec():
    assert result[0].source_type == "instruction"
 def test_execute_instruction_with_audit_returns_metadata():
    task_data = [{"title": "Run triage", "priority": "high"}]
    llm = _CountingLLM([json.dumps(task_data)])
    instr = _instr(
        id="daily-triage",
        condition="",
        prompt="Check State Hub.",
        trusted_fields=[],
        model="test-model",
        review_required=True,
    )
    result = execute_instruction_with_audit(instr, _Event(), {}, llm)
    assert len(result.tasks) == 1
    assert result.tasks[0].source_id == "daily-triage"
    assert result.prompt_hash is not None
    assert len(result.prompt_hash) == 64
    assert result.model == "test-model"
    assert result.output_validated is True
    assert result.review_required is True
 def test_execute_instruction_with_audit_accepts_report_payload():
    report_data = {
        "summary": "State Hub has loose ends.",
        "recommendations": [{"action": "revisit", "candidate": "CUST-WP-0045"}],
    }
    llm = _CountingLLM([json.dumps(report_data)])
    instr = _instr(
        id="daily-triage-report",
        prompt="Report.",
        trusted_fields=[],
        output_schema="schemas/daily-triage-report.json",
    )
    result = execute_instruction_with_audit(instr, _Event(), {}, llm)
    assert result.tasks == []
    assert result.report == report_data
    assert result.output_validated is True
 def test_execute_instruction_with_audit_rejects_invalid_report_schema():
    report_data = {"summary": "Missing recommendations."}
    llm = _CountingLLM([json.dumps(report_data), json.dumps(report_data)])
    instr = _instr(
        id="daily-triage-report",
        prompt="Report.",
        trusted_fields=[],
        output_schema="schemas/daily-triage-report.json",
    )
    result = execute_instruction_with_audit(instr, _Event(), {}, llm)
    assert result.tasks == []
    assert result.report is None
    assert result.output_validated is False
    assert llm.call_count == 2
 def test_execute_instruction_with_audit_accepts_report_and_tasks_envelope():
    envelope = {
        "report": {"summary": "Review needed."},
        "tasks": [{"title": "Inspect CUST-WP-0045"}],
    }
    llm = _CountingLLM([json.dumps(envelope)])
    instr = _instr(id="daily-triage-report", prompt="Report.", trusted_fields=[])
    result = execute_instruction_with_audit(instr, _Event(), {}, llm)
    assert result.report == {"summary": "Review needed."}
    assert len(result.tasks) == 1
    assert result.tasks[0].title == "Inspect CUST-WP-0045"
 # ── Condition pre-filter ───────────────────────────────────────────────────────
 def test_condition_false_skips_llm():
--- a/tests/test_instruction_evaluation.py
+++ b/tests/test_instruction_evaluation.py
@@ -0,0 +1,116 @@
 from __future__ import annotations
 import json
 import pytest
 from activity_core import activities
 class FakeLLMClient:
    def __init__(self, response: str) -> None:
        self.response = response
        self.calls: list[tuple[str, str]] = []
    def complete(self, prompt: str, model: str = "") -> str:
        self.calls.append((prompt, model))
        return self.response
@pytest.mark.asyncio
 async def test_evaluate_instructions_returns_task_specs_with_audit(monkeypatch) -> None:
    llm = FakeLLMClient(json.dumps([
        {
            "title": "Run daily triage",
            "description": "Review State Hub loose ends.",
            "priority": "high",
            "labels": ["triage"],
        }
    ]))
    monkeypatch.setattr(activities, "get_llm_client", lambda: llm)
    result = await activities.evaluate_instructions({
        "instructions": [
            {
                "id": "daily-triage",
                "trusted_fields": ["context.summary.open_tasks"],
                "model": "test-model",
                "prompt": "Open tasks: {context.summary.open_tasks}",
                "output_schema": "",
                "review_required": False,
            }
        ],
        "event": {},
        "context": {"summary": {"open_tasks": 3}},
    })
    task_specs = result["task_specs"]
    assert len(task_specs) == 1
    spec = task_specs[0]
    assert spec["title"] == "Run daily triage"
    assert spec["source_type"] == "instruction"
    assert spec["source_id"] == "daily-triage"
    assert spec["model"] == "test-model"
    assert spec["output_validated"] is True
    assert spec["review_required"] is False
    assert spec["prompt_hash"] is not None
    assert len(spec["prompt_hash"]) == 64
    assert result["reports"] == []
    assert llm.calls == [("Open tasks: 3", "test-model")]
@pytest.mark.asyncio
 async def test_evaluate_instructions_returns_report_payload(monkeypatch) -> None:
    llm = FakeLLMClient(json.dumps({
        "summary": "State Hub has open loose ends.",
        "recommendations": [{"candidate": "CUST-WP-0045", "action": "work-next"}],
    }))
    monkeypatch.setattr(activities, "get_llm_client", lambda: llm)
    result = await activities.evaluate_instructions({
        "instructions": [
            {
                "id": "daily-triage-report",
                "trusted_fields": [],
                "model": "test-model",
                "prompt": "Run report.",
                "output_schema": "schemas/daily-triage-report.json",
                "review_required": False,
            }
        ],
        "event": {},
        "context": {},
    })
    assert result["task_specs"] == []
    assert len(result["reports"]) == 1
    report = result["reports"][0]
    assert report["instruction_id"] == "daily-triage-report"
    assert report["report"]["summary"] == "State Hub has open loose ends."
    assert report["output_validated"] is True
    assert report["prompt_hash"] is not None
@pytest.mark.asyncio
 async def test_evaluate_instructions_without_llm_client_returns_no_tasks(monkeypatch) -> None:
    class RaisingClient:
        def complete(self, prompt: str, model: str = "") -> str:  # noqa: ARG002
            raise RuntimeError("not configured")
    monkeypatch.setattr(activities, "get_llm_client", lambda: RaisingClient())
    result = await activities.evaluate_instructions({
        "instructions": [
            {
                "id": "daily-triage",
                "trusted_fields": [],
                "model": "test-model",
                "prompt": "Run triage.",
                "output_schema": "schemas/daily-triage-report.json",
            }
        ],
        "event": {},
        "context": {},
    })
    assert result == {"task_specs": [], "reports": []}