From 206bb336d214ed6668f56f597312e664da096c75 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 18 Jun 2026 15:12:31 +0200 Subject: [PATCH] Wire llm-connect runtime for daily triage --- k8s/railiance/20-runtime.yaml | 2 +- k8s/railiance/README.md | 6 +- tests/test_railiance_ops_inventory_wiring.py | 4 +- uv.lock | 2 + ...-0006-post-triage-operational-hardening.md | 26 ++- ...WP-0010-daily-triage-llm-reconciliation.md | 172 ++++++++++++++++++ 6 files changed, 207 insertions(+), 5 deletions(-) create mode 100644 workplans/ACTIVITY-WP-0010-daily-triage-llm-reconciliation.md diff --git a/k8s/railiance/20-runtime.yaml b/k8s/railiance/20-runtime.yaml index 3de07c0..afc8b87 100644 --- a/k8s/railiance/20-runtime.yaml +++ b/k8s/railiance/20-runtime.yaml @@ -11,7 +11,7 @@ data: TEMPORAL_NAMESPACE: default NATS_URL: nats://actcore-nats:4222 STATE_HUB_URL: http://actcore-state-hub-bridge:8000 - LLM_CONNECT_URL: "" + LLM_CONNECT_URL: http://llm-connect.activity-core.svc.cluster.local:8080 LLM_CONNECT_TIMEOUT_SECONDS: "300" REPO_SCOPING_URL: http://repo-scoping.repo-scoping.svc.cluster.local:8020 ISSUE_CORE_URL: http://issue-core.issue-core.svc.cluster.local:8010 diff --git a/k8s/railiance/README.md b/k8s/railiance/README.md index 4576cfd..1ff3df0 100644 --- a/k8s/railiance/README.md +++ b/k8s/railiance/README.md @@ -32,8 +32,10 @@ Europe/Berlin schedule, verify both runtime dependencies: - `actcore-state-hub-bridge` can reach the State Hub API through the node-local tunnel expected at `127.0.0.1:18000`. -- `LLM_CONNECT_URL` is set to an operator-approved llm-connect endpoint that can - serve the `custodian-triage-balanced` profile. +- `LLM_CONNECT_URL` points at the verified in-namespace llm-connect Service, + `http://llm-connect.activity-core.svc.cluster.local:8080`, and the + operator-owned provider Secret lets that Service serve the + `custodian-triage-balanced` profile. If `LLM_CONNECT_URL` is missing or broken, report-sink instructions write a visible `execution_failed` diagnostic instead of silently producing no report. diff --git a/tests/test_railiance_ops_inventory_wiring.py b/tests/test_railiance_ops_inventory_wiring.py index dc7f323..4db9103 100644 --- a/tests/test_railiance_ops_inventory_wiring.py +++ b/tests/test_railiance_ops_inventory_wiring.py @@ -33,7 +33,9 @@ def _by_kind_name(kind: str, name: str) -> dict[str, Any]: def test_runtime_config_has_ops_inventory_placeholders() -> None: config = _by_kind_name("ConfigMap", "actcore-runtime-config") - assert config["data"]["LLM_CONNECT_URL"] == "" + assert config["data"]["LLM_CONNECT_URL"] == ( + "http://llm-connect.activity-core.svc.cluster.local:8080" + ) assert config["data"]["LLM_CONNECT_TIMEOUT_SECONDS"] == "300" assert config["data"]["OPS_INVENTORY_PATH"] == ( "/etc/activity-core/ops/service-inventory.yml" diff --git a/uv.lock b/uv.lock index 49f8811..a93be59 100644 --- a/uv.lock +++ b/uv.lock @@ -12,6 +12,7 @@ dependencies = [ { name = "httpx" }, { name = "nats-py" }, { name = "pydantic" }, + { name = "pyyaml" }, { name = "sqlalchemy", extra = ["asyncio"] }, { name = "temporalio" }, { name = "uvicorn", extra = ["standard"] }, @@ -34,6 +35,7 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24" }, + { name = "pyyaml", specifier = ">=6.0" }, { name = "sqlalchemy", extras = ["asyncio"], specifier = ">=2.0" }, { name = "temporalio", specifier = ">=1.7" }, { name = "temporalio", extras = ["testing"], marker = "extra == 'dev'", specifier = ">=1.7" }, diff --git a/workplans/ACTIVITY-WP-0006-post-triage-operational-hardening.md b/workplans/ACTIVITY-WP-0006-post-triage-operational-hardening.md index 753237c..f109f1d 100644 --- a/workplans/ACTIVITY-WP-0006-post-triage-operational-hardening.md +++ b/workplans/ACTIVITY-WP-0006-post-triage-operational-hardening.md @@ -8,7 +8,7 @@ status: active owner: codex topic_slug: custodian created: "2026-06-03" -updated: "2026-06-07" +updated: "2026-06-16" state_hub_workstream_id: "5646e13a-13af-4724-bca6-3c0d86f96733" --- @@ -150,6 +150,30 @@ State Hub to `state-hub` (`dc10704f`), `railiance-cluster` (`53e78702`), activity-core runner plus three clean scheduled daily runs and calibration feedback. +2026-06-16: Rechecked State Hub and the configured working-memory sink. State +Hub `/progress/?event_type=daily_triage` still only shows activity-core +`daily_triage` progress through 2026-06-06, and +`/home/worsch/the-custodian/memory/working` only has `daily-triage-*` notes +for 2026-06-02 through 2026-06-06. There is still no evidence of three clean +consecutive scheduled runs after the June 7 runtime projection failure, so +T03 remains `wait`. + +2026-06-18: Consumed the verified in-cluster llm-connect Service URL in the +Railiance runtime projection. `actcore-runtime-config` now sets +`LLM_CONNECT_URL=http://llm-connect.activity-core.svc.cluster.local:8080` and +keeps `LLM_CONNECT_TIMEOUT_SECONDS=300`. The remaining live gate is no longer +the URL slot itself; it is operator-owned provider credential custody for +`activity-core/llm-connect-provider-secrets`, a schema-valid fixture smoke, and +then three clean scheduled daily triage runs. + +2026-06-18 follow-up: `llm-connect` reported State Hub message +`6a098e1e-65de-4309-ab4a-446aba2f3587`: the provider Secret now has a populated +key count and the in-namespace fixture smoke passed on the llm-connect side. +The remaining activity-core gate is to reconcile the live Railiance runtime so +the worker consumes the configured URL, then produce schema-valid daily triage +evidence and three clean scheduled runs. This narrower path is tracked in +`ACTIVITY-WP-0010`. + ## Rule Action Contract Documentation ```task diff --git a/workplans/ACTIVITY-WP-0010-daily-triage-llm-reconciliation.md b/workplans/ACTIVITY-WP-0010-daily-triage-llm-reconciliation.md new file mode 100644 index 0000000..5d14d40 --- /dev/null +++ b/workplans/ACTIVITY-WP-0010-daily-triage-llm-reconciliation.md @@ -0,0 +1,172 @@ +--- +id: ACTIVITY-WP-0010 +type: workplan +title: "Daily Triage LLM Reconciliation And Evidence" +domain: custodian +repo: activity-core +status: blocked +owner: codex +topic_slug: custodian +created: "2026-06-18" +updated: "2026-06-18" +state_hub_workstream_id: "f2c73ac6-13f0-4005-82cc-76c7c9f9c8b9" +--- + +# ACTIVITY-WP-0010 - Daily Triage LLM Reconciliation And Evidence + +## Context + +This workplan implements the in-scope portion of the latest activity-core +suggestion review against `INTENT.md` and `SCOPE.md`. + +Relevant accepted suggestion: + +- State Hub message `6a098e1e-65de-4309-ab4a-446aba2f3587` from + `llm-connect` says `LLM-WP-0006` is complete on the llm-connect side. The + stable Service URL is + `http://llm-connect.activity-core.svc.cluster.local:8080`, timeout remains + `300`, the provider Secret reports populated key count, and the in-namespace + fixture smoke passed with schema-valid endpoint behavior. + +Why this belongs in activity-core: + +- `INTENT.md` says activity-core owns the **when/what/where** loop for + scheduled coordination work. +- `SCOPE.md` keeps LLM instruction execution in scope through the llm-connect + boundary, while keeping provider credentials and cluster reconciliation out of + scope. +- `ACTIVITY-WP-0006-T03` and `ACTIVITY-WP-0009-T01` remain open because daily + State Hub WSJF triage has not yet produced three clean scheduled runs after + the June 7 runtime projection failure. + +Suggestions reviewed but not accepted as product/runtime implementation work: + +- `coding_retro` activity-core suggestions for Bash tool thrash, schema thrash, + and read-before-edit hygiene are agent workflow advice. They are useful for + Codex operating style, but they do not change activity-core's Event Bridge + product surface and should not become runtime code. +- The earlier local-kubectl / cluster-owned evidence suggestion for + `ACTIVITY-WP-0007` has already been handled by moving live evidence ownership + to Railiance and closing the workplan from cluster-owned proof. + +Latest evidence before this workplan: + +- State Hub `daily_triage` progress on 2026-06-18 still shows + `LLM_CONNECT_URL is not configured`, which means the live activity-core + runtime has not yet consumed the repo-side URL update. +- `k8s/railiance/20-runtime.yaml` now sets the verified llm-connect Service URL + and `LLM_CONNECT_TIMEOUT_SECONDS=300`. + +## Confirm Repo-Side Runtime Contract + +```task +id: ACTIVITY-WP-0010-T01 +status: done +priority: high +state_hub_task_id: "dd52ce21-23b8-4e46-b3af-cb7bf486e40f" +``` + +Update activity-core's Railiance runtime projection so the daily triage worker +consumes the verified llm-connect Service URL by default. + +Done when: + +- `k8s/railiance/20-runtime.yaml` sets + `LLM_CONNECT_URL=http://llm-connect.activity-core.svc.cluster.local:8080`. +- `LLM_CONNECT_TIMEOUT_SECONDS=300` remains configured. +- Wiring tests assert the URL and timeout. +- The Railiance README states that provider credentials remain operator-owned + and outside Git / State Hub. + +2026-06-18: Completed. Updated the runtime ConfigMap, README, and +`tests/test_railiance_ops_inventory_wiring.py`. Focused tests passed: +`tests/test_railiance_ops_inventory_wiring.py tests/test_llm_client.py` +reported 9 passed. + +## Reconcile Live Railiance Runtime + +```task +id: ACTIVITY-WP-0010-T02 +status: wait +priority: high +state_hub_task_id: "23545ddc-926b-485a-8535-5cc11e01134a" +``` + +Apply or reconcile the updated activity-core Railiance runtime through the +cluster-owned deployment path, not through ad hoc local kubectl from this repo. + +Done when non-secret evidence shows: + +- live `actcore-runtime-config` has the verified `LLM_CONNECT_URL` and timeout; +- the activity-core worker has restarted or otherwise consumed the new config; +- `activity-core/llm-connect-provider-secrets` remains present with a populated + key count only, without printing or storing secret values; +- the State Hub bridge remains reachable from the activity-core runtime. + +Current wait reason: this is Railiance/operator-owned live cluster work. State +Hub handoff message `9a074b7c-4b87-4e3c-a6bf-e1fe5580daa8` asks +`railiance-cluster` to reconcile the updated config and smoke it. + +## Run Daily Triage Fixture Smoke + +```task +id: ACTIVITY-WP-0010-T03 +status: wait +priority: high +state_hub_task_id: "10e0df77-c230-4a82-b720-23c66bd17c0a" +``` + +After T02, run a manual or smoke execution of +`daily-statehub-wsjf-triage` against the live activity-core runtime. + +Done when: + +- the run calls llm-connect through the configured Service URL; +- llm-connect returns content accepted as schema-valid daily-triage JSON; +- State Hub receives a `daily_triage` progress item with `output_validated=true`; +- the working-memory daily-triage note exists at the path recorded in State Hub + detail; +- `scripts/verify_daily_triage.py` reports the smoke/manual run as present. + +## Collect Three Clean Scheduled Runs + +```task +id: ACTIVITY-WP-0010-T04 +status: wait +priority: high +state_hub_task_id: "dc6b9482-cf43-4fc5-994b-dcd7dea47db7" +``` + +Let the normal 07:20 Europe/Berlin schedule produce three consecutive clean +daily triage runs after the live config reconciliation. + +Done when: + +- three consecutive scheduled runs have Temporal workflow evidence, + `activity_runs` rows, State Hub `daily_triage` progress, and working-memory + notes; +- none of the three runs are merely manual smoke tests or `execution_failed` + diagnostics; +- calibration feedback is recorded in State Hub; +- `ACTIVITY-WP-0006-T03` and `ACTIVITY-WP-0009-T01` can move from `wait` to + `done`. + +## Close Handoff State + +```task +id: ACTIVITY-WP-0010-T05 +status: wait +priority: medium +state_hub_task_id: "ecc57e21-1716-4daa-aba6-d8a6d824e4ed" +``` + +Update the surrounding workplans and State Hub once the live daily triage gate +passes. + +Done when: + +- `ACTIVITY-WP-0006` records the three-run calibration evidence; +- `ACTIVITY-WP-0009` records the scheduled-run trust gap closure; +- any temporary `needs_human` flags created for the llm-connect provider/config + handoff are cleared or replaced by a narrower follow-up; +- this workplan is marked `finished`.