Files
reuse-surface/reuse_surface/registry_update.py
tegwick b24ec507aa
Some checks failed
ci / validate-registry (push) Has been cancelled
WP-0016 finished: interactive registry maintain with llm-connect automation
Closes the registry maintenance loop from inside each domain repo:
interactive prompting for judgment calls, full automation for safe and
high-confidence changes, both backed by the llm-connect HTTP bridge.

- New modules: maintain.py, maintain_llm.py, patches.py, interactive.py
- Schema: schemas/registry-patch.schema.json
- CLI: reuse-surface maintain; establish --scaffold --hook
- Sibling templates: Makefile fragment, pre-commit hook
- Deterministic signal collectors extended; validate cwd auto-detect
- Docs, gap priority 28, SCOPE update
- Tests: test_maintain.py, test_interactive.py (59 pytest total)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 04:00:39 +02:00

417 lines
15 KiB
Python

from __future__ import annotations
import json
import subprocess
import textwrap
from datetime import date
from pathlib import Path
from typing import Any
import yaml
from reuse_surface.llm_bridge import request_json_object
from reuse_surface.registry import (
entry_vector,
load_index_at,
parse_front_matter,
registry_paths,
vectors_match,
)
# Safe to apply without interactive review (see patches.SAFE_DETERMINISTIC_KINDS).
SAFE_EVIDENCE_PREFIXES = ("tests/", ".gitea/workflows/")
def git_changed_files(repo_root: Path, since_ref: str) -> list[str]:
result = subprocess.run(
["git", "-C", str(repo_root), "diff", "--name-only", since_ref, "HEAD"],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
raise ValueError(result.stderr.strip() or f"git diff failed for {since_ref}")
return [line.strip() for line in result.stdout.splitlines() if line.strip()]
def collect_deterministic_suggestions(
repo_root: Path,
*,
capability_id: str | None = None,
git_since: str | None = None,
) -> list[dict[str, Any]]:
paths = registry_paths(repo_root)
if not paths["index"].exists():
raise ValueError("registry index missing; run establish --scaffold first")
index = load_index_at(paths["index"])
rows = index.get("capabilities", [])
if capability_id:
rows = [row for row in rows if row["id"] == capability_id]
if not rows:
raise ValueError(f"capability not in index: {capability_id}")
changed_files = git_changed_files(repo_root, git_since) if git_since else []
suggestions: list[dict[str, Any]] = []
suggestions.extend(_collect_index_orphans(repo_root, index, changed_files))
for row in rows:
entry_path = repo_root / row["path"]
if not entry_path.exists():
suggestions.append(
{
"capability_id": row["id"],
"kind": "missing_entry",
"detail": f"missing file {row['path']}",
}
)
continue
front_matter = parse_front_matter(entry_path)
if not vectors_match(row["vector"], front_matter):
suggestions.append(
{
"capability_id": row["id"],
"kind": "vector_drift",
"detail": "index vector differs from entry front matter",
"index_vector": row["vector"],
"entry_vector": entry_vector(front_matter),
"apply_patch": {
"field": "index.vector",
"value": entry_vector(front_matter),
},
}
)
suggestions.extend(
_collect_changed_file_suggestions(row["id"], front_matter, changed_files, repo_root)
)
return suggestions
def _collect_index_orphans(
repo_root: Path,
index: dict[str, Any],
changed_files: list[str],
) -> list[dict[str, Any]]:
suggestions: list[dict[str, Any]] = []
indexed_paths = {row["path"] for row in index.get("capabilities", [])}
cap_dir = registry_paths(repo_root)["capabilities"]
if not cap_dir.exists():
return suggestions
for entry_file in sorted(cap_dir.glob("*.md")):
if entry_file.name == ".gitkeep":
continue
rel = str(entry_file.relative_to(repo_root))
if rel in indexed_paths:
continue
try:
front_matter = parse_front_matter(entry_file)
except ValueError:
continue
cap_id = front_matter.get("id", entry_file.stem.replace("-", "."))
suggestions.append(
{
"capability_id": cap_id,
"kind": "index_row_add",
"detail": f"capability file not in index: {rel}",
"apply_patch": {
"field": "index.capabilities",
"index_row": {
"id": cap_id,
"name": front_matter.get("name", cap_id),
"summary": front_matter.get("summary", ""),
"vector": entry_vector(front_matter),
"domain": front_matter.get("domain", index.get("domain", "helix_forge")),
"status": front_matter.get("status", "draft"),
"owner": front_matter.get("owner", repo_root.name),
"path": rel,
"tags": front_matter.get("tags", []),
"consumption_modes": front_matter.get("availability", {}).get(
"consumption_modes", ["informational"]
),
},
},
}
)
index_updated = index.get("updated")
registry_touched = any(path.startswith("registry/") for path in changed_files)
if registry_touched and index_updated != date.today().isoformat():
first_id = index.get("capabilities", [{}])[0].get("id", "registry")
suggestions.append(
{
"capability_id": first_id,
"kind": "index_updated_stale",
"detail": "registry/ changed; bump index updated date",
"apply_patch": {"field": "index.updated", "value": date.today().isoformat()},
}
)
return suggestions
def _pyproject_script_artifacts(repo_root: Path) -> list[str]:
pyproject = repo_root / "pyproject.toml"
if not pyproject.exists():
return []
try:
import tomllib
data = tomllib.loads(pyproject.read_text(encoding="utf-8"))
except (OSError, ValueError):
return []
scripts = data.get("project", {}).get("scripts", {})
return [f"pyproject.toml:[project.scripts].{name}" for name in sorted(scripts)]
def _collect_changed_file_suggestions(
cap_id: str,
front_matter: dict[str, Any],
changed_files: list[str],
repo_root: Path,
) -> list[dict[str, Any]]:
suggestions: list[dict[str, Any]] = []
evidence = front_matter.setdefault("evidence", {})
evidence_tests = evidence.get("tests", [])
evidence_docs = evidence.get("documentation", [])
pkg_prefixes = tuple(
p.name + "/"
for p in repo_root.iterdir()
if p.is_dir() and (p / "__init__.py").exists()
)
for changed in changed_files:
if changed.startswith("tests/") and changed not in evidence_tests:
suggestions.append(
{
"capability_id": cap_id,
"kind": "evidence_test",
"detail": f"new test file not cited: {changed}",
"apply_patch": {"field": "evidence.tests", "append": changed},
}
)
if changed.startswith(".gitea/workflows/") and changed.endswith((".yml", ".yaml")):
field = "evidence.tests" if "test" in changed.lower() else "evidence.documentation"
existing = evidence_tests if field == "evidence.tests" else evidence_docs
if changed not in existing:
suggestions.append(
{
"capability_id": cap_id,
"kind": "evidence_workflow",
"detail": f"workflow changed not cited: {changed}",
"apply_patch": {"field": field, "append": changed},
}
)
if changed.startswith("docs/") and changed not in evidence_docs:
suggestions.append(
{
"capability_id": cap_id,
"kind": "evidence_documentation",
"detail": f"doc changed not cited: {changed}",
"apply_patch": {"field": "evidence.documentation", "append": changed},
}
)
artifacts = front_matter.get("availability", {}).get("current_artifacts", [])
for changed in changed_files:
if changed.endswith(".py") and changed.startswith(pkg_prefixes):
if changed not in artifacts:
suggestions.append(
{
"capability_id": cap_id,
"kind": "availability_artifact",
"detail": f"changed module not cited: {changed}",
"apply_patch": {
"field": "availability.current_artifacts",
"append": changed,
},
}
)
if changed == "pyproject.toml":
for script_ref in _pyproject_script_artifacts(repo_root):
if script_ref not in artifacts:
suggestions.append(
{
"capability_id": cap_id,
"kind": "availability_artifact",
"detail": f"CLI script not cited: {script_ref}",
"apply_patch": {
"field": "availability.current_artifacts",
"append": script_ref,
},
}
)
return suggestions
def apply_deterministic_suggestions(
repo_root: Path,
suggestions: list[dict[str, Any]],
) -> list[str]:
paths = registry_paths(repo_root)
index = load_index_at(paths["index"])
index_by_id = {row["id"]: row for row in index.get("capabilities", [])}
changed: list[str] = []
entry_cache: dict[str, dict[str, Any]] = {}
entry_paths: dict[str, Path] = {}
for suggestion in suggestions:
patch = suggestion.get("apply_patch")
if not patch:
continue
cap_id = suggestion["capability_id"]
if patch["field"] == "index.vector" and cap_id in index_by_id:
index_by_id[cap_id]["vector"] = patch["value"]
changed.append(f"index vector for {cap_id}")
row = index_by_id.get(cap_id)
if not row:
continue
entry_path = repo_root / row["path"]
if cap_id not in entry_cache:
entry_cache[cap_id] = parse_front_matter(entry_path)
entry_paths[cap_id] = entry_path
front_matter = entry_cache[cap_id]
if patch["field"] in {"evidence.tests", "evidence.documentation"}:
bucket = patch["field"].split(".")[1]
items = front_matter.setdefault("evidence", {}).setdefault(bucket, [])
if patch["append"] not in items:
items.append(patch["append"])
changed.append(f"{cap_id} {patch['field']} += {patch['append']}")
if patch["field"] == "availability.current_artifacts":
artifacts = front_matter.setdefault("availability", {}).setdefault(
"current_artifacts", []
)
if patch["append"] not in artifacts:
artifacts.append(patch["append"])
changed.append(
f"{cap_id} availability.current_artifacts += {patch['append']}"
)
for suggestion in suggestions:
patch = suggestion.get("apply_patch")
if not patch:
continue
if suggestion.get("kind") == "index_row_add":
cap_id = suggestion["capability_id"]
row = patch.get("index_row")
if row and cap_id not in index_by_id:
index.setdefault("capabilities", []).append(row)
changed.append(f"index row added for {cap_id}")
if suggestion.get("kind") == "index_updated_stale":
index["updated"] = patch.get("value", date.today().isoformat())
changed.append("index.updated bumped")
if changed:
index["updated"] = date.today().isoformat()
paths["index"].write_text(
yaml.safe_dump(index, sort_keys=False, allow_unicode=True),
encoding="utf-8",
)
for cap_id, front_matter in entry_cache.items():
_write_front_matter(entry_paths[cap_id], front_matter)
return changed
def _write_front_matter(path: Path, front_matter: dict[str, Any]) -> None:
text = path.read_text(encoding="utf-8")
marker_end = text.find("\n---", 4)
body = text[marker_end + 4 :] if marker_end != -1 else "\n"
path.write_text(
"---\n"
+ yaml.safe_dump(front_matter, sort_keys=False, allow_unicode=True)
+ "---"
+ body,
encoding="utf-8",
)
def build_update_prompt(
repo_root: Path,
capability_id: str,
*,
git_since: str | None = None,
) -> str:
paths = registry_paths(repo_root)
index = load_index_at(paths["index"])
row = next((item for item in index["capabilities"] if item["id"] == capability_id), None)
if not row:
raise ValueError(f"capability not in index: {capability_id}")
entry = parse_front_matter(repo_root / row["path"])
diff = ""
if git_since:
proc = subprocess.run(
[
"git",
"-C",
str(repo_root),
"diff",
git_since,
"HEAD",
"--",
"registry/",
"reuse_surface/",
"tests/",
],
capture_output=True,
text=True,
check=False,
)
diff = proc.stdout[:12000]
return textwrap.dedent(
f"""
Suggest registry entry updates for capability `{capability_id}`.
Return ONLY JSON:
{{
"promotion_history": [
{{"date": "YYYY-MM-DD", "dimension": "availability", "from": "A3", "to": "A4", "rationale": "..."}}
],
"consumer_feedback": ["optional string notes"],
"notes": ["human review items"]
}}
Current entry YAML:
{yaml.safe_dump(entry, sort_keys=False)}
Git diff since {git_since or 'N/A'}:
{diff or '(none)'}
"""
).strip()
def suggest_llm_updates(
repo_root: Path,
capability_id: str,
*,
git_since: str | None = None,
llm_url: str | None = None,
) -> dict[str, Any]:
prompt = build_update_prompt(repo_root, capability_id, git_since=git_since)
return request_json_object(
prompt,
base_url=llm_url,
config={"temperature": 0.2, "max_tokens": 2000},
)
def format_suggestions_markdown(suggestions: list[dict[str, Any]]) -> str:
if not suggestions:
return "# Registry update suggestions\n\n_No suggestions._\n"
lines = ["# Registry update suggestions", ""]
for item in suggestions:
lines.append(f"- `{item['capability_id']}` **{item['kind']}**: {item['detail']}")
lines.append("")
lines.append(f"**{len(suggestions)}** suggestion(s). Use `--apply` to apply safe patches.")
return "\n".join(lines) + "\n"
def format_suggestions_json(suggestions: list[dict[str, Any]]) -> str:
return json.dumps({"count": len(suggestions), "suggestions": suggestions}, indent=2)