structured logging around key workflows and docs for operational readiness

2026-04-26 17:02:24 +02:00
parent 2902e362df
commit 99bb851ca8
8 changed files with 288 additions and 5 deletions
--- a/docs/operations.md
+++ b/docs/operations.md
@@ -0,0 +1,93 @@
+# Operational Readiness
+
+This note captures the runtime knobs and baseline operating procedures for the
+Repository Ability Registry service.
+
+## Configuration
+
+Configuration is read from environment variables with the `REPO_REGISTRY_`
+prefix.
+
+| Variable | Default | Purpose |
+| --- | --- | --- |
+| `REPO_REGISTRY_DATABASE_PATH` | `var/repo-registry.sqlite3` | SQLite database file used by the default store. |
+| `REPO_REGISTRY_CHECKOUT_ROOT` | `var/checkouts` | Local checkout cache used during repository ingestion. |
+| `REPO_REGISTRY_LLM_PROVIDER` | unset | Optional LLM provider name for candidate extraction. |
+| `REPO_REGISTRY_LLM_MODEL` | unset | Optional model name passed to the configured LLM provider. |
+| `REPO_REGISTRY_EMBEDDING_PROVIDER` | unset | Set to `hashing` to enable deterministic local hybrid search scoring. |
+| `REPO_REGISTRY_LOG_LEVEL` | `INFO` | Log level for the `repo_registry.operations` structured event logger. |
+
+## Health Checks
+
+`GET /health` returns service status plus the operational dependencies that can
+be checked locally:
+
+```json
+{
+  "status": "ok",
+  "database": {
+    "path": "var/repo-registry.sqlite3",
+    "reachable": true,
+    "error": null
+  },
+  "checkout_root": {
+    "path": "var/checkouts",
+    "exists": true
+  }
+}
+```
+
+`status` is `degraded` when the database cannot be initialized or queried. The
+checkout root is reported as metadata because it may be created lazily by the
+ingestion path.
+
+## Structured Logs
+
+Operational events are emitted through the `repo_registry.operations` logger as
+single-line JSON messages. Current events include repository registration,
+analysis start/completion/failure, LLM extraction usage/failure, and review
+decisions.
+
+Configure the Python or ASGI server logging stack to route this logger to the
+same sink as application logs. `REPO_REGISTRY_LOG_LEVEL` controls the logger
+level used by API-created service instances.
+
+## SQLite Backup And Restore
+
+For single-node SQLite deployments, prefer the SQLite backup API so readers can
+continue while the backup is created:
+
+```bash
+mkdir -p backups
+sqlite3 var/repo-registry.sqlite3 ".backup 'backups/repo-registry-$(date +%F).sqlite3'"
+```
+
+For the most conservative backup window, stop writes first, run the backup, then
+resume the service. Verify a backup with:
+
+```bash
+sqlite3 backups/repo-registry-YYYY-MM-DD.sqlite3 "PRAGMA integrity_check;"
+```
+
+To restore, stop the service, move the current database aside, copy the backup to
+`REPO_REGISTRY_DATABASE_PATH`, start the service, and verify `GET /health`.
+
+## PostgreSQL Migration Notes
+
+The storage interface is intentionally kept behind `RegistryStore` so a
+PostgreSQL-backed implementation can be introduced alongside SQLite before
+cutover. A production migration should:
+
+1. Add a PostgreSQL store that preserves the current repository, analysis,
+   observed fact, content chunk, candidate, approved registry, and review
+   decision contracts.
+2. Manage schema changes with explicit migrations rather than implicit table
+   creation.
+3. Export from SQLite and import into PostgreSQL in a repeatable script, then
+   compare repository counts, approved ability maps, search results, and recent
+   review decisions.
+4. Keep vector search optional. If pgvector is enabled, follow the plan in
+   `docs/semantic-retrieval.md` and validate hybrid ranking before making it the
+   default.
+5. Take a final SQLite backup immediately before cutover and retain it until the
+   PostgreSQL deployment has passed health and smoke tests.
--- a/src/repo_registry/core/logging.py
+++ b/src/repo_registry/core/logging.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+
+LOGGER_NAME = "repo_registry.operations"
+
+
+def log_operation(event: str, **fields: Any) -> None:
+    payload = {"event": event, **fields}
+    logging.getLogger(LOGGER_NAME).info(
+        json.dumps(payload, sort_keys=True, default=str)
+    )
--- a/src/repo_registry/core/service.py
+++ b/src/repo_registry/core/service.py
@@ -25,6 +25,7 @@ from repo_registry.core.models import (
 )
 from repo_registry.candidate_graph.generator import CandidateGraphGenerator
 from repo_registry.content_indexing.extractor import ContentExtractor
+from repo_registry.core.logging import log_operation
 from repo_registry.llm_extraction.extractor import LLMCandidateExtractor
 from repo_registry.llm_extraction.mapper import LLMExtractionMapper
 from repo_registry.repo_ingestion.git import GitIngestionService
@@ -67,13 +68,21 @@ class RegistryService:
            metadata = self.metadata_extractor.extract(checkout.source_path, url)
        else:
            metadata = None
-        return self.store.create_repository(
+        repository = self.store.create_repository(
            name=name or (metadata.name if metadata is not None else "repository"),
            url=url,
            description=description
            or (metadata.description if metadata is not None else None),
            branch=branch,
        )
+        log_operation(
+            "repository_registered",
+            repository_id=repository.id,
+            repository_name=repository.name,
+            branch=repository.branch,
+            metadata_imported=metadata is not None,
+        )
+        return repository

    def list_repositories(self) -> list[Repository]:
        return self.store.list_repositories()
@@ -108,6 +117,12 @@ class RegistryService:
        repository = self.store.get_repository(repository_id)
        run = self.store.create_analysis_run(repository_id)
        self.store.update_repository_status(repository_id, "analyzing")
+        log_operation(
+            "analysis_started",
+            repository_id=repository_id,
+            analysis_run_id=run.id,
+            source_override=source_path is not None,
+        )
        try:
            if source_path is None:
                checkout = self.ingestion.resolve(repository.url, branch=repository.branch)
@@ -117,6 +132,12 @@ class RegistryService:
            scan_result = self.scanner.scan(scan_source)
        except Exception as exc:
            failed_run = self.store.fail_analysis_run(repository_id, run.id, str(exc))
+            log_operation(
+                "analysis_failed",
+                repository_id=repository_id,
+                analysis_run_id=run.id,
+                error=str(exc),
+            )
            return ScanSummary(analysis_run=failed_run, snapshot=None, facts=[])

        completed_run = self.store.complete_analysis_run(
@@ -145,6 +166,12 @@ class RegistryService:
                stored_chunks,
            )
        except Exception as exc:
+            log_operation(
+                "llm_extraction_failed",
+                repository_id=repository_id,
+                analysis_run_id=completed_run.id,
+                error=str(exc),
+            )
            self.store.create_review_decision(
                repository_id,
                completed_run.id,
@@ -159,12 +186,27 @@ class RegistryService:
            candidate_source = "deterministic"
        self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
        if candidate_source == "llm":
+            log_operation(
+                "llm_extraction_used",
+                repository_id=repository_id,
+                analysis_run_id=completed_run.id,
+                candidate_count=len(candidates),
+            )
            self.store.create_review_decision(
                repository_id,
                completed_run.id,
                action="llm_extraction_used",
                notes=f"Generated {len(candidates)} candidate ability draft(s).",
            )
+        log_operation(
+            "analysis_completed",
+            repository_id=repository_id,
+            analysis_run_id=completed_run.id,
+            fact_count=len(facts),
+            content_chunk_count=len(stored_chunks),
+            candidate_count=len(candidates),
+            candidate_source=candidate_source,
+        )
        return ScanSummary(
            analysis_run=completed_run,
            snapshot=snapshot,
--- a/src/repo_registry/storage/sqlite.py
+++ b/src/repo_registry/storage/sqlite.py
@@ -27,6 +27,7 @@ from repo_registry.core.models import (
    SourceReference,
    confidence_label,
 )
+from repo_registry.core.logging import log_operation
 from repo_registry.content_indexing.extractor import ContentChunkCandidate
 from repo_registry.candidate_graph.generator import CandidateAbilityDraft
 from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
@@ -1005,7 +1006,15 @@ class RegistryStore:
                """,
                (repository_id, analysis_run_id, action, notes),
            )
-            return int(cursor.lastrowid)
+            decision_id = int(cursor.lastrowid)
+        log_operation(
+            "review_decision_recorded",
+            repository_id=repository_id,
+            analysis_run_id=analysis_run_id,
+            review_decision_id=decision_id,
+            action=action,
+        )
+        return decision_id

    def list_review_decisions(
        self,
--- a/src/repo_registry/web_api/app.py
+++ b/src/repo_registry/web_api/app.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import logging
 from dataclasses import asdict
 from pathlib import Path

@@ -62,6 +63,7 @@ class Settings(BaseSettings):
    llm_provider: str | None = Field(default=None)
    llm_model: str | None = Field(default=None)
    embedding_provider: str | None = Field(default=None)
+    log_level: str = Field(default="INFO")


 def get_settings() -> Settings:
@@ -69,6 +71,9 @@ def get_settings() -> Settings:


 def get_service(settings: Settings = Depends(get_settings)) -> RegistryService:
+    logging.getLogger("repo_registry.operations").setLevel(
+        getattr(logging, settings.log_level.upper(), logging.INFO)
+    )
    database_path = Path(settings.database_path)
    database_path.parent.mkdir(parents=True, exist_ok=True)
    store = RegistryStore(database_path)
@@ -120,8 +125,33 @@ app.include_router(ui_router)


@app.get("/health", tags=["health"])
-def health() -> dict[str, str]:
-    return {"status": "ok"}
+def health(settings: Settings = Depends(get_settings)) -> dict[str, object]:
+    database_path = Path(settings.database_path)
+    checkout_root = Path(settings.checkout_root)
+    database_reachable = False
+    database_error = None
+    try:
+        database_path.parent.mkdir(parents=True, exist_ok=True)
+        store = RegistryStore(database_path)
+        store.initialize()
+        with store.connect() as connection:
+            connection.execute("SELECT 1").fetchone()
+        database_reachable = True
+    except Exception as exc:
+        database_error = str(exc)
+
+    return {
+        "status": "ok" if database_reachable else "degraded",
+        "database": {
+            "path": str(database_path),
+            "reachable": database_reachable,
+            "error": database_error,
+        },
+        "checkout_root": {
+            "path": str(checkout_root),
+            "exists": checkout_root.exists(),
+        },
+    }


@app.post(
--- a/tests/test_registry_service.py
+++ b/tests/test_registry_service.py
@@ -1,5 +1,8 @@
+import json
+import logging
 import subprocess

+from repo_registry.core.logging import LOGGER_NAME
 from repo_registry.core.service import RegistryService
 from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability
 from repo_registry.repo_ingestion.git import GitIngestionService
@@ -459,6 +462,32 @@ def test_register_repository_imports_metadata_when_name_is_omitted(tmp_path):
    assert repository.description == "Imported description."


+def test_operational_logging_records_analysis_and_review_events(tmp_path, caplog):
+    source = tmp_path / "repo"
+    source.mkdir()
+    (source / "README.md").write_text("# Logged\n", encoding="utf-8")
+    (source / "app.py").write_text(
+        "from fastapi import FastAPI\n"
+        "app = FastAPI()\n"
+        '@app.get("/health")\n'
+        "def health():\n"
+        "    return {}\n",
+        encoding="utf-8",
+    )
+    service = make_service(tmp_path)
+
+    with caplog.at_level(logging.INFO, logger=LOGGER_NAME):
+        repository = service.register_repository(name="Logged", url=str(source))
+        summary = service.analyze_repository(repository.id)
+        service.approve_candidate_graph(repository.id, summary.analysis_run.id)
+
+    events = [json.loads(record.message)["event"] for record in caplog.records]
+    assert "repository_registered" in events
+    assert "analysis_started" in events
+    assert "analysis_completed" in events
+    assert "review_decision_recorded" in events
+
+
 def test_capability_must_belong_to_repository(tmp_path):
    service = make_service(tmp_path)
    first = service.register_repository(
@@ -1193,3 +1222,42 @@ def test_analyze_repository_clones_git_url_before_scanning(tmp_path):
    fact_names = {(fact.kind, fact.name, fact.path) for fact in summary.facts}
    assert ("documentation", "README", "README.md") in fact_names
    assert ("framework", "pytest", "requirements.txt") in fact_names
+
+
+def test_operational_logging_records_analysis_and_review_events(caplog, tmp_path):
+    source = tmp_path / "repo"
+    source.mkdir()
+    (source / "README.md").write_text("# Logged Service\n", encoding="utf-8")
+    (source / "requirements.txt").write_text("fastapi\n", encoding="utf-8")
+    (source / "app.py").write_text(
+        "from fastapi import FastAPI\n"
+        "app = FastAPI()\n"
+        '@app.get("/health")\n'
+        "def health():\n"
+        "    return {}\n",
+        encoding="utf-8",
+    )
+
+    service = make_service(tmp_path)
+    caplog.set_level(logging.INFO, logger=LOGGER_NAME)
+
+    repository = service.register_repository(name="Logged", url=str(source))
+    summary = service.analyze_repository(repository.id)
+    service.approve_candidate_graph(
+        repository.id,
+        summary.analysis_run.id,
+        notes="Logged approval.",
+    )
+
+    payloads = [
+        json.loads(record.message)
+        for record in caplog.records
+        if record.name == LOGGER_NAME
+    ]
+    events = {payload["event"] for payload in payloads}
+
+    assert "repository_registered" in events
+    assert "analysis_started" in events
+    assert "analysis_completed" in events
+    assert "review_decision_recorded" in events
+    assert all(payload["repository_id"] == repository.id for payload in payloads)
--- a/tests/test_web_api.py
+++ b/tests/test_web_api.py
@@ -64,6 +64,30 @@ def test_docs_endpoint_is_available():
    assert "openapi.json" in response.text


+def test_health_reports_database_and_checkout_root(tmp_path):
+    def override_settings():
+        return Settings(
+            database_path=str(tmp_path / "health.sqlite3"),
+            checkout_root=str(tmp_path / "checkouts"),
+        )
+
+    app.dependency_overrides[get_settings] = override_settings
+    client = TestClient(app)
+    try:
+        response = client.get("/health")
+
+        assert response.status_code == 200
+        body = response.json()
+        assert body["status"] == "ok"
+        assert body["database"]["reachable"] is True
+        assert body["database"]["error"] is None
+        assert body["database"]["path"].endswith("health.sqlite3")
+        assert body["checkout_root"]["path"].endswith("checkouts")
+        assert body["checkout_root"]["exists"] is False
+    finally:
+        app.dependency_overrides.clear()
+
+
 def test_api_manual_registry_loop(tmp_path):
    def override_settings():
        return Settings(
@@ -429,6 +453,7 @@ def test_settings_can_load_from_environment(monkeypatch):
    monkeypatch.setenv("REPO_REGISTRY_LLM_PROVIDER", "mock")
    monkeypatch.setenv("REPO_REGISTRY_LLM_MODEL", "demo-model")
    monkeypatch.setenv("REPO_REGISTRY_EMBEDDING_PROVIDER", "hashing")
+    monkeypatch.setenv("REPO_REGISTRY_LOG_LEVEL", "DEBUG")

    settings = Settings()

@@ -437,6 +462,7 @@ def test_settings_can_load_from_environment(monkeypatch):
    assert settings.llm_provider == "mock"
    assert settings.llm_model == "demo-model"
    assert settings.embedding_provider == "hashing"
+    assert settings.log_level == "DEBUG"


 def test_api_analysis_run_loop(tmp_path):
--- a/workplans/RREG-WP-0002-production-hardening.md
+++ b/workplans/RREG-WP-0002-production-hardening.md
@@ -84,7 +84,7 @@ do not become approved truth without review.

 ```task
 id: RREG-WP-0002-T05
-status: todo
+status: done
 priority: low
 state_hub_task_id: "44b10491-f1f2-4e2e-9a8e-e8bd59cbf892"
 ```