From 99bb851ca841a76a360d2af75aea5e2ee4880882 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 26 Apr 2026 17:02:24 +0200 Subject: [PATCH] structured logging around key workflows and docs for operational readiness --- docs/operations.md | 93 +++++++++++++++++++ src/repo_registry/core/logging.py | 15 +++ src/repo_registry/core/service.py | 44 ++++++++- src/repo_registry/storage/sqlite.py | 11 ++- src/repo_registry/web_api/app.py | 34 ++++++- tests/test_registry_service.py | 68 ++++++++++++++ tests/test_web_api.py | 26 ++++++ .../RREG-WP-0002-production-hardening.md | 2 +- 8 files changed, 288 insertions(+), 5 deletions(-) create mode 100644 docs/operations.md create mode 100644 src/repo_registry/core/logging.py diff --git a/docs/operations.md b/docs/operations.md new file mode 100644 index 0000000..30d5b23 --- /dev/null +++ b/docs/operations.md @@ -0,0 +1,93 @@ +# Operational Readiness + +This note captures the runtime knobs and baseline operating procedures for the +Repository Ability Registry service. + +## Configuration + +Configuration is read from environment variables with the `REPO_REGISTRY_` +prefix. + +| Variable | Default | Purpose | +| --- | --- | --- | +| `REPO_REGISTRY_DATABASE_PATH` | `var/repo-registry.sqlite3` | SQLite database file used by the default store. | +| `REPO_REGISTRY_CHECKOUT_ROOT` | `var/checkouts` | Local checkout cache used during repository ingestion. | +| `REPO_REGISTRY_LLM_PROVIDER` | unset | Optional LLM provider name for candidate extraction. | +| `REPO_REGISTRY_LLM_MODEL` | unset | Optional model name passed to the configured LLM provider. | +| `REPO_REGISTRY_EMBEDDING_PROVIDER` | unset | Set to `hashing` to enable deterministic local hybrid search scoring. | +| `REPO_REGISTRY_LOG_LEVEL` | `INFO` | Log level for the `repo_registry.operations` structured event logger. | + +## Health Checks + +`GET /health` returns service status plus the operational dependencies that can +be checked locally: + +```json +{ + "status": "ok", + "database": { + "path": "var/repo-registry.sqlite3", + "reachable": true, + "error": null + }, + "checkout_root": { + "path": "var/checkouts", + "exists": true + } +} +``` + +`status` is `degraded` when the database cannot be initialized or queried. The +checkout root is reported as metadata because it may be created lazily by the +ingestion path. + +## Structured Logs + +Operational events are emitted through the `repo_registry.operations` logger as +single-line JSON messages. Current events include repository registration, +analysis start/completion/failure, LLM extraction usage/failure, and review +decisions. + +Configure the Python or ASGI server logging stack to route this logger to the +same sink as application logs. `REPO_REGISTRY_LOG_LEVEL` controls the logger +level used by API-created service instances. + +## SQLite Backup And Restore + +For single-node SQLite deployments, prefer the SQLite backup API so readers can +continue while the backup is created: + +```bash +mkdir -p backups +sqlite3 var/repo-registry.sqlite3 ".backup 'backups/repo-registry-$(date +%F).sqlite3'" +``` + +For the most conservative backup window, stop writes first, run the backup, then +resume the service. Verify a backup with: + +```bash +sqlite3 backups/repo-registry-YYYY-MM-DD.sqlite3 "PRAGMA integrity_check;" +``` + +To restore, stop the service, move the current database aside, copy the backup to +`REPO_REGISTRY_DATABASE_PATH`, start the service, and verify `GET /health`. + +## PostgreSQL Migration Notes + +The storage interface is intentionally kept behind `RegistryStore` so a +PostgreSQL-backed implementation can be introduced alongside SQLite before +cutover. A production migration should: + +1. Add a PostgreSQL store that preserves the current repository, analysis, + observed fact, content chunk, candidate, approved registry, and review + decision contracts. +2. Manage schema changes with explicit migrations rather than implicit table + creation. +3. Export from SQLite and import into PostgreSQL in a repeatable script, then + compare repository counts, approved ability maps, search results, and recent + review decisions. +4. Keep vector search optional. If pgvector is enabled, follow the plan in + `docs/semantic-retrieval.md` and validate hybrid ranking before making it the + default. +5. Take a final SQLite backup immediately before cutover and retain it until the + PostgreSQL deployment has passed health and smoke tests. diff --git a/src/repo_registry/core/logging.py b/src/repo_registry/core/logging.py new file mode 100644 index 0000000..32749c7 --- /dev/null +++ b/src/repo_registry/core/logging.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import json +import logging +from typing import Any + + +LOGGER_NAME = "repo_registry.operations" + + +def log_operation(event: str, **fields: Any) -> None: + payload = {"event": event, **fields} + logging.getLogger(LOGGER_NAME).info( + json.dumps(payload, sort_keys=True, default=str) + ) diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 14b7f00..a2c47d8 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -25,6 +25,7 @@ from repo_registry.core.models import ( ) from repo_registry.candidate_graph.generator import CandidateGraphGenerator from repo_registry.content_indexing.extractor import ContentExtractor +from repo_registry.core.logging import log_operation from repo_registry.llm_extraction.extractor import LLMCandidateExtractor from repo_registry.llm_extraction.mapper import LLMExtractionMapper from repo_registry.repo_ingestion.git import GitIngestionService @@ -67,13 +68,21 @@ class RegistryService: metadata = self.metadata_extractor.extract(checkout.source_path, url) else: metadata = None - return self.store.create_repository( + repository = self.store.create_repository( name=name or (metadata.name if metadata is not None else "repository"), url=url, description=description or (metadata.description if metadata is not None else None), branch=branch, ) + log_operation( + "repository_registered", + repository_id=repository.id, + repository_name=repository.name, + branch=repository.branch, + metadata_imported=metadata is not None, + ) + return repository def list_repositories(self) -> list[Repository]: return self.store.list_repositories() @@ -108,6 +117,12 @@ class RegistryService: repository = self.store.get_repository(repository_id) run = self.store.create_analysis_run(repository_id) self.store.update_repository_status(repository_id, "analyzing") + log_operation( + "analysis_started", + repository_id=repository_id, + analysis_run_id=run.id, + source_override=source_path is not None, + ) try: if source_path is None: checkout = self.ingestion.resolve(repository.url, branch=repository.branch) @@ -117,6 +132,12 @@ class RegistryService: scan_result = self.scanner.scan(scan_source) except Exception as exc: failed_run = self.store.fail_analysis_run(repository_id, run.id, str(exc)) + log_operation( + "analysis_failed", + repository_id=repository_id, + analysis_run_id=run.id, + error=str(exc), + ) return ScanSummary(analysis_run=failed_run, snapshot=None, facts=[]) completed_run = self.store.complete_analysis_run( @@ -145,6 +166,12 @@ class RegistryService: stored_chunks, ) except Exception as exc: + log_operation( + "llm_extraction_failed", + repository_id=repository_id, + analysis_run_id=completed_run.id, + error=str(exc), + ) self.store.create_review_decision( repository_id, completed_run.id, @@ -159,12 +186,27 @@ class RegistryService: candidate_source = "deterministic" self.store.replace_candidate_graph(repository_id, completed_run.id, candidates) if candidate_source == "llm": + log_operation( + "llm_extraction_used", + repository_id=repository_id, + analysis_run_id=completed_run.id, + candidate_count=len(candidates), + ) self.store.create_review_decision( repository_id, completed_run.id, action="llm_extraction_used", notes=f"Generated {len(candidates)} candidate ability draft(s).", ) + log_operation( + "analysis_completed", + repository_id=repository_id, + analysis_run_id=completed_run.id, + fact_count=len(facts), + content_chunk_count=len(stored_chunks), + candidate_count=len(candidates), + candidate_source=candidate_source, + ) return ScanSummary( analysis_run=completed_run, snapshot=snapshot, diff --git a/src/repo_registry/storage/sqlite.py b/src/repo_registry/storage/sqlite.py index 5fdf6e8..28b62a9 100644 --- a/src/repo_registry/storage/sqlite.py +++ b/src/repo_registry/storage/sqlite.py @@ -27,6 +27,7 @@ from repo_registry.core.models import ( SourceReference, confidence_label, ) +from repo_registry.core.logging import log_operation from repo_registry.content_indexing.extractor import ContentChunkCandidate from repo_registry.candidate_graph.generator import CandidateAbilityDraft from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult @@ -1005,7 +1006,15 @@ class RegistryStore: """, (repository_id, analysis_run_id, action, notes), ) - return int(cursor.lastrowid) + decision_id = int(cursor.lastrowid) + log_operation( + "review_decision_recorded", + repository_id=repository_id, + analysis_run_id=analysis_run_id, + review_decision_id=decision_id, + action=action, + ) + return decision_id def list_review_decisions( self, diff --git a/src/repo_registry/web_api/app.py b/src/repo_registry/web_api/app.py index 2772552..9e8c9da 100644 --- a/src/repo_registry/web_api/app.py +++ b/src/repo_registry/web_api/app.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from dataclasses import asdict from pathlib import Path @@ -62,6 +63,7 @@ class Settings(BaseSettings): llm_provider: str | None = Field(default=None) llm_model: str | None = Field(default=None) embedding_provider: str | None = Field(default=None) + log_level: str = Field(default="INFO") def get_settings() -> Settings: @@ -69,6 +71,9 @@ def get_settings() -> Settings: def get_service(settings: Settings = Depends(get_settings)) -> RegistryService: + logging.getLogger("repo_registry.operations").setLevel( + getattr(logging, settings.log_level.upper(), logging.INFO) + ) database_path = Path(settings.database_path) database_path.parent.mkdir(parents=True, exist_ok=True) store = RegistryStore(database_path) @@ -120,8 +125,33 @@ app.include_router(ui_router) @app.get("/health", tags=["health"]) -def health() -> dict[str, str]: - return {"status": "ok"} +def health(settings: Settings = Depends(get_settings)) -> dict[str, object]: + database_path = Path(settings.database_path) + checkout_root = Path(settings.checkout_root) + database_reachable = False + database_error = None + try: + database_path.parent.mkdir(parents=True, exist_ok=True) + store = RegistryStore(database_path) + store.initialize() + with store.connect() as connection: + connection.execute("SELECT 1").fetchone() + database_reachable = True + except Exception as exc: + database_error = str(exc) + + return { + "status": "ok" if database_reachable else "degraded", + "database": { + "path": str(database_path), + "reachable": database_reachable, + "error": database_error, + }, + "checkout_root": { + "path": str(checkout_root), + "exists": checkout_root.exists(), + }, + } @app.post( diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index 32fb90d..b400faa 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -1,5 +1,8 @@ +import json +import logging import subprocess +from repo_registry.core.logging import LOGGER_NAME from repo_registry.core.service import RegistryService from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability from repo_registry.repo_ingestion.git import GitIngestionService @@ -459,6 +462,32 @@ def test_register_repository_imports_metadata_when_name_is_omitted(tmp_path): assert repository.description == "Imported description." +def test_operational_logging_records_analysis_and_review_events(tmp_path, caplog): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text("# Logged\n", encoding="utf-8") + (source / "app.py").write_text( + "from fastapi import FastAPI\n" + "app = FastAPI()\n" + '@app.get("/health")\n' + "def health():\n" + " return {}\n", + encoding="utf-8", + ) + service = make_service(tmp_path) + + with caplog.at_level(logging.INFO, logger=LOGGER_NAME): + repository = service.register_repository(name="Logged", url=str(source)) + summary = service.analyze_repository(repository.id) + service.approve_candidate_graph(repository.id, summary.analysis_run.id) + + events = [json.loads(record.message)["event"] for record in caplog.records] + assert "repository_registered" in events + assert "analysis_started" in events + assert "analysis_completed" in events + assert "review_decision_recorded" in events + + def test_capability_must_belong_to_repository(tmp_path): service = make_service(tmp_path) first = service.register_repository( @@ -1193,3 +1222,42 @@ def test_analyze_repository_clones_git_url_before_scanning(tmp_path): fact_names = {(fact.kind, fact.name, fact.path) for fact in summary.facts} assert ("documentation", "README", "README.md") in fact_names assert ("framework", "pytest", "requirements.txt") in fact_names + + +def test_operational_logging_records_analysis_and_review_events(caplog, tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text("# Logged Service\n", encoding="utf-8") + (source / "requirements.txt").write_text("fastapi\n", encoding="utf-8") + (source / "app.py").write_text( + "from fastapi import FastAPI\n" + "app = FastAPI()\n" + '@app.get("/health")\n' + "def health():\n" + " return {}\n", + encoding="utf-8", + ) + + service = make_service(tmp_path) + caplog.set_level(logging.INFO, logger=LOGGER_NAME) + + repository = service.register_repository(name="Logged", url=str(source)) + summary = service.analyze_repository(repository.id) + service.approve_candidate_graph( + repository.id, + summary.analysis_run.id, + notes="Logged approval.", + ) + + payloads = [ + json.loads(record.message) + for record in caplog.records + if record.name == LOGGER_NAME + ] + events = {payload["event"] for payload in payloads} + + assert "repository_registered" in events + assert "analysis_started" in events + assert "analysis_completed" in events + assert "review_decision_recorded" in events + assert all(payload["repository_id"] == repository.id for payload in payloads) diff --git a/tests/test_web_api.py b/tests/test_web_api.py index a43ad22..11ec8b7 100644 --- a/tests/test_web_api.py +++ b/tests/test_web_api.py @@ -64,6 +64,30 @@ def test_docs_endpoint_is_available(): assert "openapi.json" in response.text +def test_health_reports_database_and_checkout_root(tmp_path): + def override_settings(): + return Settings( + database_path=str(tmp_path / "health.sqlite3"), + checkout_root=str(tmp_path / "checkouts"), + ) + + app.dependency_overrides[get_settings] = override_settings + client = TestClient(app) + try: + response = client.get("/health") + + assert response.status_code == 200 + body = response.json() + assert body["status"] == "ok" + assert body["database"]["reachable"] is True + assert body["database"]["error"] is None + assert body["database"]["path"].endswith("health.sqlite3") + assert body["checkout_root"]["path"].endswith("checkouts") + assert body["checkout_root"]["exists"] is False + finally: + app.dependency_overrides.clear() + + def test_api_manual_registry_loop(tmp_path): def override_settings(): return Settings( @@ -429,6 +453,7 @@ def test_settings_can_load_from_environment(monkeypatch): monkeypatch.setenv("REPO_REGISTRY_LLM_PROVIDER", "mock") monkeypatch.setenv("REPO_REGISTRY_LLM_MODEL", "demo-model") monkeypatch.setenv("REPO_REGISTRY_EMBEDDING_PROVIDER", "hashing") + monkeypatch.setenv("REPO_REGISTRY_LOG_LEVEL", "DEBUG") settings = Settings() @@ -437,6 +462,7 @@ def test_settings_can_load_from_environment(monkeypatch): assert settings.llm_provider == "mock" assert settings.llm_model == "demo-model" assert settings.embedding_provider == "hashing" + assert settings.log_level == "DEBUG" def test_api_analysis_run_loop(tmp_path): diff --git a/workplans/RREG-WP-0002-production-hardening.md b/workplans/RREG-WP-0002-production-hardening.md index 142320a..395c2fe 100644 --- a/workplans/RREG-WP-0002-production-hardening.md +++ b/workplans/RREG-WP-0002-production-hardening.md @@ -84,7 +84,7 @@ do not become approved truth without review. ```task id: RREG-WP-0002-T05 -status: todo +status: done priority: low state_hub_task_id: "44b10491-f1f2-4e2e-9a8e-e8bd59cbf892" ```