structured logging around key workflows and docs for operational readiness

This commit is contained in:
2026-04-26 17:02:24 +02:00
parent 2902e362df
commit 99bb851ca8
8 changed files with 288 additions and 5 deletions

93
docs/operations.md Normal file
View File

@@ -0,0 +1,93 @@
# Operational Readiness
This note captures the runtime knobs and baseline operating procedures for the
Repository Ability Registry service.
## Configuration
Configuration is read from environment variables with the `REPO_REGISTRY_`
prefix.
| Variable | Default | Purpose |
| --- | --- | --- |
| `REPO_REGISTRY_DATABASE_PATH` | `var/repo-registry.sqlite3` | SQLite database file used by the default store. |
| `REPO_REGISTRY_CHECKOUT_ROOT` | `var/checkouts` | Local checkout cache used during repository ingestion. |
| `REPO_REGISTRY_LLM_PROVIDER` | unset | Optional LLM provider name for candidate extraction. |
| `REPO_REGISTRY_LLM_MODEL` | unset | Optional model name passed to the configured LLM provider. |
| `REPO_REGISTRY_EMBEDDING_PROVIDER` | unset | Set to `hashing` to enable deterministic local hybrid search scoring. |
| `REPO_REGISTRY_LOG_LEVEL` | `INFO` | Log level for the `repo_registry.operations` structured event logger. |
## Health Checks
`GET /health` returns service status plus the operational dependencies that can
be checked locally:
```json
{
"status": "ok",
"database": {
"path": "var/repo-registry.sqlite3",
"reachable": true,
"error": null
},
"checkout_root": {
"path": "var/checkouts",
"exists": true
}
}
```
`status` is `degraded` when the database cannot be initialized or queried. The
checkout root is reported as metadata because it may be created lazily by the
ingestion path.
## Structured Logs
Operational events are emitted through the `repo_registry.operations` logger as
single-line JSON messages. Current events include repository registration,
analysis start/completion/failure, LLM extraction usage/failure, and review
decisions.
Configure the Python or ASGI server logging stack to route this logger to the
same sink as application logs. `REPO_REGISTRY_LOG_LEVEL` controls the logger
level used by API-created service instances.
## SQLite Backup And Restore
For single-node SQLite deployments, prefer the SQLite backup API so readers can
continue while the backup is created:
```bash
mkdir -p backups
sqlite3 var/repo-registry.sqlite3 ".backup 'backups/repo-registry-$(date +%F).sqlite3'"
```
For the most conservative backup window, stop writes first, run the backup, then
resume the service. Verify a backup with:
```bash
sqlite3 backups/repo-registry-YYYY-MM-DD.sqlite3 "PRAGMA integrity_check;"
```
To restore, stop the service, move the current database aside, copy the backup to
`REPO_REGISTRY_DATABASE_PATH`, start the service, and verify `GET /health`.
## PostgreSQL Migration Notes
The storage interface is intentionally kept behind `RegistryStore` so a
PostgreSQL-backed implementation can be introduced alongside SQLite before
cutover. A production migration should:
1. Add a PostgreSQL store that preserves the current repository, analysis,
observed fact, content chunk, candidate, approved registry, and review
decision contracts.
2. Manage schema changes with explicit migrations rather than implicit table
creation.
3. Export from SQLite and import into PostgreSQL in a repeatable script, then
compare repository counts, approved ability maps, search results, and recent
review decisions.
4. Keep vector search optional. If pgvector is enabled, follow the plan in
`docs/semantic-retrieval.md` and validate hybrid ranking before making it the
default.
5. Take a final SQLite backup immediately before cutover and retain it until the
PostgreSQL deployment has passed health and smoke tests.

View File

@@ -0,0 +1,15 @@
from __future__ import annotations
import json
import logging
from typing import Any
LOGGER_NAME = "repo_registry.operations"
def log_operation(event: str, **fields: Any) -> None:
payload = {"event": event, **fields}
logging.getLogger(LOGGER_NAME).info(
json.dumps(payload, sort_keys=True, default=str)
)

View File

@@ -25,6 +25,7 @@ from repo_registry.core.models import (
)
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.core.logging import log_operation
from repo_registry.llm_extraction.extractor import LLMCandidateExtractor
from repo_registry.llm_extraction.mapper import LLMExtractionMapper
from repo_registry.repo_ingestion.git import GitIngestionService
@@ -67,13 +68,21 @@ class RegistryService:
metadata = self.metadata_extractor.extract(checkout.source_path, url)
else:
metadata = None
return self.store.create_repository(
repository = self.store.create_repository(
name=name or (metadata.name if metadata is not None else "repository"),
url=url,
description=description
or (metadata.description if metadata is not None else None),
branch=branch,
)
log_operation(
"repository_registered",
repository_id=repository.id,
repository_name=repository.name,
branch=repository.branch,
metadata_imported=metadata is not None,
)
return repository
def list_repositories(self) -> list[Repository]:
return self.store.list_repositories()
@@ -108,6 +117,12 @@ class RegistryService:
repository = self.store.get_repository(repository_id)
run = self.store.create_analysis_run(repository_id)
self.store.update_repository_status(repository_id, "analyzing")
log_operation(
"analysis_started",
repository_id=repository_id,
analysis_run_id=run.id,
source_override=source_path is not None,
)
try:
if source_path is None:
checkout = self.ingestion.resolve(repository.url, branch=repository.branch)
@@ -117,6 +132,12 @@ class RegistryService:
scan_result = self.scanner.scan(scan_source)
except Exception as exc:
failed_run = self.store.fail_analysis_run(repository_id, run.id, str(exc))
log_operation(
"analysis_failed",
repository_id=repository_id,
analysis_run_id=run.id,
error=str(exc),
)
return ScanSummary(analysis_run=failed_run, snapshot=None, facts=[])
completed_run = self.store.complete_analysis_run(
@@ -145,6 +166,12 @@ class RegistryService:
stored_chunks,
)
except Exception as exc:
log_operation(
"llm_extraction_failed",
repository_id=repository_id,
analysis_run_id=completed_run.id,
error=str(exc),
)
self.store.create_review_decision(
repository_id,
completed_run.id,
@@ -159,12 +186,27 @@ class RegistryService:
candidate_source = "deterministic"
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
if candidate_source == "llm":
log_operation(
"llm_extraction_used",
repository_id=repository_id,
analysis_run_id=completed_run.id,
candidate_count=len(candidates),
)
self.store.create_review_decision(
repository_id,
completed_run.id,
action="llm_extraction_used",
notes=f"Generated {len(candidates)} candidate ability draft(s).",
)
log_operation(
"analysis_completed",
repository_id=repository_id,
analysis_run_id=completed_run.id,
fact_count=len(facts),
content_chunk_count=len(stored_chunks),
candidate_count=len(candidates),
candidate_source=candidate_source,
)
return ScanSummary(
analysis_run=completed_run,
snapshot=snapshot,

View File

@@ -27,6 +27,7 @@ from repo_registry.core.models import (
SourceReference,
confidence_label,
)
from repo_registry.core.logging import log_operation
from repo_registry.content_indexing.extractor import ContentChunkCandidate
from repo_registry.candidate_graph.generator import CandidateAbilityDraft
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
@@ -1005,7 +1006,15 @@ class RegistryStore:
""",
(repository_id, analysis_run_id, action, notes),
)
return int(cursor.lastrowid)
decision_id = int(cursor.lastrowid)
log_operation(
"review_decision_recorded",
repository_id=repository_id,
analysis_run_id=analysis_run_id,
review_decision_id=decision_id,
action=action,
)
return decision_id
def list_review_decisions(
self,

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import logging
from dataclasses import asdict
from pathlib import Path
@@ -62,6 +63,7 @@ class Settings(BaseSettings):
llm_provider: str | None = Field(default=None)
llm_model: str | None = Field(default=None)
embedding_provider: str | None = Field(default=None)
log_level: str = Field(default="INFO")
def get_settings() -> Settings:
@@ -69,6 +71,9 @@ def get_settings() -> Settings:
def get_service(settings: Settings = Depends(get_settings)) -> RegistryService:
logging.getLogger("repo_registry.operations").setLevel(
getattr(logging, settings.log_level.upper(), logging.INFO)
)
database_path = Path(settings.database_path)
database_path.parent.mkdir(parents=True, exist_ok=True)
store = RegistryStore(database_path)
@@ -120,8 +125,33 @@ app.include_router(ui_router)
@app.get("/health", tags=["health"])
def health() -> dict[str, str]:
return {"status": "ok"}
def health(settings: Settings = Depends(get_settings)) -> dict[str, object]:
database_path = Path(settings.database_path)
checkout_root = Path(settings.checkout_root)
database_reachable = False
database_error = None
try:
database_path.parent.mkdir(parents=True, exist_ok=True)
store = RegistryStore(database_path)
store.initialize()
with store.connect() as connection:
connection.execute("SELECT 1").fetchone()
database_reachable = True
except Exception as exc:
database_error = str(exc)
return {
"status": "ok" if database_reachable else "degraded",
"database": {
"path": str(database_path),
"reachable": database_reachable,
"error": database_error,
},
"checkout_root": {
"path": str(checkout_root),
"exists": checkout_root.exists(),
},
}
@app.post(

View File

@@ -1,5 +1,8 @@
import json
import logging
import subprocess
from repo_registry.core.logging import LOGGER_NAME
from repo_registry.core.service import RegistryService
from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability
from repo_registry.repo_ingestion.git import GitIngestionService
@@ -459,6 +462,32 @@ def test_register_repository_imports_metadata_when_name_is_omitted(tmp_path):
assert repository.description == "Imported description."
def test_operational_logging_records_analysis_and_review_events(tmp_path, caplog):
source = tmp_path / "repo"
source.mkdir()
(source / "README.md").write_text("# Logged\n", encoding="utf-8")
(source / "app.py").write_text(
"from fastapi import FastAPI\n"
"app = FastAPI()\n"
'@app.get("/health")\n'
"def health():\n"
" return {}\n",
encoding="utf-8",
)
service = make_service(tmp_path)
with caplog.at_level(logging.INFO, logger=LOGGER_NAME):
repository = service.register_repository(name="Logged", url=str(source))
summary = service.analyze_repository(repository.id)
service.approve_candidate_graph(repository.id, summary.analysis_run.id)
events = [json.loads(record.message)["event"] for record in caplog.records]
assert "repository_registered" in events
assert "analysis_started" in events
assert "analysis_completed" in events
assert "review_decision_recorded" in events
def test_capability_must_belong_to_repository(tmp_path):
service = make_service(tmp_path)
first = service.register_repository(
@@ -1193,3 +1222,42 @@ def test_analyze_repository_clones_git_url_before_scanning(tmp_path):
fact_names = {(fact.kind, fact.name, fact.path) for fact in summary.facts}
assert ("documentation", "README", "README.md") in fact_names
assert ("framework", "pytest", "requirements.txt") in fact_names
def test_operational_logging_records_analysis_and_review_events(caplog, tmp_path):
source = tmp_path / "repo"
source.mkdir()
(source / "README.md").write_text("# Logged Service\n", encoding="utf-8")
(source / "requirements.txt").write_text("fastapi\n", encoding="utf-8")
(source / "app.py").write_text(
"from fastapi import FastAPI\n"
"app = FastAPI()\n"
'@app.get("/health")\n'
"def health():\n"
" return {}\n",
encoding="utf-8",
)
service = make_service(tmp_path)
caplog.set_level(logging.INFO, logger=LOGGER_NAME)
repository = service.register_repository(name="Logged", url=str(source))
summary = service.analyze_repository(repository.id)
service.approve_candidate_graph(
repository.id,
summary.analysis_run.id,
notes="Logged approval.",
)
payloads = [
json.loads(record.message)
for record in caplog.records
if record.name == LOGGER_NAME
]
events = {payload["event"] for payload in payloads}
assert "repository_registered" in events
assert "analysis_started" in events
assert "analysis_completed" in events
assert "review_decision_recorded" in events
assert all(payload["repository_id"] == repository.id for payload in payloads)

View File

@@ -64,6 +64,30 @@ def test_docs_endpoint_is_available():
assert "openapi.json" in response.text
def test_health_reports_database_and_checkout_root(tmp_path):
def override_settings():
return Settings(
database_path=str(tmp_path / "health.sqlite3"),
checkout_root=str(tmp_path / "checkouts"),
)
app.dependency_overrides[get_settings] = override_settings
client = TestClient(app)
try:
response = client.get("/health")
assert response.status_code == 200
body = response.json()
assert body["status"] == "ok"
assert body["database"]["reachable"] is True
assert body["database"]["error"] is None
assert body["database"]["path"].endswith("health.sqlite3")
assert body["checkout_root"]["path"].endswith("checkouts")
assert body["checkout_root"]["exists"] is False
finally:
app.dependency_overrides.clear()
def test_api_manual_registry_loop(tmp_path):
def override_settings():
return Settings(
@@ -429,6 +453,7 @@ def test_settings_can_load_from_environment(monkeypatch):
monkeypatch.setenv("REPO_REGISTRY_LLM_PROVIDER", "mock")
monkeypatch.setenv("REPO_REGISTRY_LLM_MODEL", "demo-model")
monkeypatch.setenv("REPO_REGISTRY_EMBEDDING_PROVIDER", "hashing")
monkeypatch.setenv("REPO_REGISTRY_LOG_LEVEL", "DEBUG")
settings = Settings()
@@ -437,6 +462,7 @@ def test_settings_can_load_from_environment(monkeypatch):
assert settings.llm_provider == "mock"
assert settings.llm_model == "demo-model"
assert settings.embedding_provider == "hashing"
assert settings.log_level == "DEBUG"
def test_api_analysis_run_loop(tmp_path):

View File

@@ -84,7 +84,7 @@ do not become approved truth without review.
```task
id: RREG-WP-0002-T05
status: todo
status: done
priority: low
state_hub_task_id: "44b10491-f1f2-4e2e-9a8e-e8bd59cbf892"
```