from __future__ import annotations import json import sqlite3 from pathlib import Path from repo_registry.core.models import ( Ability, AnalysisRun, Capability, Evidence, Feature, ObservedFact, Repository, RepositoryAbilityMap, RepositorySnapshot, SearchResult, ) from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult class NotFoundError(ValueError): pass class RegistryStore: def __init__(self, database_path: str | Path) -> None: self.database_path = str(database_path) def initialize(self) -> None: migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql" with self.connect() as connection: connection.executescript(migration_path.read_text(encoding="utf-8")) def connect(self) -> sqlite3.Connection: connection = sqlite3.connect(self.database_path) connection.row_factory = sqlite3.Row connection.execute("PRAGMA foreign_keys = ON") return connection def create_repository( self, *, name: str, url: str, description: str | None, branch: str, ) -> Repository: with self.connect() as connection: cursor = connection.execute( """ INSERT INTO repositories (name, url, description, branch) VALUES (?, ?, ?, ?) """, (name, url, description, branch), ) repository_id = int(cursor.lastrowid) return self.get_repository(repository_id) def update_repository_status(self, repository_id: int, status: str) -> None: with self.connect() as connection: cursor = connection.execute( """ UPDATE repositories SET status = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ? """, (status, repository_id), ) if cursor.rowcount == 0: raise NotFoundError(f"repository {repository_id} was not found") def list_repositories(self) -> list[Repository]: with self.connect() as connection: rows = connection.execute( """ SELECT id, name, url, description, branch, status FROM repositories ORDER BY created_at DESC, id DESC """ ).fetchall() return [self._repository_from_row(row) for row in rows] def get_repository(self, repository_id: int) -> Repository: with self.connect() as connection: row = connection.execute( """ SELECT id, name, url, description, branch, status FROM repositories WHERE id = ? """, (repository_id,), ).fetchone() if row is None: raise NotFoundError(f"repository {repository_id} was not found") return self._repository_from_row(row) def create_analysis_run(self, repository_id: int) -> AnalysisRun: self.get_repository(repository_id) with self.connect() as connection: cursor = connection.execute( """ INSERT INTO analysis_runs (repository_id, status) VALUES (?, 'running') """, (repository_id,), ) analysis_run_id = int(cursor.lastrowid) return self.get_analysis_run(repository_id, analysis_run_id) def complete_analysis_run( self, repository_id: int, analysis_run_id: int, scan_result: ScanResult, ) -> AnalysisRun: with self.connect() as connection: snapshot_cursor = connection.execute( """ INSERT INTO repository_snapshots (repository_id, commit_hash, branch, source_path, file_count) VALUES (?, ?, ?, ?, ?) """, ( repository_id, scan_result.commit_hash, scan_result.branch, scan_result.source_path, scan_result.file_count, ), ) snapshot_id = int(snapshot_cursor.lastrowid) self._insert_facts( connection, repository_id=repository_id, analysis_run_id=analysis_run_id, snapshot_id=snapshot_id, facts=scan_result.facts, ) connection.execute( """ UPDATE analysis_runs SET status = 'completed', snapshot_id = ?, completed_at = CURRENT_TIMESTAMP, error_message = NULL WHERE id = ? AND repository_id = ? """, (snapshot_id, analysis_run_id, repository_id), ) connection.execute( """ UPDATE repositories SET status = 'analyzed', updated_at = CURRENT_TIMESTAMP WHERE id = ? """, (repository_id,), ) return self.get_analysis_run(repository_id, analysis_run_id) def fail_analysis_run( self, repository_id: int, analysis_run_id: int, error_message: str, ) -> AnalysisRun: with self.connect() as connection: cursor = connection.execute( """ UPDATE analysis_runs SET status = 'failed', completed_at = CURRENT_TIMESTAMP, error_message = ? WHERE id = ? AND repository_id = ? """, (error_message, analysis_run_id, repository_id), ) connection.execute( """ UPDATE repositories SET status = 'analysis_failed', updated_at = CURRENT_TIMESTAMP WHERE id = ? """, (repository_id,), ) if cursor.rowcount == 0: raise NotFoundError( f"analysis run {analysis_run_id} was not found for repository {repository_id}" ) return self.get_analysis_run(repository_id, analysis_run_id) def get_analysis_run(self, repository_id: int, analysis_run_id: int) -> AnalysisRun: with self.connect() as connection: row = connection.execute( """ SELECT id, repository_id, snapshot_id, status, started_at, completed_at, error_message, scanner_version FROM analysis_runs WHERE id = ? AND repository_id = ? """, (analysis_run_id, repository_id), ).fetchone() if row is None: raise NotFoundError( f"analysis run {analysis_run_id} was not found for repository {repository_id}" ) return self._analysis_run_from_row(row) def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]: self.get_repository(repository_id) with self.connect() as connection: rows = connection.execute( """ SELECT id, repository_id, snapshot_id, status, started_at, completed_at, error_message, scanner_version FROM analysis_runs WHERE repository_id = ? ORDER BY started_at DESC, id DESC """, (repository_id,), ).fetchall() return [self._analysis_run_from_row(row) for row in rows] def get_snapshot(self, snapshot_id: int) -> RepositorySnapshot: with self.connect() as connection: row = connection.execute( """ SELECT id, repository_id, commit_hash, branch, source_path, file_count FROM repository_snapshots WHERE id = ? """, (snapshot_id,), ).fetchone() if row is None: raise NotFoundError(f"snapshot {snapshot_id} was not found") return self._snapshot_from_row(row) def list_observed_facts( self, repository_id: int, analysis_run_id: int | None = None, ) -> list[ObservedFact]: self.get_repository(repository_id) params: tuple[int, ...] where = "WHERE repository_id = ?" params = (repository_id,) if analysis_run_id is not None: where += " AND analysis_run_id = ?" params = (repository_id, analysis_run_id) with self.connect() as connection: rows = connection.execute( f""" SELECT id, repository_id, analysis_run_id, snapshot_id, kind, path, name, value, metadata FROM observed_facts {where} ORDER BY kind ASC, path ASC, name ASC, id ASC """, params, ).fetchall() return [self._observed_fact_from_row(row) for row in rows] def create_ability( self, repository_id: int, *, name: str, description: str, confidence: float, ) -> int: with self.connect() as connection: cursor = connection.execute( """ INSERT INTO approved_abilities (repository_id, name, description, confidence) VALUES (?, ?, ?, ?) """, (repository_id, name, description, confidence), ) return int(cursor.lastrowid) def ensure_ability(self, repository_id: int, ability_id: int) -> None: with self.connect() as connection: row = connection.execute( """ SELECT id FROM approved_abilities WHERE id = ? AND repository_id = ? """, (ability_id, repository_id), ).fetchone() if row is None: raise NotFoundError( f"ability {ability_id} was not found for repository {repository_id}" ) def create_capability( self, repository_id: int, ability_id: int, *, name: str, description: str, inputs: list[str], outputs: list[str], confidence: float, ) -> int: with self.connect() as connection: cursor = connection.execute( """ INSERT INTO approved_capabilities (repository_id, ability_id, name, description, inputs, outputs, confidence) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( repository_id, ability_id, name, description, json.dumps(inputs), json.dumps(outputs), confidence, ), ) return int(cursor.lastrowid) def ensure_capability(self, repository_id: int, capability_id: int) -> None: with self.connect() as connection: row = connection.execute( """ SELECT id FROM approved_capabilities WHERE id = ? AND repository_id = ? """, (capability_id, repository_id), ).fetchone() if row is None: raise NotFoundError( f"capability {capability_id} was not found for repository {repository_id}" ) def create_feature( self, repository_id: int, capability_id: int, *, name: str, type: str, location: str, confidence: float, ) -> int: with self.connect() as connection: cursor = connection.execute( """ INSERT INTO approved_features (repository_id, capability_id, name, type, location, confidence) VALUES (?, ?, ?, ?, ?, ?) """, (repository_id, capability_id, name, type, location, confidence), ) return int(cursor.lastrowid) def create_evidence( self, repository_id: int, capability_id: int, *, type: str, reference: str, strength: str, ) -> int: with self.connect() as connection: cursor = connection.execute( """ INSERT INTO approved_evidence (repository_id, capability_id, type, reference, strength) VALUES (?, ?, ?, ?, ?) """, (repository_id, capability_id, type, reference, strength), ) return int(cursor.lastrowid) def get_ability_map(self, repository_id: int) -> RepositoryAbilityMap: repository = self.get_repository(repository_id) with self.connect() as connection: ability_rows = connection.execute( """ SELECT id, name, description, confidence FROM approved_abilities WHERE repository_id = ? ORDER BY id """, (repository_id,), ).fetchall() capability_rows = connection.execute( """ SELECT id, ability_id, name, description, inputs, outputs, confidence FROM approved_capabilities WHERE repository_id = ? ORDER BY id """, (repository_id,), ).fetchall() feature_rows = connection.execute( """ SELECT id, capability_id, name, type, location, confidence FROM approved_features WHERE repository_id = ? ORDER BY id """, (repository_id,), ).fetchall() evidence_rows = connection.execute( """ SELECT id, capability_id, type, reference, strength FROM approved_evidence WHERE repository_id = ? ORDER BY id """, (repository_id,), ).fetchall() features_by_capability: dict[int, list[Feature]] = {} for row in feature_rows: features_by_capability.setdefault(row["capability_id"], []).append( Feature( id=row["id"], name=row["name"], type=row["type"], location=row["location"], confidence=row["confidence"], ) ) evidence_by_capability: dict[int, list[Evidence]] = {} for row in evidence_rows: evidence_by_capability.setdefault(row["capability_id"], []).append( Evidence( id=row["id"], type=row["type"], reference=row["reference"], strength=row["strength"], ) ) capabilities_by_ability: dict[int, list[Capability]] = {} for row in capability_rows: capabilities_by_ability.setdefault(row["ability_id"], []).append( Capability( id=row["id"], name=row["name"], description=row["description"], inputs=json.loads(row["inputs"]), outputs=json.loads(row["outputs"]), confidence=row["confidence"], features=features_by_capability.get(row["id"], []), evidence=evidence_by_capability.get(row["id"], []), ) ) abilities = [ Ability( id=row["id"], name=row["name"], description=row["description"], confidence=row["confidence"], capabilities=capabilities_by_ability.get(row["id"], []), ) for row in ability_rows ] return RepositoryAbilityMap(repository=repository, abilities=abilities) def search(self, query: str) -> list[SearchResult]: needle = f"%{query.strip()}%" if needle == "%%": return [] with self.connect() as connection: rows = connection.execute( """ SELECT r.id AS repository_id, r.name AS repository_name, 'repository' AS match_type, r.name AS match_name, 1.0 AS confidence FROM repositories r WHERE r.name LIKE ? OR COALESCE(r.description, '') LIKE ? UNION ALL SELECT r.id, r.name, 'ability', a.name, a.confidence FROM approved_abilities a JOIN repositories r ON r.id = a.repository_id WHERE a.name LIKE ? OR a.description LIKE ? UNION ALL SELECT r.id, r.name, 'capability', c.name, c.confidence FROM approved_capabilities c JOIN repositories r ON r.id = c.repository_id WHERE c.name LIKE ? OR c.description LIKE ? ORDER BY confidence DESC, repository_name ASC, match_name ASC """, (needle, needle, needle, needle, needle, needle), ).fetchall() return [ SearchResult( repository_id=row["repository_id"], repository_name=row["repository_name"], match_type=row["match_type"], match_name=row["match_name"], confidence=row["confidence"], ) for row in rows ] def _insert_facts( self, connection: sqlite3.Connection, *, repository_id: int, analysis_run_id: int, snapshot_id: int, facts: list[FactCandidate], ) -> None: connection.executemany( """ INSERT INTO observed_facts (repository_id, analysis_run_id, snapshot_id, kind, path, name, value, metadata) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, [ ( repository_id, analysis_run_id, snapshot_id, fact.kind, fact.path, fact.name, fact.value, json.dumps(fact.metadata), ) for fact in facts ], ) @staticmethod def _repository_from_row(row: sqlite3.Row) -> Repository: return Repository( id=row["id"], name=row["name"], url=row["url"], description=row["description"], branch=row["branch"], status=row["status"], ) @staticmethod def _snapshot_from_row(row: sqlite3.Row) -> RepositorySnapshot: return RepositorySnapshot( id=row["id"], repository_id=row["repository_id"], commit_hash=row["commit_hash"], branch=row["branch"], source_path=row["source_path"], file_count=row["file_count"], ) @staticmethod def _analysis_run_from_row(row: sqlite3.Row) -> AnalysisRun: return AnalysisRun( id=row["id"], repository_id=row["repository_id"], snapshot_id=row["snapshot_id"], status=row["status"], started_at=row["started_at"], completed_at=row["completed_at"], error_message=row["error_message"], scanner_version=row["scanner_version"], ) @staticmethod def _observed_fact_from_row(row: sqlite3.Row) -> ObservedFact: return ObservedFact( id=row["id"], repository_id=row["repository_id"], analysis_run_id=row["analysis_run_id"], snapshot_id=row["snapshot_id"], kind=row["kind"], path=row["path"], name=row["name"], value=row["value"], metadata=json.loads(row["metadata"]), )