repo-scoping/src/repo_registry/storage/sqlite.py

from __future__ import annotations

import json
import sqlite3
from pathlib import Path

from repo_registry.core.models import (
    Ability,
    AnalysisRun,
    Capability,
    Evidence,
    Feature,
    ObservedFact,
    Repository,
    RepositoryAbilityMap,
    RepositorySnapshot,
    SearchResult,
)
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult


class NotFoundError(ValueError):
    pass


class RegistryStore:
    def __init__(self, database_path: str | Path) -> None:
        self.database_path = str(database_path)

    def initialize(self) -> None:
        migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
        with self.connect() as connection:
            connection.executescript(migration_path.read_text(encoding="utf-8"))

    def connect(self) -> sqlite3.Connection:
        connection = sqlite3.connect(self.database_path)
        connection.row_factory = sqlite3.Row
        connection.execute("PRAGMA foreign_keys = ON")
        return connection

    def create_repository(
        self,
        *,
        name: str,
        url: str,
        description: str | None,
        branch: str,
    ) -> Repository:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                INSERT INTO repositories (name, url, description, branch)
                VALUES (?, ?, ?, ?)
                """,
                (name, url, description, branch),
            )
            repository_id = int(cursor.lastrowid)
        return self.get_repository(repository_id)

    def update_repository_status(self, repository_id: int, status: str) -> None:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                UPDATE repositories
                SET status = ?, updated_at = CURRENT_TIMESTAMP
                WHERE id = ?
                """,
                (status, repository_id),
            )
        if cursor.rowcount == 0:
            raise NotFoundError(f"repository {repository_id} was not found")

    def list_repositories(self) -> list[Repository]:
        with self.connect() as connection:
            rows = connection.execute(
                """
                SELECT id, name, url, description, branch, status
                FROM repositories
                ORDER BY created_at DESC, id DESC
                """
            ).fetchall()
        return [self._repository_from_row(row) for row in rows]

    def get_repository(self, repository_id: int) -> Repository:
        with self.connect() as connection:
            row = connection.execute(
                """
                SELECT id, name, url, description, branch, status
                FROM repositories
                WHERE id = ?
                """,
                (repository_id,),
            ).fetchone()
        if row is None:
            raise NotFoundError(f"repository {repository_id} was not found")
        return self._repository_from_row(row)

    def create_analysis_run(self, repository_id: int) -> AnalysisRun:
        self.get_repository(repository_id)
        with self.connect() as connection:
            cursor = connection.execute(
                """
                INSERT INTO analysis_runs (repository_id, status)
                VALUES (?, 'running')
                """,
                (repository_id,),
            )
            analysis_run_id = int(cursor.lastrowid)
        return self.get_analysis_run(repository_id, analysis_run_id)

    def complete_analysis_run(
        self,
        repository_id: int,
        analysis_run_id: int,
        scan_result: ScanResult,
    ) -> AnalysisRun:
        with self.connect() as connection:
            snapshot_cursor = connection.execute(
                """
                INSERT INTO repository_snapshots
                  (repository_id, commit_hash, branch, source_path, file_count)
                VALUES (?, ?, ?, ?, ?)
                """,
                (
                    repository_id,
                    scan_result.commit_hash,
                    scan_result.branch,
                    scan_result.source_path,
                    scan_result.file_count,
                ),
            )
            snapshot_id = int(snapshot_cursor.lastrowid)
            self._insert_facts(
                connection,
                repository_id=repository_id,
                analysis_run_id=analysis_run_id,
                snapshot_id=snapshot_id,
                facts=scan_result.facts,
            )
            connection.execute(
                """
                UPDATE analysis_runs
                SET status = 'completed',
                    snapshot_id = ?,
                    completed_at = CURRENT_TIMESTAMP,
                    error_message = NULL
                WHERE id = ? AND repository_id = ?
                """,
                (snapshot_id, analysis_run_id, repository_id),
            )
            connection.execute(
                """
                UPDATE repositories
                SET status = 'analyzed', updated_at = CURRENT_TIMESTAMP
                WHERE id = ?
                """,
                (repository_id,),
            )
        return self.get_analysis_run(repository_id, analysis_run_id)

    def fail_analysis_run(
        self,
        repository_id: int,
        analysis_run_id: int,
        error_message: str,
    ) -> AnalysisRun:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                UPDATE analysis_runs
                SET status = 'failed',
                    completed_at = CURRENT_TIMESTAMP,
                    error_message = ?
                WHERE id = ? AND repository_id = ?
                """,
                (error_message, analysis_run_id, repository_id),
            )
            connection.execute(
                """
                UPDATE repositories
                SET status = 'analysis_failed', updated_at = CURRENT_TIMESTAMP
                WHERE id = ?
                """,
                (repository_id,),
            )
        if cursor.rowcount == 0:
            raise NotFoundError(
                f"analysis run {analysis_run_id} was not found for repository {repository_id}"
            )
        return self.get_analysis_run(repository_id, analysis_run_id)

    def get_analysis_run(self, repository_id: int, analysis_run_id: int) -> AnalysisRun:
        with self.connect() as connection:
            row = connection.execute(
                """
                SELECT id, repository_id, snapshot_id, status, started_at,
                       completed_at, error_message, scanner_version
                FROM analysis_runs
                WHERE id = ? AND repository_id = ?
                """,
                (analysis_run_id, repository_id),
            ).fetchone()
        if row is None:
            raise NotFoundError(
                f"analysis run {analysis_run_id} was not found for repository {repository_id}"
            )
        return self._analysis_run_from_row(row)

    def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]:
        self.get_repository(repository_id)
        with self.connect() as connection:
            rows = connection.execute(
                """
                SELECT id, repository_id, snapshot_id, status, started_at,
                       completed_at, error_message, scanner_version
                FROM analysis_runs
                WHERE repository_id = ?
                ORDER BY started_at DESC, id DESC
                """,
                (repository_id,),
            ).fetchall()
        return [self._analysis_run_from_row(row) for row in rows]

    def get_snapshot(self, snapshot_id: int) -> RepositorySnapshot:
        with self.connect() as connection:
            row = connection.execute(
                """
                SELECT id, repository_id, commit_hash, branch, source_path, file_count
                FROM repository_snapshots
                WHERE id = ?
                """,
                (snapshot_id,),
            ).fetchone()
        if row is None:
            raise NotFoundError(f"snapshot {snapshot_id} was not found")
        return self._snapshot_from_row(row)

    def list_observed_facts(
        self,
        repository_id: int,
        analysis_run_id: int | None = None,
    ) -> list[ObservedFact]:
        self.get_repository(repository_id)
        params: tuple[int, ...]
        where = "WHERE repository_id = ?"
        params = (repository_id,)
        if analysis_run_id is not None:
            where += " AND analysis_run_id = ?"
            params = (repository_id, analysis_run_id)

        with self.connect() as connection:
            rows = connection.execute(
                f"""
                SELECT id, repository_id, analysis_run_id, snapshot_id, kind,
                       path, name, value, metadata
                FROM observed_facts
                {where}
                ORDER BY kind ASC, path ASC, name ASC, id ASC
                """,
                params,
            ).fetchall()
        return [self._observed_fact_from_row(row) for row in rows]

    def create_ability(
        self,
        repository_id: int,
        *,
        name: str,
        description: str,
        confidence: float,
    ) -> int:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                INSERT INTO approved_abilities
                  (repository_id, name, description, confidence)
                VALUES (?, ?, ?, ?)
                """,
                (repository_id, name, description, confidence),
            )
            return int(cursor.lastrowid)

    def ensure_ability(self, repository_id: int, ability_id: int) -> None:
        with self.connect() as connection:
            row = connection.execute(
                """
                SELECT id FROM approved_abilities
                WHERE id = ? AND repository_id = ?
                """,
                (ability_id, repository_id),
            ).fetchone()
        if row is None:
            raise NotFoundError(
                f"ability {ability_id} was not found for repository {repository_id}"
            )

    def create_capability(
        self,
        repository_id: int,
        ability_id: int,
        *,
        name: str,
        description: str,
        inputs: list[str],
        outputs: list[str],
        confidence: float,
    ) -> int:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                INSERT INTO approved_capabilities
                  (repository_id, ability_id, name, description, inputs, outputs, confidence)
                VALUES (?, ?, ?, ?, ?, ?, ?)
                """,
                (
                    repository_id,
                    ability_id,
                    name,
                    description,
                    json.dumps(inputs),
                    json.dumps(outputs),
                    confidence,
                ),
            )
            return int(cursor.lastrowid)

    def ensure_capability(self, repository_id: int, capability_id: int) -> None:
        with self.connect() as connection:
            row = connection.execute(
                """
                SELECT id FROM approved_capabilities
                WHERE id = ? AND repository_id = ?
                """,
                (capability_id, repository_id),
            ).fetchone()
        if row is None:
            raise NotFoundError(
                f"capability {capability_id} was not found for repository {repository_id}"
            )

    def create_feature(
        self,
        repository_id: int,
        capability_id: int,
        *,
        name: str,
        type: str,
        location: str,
        confidence: float,
    ) -> int:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                INSERT INTO approved_features
                  (repository_id, capability_id, name, type, location, confidence)
                VALUES (?, ?, ?, ?, ?, ?)
                """,
                (repository_id, capability_id, name, type, location, confidence),
            )
            return int(cursor.lastrowid)

    def create_evidence(
        self,
        repository_id: int,
        capability_id: int,
        *,
        type: str,
        reference: str,
        strength: str,
    ) -> int:
        with self.connect() as connection:
            cursor = connection.execute(
                """
                INSERT INTO approved_evidence
                  (repository_id, capability_id, type, reference, strength)
                VALUES (?, ?, ?, ?, ?)
                """,
                (repository_id, capability_id, type, reference, strength),
            )
            return int(cursor.lastrowid)

    def get_ability_map(self, repository_id: int) -> RepositoryAbilityMap:
        repository = self.get_repository(repository_id)
        with self.connect() as connection:
            ability_rows = connection.execute(
                """
                SELECT id, name, description, confidence
                FROM approved_abilities
                WHERE repository_id = ?
                ORDER BY id
                """,
                (repository_id,),
            ).fetchall()
            capability_rows = connection.execute(
                """
                SELECT id, ability_id, name, description, inputs, outputs, confidence
                FROM approved_capabilities
                WHERE repository_id = ?
                ORDER BY id
                """,
                (repository_id,),
            ).fetchall()
            feature_rows = connection.execute(
                """
                SELECT id, capability_id, name, type, location, confidence
                FROM approved_features
                WHERE repository_id = ?
                ORDER BY id
                """,
                (repository_id,),
            ).fetchall()
            evidence_rows = connection.execute(
                """
                SELECT id, capability_id, type, reference, strength
                FROM approved_evidence
                WHERE repository_id = ?
                ORDER BY id
                """,
                (repository_id,),
            ).fetchall()

        features_by_capability: dict[int, list[Feature]] = {}
        for row in feature_rows:
            features_by_capability.setdefault(row["capability_id"], []).append(
                Feature(
                    id=row["id"],
                    name=row["name"],
                    type=row["type"],
                    location=row["location"],
                    confidence=row["confidence"],
                )
            )

        evidence_by_capability: dict[int, list[Evidence]] = {}
        for row in evidence_rows:
            evidence_by_capability.setdefault(row["capability_id"], []).append(
                Evidence(
                    id=row["id"],
                    type=row["type"],
                    reference=row["reference"],
                    strength=row["strength"],
                )
            )

        capabilities_by_ability: dict[int, list[Capability]] = {}
        for row in capability_rows:
            capabilities_by_ability.setdefault(row["ability_id"], []).append(
                Capability(
                    id=row["id"],
                    name=row["name"],
                    description=row["description"],
                    inputs=json.loads(row["inputs"]),
                    outputs=json.loads(row["outputs"]),
                    confidence=row["confidence"],
                    features=features_by_capability.get(row["id"], []),
                    evidence=evidence_by_capability.get(row["id"], []),
                )
            )

        abilities = [
            Ability(
                id=row["id"],
                name=row["name"],
                description=row["description"],
                confidence=row["confidence"],
                capabilities=capabilities_by_ability.get(row["id"], []),
            )
            for row in ability_rows
        ]
        return RepositoryAbilityMap(repository=repository, abilities=abilities)

    def search(self, query: str) -> list[SearchResult]:
        needle = f"%{query.strip()}%"
        if needle == "%%":
            return []

        with self.connect() as connection:
            rows = connection.execute(
                """
                SELECT r.id AS repository_id, r.name AS repository_name,
                       'repository' AS match_type, r.name AS match_name,
                       1.0 AS confidence
                FROM repositories r
                WHERE r.name LIKE ? OR COALESCE(r.description, '') LIKE ?
                UNION ALL
                SELECT r.id, r.name, 'ability', a.name, a.confidence
                FROM approved_abilities a
                JOIN repositories r ON r.id = a.repository_id
                WHERE a.name LIKE ? OR a.description LIKE ?
                UNION ALL
                SELECT r.id, r.name, 'capability', c.name, c.confidence
                FROM approved_capabilities c
                JOIN repositories r ON r.id = c.repository_id
                WHERE c.name LIKE ? OR c.description LIKE ?
                ORDER BY confidence DESC, repository_name ASC, match_name ASC
                """,
                (needle, needle, needle, needle, needle, needle),
            ).fetchall()

        return [
            SearchResult(
                repository_id=row["repository_id"],
                repository_name=row["repository_name"],
                match_type=row["match_type"],
                match_name=row["match_name"],
                confidence=row["confidence"],
            )
            for row in rows
        ]

    def _insert_facts(
        self,
        connection: sqlite3.Connection,
        *,
        repository_id: int,
        analysis_run_id: int,
        snapshot_id: int,
        facts: list[FactCandidate],
    ) -> None:
        connection.executemany(
            """
            INSERT INTO observed_facts
              (repository_id, analysis_run_id, snapshot_id, kind, path, name, value, metadata)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """,
            [
                (
                    repository_id,
                    analysis_run_id,
                    snapshot_id,
                    fact.kind,
                    fact.path,
                    fact.name,
                    fact.value,
                    json.dumps(fact.metadata),
                )
                for fact in facts
            ],
        )

    @staticmethod
    def _repository_from_row(row: sqlite3.Row) -> Repository:
        return Repository(
            id=row["id"],
            name=row["name"],
            url=row["url"],
            description=row["description"],
            branch=row["branch"],
            status=row["status"],
        )

    @staticmethod
    def _snapshot_from_row(row: sqlite3.Row) -> RepositorySnapshot:
        return RepositorySnapshot(
            id=row["id"],
            repository_id=row["repository_id"],
            commit_hash=row["commit_hash"],
            branch=row["branch"],
            source_path=row["source_path"],
            file_count=row["file_count"],
        )

    @staticmethod
    def _analysis_run_from_row(row: sqlite3.Row) -> AnalysisRun:
        return AnalysisRun(
            id=row["id"],
            repository_id=row["repository_id"],
            snapshot_id=row["snapshot_id"],
            status=row["status"],
            started_at=row["started_at"],
            completed_at=row["completed_at"],
            error_message=row["error_message"],
            scanner_version=row["scanner_version"],
        )

    @staticmethod
    def _observed_fact_from_row(row: sqlite3.Row) -> ObservedFact:
        return ObservedFact(
            id=row["id"],
            repository_id=row["repository_id"],
            analysis_run_id=row["analysis_run_id"],
            snapshot_id=row["snapshot_id"],
            kind=row["kind"],
            path=row["path"],
            name=row["name"],
            value=row["value"],
            metadata=json.loads(row["metadata"]),
        )