Files
repo-scoping/src/repo_registry/storage/sqlite.py

589 lines
20 KiB
Python

from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from repo_registry.core.models import (
Ability,
AnalysisRun,
Capability,
Evidence,
Feature,
ObservedFact,
Repository,
RepositoryAbilityMap,
RepositorySnapshot,
SearchResult,
)
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
class NotFoundError(ValueError):
pass
class RegistryStore:
def __init__(self, database_path: str | Path) -> None:
self.database_path = str(database_path)
def initialize(self) -> None:
migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
with self.connect() as connection:
connection.executescript(migration_path.read_text(encoding="utf-8"))
def connect(self) -> sqlite3.Connection:
connection = sqlite3.connect(self.database_path)
connection.row_factory = sqlite3.Row
connection.execute("PRAGMA foreign_keys = ON")
return connection
def create_repository(
self,
*,
name: str,
url: str,
description: str | None,
branch: str,
) -> Repository:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO repositories (name, url, description, branch)
VALUES (?, ?, ?, ?)
""",
(name, url, description, branch),
)
repository_id = int(cursor.lastrowid)
return self.get_repository(repository_id)
def update_repository_status(self, repository_id: int, status: str) -> None:
with self.connect() as connection:
cursor = connection.execute(
"""
UPDATE repositories
SET status = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
(status, repository_id),
)
if cursor.rowcount == 0:
raise NotFoundError(f"repository {repository_id} was not found")
def list_repositories(self) -> list[Repository]:
with self.connect() as connection:
rows = connection.execute(
"""
SELECT id, name, url, description, branch, status
FROM repositories
ORDER BY created_at DESC, id DESC
"""
).fetchall()
return [self._repository_from_row(row) for row in rows]
def get_repository(self, repository_id: int) -> Repository:
with self.connect() as connection:
row = connection.execute(
"""
SELECT id, name, url, description, branch, status
FROM repositories
WHERE id = ?
""",
(repository_id,),
).fetchone()
if row is None:
raise NotFoundError(f"repository {repository_id} was not found")
return self._repository_from_row(row)
def create_analysis_run(self, repository_id: int) -> AnalysisRun:
self.get_repository(repository_id)
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO analysis_runs (repository_id, status)
VALUES (?, 'running')
""",
(repository_id,),
)
analysis_run_id = int(cursor.lastrowid)
return self.get_analysis_run(repository_id, analysis_run_id)
def complete_analysis_run(
self,
repository_id: int,
analysis_run_id: int,
scan_result: ScanResult,
) -> AnalysisRun:
with self.connect() as connection:
snapshot_cursor = connection.execute(
"""
INSERT INTO repository_snapshots
(repository_id, commit_hash, branch, source_path, file_count)
VALUES (?, ?, ?, ?, ?)
""",
(
repository_id,
scan_result.commit_hash,
scan_result.branch,
scan_result.source_path,
scan_result.file_count,
),
)
snapshot_id = int(snapshot_cursor.lastrowid)
self._insert_facts(
connection,
repository_id=repository_id,
analysis_run_id=analysis_run_id,
snapshot_id=snapshot_id,
facts=scan_result.facts,
)
connection.execute(
"""
UPDATE analysis_runs
SET status = 'completed',
snapshot_id = ?,
completed_at = CURRENT_TIMESTAMP,
error_message = NULL
WHERE id = ? AND repository_id = ?
""",
(snapshot_id, analysis_run_id, repository_id),
)
connection.execute(
"""
UPDATE repositories
SET status = 'analyzed', updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
(repository_id,),
)
return self.get_analysis_run(repository_id, analysis_run_id)
def fail_analysis_run(
self,
repository_id: int,
analysis_run_id: int,
error_message: str,
) -> AnalysisRun:
with self.connect() as connection:
cursor = connection.execute(
"""
UPDATE analysis_runs
SET status = 'failed',
completed_at = CURRENT_TIMESTAMP,
error_message = ?
WHERE id = ? AND repository_id = ?
""",
(error_message, analysis_run_id, repository_id),
)
connection.execute(
"""
UPDATE repositories
SET status = 'analysis_failed', updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
(repository_id,),
)
if cursor.rowcount == 0:
raise NotFoundError(
f"analysis run {analysis_run_id} was not found for repository {repository_id}"
)
return self.get_analysis_run(repository_id, analysis_run_id)
def get_analysis_run(self, repository_id: int, analysis_run_id: int) -> AnalysisRun:
with self.connect() as connection:
row = connection.execute(
"""
SELECT id, repository_id, snapshot_id, status, started_at,
completed_at, error_message, scanner_version
FROM analysis_runs
WHERE id = ? AND repository_id = ?
""",
(analysis_run_id, repository_id),
).fetchone()
if row is None:
raise NotFoundError(
f"analysis run {analysis_run_id} was not found for repository {repository_id}"
)
return self._analysis_run_from_row(row)
def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]:
self.get_repository(repository_id)
with self.connect() as connection:
rows = connection.execute(
"""
SELECT id, repository_id, snapshot_id, status, started_at,
completed_at, error_message, scanner_version
FROM analysis_runs
WHERE repository_id = ?
ORDER BY started_at DESC, id DESC
""",
(repository_id,),
).fetchall()
return [self._analysis_run_from_row(row) for row in rows]
def get_snapshot(self, snapshot_id: int) -> RepositorySnapshot:
with self.connect() as connection:
row = connection.execute(
"""
SELECT id, repository_id, commit_hash, branch, source_path, file_count
FROM repository_snapshots
WHERE id = ?
""",
(snapshot_id,),
).fetchone()
if row is None:
raise NotFoundError(f"snapshot {snapshot_id} was not found")
return self._snapshot_from_row(row)
def list_observed_facts(
self,
repository_id: int,
analysis_run_id: int | None = None,
) -> list[ObservedFact]:
self.get_repository(repository_id)
params: tuple[int, ...]
where = "WHERE repository_id = ?"
params = (repository_id,)
if analysis_run_id is not None:
where += " AND analysis_run_id = ?"
params = (repository_id, analysis_run_id)
with self.connect() as connection:
rows = connection.execute(
f"""
SELECT id, repository_id, analysis_run_id, snapshot_id, kind,
path, name, value, metadata
FROM observed_facts
{where}
ORDER BY kind ASC, path ASC, name ASC, id ASC
""",
params,
).fetchall()
return [self._observed_fact_from_row(row) for row in rows]
def create_ability(
self,
repository_id: int,
*,
name: str,
description: str,
confidence: float,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_abilities
(repository_id, name, description, confidence)
VALUES (?, ?, ?, ?)
""",
(repository_id, name, description, confidence),
)
return int(cursor.lastrowid)
def ensure_ability(self, repository_id: int, ability_id: int) -> None:
with self.connect() as connection:
row = connection.execute(
"""
SELECT id FROM approved_abilities
WHERE id = ? AND repository_id = ?
""",
(ability_id, repository_id),
).fetchone()
if row is None:
raise NotFoundError(
f"ability {ability_id} was not found for repository {repository_id}"
)
def create_capability(
self,
repository_id: int,
ability_id: int,
*,
name: str,
description: str,
inputs: list[str],
outputs: list[str],
confidence: float,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_capabilities
(repository_id, ability_id, name, description, inputs, outputs, confidence)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
ability_id,
name,
description,
json.dumps(inputs),
json.dumps(outputs),
confidence,
),
)
return int(cursor.lastrowid)
def ensure_capability(self, repository_id: int, capability_id: int) -> None:
with self.connect() as connection:
row = connection.execute(
"""
SELECT id FROM approved_capabilities
WHERE id = ? AND repository_id = ?
""",
(capability_id, repository_id),
).fetchone()
if row is None:
raise NotFoundError(
f"capability {capability_id} was not found for repository {repository_id}"
)
def create_feature(
self,
repository_id: int,
capability_id: int,
*,
name: str,
type: str,
location: str,
confidence: float,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_features
(repository_id, capability_id, name, type, location, confidence)
VALUES (?, ?, ?, ?, ?, ?)
""",
(repository_id, capability_id, name, type, location, confidence),
)
return int(cursor.lastrowid)
def create_evidence(
self,
repository_id: int,
capability_id: int,
*,
type: str,
reference: str,
strength: str,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_evidence
(repository_id, capability_id, type, reference, strength)
VALUES (?, ?, ?, ?, ?)
""",
(repository_id, capability_id, type, reference, strength),
)
return int(cursor.lastrowid)
def get_ability_map(self, repository_id: int) -> RepositoryAbilityMap:
repository = self.get_repository(repository_id)
with self.connect() as connection:
ability_rows = connection.execute(
"""
SELECT id, name, description, confidence
FROM approved_abilities
WHERE repository_id = ?
ORDER BY id
""",
(repository_id,),
).fetchall()
capability_rows = connection.execute(
"""
SELECT id, ability_id, name, description, inputs, outputs, confidence
FROM approved_capabilities
WHERE repository_id = ?
ORDER BY id
""",
(repository_id,),
).fetchall()
feature_rows = connection.execute(
"""
SELECT id, capability_id, name, type, location, confidence
FROM approved_features
WHERE repository_id = ?
ORDER BY id
""",
(repository_id,),
).fetchall()
evidence_rows = connection.execute(
"""
SELECT id, capability_id, type, reference, strength
FROM approved_evidence
WHERE repository_id = ?
ORDER BY id
""",
(repository_id,),
).fetchall()
features_by_capability: dict[int, list[Feature]] = {}
for row in feature_rows:
features_by_capability.setdefault(row["capability_id"], []).append(
Feature(
id=row["id"],
name=row["name"],
type=row["type"],
location=row["location"],
confidence=row["confidence"],
)
)
evidence_by_capability: dict[int, list[Evidence]] = {}
for row in evidence_rows:
evidence_by_capability.setdefault(row["capability_id"], []).append(
Evidence(
id=row["id"],
type=row["type"],
reference=row["reference"],
strength=row["strength"],
)
)
capabilities_by_ability: dict[int, list[Capability]] = {}
for row in capability_rows:
capabilities_by_ability.setdefault(row["ability_id"], []).append(
Capability(
id=row["id"],
name=row["name"],
description=row["description"],
inputs=json.loads(row["inputs"]),
outputs=json.loads(row["outputs"]),
confidence=row["confidence"],
features=features_by_capability.get(row["id"], []),
evidence=evidence_by_capability.get(row["id"], []),
)
)
abilities = [
Ability(
id=row["id"],
name=row["name"],
description=row["description"],
confidence=row["confidence"],
capabilities=capabilities_by_ability.get(row["id"], []),
)
for row in ability_rows
]
return RepositoryAbilityMap(repository=repository, abilities=abilities)
def search(self, query: str) -> list[SearchResult]:
needle = f"%{query.strip()}%"
if needle == "%%":
return []
with self.connect() as connection:
rows = connection.execute(
"""
SELECT r.id AS repository_id, r.name AS repository_name,
'repository' AS match_type, r.name AS match_name,
1.0 AS confidence
FROM repositories r
WHERE r.name LIKE ? OR COALESCE(r.description, '') LIKE ?
UNION ALL
SELECT r.id, r.name, 'ability', a.name, a.confidence
FROM approved_abilities a
JOIN repositories r ON r.id = a.repository_id
WHERE a.name LIKE ? OR a.description LIKE ?
UNION ALL
SELECT r.id, r.name, 'capability', c.name, c.confidence
FROM approved_capabilities c
JOIN repositories r ON r.id = c.repository_id
WHERE c.name LIKE ? OR c.description LIKE ?
ORDER BY confidence DESC, repository_name ASC, match_name ASC
""",
(needle, needle, needle, needle, needle, needle),
).fetchall()
return [
SearchResult(
repository_id=row["repository_id"],
repository_name=row["repository_name"],
match_type=row["match_type"],
match_name=row["match_name"],
confidence=row["confidence"],
)
for row in rows
]
def _insert_facts(
self,
connection: sqlite3.Connection,
*,
repository_id: int,
analysis_run_id: int,
snapshot_id: int,
facts: list[FactCandidate],
) -> None:
connection.executemany(
"""
INSERT INTO observed_facts
(repository_id, analysis_run_id, snapshot_id, kind, path, name, value, metadata)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(
repository_id,
analysis_run_id,
snapshot_id,
fact.kind,
fact.path,
fact.name,
fact.value,
json.dumps(fact.metadata),
)
for fact in facts
],
)
@staticmethod
def _repository_from_row(row: sqlite3.Row) -> Repository:
return Repository(
id=row["id"],
name=row["name"],
url=row["url"],
description=row["description"],
branch=row["branch"],
status=row["status"],
)
@staticmethod
def _snapshot_from_row(row: sqlite3.Row) -> RepositorySnapshot:
return RepositorySnapshot(
id=row["id"],
repository_id=row["repository_id"],
commit_hash=row["commit_hash"],
branch=row["branch"],
source_path=row["source_path"],
file_count=row["file_count"],
)
@staticmethod
def _analysis_run_from_row(row: sqlite3.Row) -> AnalysisRun:
return AnalysisRun(
id=row["id"],
repository_id=row["repository_id"],
snapshot_id=row["snapshot_id"],
status=row["status"],
started_at=row["started_at"],
completed_at=row["completed_at"],
error_message=row["error_message"],
scanner_version=row["scanner_version"],
)
@staticmethod
def _observed_fact_from_row(row: sqlite3.Row) -> ObservedFact:
return ObservedFact(
id=row["id"],
repository_id=row["repository_id"],
analysis_run_id=row["analysis_run_id"],
snapshot_id=row["snapshot_id"],
kind=row["kind"],
path=row["path"],
name=row["name"],
value=row["value"],
metadata=json.loads(row["metadata"]),
)