generated from coulomb/repo-seed
589 lines
20 KiB
Python
589 lines
20 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
from repo_registry.core.models import (
|
|
Ability,
|
|
AnalysisRun,
|
|
Capability,
|
|
Evidence,
|
|
Feature,
|
|
ObservedFact,
|
|
Repository,
|
|
RepositoryAbilityMap,
|
|
RepositorySnapshot,
|
|
SearchResult,
|
|
)
|
|
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
|
|
|
|
|
|
class NotFoundError(ValueError):
|
|
pass
|
|
|
|
|
|
class RegistryStore:
|
|
def __init__(self, database_path: str | Path) -> None:
|
|
self.database_path = str(database_path)
|
|
|
|
def initialize(self) -> None:
|
|
migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
|
|
with self.connect() as connection:
|
|
connection.executescript(migration_path.read_text(encoding="utf-8"))
|
|
|
|
def connect(self) -> sqlite3.Connection:
|
|
connection = sqlite3.connect(self.database_path)
|
|
connection.row_factory = sqlite3.Row
|
|
connection.execute("PRAGMA foreign_keys = ON")
|
|
return connection
|
|
|
|
def create_repository(
|
|
self,
|
|
*,
|
|
name: str,
|
|
url: str,
|
|
description: str | None,
|
|
branch: str,
|
|
) -> Repository:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
INSERT INTO repositories (name, url, description, branch)
|
|
VALUES (?, ?, ?, ?)
|
|
""",
|
|
(name, url, description, branch),
|
|
)
|
|
repository_id = int(cursor.lastrowid)
|
|
return self.get_repository(repository_id)
|
|
|
|
def update_repository_status(self, repository_id: int, status: str) -> None:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
UPDATE repositories
|
|
SET status = ?, updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
""",
|
|
(status, repository_id),
|
|
)
|
|
if cursor.rowcount == 0:
|
|
raise NotFoundError(f"repository {repository_id} was not found")
|
|
|
|
def list_repositories(self) -> list[Repository]:
|
|
with self.connect() as connection:
|
|
rows = connection.execute(
|
|
"""
|
|
SELECT id, name, url, description, branch, status
|
|
FROM repositories
|
|
ORDER BY created_at DESC, id DESC
|
|
"""
|
|
).fetchall()
|
|
return [self._repository_from_row(row) for row in rows]
|
|
|
|
def get_repository(self, repository_id: int) -> Repository:
|
|
with self.connect() as connection:
|
|
row = connection.execute(
|
|
"""
|
|
SELECT id, name, url, description, branch, status
|
|
FROM repositories
|
|
WHERE id = ?
|
|
""",
|
|
(repository_id,),
|
|
).fetchone()
|
|
if row is None:
|
|
raise NotFoundError(f"repository {repository_id} was not found")
|
|
return self._repository_from_row(row)
|
|
|
|
def create_analysis_run(self, repository_id: int) -> AnalysisRun:
|
|
self.get_repository(repository_id)
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
INSERT INTO analysis_runs (repository_id, status)
|
|
VALUES (?, 'running')
|
|
""",
|
|
(repository_id,),
|
|
)
|
|
analysis_run_id = int(cursor.lastrowid)
|
|
return self.get_analysis_run(repository_id, analysis_run_id)
|
|
|
|
def complete_analysis_run(
|
|
self,
|
|
repository_id: int,
|
|
analysis_run_id: int,
|
|
scan_result: ScanResult,
|
|
) -> AnalysisRun:
|
|
with self.connect() as connection:
|
|
snapshot_cursor = connection.execute(
|
|
"""
|
|
INSERT INTO repository_snapshots
|
|
(repository_id, commit_hash, branch, source_path, file_count)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
""",
|
|
(
|
|
repository_id,
|
|
scan_result.commit_hash,
|
|
scan_result.branch,
|
|
scan_result.source_path,
|
|
scan_result.file_count,
|
|
),
|
|
)
|
|
snapshot_id = int(snapshot_cursor.lastrowid)
|
|
self._insert_facts(
|
|
connection,
|
|
repository_id=repository_id,
|
|
analysis_run_id=analysis_run_id,
|
|
snapshot_id=snapshot_id,
|
|
facts=scan_result.facts,
|
|
)
|
|
connection.execute(
|
|
"""
|
|
UPDATE analysis_runs
|
|
SET status = 'completed',
|
|
snapshot_id = ?,
|
|
completed_at = CURRENT_TIMESTAMP,
|
|
error_message = NULL
|
|
WHERE id = ? AND repository_id = ?
|
|
""",
|
|
(snapshot_id, analysis_run_id, repository_id),
|
|
)
|
|
connection.execute(
|
|
"""
|
|
UPDATE repositories
|
|
SET status = 'analyzed', updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
""",
|
|
(repository_id,),
|
|
)
|
|
return self.get_analysis_run(repository_id, analysis_run_id)
|
|
|
|
def fail_analysis_run(
|
|
self,
|
|
repository_id: int,
|
|
analysis_run_id: int,
|
|
error_message: str,
|
|
) -> AnalysisRun:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
UPDATE analysis_runs
|
|
SET status = 'failed',
|
|
completed_at = CURRENT_TIMESTAMP,
|
|
error_message = ?
|
|
WHERE id = ? AND repository_id = ?
|
|
""",
|
|
(error_message, analysis_run_id, repository_id),
|
|
)
|
|
connection.execute(
|
|
"""
|
|
UPDATE repositories
|
|
SET status = 'analysis_failed', updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
""",
|
|
(repository_id,),
|
|
)
|
|
if cursor.rowcount == 0:
|
|
raise NotFoundError(
|
|
f"analysis run {analysis_run_id} was not found for repository {repository_id}"
|
|
)
|
|
return self.get_analysis_run(repository_id, analysis_run_id)
|
|
|
|
def get_analysis_run(self, repository_id: int, analysis_run_id: int) -> AnalysisRun:
|
|
with self.connect() as connection:
|
|
row = connection.execute(
|
|
"""
|
|
SELECT id, repository_id, snapshot_id, status, started_at,
|
|
completed_at, error_message, scanner_version
|
|
FROM analysis_runs
|
|
WHERE id = ? AND repository_id = ?
|
|
""",
|
|
(analysis_run_id, repository_id),
|
|
).fetchone()
|
|
if row is None:
|
|
raise NotFoundError(
|
|
f"analysis run {analysis_run_id} was not found for repository {repository_id}"
|
|
)
|
|
return self._analysis_run_from_row(row)
|
|
|
|
def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]:
|
|
self.get_repository(repository_id)
|
|
with self.connect() as connection:
|
|
rows = connection.execute(
|
|
"""
|
|
SELECT id, repository_id, snapshot_id, status, started_at,
|
|
completed_at, error_message, scanner_version
|
|
FROM analysis_runs
|
|
WHERE repository_id = ?
|
|
ORDER BY started_at DESC, id DESC
|
|
""",
|
|
(repository_id,),
|
|
).fetchall()
|
|
return [self._analysis_run_from_row(row) for row in rows]
|
|
|
|
def get_snapshot(self, snapshot_id: int) -> RepositorySnapshot:
|
|
with self.connect() as connection:
|
|
row = connection.execute(
|
|
"""
|
|
SELECT id, repository_id, commit_hash, branch, source_path, file_count
|
|
FROM repository_snapshots
|
|
WHERE id = ?
|
|
""",
|
|
(snapshot_id,),
|
|
).fetchone()
|
|
if row is None:
|
|
raise NotFoundError(f"snapshot {snapshot_id} was not found")
|
|
return self._snapshot_from_row(row)
|
|
|
|
def list_observed_facts(
|
|
self,
|
|
repository_id: int,
|
|
analysis_run_id: int | None = None,
|
|
) -> list[ObservedFact]:
|
|
self.get_repository(repository_id)
|
|
params: tuple[int, ...]
|
|
where = "WHERE repository_id = ?"
|
|
params = (repository_id,)
|
|
if analysis_run_id is not None:
|
|
where += " AND analysis_run_id = ?"
|
|
params = (repository_id, analysis_run_id)
|
|
|
|
with self.connect() as connection:
|
|
rows = connection.execute(
|
|
f"""
|
|
SELECT id, repository_id, analysis_run_id, snapshot_id, kind,
|
|
path, name, value, metadata
|
|
FROM observed_facts
|
|
{where}
|
|
ORDER BY kind ASC, path ASC, name ASC, id ASC
|
|
""",
|
|
params,
|
|
).fetchall()
|
|
return [self._observed_fact_from_row(row) for row in rows]
|
|
|
|
def create_ability(
|
|
self,
|
|
repository_id: int,
|
|
*,
|
|
name: str,
|
|
description: str,
|
|
confidence: float,
|
|
) -> int:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
INSERT INTO approved_abilities
|
|
(repository_id, name, description, confidence)
|
|
VALUES (?, ?, ?, ?)
|
|
""",
|
|
(repository_id, name, description, confidence),
|
|
)
|
|
return int(cursor.lastrowid)
|
|
|
|
def ensure_ability(self, repository_id: int, ability_id: int) -> None:
|
|
with self.connect() as connection:
|
|
row = connection.execute(
|
|
"""
|
|
SELECT id FROM approved_abilities
|
|
WHERE id = ? AND repository_id = ?
|
|
""",
|
|
(ability_id, repository_id),
|
|
).fetchone()
|
|
if row is None:
|
|
raise NotFoundError(
|
|
f"ability {ability_id} was not found for repository {repository_id}"
|
|
)
|
|
|
|
def create_capability(
|
|
self,
|
|
repository_id: int,
|
|
ability_id: int,
|
|
*,
|
|
name: str,
|
|
description: str,
|
|
inputs: list[str],
|
|
outputs: list[str],
|
|
confidence: float,
|
|
) -> int:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
INSERT INTO approved_capabilities
|
|
(repository_id, ability_id, name, description, inputs, outputs, confidence)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
(
|
|
repository_id,
|
|
ability_id,
|
|
name,
|
|
description,
|
|
json.dumps(inputs),
|
|
json.dumps(outputs),
|
|
confidence,
|
|
),
|
|
)
|
|
return int(cursor.lastrowid)
|
|
|
|
def ensure_capability(self, repository_id: int, capability_id: int) -> None:
|
|
with self.connect() as connection:
|
|
row = connection.execute(
|
|
"""
|
|
SELECT id FROM approved_capabilities
|
|
WHERE id = ? AND repository_id = ?
|
|
""",
|
|
(capability_id, repository_id),
|
|
).fetchone()
|
|
if row is None:
|
|
raise NotFoundError(
|
|
f"capability {capability_id} was not found for repository {repository_id}"
|
|
)
|
|
|
|
def create_feature(
|
|
self,
|
|
repository_id: int,
|
|
capability_id: int,
|
|
*,
|
|
name: str,
|
|
type: str,
|
|
location: str,
|
|
confidence: float,
|
|
) -> int:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
INSERT INTO approved_features
|
|
(repository_id, capability_id, name, type, location, confidence)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
""",
|
|
(repository_id, capability_id, name, type, location, confidence),
|
|
)
|
|
return int(cursor.lastrowid)
|
|
|
|
def create_evidence(
|
|
self,
|
|
repository_id: int,
|
|
capability_id: int,
|
|
*,
|
|
type: str,
|
|
reference: str,
|
|
strength: str,
|
|
) -> int:
|
|
with self.connect() as connection:
|
|
cursor = connection.execute(
|
|
"""
|
|
INSERT INTO approved_evidence
|
|
(repository_id, capability_id, type, reference, strength)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
""",
|
|
(repository_id, capability_id, type, reference, strength),
|
|
)
|
|
return int(cursor.lastrowid)
|
|
|
|
def get_ability_map(self, repository_id: int) -> RepositoryAbilityMap:
|
|
repository = self.get_repository(repository_id)
|
|
with self.connect() as connection:
|
|
ability_rows = connection.execute(
|
|
"""
|
|
SELECT id, name, description, confidence
|
|
FROM approved_abilities
|
|
WHERE repository_id = ?
|
|
ORDER BY id
|
|
""",
|
|
(repository_id,),
|
|
).fetchall()
|
|
capability_rows = connection.execute(
|
|
"""
|
|
SELECT id, ability_id, name, description, inputs, outputs, confidence
|
|
FROM approved_capabilities
|
|
WHERE repository_id = ?
|
|
ORDER BY id
|
|
""",
|
|
(repository_id,),
|
|
).fetchall()
|
|
feature_rows = connection.execute(
|
|
"""
|
|
SELECT id, capability_id, name, type, location, confidence
|
|
FROM approved_features
|
|
WHERE repository_id = ?
|
|
ORDER BY id
|
|
""",
|
|
(repository_id,),
|
|
).fetchall()
|
|
evidence_rows = connection.execute(
|
|
"""
|
|
SELECT id, capability_id, type, reference, strength
|
|
FROM approved_evidence
|
|
WHERE repository_id = ?
|
|
ORDER BY id
|
|
""",
|
|
(repository_id,),
|
|
).fetchall()
|
|
|
|
features_by_capability: dict[int, list[Feature]] = {}
|
|
for row in feature_rows:
|
|
features_by_capability.setdefault(row["capability_id"], []).append(
|
|
Feature(
|
|
id=row["id"],
|
|
name=row["name"],
|
|
type=row["type"],
|
|
location=row["location"],
|
|
confidence=row["confidence"],
|
|
)
|
|
)
|
|
|
|
evidence_by_capability: dict[int, list[Evidence]] = {}
|
|
for row in evidence_rows:
|
|
evidence_by_capability.setdefault(row["capability_id"], []).append(
|
|
Evidence(
|
|
id=row["id"],
|
|
type=row["type"],
|
|
reference=row["reference"],
|
|
strength=row["strength"],
|
|
)
|
|
)
|
|
|
|
capabilities_by_ability: dict[int, list[Capability]] = {}
|
|
for row in capability_rows:
|
|
capabilities_by_ability.setdefault(row["ability_id"], []).append(
|
|
Capability(
|
|
id=row["id"],
|
|
name=row["name"],
|
|
description=row["description"],
|
|
inputs=json.loads(row["inputs"]),
|
|
outputs=json.loads(row["outputs"]),
|
|
confidence=row["confidence"],
|
|
features=features_by_capability.get(row["id"], []),
|
|
evidence=evidence_by_capability.get(row["id"], []),
|
|
)
|
|
)
|
|
|
|
abilities = [
|
|
Ability(
|
|
id=row["id"],
|
|
name=row["name"],
|
|
description=row["description"],
|
|
confidence=row["confidence"],
|
|
capabilities=capabilities_by_ability.get(row["id"], []),
|
|
)
|
|
for row in ability_rows
|
|
]
|
|
return RepositoryAbilityMap(repository=repository, abilities=abilities)
|
|
|
|
def search(self, query: str) -> list[SearchResult]:
|
|
needle = f"%{query.strip()}%"
|
|
if needle == "%%":
|
|
return []
|
|
|
|
with self.connect() as connection:
|
|
rows = connection.execute(
|
|
"""
|
|
SELECT r.id AS repository_id, r.name AS repository_name,
|
|
'repository' AS match_type, r.name AS match_name,
|
|
1.0 AS confidence
|
|
FROM repositories r
|
|
WHERE r.name LIKE ? OR COALESCE(r.description, '') LIKE ?
|
|
UNION ALL
|
|
SELECT r.id, r.name, 'ability', a.name, a.confidence
|
|
FROM approved_abilities a
|
|
JOIN repositories r ON r.id = a.repository_id
|
|
WHERE a.name LIKE ? OR a.description LIKE ?
|
|
UNION ALL
|
|
SELECT r.id, r.name, 'capability', c.name, c.confidence
|
|
FROM approved_capabilities c
|
|
JOIN repositories r ON r.id = c.repository_id
|
|
WHERE c.name LIKE ? OR c.description LIKE ?
|
|
ORDER BY confidence DESC, repository_name ASC, match_name ASC
|
|
""",
|
|
(needle, needle, needle, needle, needle, needle),
|
|
).fetchall()
|
|
|
|
return [
|
|
SearchResult(
|
|
repository_id=row["repository_id"],
|
|
repository_name=row["repository_name"],
|
|
match_type=row["match_type"],
|
|
match_name=row["match_name"],
|
|
confidence=row["confidence"],
|
|
)
|
|
for row in rows
|
|
]
|
|
|
|
def _insert_facts(
|
|
self,
|
|
connection: sqlite3.Connection,
|
|
*,
|
|
repository_id: int,
|
|
analysis_run_id: int,
|
|
snapshot_id: int,
|
|
facts: list[FactCandidate],
|
|
) -> None:
|
|
connection.executemany(
|
|
"""
|
|
INSERT INTO observed_facts
|
|
(repository_id, analysis_run_id, snapshot_id, kind, path, name, value, metadata)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
[
|
|
(
|
|
repository_id,
|
|
analysis_run_id,
|
|
snapshot_id,
|
|
fact.kind,
|
|
fact.path,
|
|
fact.name,
|
|
fact.value,
|
|
json.dumps(fact.metadata),
|
|
)
|
|
for fact in facts
|
|
],
|
|
)
|
|
|
|
@staticmethod
|
|
def _repository_from_row(row: sqlite3.Row) -> Repository:
|
|
return Repository(
|
|
id=row["id"],
|
|
name=row["name"],
|
|
url=row["url"],
|
|
description=row["description"],
|
|
branch=row["branch"],
|
|
status=row["status"],
|
|
)
|
|
|
|
@staticmethod
|
|
def _snapshot_from_row(row: sqlite3.Row) -> RepositorySnapshot:
|
|
return RepositorySnapshot(
|
|
id=row["id"],
|
|
repository_id=row["repository_id"],
|
|
commit_hash=row["commit_hash"],
|
|
branch=row["branch"],
|
|
source_path=row["source_path"],
|
|
file_count=row["file_count"],
|
|
)
|
|
|
|
@staticmethod
|
|
def _analysis_run_from_row(row: sqlite3.Row) -> AnalysisRun:
|
|
return AnalysisRun(
|
|
id=row["id"],
|
|
repository_id=row["repository_id"],
|
|
snapshot_id=row["snapshot_id"],
|
|
status=row["status"],
|
|
started_at=row["started_at"],
|
|
completed_at=row["completed_at"],
|
|
error_message=row["error_message"],
|
|
scanner_version=row["scanner_version"],
|
|
)
|
|
|
|
@staticmethod
|
|
def _observed_fact_from_row(row: sqlite3.Row) -> ObservedFact:
|
|
return ObservedFact(
|
|
id=row["id"],
|
|
repository_id=row["repository_id"],
|
|
analysis_run_id=row["analysis_run_id"],
|
|
snapshot_id=row["snapshot_id"],
|
|
kind=row["kind"],
|
|
path=row["path"],
|
|
name=row["name"],
|
|
value=row["value"],
|
|
metadata=json.loads(row["metadata"]),
|
|
)
|