first content-indexing slice

This commit is contained in:
2026-04-26 02:47:10 +02:00
parent 9cd700b215
commit 6416139176
12 changed files with 404 additions and 0 deletions

View File

@@ -15,6 +15,7 @@ from repo_registry.core.models import (
CandidateGraph,
Capability,
CapabilitySummary,
ContentChunk,
Evidence,
Feature,
ObservedFact,
@@ -25,6 +26,7 @@ from repo_registry.core.models import (
SearchResult,
SourceReference,
)
from repo_registry.content_indexing.extractor import ContentChunkCandidate
from repo_registry.candidate_graph.generator import CandidateAbilityDraft
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
@@ -41,6 +43,7 @@ class RegistryStore:
migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
with self.connect() as connection:
connection.executescript(migration_path.read_text(encoding="utf-8"))
self._ensure_content_chunks_table(connection)
self._ensure_approved_source_ref_columns(connection)
def connect(self) -> sqlite3.Connection:
@@ -63,6 +66,30 @@ class RegistryStore:
f"ALTER TABLE {table} ADD COLUMN source_refs TEXT NOT NULL DEFAULT '[]'"
)
def _ensure_content_chunks_table(self, connection: sqlite3.Connection) -> None:
connection.execute(
"""
CREATE TABLE IF NOT EXISTS content_chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
path TEXT NOT NULL,
kind TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
text TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
)
"""
)
connection.execute(
"CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
)
connection.execute(
"CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id)"
)
def create_repository(
self,
*,
@@ -1163,6 +1190,65 @@ class RegistryStore:
).fetchall()
return [self._observed_fact_from_row(row) for row in rows]
def replace_content_chunks(
self,
repository_id: int,
analysis_run_id: int,
snapshot_id: int | None,
chunks: list[ContentChunkCandidate],
) -> None:
with self.connect() as connection:
connection.execute(
"DELETE FROM content_chunks WHERE analysis_run_id = ?",
(analysis_run_id,),
)
connection.executemany(
"""
INSERT INTO content_chunks
(repository_id, analysis_run_id, snapshot_id, path, kind,
start_line, end_line, text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(
repository_id,
analysis_run_id,
snapshot_id,
chunk.path,
chunk.kind,
chunk.start_line,
chunk.end_line,
chunk.text,
)
for chunk in chunks
],
)
def list_content_chunks(
self,
repository_id: int,
analysis_run_id: int | None = None,
) -> list[ContentChunk]:
self.get_repository(repository_id)
params: tuple[int, ...]
where = "WHERE repository_id = ?"
params = (repository_id,)
if analysis_run_id is not None:
where += " AND analysis_run_id = ?"
params = (repository_id, analysis_run_id)
with self.connect() as connection:
rows = connection.execute(
f"""
SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
start_line, end_line, text
FROM content_chunks
{where}
ORDER BY path ASC, start_line ASC, id ASC
""",
params,
).fetchall()
return [self._content_chunk_from_row(row) for row in rows]
def create_ability(
self,
repository_id: int,
@@ -1986,3 +2072,17 @@ class RegistryStore:
value=row["value"],
metadata=json.loads(row["metadata"]),
)
@staticmethod
def _content_chunk_from_row(row: sqlite3.Row) -> ContentChunk:
return ContentChunk(
id=row["id"],
repository_id=row["repository_id"],
analysis_run_id=row["analysis_run_id"],
snapshot_id=row["snapshot_id"],
path=row["path"],
kind=row["kind"],
start_line=row["start_line"],
end_line=row["end_line"],
text=row["text"],
)