first content-indexing slice

This commit is contained in:
2026-04-26 02:47:10 +02:00
parent 9cd700b215
commit 6416139176
12 changed files with 404 additions and 0 deletions

View File

@@ -43,6 +43,19 @@ CREATE TABLE IF NOT EXISTS observed_facts (
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS content_chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
path TEXT NOT NULL,
kind TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
text TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS candidate_abilities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
@@ -155,6 +168,8 @@ CREATE INDEX IF NOT EXISTS idx_snapshots_repository ON repository_snapshots(repo
CREATE INDEX IF NOT EXISTS idx_analysis_runs_repository ON analysis_runs(repository_id);
CREATE INDEX IF NOT EXISTS idx_observed_facts_repository ON observed_facts(repository_id);
CREATE INDEX IF NOT EXISTS idx_observed_facts_run ON observed_facts(analysis_run_id);
CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id);
CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id);
CREATE INDEX IF NOT EXISTS idx_candidate_abilities_repository ON candidate_abilities(repository_id);
CREATE INDEX IF NOT EXISTS idx_candidate_capabilities_repository ON candidate_capabilities(repository_id);
CREATE INDEX IF NOT EXISTS idx_candidate_features_repository ON candidate_features(repository_id);

View File

@@ -0,0 +1,3 @@
from repo_registry.content_indexing.extractor import ContentChunkCandidate, ContentExtractor
__all__ = ["ContentChunkCandidate", "ContentExtractor"]

View File

@@ -0,0 +1,109 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from repo_registry.core.models import ObservedFact
INDEXED_FACT_KINDS = {"documentation", "example", "test", "manifest", "interface"}
MAX_CHUNK_LINES = 40
MAX_FILE_BYTES = 200_000
@dataclass(frozen=True)
class ContentChunkCandidate:
path: str
kind: str
start_line: int
end_line: int
text: str
class ContentExtractor:
"""Extract deterministic text chunks from source-linked observed facts."""
def extract(
self,
source_path: str | Path,
facts: list[ObservedFact],
) -> list[ContentChunkCandidate]:
root = Path(source_path).expanduser().resolve()
chunks: list[ContentChunkCandidate] = []
seen: set[tuple[str, str, int, int]] = set()
for fact in facts:
if fact.kind not in INDEXED_FACT_KINDS or not fact.path:
continue
path = (root / fact.path).resolve()
if not self._is_within(root, path) or not path.is_file():
continue
for chunk in self._chunks_for_fact(path, root, fact):
key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line)
if key in seen:
continue
seen.add(key)
chunks.append(chunk)
return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind))
def _chunks_for_fact(
self,
path: Path,
root: Path,
fact: ObservedFact,
) -> list[ContentChunkCandidate]:
try:
if path.stat().st_size > MAX_FILE_BYTES:
return []
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
except OSError:
return []
if not lines:
return []
line = fact.metadata.get("line")
if isinstance(line, int):
start_line = max(1, line - 5)
end_line = min(len(lines), line + 10)
return [
self._chunk(
path,
root,
fact.kind,
lines,
start_line,
end_line,
)
]
chunks: list[ContentChunkCandidate] = []
for start_index in range(0, len(lines), MAX_CHUNK_LINES):
start_line = start_index + 1
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
chunks.append(
self._chunk(path, root, fact.kind, lines, start_line, end_line)
)
return chunks
def _chunk(
self,
path: Path,
root: Path,
kind: str,
lines: list[str],
start_line: int,
end_line: int,
) -> ContentChunkCandidate:
return ContentChunkCandidate(
path=path.relative_to(root).as_posix(),
kind=kind,
start_line=start_line,
end_line=end_line,
text="\n".join(lines[start_line - 1 : end_line]).strip(),
)
def _is_within(self, root: Path, path: Path) -> bool:
try:
path.relative_to(root)
except ValueError:
return False
return True

View File

@@ -59,6 +59,19 @@ class ObservedFact:
metadata: dict[str, Any]
@dataclass(frozen=True)
class ContentChunk:
id: int
repository_id: int
analysis_run_id: int
snapshot_id: int | None
path: str
kind: str
start_line: int
end_line: int
text: str
@dataclass(frozen=True)
class ScanSummary:
analysis_run: AnalysisRun

View File

@@ -7,6 +7,7 @@ from repo_registry.core.models import (
AnalysisRun,
CapabilitySummary,
CandidateGraph,
ContentChunk,
ObservedFact,
Repository,
RepositoryAbilityMap,
@@ -15,6 +16,7 @@ from repo_registry.core.models import (
SearchResult,
)
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor
from repo_registry.repo_scanning.scanner import DeterministicScanner
@@ -34,6 +36,7 @@ class RegistryService:
self.ingestion = ingestion or GitIngestionService()
self.metadata_extractor = RepositoryMetadataExtractor()
self.candidate_generator = CandidateGraphGenerator()
self.content_extractor = ContentExtractor()
def register_repository(
self,
@@ -111,6 +114,13 @@ class RegistryService:
else None
)
facts = self.store.list_observed_facts(repository_id, completed_run.id)
chunks = self.content_extractor.extract(scan_result.source_path, facts)
self.store.replace_content_chunks(
repository_id,
completed_run.id,
completed_run.snapshot_id,
chunks,
)
candidates = self.candidate_generator.generate(repository, facts)
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
return ScanSummary(
@@ -145,6 +155,13 @@ class RegistryService:
) -> list[ObservedFact]:
return self.store.list_observed_facts(repository_id, analysis_run_id)
def list_content_chunks(
self,
repository_id: int,
analysis_run_id: int | None = None,
) -> list[ContentChunk]:
return self.store.list_content_chunks(repository_id, analysis_run_id)
def candidate_graph(self, repository_id: int, analysis_run_id: int) -> CandidateGraph:
return self.store.get_candidate_graph(repository_id, analysis_run_id)

View File

@@ -15,6 +15,7 @@ from repo_registry.core.models import (
CandidateGraph,
Capability,
CapabilitySummary,
ContentChunk,
Evidence,
Feature,
ObservedFact,
@@ -25,6 +26,7 @@ from repo_registry.core.models import (
SearchResult,
SourceReference,
)
from repo_registry.content_indexing.extractor import ContentChunkCandidate
from repo_registry.candidate_graph.generator import CandidateAbilityDraft
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
@@ -41,6 +43,7 @@ class RegistryStore:
migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
with self.connect() as connection:
connection.executescript(migration_path.read_text(encoding="utf-8"))
self._ensure_content_chunks_table(connection)
self._ensure_approved_source_ref_columns(connection)
def connect(self) -> sqlite3.Connection:
@@ -63,6 +66,30 @@ class RegistryStore:
f"ALTER TABLE {table} ADD COLUMN source_refs TEXT NOT NULL DEFAULT '[]'"
)
def _ensure_content_chunks_table(self, connection: sqlite3.Connection) -> None:
connection.execute(
"""
CREATE TABLE IF NOT EXISTS content_chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
path TEXT NOT NULL,
kind TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
text TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
)
"""
)
connection.execute(
"CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
)
connection.execute(
"CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id)"
)
def create_repository(
self,
*,
@@ -1163,6 +1190,65 @@ class RegistryStore:
).fetchall()
return [self._observed_fact_from_row(row) for row in rows]
def replace_content_chunks(
self,
repository_id: int,
analysis_run_id: int,
snapshot_id: int | None,
chunks: list[ContentChunkCandidate],
) -> None:
with self.connect() as connection:
connection.execute(
"DELETE FROM content_chunks WHERE analysis_run_id = ?",
(analysis_run_id,),
)
connection.executemany(
"""
INSERT INTO content_chunks
(repository_id, analysis_run_id, snapshot_id, path, kind,
start_line, end_line, text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(
repository_id,
analysis_run_id,
snapshot_id,
chunk.path,
chunk.kind,
chunk.start_line,
chunk.end_line,
chunk.text,
)
for chunk in chunks
],
)
def list_content_chunks(
self,
repository_id: int,
analysis_run_id: int | None = None,
) -> list[ContentChunk]:
self.get_repository(repository_id)
params: tuple[int, ...]
where = "WHERE repository_id = ?"
params = (repository_id,)
if analysis_run_id is not None:
where += " AND analysis_run_id = ?"
params = (repository_id, analysis_run_id)
with self.connect() as connection:
rows = connection.execute(
f"""
SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
start_line, end_line, text
FROM content_chunks
{where}
ORDER BY path ASC, start_line ASC, id ASC
""",
params,
).fetchall()
return [self._content_chunk_from_row(row) for row in rows]
def create_ability(
self,
repository_id: int,
@@ -1986,3 +2072,17 @@ class RegistryStore:
value=row["value"],
metadata=json.loads(row["metadata"]),
)
@staticmethod
def _content_chunk_from_row(row: sqlite3.Row) -> ContentChunk:
return ContentChunk(
id=row["id"],
repository_id=row["repository_id"],
analysis_run_id=row["analysis_run_id"],
snapshot_id=row["snapshot_id"],
path=row["path"],
kind=row["kind"],
start_line=row["start_line"],
end_line=row["end_line"],
text=row["text"],
)

View File

@@ -464,6 +464,36 @@ def list_observed_facts(
raise HTTPException(status_code=404, detail=str(exc)) from exc
@app.get("/repos/{repository_id}/content-chunks")
def list_content_chunks(
repository_id: int,
analysis_run_id: int | None = None,
service: RegistryService = Depends(get_service),
) -> list[dict[str, object]]:
try:
return [
asdict(chunk)
for chunk in service.list_content_chunks(repository_id, analysis_run_id)
]
except NotFoundError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/content-chunks")
def list_analysis_run_content_chunks(
repository_id: int,
analysis_run_id: int,
service: RegistryService = Depends(get_service),
) -> list[dict[str, object]]:
try:
return [
asdict(chunk)
for chunk in service.list_content_chunks(repository_id, analysis_run_id)
]
except NotFoundError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/candidate-graph")
def get_candidate_graph(
repository_id: int,

View File

@@ -630,6 +630,7 @@ def analysis_run_detail(
repository = service.get_repository(repository_id)
candidate_graph = service.candidate_graph(repository_id, analysis_run_id)
facts = service.list_observed_facts(repository_id, analysis_run_id)
chunks = service.list_content_chunks(repository_id, analysis_run_id)
decisions = service.list_review_decisions(repository_id, analysis_run_id)
fact_rows = "\n".join(
f"""
@@ -667,6 +668,10 @@ def analysis_run_detail(
{render_review_decisions(decisions)}
</section>
</div>
<section class="panel" style="margin-top:18px">
<h2>Content Chunks</h2>
{render_content_chunks(chunks)}
</section>
"""
return page(f"{repository.name} Run {analysis_run_id}", body)
@@ -1065,6 +1070,27 @@ def render_review_decisions(decisions: list) -> str:
"""
def render_content_chunks(chunks: list) -> str:
if not chunks:
return '<p class="muted">No content chunks extracted.</p>'
rows = "\n".join(
f"""
<tr>
<td><span class="pill">{escape(chunk.kind)}</span></td>
<td class="source">{escape(chunk.path)}:{chunk.start_line}-{chunk.end_line}</td>
<td><pre>{escape(chunk.text[:500])}</pre></td>
</tr>
"""
for chunk in chunks
)
return f"""
<table>
<thead><tr><th>Kind</th><th>Source</th><th>Text</th></tr></thead>
<tbody>{rows}</tbody>
</table>
"""
def render_candidate_ability_actions(
ability: dict,
repository_id: int,

View File

@@ -0,0 +1,67 @@
from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.core.models import ObservedFact
def fact(id, kind, name, path="", line=None):
metadata = {}
if line is not None:
metadata["line"] = line
return ObservedFact(
id=id,
repository_id=1,
analysis_run_id=1,
snapshot_id=1,
kind=kind,
path=path,
name=name,
value="",
metadata=metadata,
)
def test_content_extractor_chunks_docs_and_interface_line_ranges(tmp_path):
repo = tmp_path / "repo"
repo.mkdir()
(repo / "README.md").write_text(
"\n".join(f"readme line {number}" for number in range(1, 46)),
encoding="utf-8",
)
(repo / "app.py").write_text(
"\n".join(f"line {number}" for number in range(1, 21)),
encoding="utf-8",
)
chunks = ContentExtractor().extract(
repo,
[
fact(1, "documentation", "README", "README.md"),
fact(2, "interface", "python route decorator", "app.py", line=10),
],
)
readme_chunks = [chunk for chunk in chunks if chunk.path == "README.md"]
interface_chunks = [chunk for chunk in chunks if chunk.path == "app.py"]
assert [(chunk.start_line, chunk.end_line) for chunk in readme_chunks] == [
(1, 40),
(41, 45),
]
assert len(interface_chunks) == 1
assert interface_chunks[0].start_line == 5
assert interface_chunks[0].end_line == 20
assert "line 10" in interface_chunks[0].text
def test_content_extractor_ignores_unindexed_and_missing_paths(tmp_path):
repo = tmp_path / "repo"
repo.mkdir()
(repo / "README.md").write_text("# ok\n", encoding="utf-8")
chunks = ContentExtractor().extract(
repo,
[
fact(1, "language", "Python"),
fact(2, "documentation", "missing", "missing.md"),
],
)
assert chunks == []

View File

@@ -343,6 +343,11 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
assert ("documentation", "README", "README.md") in fact_names
assert ("framework", "FastAPI", "requirements.txt") in fact_names
assert ("interface", "python route decorator", "app.py") in fact_names
chunks = service.list_content_chunks(repository.id, summary.analysis_run.id)
chunk_sources = {(chunk.kind, chunk.path) for chunk in chunks}
assert ("documentation", "README.md") in chunk_sources
assert ("manifest", "requirements.txt") in chunk_sources
assert ("interface", "app.py") in chunk_sources
candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
assert candidate_graph.repository.name == "Example"

View File

@@ -25,9 +25,16 @@ def test_initialize_is_idempotent_and_applies_expected_columns(tmp_path):
evidence_columns = {
row[1] for row in connection.execute("PRAGMA table_info(approved_evidence)")
}
tables = {
row[0]
for row in connection.execute(
"SELECT name FROM sqlite_master WHERE type = 'table'"
)
}
assert "source_refs" in feature_columns
assert "source_refs" in evidence_columns
assert "content_chunks" in tables
def test_delete_repository_cascades_registry_and_review_rows(tmp_path):
@@ -72,6 +79,7 @@ def test_delete_repository_cascades_registry_and_review_rows(tmp_path):
"approved_features",
"approved_evidence",
"analysis_runs",
"content_chunks",
"review_decisions",
):
count = connection.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]

View File

@@ -320,6 +320,15 @@ def test_api_analysis_run_loop(tmp_path):
assert ("documentation", "README", "README.md") in fact_names
assert ("framework", "React", "package.json") in fact_names
assert ("framework", "Vite", "package.json") in fact_names
chunks_response = client.get(
f"/repos/{repository_id}/analysis-runs/"
f"{run['analysis_run']['id']}/content-chunks"
)
assert chunks_response.status_code == 200
assert {
(chunk["kind"], chunk["path"]) for chunk in chunks_response.json()
} >= {("documentation", "README.md"), ("manifest", "package.json")}
finally:
app.dependency_overrides.clear()
@@ -392,6 +401,8 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
run_detail = client.get(run_path)
assert run_detail.status_code == 200
assert "Candidate Graph" in run_detail.text
assert "Content Chunks" in run_detail.text
assert "README.md:1-1" in run_detail.text
assert "ID " in run_detail.text
assert "No review decisions yet." in run_detail.text