diff --git a/migrations/0001_initial.sql b/migrations/0001_initial.sql
index a1553be..56df20e 100644
--- a/migrations/0001_initial.sql
+++ b/migrations/0001_initial.sql
@@ -43,6 +43,19 @@ CREATE TABLE IF NOT EXISTS observed_facts (
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
+CREATE TABLE IF NOT EXISTS content_chunks (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
+ analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
+ snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
+ path TEXT NOT NULL,
+ kind TEXT NOT NULL,
+ start_line INTEGER NOT NULL,
+ end_line INTEGER NOT NULL,
+ text TEXT NOT NULL,
+ created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
CREATE TABLE IF NOT EXISTS candidate_abilities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
@@ -155,6 +168,8 @@ CREATE INDEX IF NOT EXISTS idx_snapshots_repository ON repository_snapshots(repo
CREATE INDEX IF NOT EXISTS idx_analysis_runs_repository ON analysis_runs(repository_id);
CREATE INDEX IF NOT EXISTS idx_observed_facts_repository ON observed_facts(repository_id);
CREATE INDEX IF NOT EXISTS idx_observed_facts_run ON observed_facts(analysis_run_id);
+CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id);
+CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id);
CREATE INDEX IF NOT EXISTS idx_candidate_abilities_repository ON candidate_abilities(repository_id);
CREATE INDEX IF NOT EXISTS idx_candidate_capabilities_repository ON candidate_capabilities(repository_id);
CREATE INDEX IF NOT EXISTS idx_candidate_features_repository ON candidate_features(repository_id);
diff --git a/src/repo_registry/content_indexing/__init__.py b/src/repo_registry/content_indexing/__init__.py
new file mode 100644
index 0000000..2578378
--- /dev/null
+++ b/src/repo_registry/content_indexing/__init__.py
@@ -0,0 +1,3 @@
+from repo_registry.content_indexing.extractor import ContentChunkCandidate, ContentExtractor
+
+__all__ = ["ContentChunkCandidate", "ContentExtractor"]
diff --git a/src/repo_registry/content_indexing/extractor.py b/src/repo_registry/content_indexing/extractor.py
new file mode 100644
index 0000000..eec4fdf
--- /dev/null
+++ b/src/repo_registry/content_indexing/extractor.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from repo_registry.core.models import ObservedFact
+
+
+INDEXED_FACT_KINDS = {"documentation", "example", "test", "manifest", "interface"}
+MAX_CHUNK_LINES = 40
+MAX_FILE_BYTES = 200_000
+
+
+@dataclass(frozen=True)
+class ContentChunkCandidate:
+ path: str
+ kind: str
+ start_line: int
+ end_line: int
+ text: str
+
+
+class ContentExtractor:
+ """Extract deterministic text chunks from source-linked observed facts."""
+
+ def extract(
+ self,
+ source_path: str | Path,
+ facts: list[ObservedFact],
+ ) -> list[ContentChunkCandidate]:
+ root = Path(source_path).expanduser().resolve()
+ chunks: list[ContentChunkCandidate] = []
+ seen: set[tuple[str, str, int, int]] = set()
+ for fact in facts:
+ if fact.kind not in INDEXED_FACT_KINDS or not fact.path:
+ continue
+ path = (root / fact.path).resolve()
+ if not self._is_within(root, path) or not path.is_file():
+ continue
+ for chunk in self._chunks_for_fact(path, root, fact):
+ key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line)
+ if key in seen:
+ continue
+ seen.add(key)
+ chunks.append(chunk)
+ return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind))
+
+ def _chunks_for_fact(
+ self,
+ path: Path,
+ root: Path,
+ fact: ObservedFact,
+ ) -> list[ContentChunkCandidate]:
+ try:
+ if path.stat().st_size > MAX_FILE_BYTES:
+ return []
+ lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
+ except OSError:
+ return []
+ if not lines:
+ return []
+
+ line = fact.metadata.get("line")
+ if isinstance(line, int):
+ start_line = max(1, line - 5)
+ end_line = min(len(lines), line + 10)
+ return [
+ self._chunk(
+ path,
+ root,
+ fact.kind,
+ lines,
+ start_line,
+ end_line,
+ )
+ ]
+
+ chunks: list[ContentChunkCandidate] = []
+ for start_index in range(0, len(lines), MAX_CHUNK_LINES):
+ start_line = start_index + 1
+ end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
+ chunks.append(
+ self._chunk(path, root, fact.kind, lines, start_line, end_line)
+ )
+ return chunks
+
+ def _chunk(
+ self,
+ path: Path,
+ root: Path,
+ kind: str,
+ lines: list[str],
+ start_line: int,
+ end_line: int,
+ ) -> ContentChunkCandidate:
+ return ContentChunkCandidate(
+ path=path.relative_to(root).as_posix(),
+ kind=kind,
+ start_line=start_line,
+ end_line=end_line,
+ text="\n".join(lines[start_line - 1 : end_line]).strip(),
+ )
+
+ def _is_within(self, root: Path, path: Path) -> bool:
+ try:
+ path.relative_to(root)
+ except ValueError:
+ return False
+ return True
diff --git a/src/repo_registry/core/models.py b/src/repo_registry/core/models.py
index 99c6cd7..7143b5f 100644
--- a/src/repo_registry/core/models.py
+++ b/src/repo_registry/core/models.py
@@ -59,6 +59,19 @@ class ObservedFact:
metadata: dict[str, Any]
+@dataclass(frozen=True)
+class ContentChunk:
+ id: int
+ repository_id: int
+ analysis_run_id: int
+ snapshot_id: int | None
+ path: str
+ kind: str
+ start_line: int
+ end_line: int
+ text: str
+
+
@dataclass(frozen=True)
class ScanSummary:
analysis_run: AnalysisRun
diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py
index 10ffd8e..79bfe0f 100644
--- a/src/repo_registry/core/service.py
+++ b/src/repo_registry/core/service.py
@@ -7,6 +7,7 @@ from repo_registry.core.models import (
AnalysisRun,
CapabilitySummary,
CandidateGraph,
+ ContentChunk,
ObservedFact,
Repository,
RepositoryAbilityMap,
@@ -15,6 +16,7 @@ from repo_registry.core.models import (
SearchResult,
)
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
+from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor
from repo_registry.repo_scanning.scanner import DeterministicScanner
@@ -34,6 +36,7 @@ class RegistryService:
self.ingestion = ingestion or GitIngestionService()
self.metadata_extractor = RepositoryMetadataExtractor()
self.candidate_generator = CandidateGraphGenerator()
+ self.content_extractor = ContentExtractor()
def register_repository(
self,
@@ -111,6 +114,13 @@ class RegistryService:
else None
)
facts = self.store.list_observed_facts(repository_id, completed_run.id)
+ chunks = self.content_extractor.extract(scan_result.source_path, facts)
+ self.store.replace_content_chunks(
+ repository_id,
+ completed_run.id,
+ completed_run.snapshot_id,
+ chunks,
+ )
candidates = self.candidate_generator.generate(repository, facts)
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
return ScanSummary(
@@ -145,6 +155,13 @@ class RegistryService:
) -> list[ObservedFact]:
return self.store.list_observed_facts(repository_id, analysis_run_id)
+ def list_content_chunks(
+ self,
+ repository_id: int,
+ analysis_run_id: int | None = None,
+ ) -> list[ContentChunk]:
+ return self.store.list_content_chunks(repository_id, analysis_run_id)
+
def candidate_graph(self, repository_id: int, analysis_run_id: int) -> CandidateGraph:
return self.store.get_candidate_graph(repository_id, analysis_run_id)
diff --git a/src/repo_registry/storage/sqlite.py b/src/repo_registry/storage/sqlite.py
index cbfd694..dbe041a 100644
--- a/src/repo_registry/storage/sqlite.py
+++ b/src/repo_registry/storage/sqlite.py
@@ -15,6 +15,7 @@ from repo_registry.core.models import (
CandidateGraph,
Capability,
CapabilitySummary,
+ ContentChunk,
Evidence,
Feature,
ObservedFact,
@@ -25,6 +26,7 @@ from repo_registry.core.models import (
SearchResult,
SourceReference,
)
+from repo_registry.content_indexing.extractor import ContentChunkCandidate
from repo_registry.candidate_graph.generator import CandidateAbilityDraft
from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult
@@ -41,6 +43,7 @@ class RegistryStore:
migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
with self.connect() as connection:
connection.executescript(migration_path.read_text(encoding="utf-8"))
+ self._ensure_content_chunks_table(connection)
self._ensure_approved_source_ref_columns(connection)
def connect(self) -> sqlite3.Connection:
@@ -63,6 +66,30 @@ class RegistryStore:
f"ALTER TABLE {table} ADD COLUMN source_refs TEXT NOT NULL DEFAULT '[]'"
)
+ def _ensure_content_chunks_table(self, connection: sqlite3.Connection) -> None:
+ connection.execute(
+ """
+ CREATE TABLE IF NOT EXISTS content_chunks (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
+ analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
+ snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
+ path TEXT NOT NULL,
+ kind TEXT NOT NULL,
+ start_line INTEGER NOT NULL,
+ end_line INTEGER NOT NULL,
+ text TEXT NOT NULL,
+ created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+ )
+ """
+ )
+ connection.execute(
+ "CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
+ )
+ connection.execute(
+ "CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id)"
+ )
+
def create_repository(
self,
*,
@@ -1163,6 +1190,65 @@ class RegistryStore:
).fetchall()
return [self._observed_fact_from_row(row) for row in rows]
+ def replace_content_chunks(
+ self,
+ repository_id: int,
+ analysis_run_id: int,
+ snapshot_id: int | None,
+ chunks: list[ContentChunkCandidate],
+ ) -> None:
+ with self.connect() as connection:
+ connection.execute(
+ "DELETE FROM content_chunks WHERE analysis_run_id = ?",
+ (analysis_run_id,),
+ )
+ connection.executemany(
+ """
+ INSERT INTO content_chunks
+ (repository_id, analysis_run_id, snapshot_id, path, kind,
+ start_line, end_line, text)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ [
+ (
+ repository_id,
+ analysis_run_id,
+ snapshot_id,
+ chunk.path,
+ chunk.kind,
+ chunk.start_line,
+ chunk.end_line,
+ chunk.text,
+ )
+ for chunk in chunks
+ ],
+ )
+
+ def list_content_chunks(
+ self,
+ repository_id: int,
+ analysis_run_id: int | None = None,
+ ) -> list[ContentChunk]:
+ self.get_repository(repository_id)
+ params: tuple[int, ...]
+ where = "WHERE repository_id = ?"
+ params = (repository_id,)
+ if analysis_run_id is not None:
+ where += " AND analysis_run_id = ?"
+ params = (repository_id, analysis_run_id)
+ with self.connect() as connection:
+ rows = connection.execute(
+ f"""
+ SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
+ start_line, end_line, text
+ FROM content_chunks
+ {where}
+ ORDER BY path ASC, start_line ASC, id ASC
+ """,
+ params,
+ ).fetchall()
+ return [self._content_chunk_from_row(row) for row in rows]
+
def create_ability(
self,
repository_id: int,
@@ -1986,3 +2072,17 @@ class RegistryStore:
value=row["value"],
metadata=json.loads(row["metadata"]),
)
+
+ @staticmethod
+ def _content_chunk_from_row(row: sqlite3.Row) -> ContentChunk:
+ return ContentChunk(
+ id=row["id"],
+ repository_id=row["repository_id"],
+ analysis_run_id=row["analysis_run_id"],
+ snapshot_id=row["snapshot_id"],
+ path=row["path"],
+ kind=row["kind"],
+ start_line=row["start_line"],
+ end_line=row["end_line"],
+ text=row["text"],
+ )
diff --git a/src/repo_registry/web_api/app.py b/src/repo_registry/web_api/app.py
index 84e1c49..95bd3f2 100644
--- a/src/repo_registry/web_api/app.py
+++ b/src/repo_registry/web_api/app.py
@@ -464,6 +464,36 @@ def list_observed_facts(
raise HTTPException(status_code=404, detail=str(exc)) from exc
+@app.get("/repos/{repository_id}/content-chunks")
+def list_content_chunks(
+ repository_id: int,
+ analysis_run_id: int | None = None,
+ service: RegistryService = Depends(get_service),
+) -> list[dict[str, object]]:
+ try:
+ return [
+ asdict(chunk)
+ for chunk in service.list_content_chunks(repository_id, analysis_run_id)
+ ]
+ except NotFoundError as exc:
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+
+@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/content-chunks")
+def list_analysis_run_content_chunks(
+ repository_id: int,
+ analysis_run_id: int,
+ service: RegistryService = Depends(get_service),
+) -> list[dict[str, object]]:
+ try:
+ return [
+ asdict(chunk)
+ for chunk in service.list_content_chunks(repository_id, analysis_run_id)
+ ]
+ except NotFoundError as exc:
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+
@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/candidate-graph")
def get_candidate_graph(
repository_id: int,
diff --git a/src/repo_registry/web_ui/views.py b/src/repo_registry/web_ui/views.py
index 4d6dbaa..770a83c 100644
--- a/src/repo_registry/web_ui/views.py
+++ b/src/repo_registry/web_ui/views.py
@@ -630,6 +630,7 @@ def analysis_run_detail(
repository = service.get_repository(repository_id)
candidate_graph = service.candidate_graph(repository_id, analysis_run_id)
facts = service.list_observed_facts(repository_id, analysis_run_id)
+ chunks = service.list_content_chunks(repository_id, analysis_run_id)
decisions = service.list_review_decisions(repository_id, analysis_run_id)
fact_rows = "\n".join(
f"""
@@ -667,6 +668,10 @@ def analysis_run_detail(
{render_review_decisions(decisions)}
+ Content Chunks
+ {render_content_chunks(chunks)}
+
No content chunks extracted.
' + rows = "\n".join( + f""" +{escape(chunk.text[:500])}| Kind | Source | Text |
|---|