first content-indexing slice

2026-04-26 02:47:10 +02:00
parent 9cd700b215
commit 6416139176
12 changed files with 404 additions and 0 deletions
--- a/migrations/0001_initial.sql
+++ b/migrations/0001_initial.sql
@@ -43,6 +43,19 @@ CREATE TABLE IF NOT EXISTS observed_facts (
  created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );

+CREATE TABLE IF NOT EXISTS content_chunks (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
+  analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
+  snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
+  path TEXT NOT NULL,
+  kind TEXT NOT NULL,
+  start_line INTEGER NOT NULL,
+  end_line INTEGER NOT NULL,
+  text TEXT NOT NULL,
+  created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
 CREATE TABLE IF NOT EXISTS candidate_abilities (
  id INTEGER PRIMARY KEY AUTOINCREMENT,
  repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
@@ -155,6 +168,8 @@ CREATE INDEX IF NOT EXISTS idx_snapshots_repository ON repository_snapshots(repo
 CREATE INDEX IF NOT EXISTS idx_analysis_runs_repository ON analysis_runs(repository_id);
 CREATE INDEX IF NOT EXISTS idx_observed_facts_repository ON observed_facts(repository_id);
 CREATE INDEX IF NOT EXISTS idx_observed_facts_run ON observed_facts(analysis_run_id);
+CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id);
+CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id);
 CREATE INDEX IF NOT EXISTS idx_candidate_abilities_repository ON candidate_abilities(repository_id);
 CREATE INDEX IF NOT EXISTS idx_candidate_capabilities_repository ON candidate_capabilities(repository_id);
 CREATE INDEX IF NOT EXISTS idx_candidate_features_repository ON candidate_features(repository_id);
--- a/src/repo_registry/content_indexing/init.py
+++ b/src/repo_registry/content_indexing/init.py
@@ -0,0 +1,3 @@
+from repo_registry.content_indexing.extractor import ContentChunkCandidate, ContentExtractor
+
+__all__ = ["ContentChunkCandidate", "ContentExtractor"]
--- a/src/repo_registry/content_indexing/extractor.py
+++ b/src/repo_registry/content_indexing/extractor.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from repo_registry.core.models import ObservedFact
+
+
+INDEXED_FACT_KINDS = {"documentation", "example", "test", "manifest", "interface"}
+MAX_CHUNK_LINES = 40
+MAX_FILE_BYTES = 200_000
+
+
+@dataclass(frozen=True)
+class ContentChunkCandidate:
+    path: str
+    kind: str
+    start_line: int
+    end_line: int
+    text: str
+
+
+class ContentExtractor:
+    """Extract deterministic text chunks from source-linked observed facts."""
+
+    def extract(
+        self,
+        source_path: str | Path,
+        facts: list[ObservedFact],
+    ) -> list[ContentChunkCandidate]:
+        root = Path(source_path).expanduser().resolve()
+        chunks: list[ContentChunkCandidate] = []
+        seen: set[tuple[str, str, int, int]] = set()
+        for fact in facts:
+            if fact.kind not in INDEXED_FACT_KINDS or not fact.path:
+                continue
+            path = (root / fact.path).resolve()
+            if not self._is_within(root, path) or not path.is_file():
+                continue
+            for chunk in self._chunks_for_fact(path, root, fact):
+                key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line)
+                if key in seen:
+                    continue
+                seen.add(key)
+                chunks.append(chunk)
+        return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind))
+
+    def _chunks_for_fact(
+        self,
+        path: Path,
+        root: Path,
+        fact: ObservedFact,
+    ) -> list[ContentChunkCandidate]:
+        try:
+            if path.stat().st_size > MAX_FILE_BYTES:
+                return []
+            lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
+        except OSError:
+            return []
+        if not lines:
+            return []
+
+        line = fact.metadata.get("line")
+        if isinstance(line, int):
+            start_line = max(1, line - 5)
+            end_line = min(len(lines), line + 10)
+            return [
+                self._chunk(
+                    path,
+                    root,
+                    fact.kind,
+                    lines,
+                    start_line,
+                    end_line,
+                )
+            ]
+
+        chunks: list[ContentChunkCandidate] = []
+        for start_index in range(0, len(lines), MAX_CHUNK_LINES):
+            start_line = start_index + 1
+            end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
+            chunks.append(
+                self._chunk(path, root, fact.kind, lines, start_line, end_line)
+            )
+        return chunks
+
+    def _chunk(
+        self,
+        path: Path,
+        root: Path,
+        kind: str,
+        lines: list[str],
+        start_line: int,
+        end_line: int,
+    ) -> ContentChunkCandidate:
+        return ContentChunkCandidate(
+            path=path.relative_to(root).as_posix(),
+            kind=kind,
+            start_line=start_line,
+            end_line=end_line,
+            text="\n".join(lines[start_line - 1 : end_line]).strip(),
+        )
+
+    def _is_within(self, root: Path, path: Path) -> bool:
+        try:
+            path.relative_to(root)
+        except ValueError:
+            return False
+        return True
--- a/src/repo_registry/core/models.py
+++ b/src/repo_registry/core/models.py
@@ -59,6 +59,19 @@ class ObservedFact:
    metadata: dict[str, Any]


+@dataclass(frozen=True)
+class ContentChunk:
+    id: int
+    repository_id: int
+    analysis_run_id: int
+    snapshot_id: int | None
+    path: str
+    kind: str
+    start_line: int
+    end_line: int
+    text: str
+
+
@dataclass(frozen=True)
 class ScanSummary:
    analysis_run: AnalysisRun
--- a/src/repo_registry/core/service.py
+++ b/src/repo_registry/core/service.py
@@ -7,6 +7,7 @@ from repo_registry.core.models import (
    AnalysisRun,
    CapabilitySummary,
    CandidateGraph,
+    ContentChunk,
    ObservedFact,
    Repository,
    RepositoryAbilityMap,
@@ -15,6 +16,7 @@ from repo_registry.core.models import (
    SearchResult,
 )
 from repo_registry.candidate_graph.generator import CandidateGraphGenerator
+from repo_registry.content_indexing.extractor import ContentExtractor
 from repo_registry.repo_ingestion.git import GitIngestionService
 from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor
 from repo_registry.repo_scanning.scanner import DeterministicScanner
@@ -34,6 +36,7 @@ class RegistryService:
        self.ingestion = ingestion or GitIngestionService()
        self.metadata_extractor = RepositoryMetadataExtractor()
        self.candidate_generator = CandidateGraphGenerator()
+        self.content_extractor = ContentExtractor()

    def register_repository(
        self,
@@ -111,6 +114,13 @@ class RegistryService:
            else None
        )
        facts = self.store.list_observed_facts(repository_id, completed_run.id)
+        chunks = self.content_extractor.extract(scan_result.source_path, facts)
+        self.store.replace_content_chunks(
+            repository_id,
+            completed_run.id,
+            completed_run.snapshot_id,
+            chunks,
+        )
        candidates = self.candidate_generator.generate(repository, facts)
        self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
        return ScanSummary(
@@ -145,6 +155,13 @@ class RegistryService:
    ) -> list[ObservedFact]:
        return self.store.list_observed_facts(repository_id, analysis_run_id)

+    def list_content_chunks(
+        self,
+        repository_id: int,
+        analysis_run_id: int | None = None,
+    ) -> list[ContentChunk]:
+        return self.store.list_content_chunks(repository_id, analysis_run_id)
+
    def candidate_graph(self, repository_id: int, analysis_run_id: int) -> CandidateGraph:
        return self.store.get_candidate_graph(repository_id, analysis_run_id)

--- a/src/repo_registry/storage/sqlite.py
+++ b/src/repo_registry/storage/sqlite.py
@@ -15,6 +15,7 @@ from repo_registry.core.models import (
    CandidateGraph,
    Capability,
    CapabilitySummary,
+    ContentChunk,
    Evidence,
    Feature,
    ObservedFact,
@@ -25,6 +26,7 @@ from repo_registry.core.models import (
    SearchResult,
    SourceReference,
 )
+from repo_registry.content_indexing.extractor import ContentChunkCandidate
 from repo_registry.candidate_graph.generator import CandidateAbilityDraft
 from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult

@@ -41,6 +43,7 @@ class RegistryStore:
        migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql"
        with self.connect() as connection:
            connection.executescript(migration_path.read_text(encoding="utf-8"))
+            self._ensure_content_chunks_table(connection)
            self._ensure_approved_source_ref_columns(connection)

    def connect(self) -> sqlite3.Connection:
@@ -63,6 +66,30 @@ class RegistryStore:
                    f"ALTER TABLE {table} ADD COLUMN source_refs TEXT NOT NULL DEFAULT '[]'"
                )

+    def _ensure_content_chunks_table(self, connection: sqlite3.Connection) -> None:
+        connection.execute(
+            """
+            CREATE TABLE IF NOT EXISTS content_chunks (
+              id INTEGER PRIMARY KEY AUTOINCREMENT,
+              repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE,
+              analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE,
+              snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE,
+              path TEXT NOT NULL,
+              kind TEXT NOT NULL,
+              start_line INTEGER NOT NULL,
+              end_line INTEGER NOT NULL,
+              text TEXT NOT NULL,
+              created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+            )
+            """
+        )
+        connection.execute(
+            "CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
+        )
+        connection.execute(
+            "CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id)"
+        )
+
    def create_repository(
        self,
        *,
@@ -1163,6 +1190,65 @@ class RegistryStore:
            ).fetchall()
        return [self._observed_fact_from_row(row) for row in rows]

+    def replace_content_chunks(
+        self,
+        repository_id: int,
+        analysis_run_id: int,
+        snapshot_id: int | None,
+        chunks: list[ContentChunkCandidate],
+    ) -> None:
+        with self.connect() as connection:
+            connection.execute(
+                "DELETE FROM content_chunks WHERE analysis_run_id = ?",
+                (analysis_run_id,),
+            )
+            connection.executemany(
+                """
+                INSERT INTO content_chunks
+                  (repository_id, analysis_run_id, snapshot_id, path, kind,
+                   start_line, end_line, text)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                [
+                    (
+                        repository_id,
+                        analysis_run_id,
+                        snapshot_id,
+                        chunk.path,
+                        chunk.kind,
+                        chunk.start_line,
+                        chunk.end_line,
+                        chunk.text,
+                    )
+                    for chunk in chunks
+                ],
+            )
+
+    def list_content_chunks(
+        self,
+        repository_id: int,
+        analysis_run_id: int | None = None,
+    ) -> list[ContentChunk]:
+        self.get_repository(repository_id)
+        params: tuple[int, ...]
+        where = "WHERE repository_id = ?"
+        params = (repository_id,)
+        if analysis_run_id is not None:
+            where += " AND analysis_run_id = ?"
+            params = (repository_id, analysis_run_id)
+        with self.connect() as connection:
+            rows = connection.execute(
+                f"""
+                SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
+                       start_line, end_line, text
+                FROM content_chunks
+                {where}
+                ORDER BY path ASC, start_line ASC, id ASC
+                """,
+                params,
+            ).fetchall()
+        return [self._content_chunk_from_row(row) for row in rows]
+
    def create_ability(
        self,
        repository_id: int,
@@ -1986,3 +2072,17 @@ class RegistryStore:
            value=row["value"],
            metadata=json.loads(row["metadata"]),
        )
+
+    @staticmethod
+    def _content_chunk_from_row(row: sqlite3.Row) -> ContentChunk:
+        return ContentChunk(
+            id=row["id"],
+            repository_id=row["repository_id"],
+            analysis_run_id=row["analysis_run_id"],
+            snapshot_id=row["snapshot_id"],
+            path=row["path"],
+            kind=row["kind"],
+            start_line=row["start_line"],
+            end_line=row["end_line"],
+            text=row["text"],
+        )
--- a/src/repo_registry/web_api/app.py
+++ b/src/repo_registry/web_api/app.py
@@ -464,6 +464,36 @@ def list_observed_facts(
        raise HTTPException(status_code=404, detail=str(exc)) from exc


+@app.get("/repos/{repository_id}/content-chunks")
+def list_content_chunks(
+    repository_id: int,
+    analysis_run_id: int | None = None,
+    service: RegistryService = Depends(get_service),
+) -> list[dict[str, object]]:
+    try:
+        return [
+            asdict(chunk)
+            for chunk in service.list_content_chunks(repository_id, analysis_run_id)
+        ]
+    except NotFoundError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+
+@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/content-chunks")
+def list_analysis_run_content_chunks(
+    repository_id: int,
+    analysis_run_id: int,
+    service: RegistryService = Depends(get_service),
+) -> list[dict[str, object]]:
+    try:
+        return [
+            asdict(chunk)
+            for chunk in service.list_content_chunks(repository_id, analysis_run_id)
+        ]
+    except NotFoundError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+
@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/candidate-graph")
 def get_candidate_graph(
    repository_id: int,
--- a/src/repo_registry/web_ui/views.py
+++ b/src/repo_registry/web_ui/views.py
@@ -630,6 +630,7 @@ def analysis_run_detail(
    repository = service.get_repository(repository_id)
    candidate_graph = service.candidate_graph(repository_id, analysis_run_id)
    facts = service.list_observed_facts(repository_id, analysis_run_id)
+    chunks = service.list_content_chunks(repository_id, analysis_run_id)
    decisions = service.list_review_decisions(repository_id, analysis_run_id)
    fact_rows = "\n".join(
        f"""
@@ -667,6 +668,10 @@ def analysis_run_detail(
        {render_review_decisions(decisions)}
      </section>
    </div>
+    <section class="panel" style="margin-top:18px">
+      <h2>Content Chunks</h2>
+      {render_content_chunks(chunks)}
+    </section>
    """
    return page(f"{repository.name} Run {analysis_run_id}", body)

@@ -1065,6 +1070,27 @@ def render_review_decisions(decisions: list) -> str:
    """


+def render_content_chunks(chunks: list) -> str:
+    if not chunks:
+        return '<p class="muted">No content chunks extracted.</p>'
+    rows = "\n".join(
+        f"""
+        <tr>
+          <td><span class="pill">{escape(chunk.kind)}</span></td>
+          <td class="source">{escape(chunk.path)}:{chunk.start_line}-{chunk.end_line}</td>
+          <td><pre>{escape(chunk.text[:500])}</pre></td>
+        </tr>
+        """
+        for chunk in chunks
+    )
+    return f"""
+    <table>
+      <thead><tr><th>Kind</th><th>Source</th><th>Text</th></tr></thead>
+      <tbody>{rows}</tbody>
+    </table>
+    """
+
+
 def render_candidate_ability_actions(
    ability: dict,
    repository_id: int,
--- a/tests/test_content_indexing.py
+++ b/tests/test_content_indexing.py
@@ -0,0 +1,67 @@
+from repo_registry.content_indexing.extractor import ContentExtractor
+from repo_registry.core.models import ObservedFact
+
+
+def fact(id, kind, name, path="", line=None):
+    metadata = {}
+    if line is not None:
+        metadata["line"] = line
+    return ObservedFact(
+        id=id,
+        repository_id=1,
+        analysis_run_id=1,
+        snapshot_id=1,
+        kind=kind,
+        path=path,
+        name=name,
+        value="",
+        metadata=metadata,
+    )
+
+
+def test_content_extractor_chunks_docs_and_interface_line_ranges(tmp_path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    (repo / "README.md").write_text(
+        "\n".join(f"readme line {number}" for number in range(1, 46)),
+        encoding="utf-8",
+    )
+    (repo / "app.py").write_text(
+        "\n".join(f"line {number}" for number in range(1, 21)),
+        encoding="utf-8",
+    )
+
+    chunks = ContentExtractor().extract(
+        repo,
+        [
+            fact(1, "documentation", "README", "README.md"),
+            fact(2, "interface", "python route decorator", "app.py", line=10),
+        ],
+    )
+
+    readme_chunks = [chunk for chunk in chunks if chunk.path == "README.md"]
+    interface_chunks = [chunk for chunk in chunks if chunk.path == "app.py"]
+    assert [(chunk.start_line, chunk.end_line) for chunk in readme_chunks] == [
+        (1, 40),
+        (41, 45),
+    ]
+    assert len(interface_chunks) == 1
+    assert interface_chunks[0].start_line == 5
+    assert interface_chunks[0].end_line == 20
+    assert "line 10" in interface_chunks[0].text
+
+
+def test_content_extractor_ignores_unindexed_and_missing_paths(tmp_path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    (repo / "README.md").write_text("# ok\n", encoding="utf-8")
+
+    chunks = ContentExtractor().extract(
+        repo,
+        [
+            fact(1, "language", "Python"),
+            fact(2, "documentation", "missing", "missing.md"),
+        ],
+    )
+
+    assert chunks == []
--- a/tests/test_registry_service.py
+++ b/tests/test_registry_service.py
@@ -343,6 +343,11 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
    assert ("documentation", "README", "README.md") in fact_names
    assert ("framework", "FastAPI", "requirements.txt") in fact_names
    assert ("interface", "python route decorator", "app.py") in fact_names
+    chunks = service.list_content_chunks(repository.id, summary.analysis_run.id)
+    chunk_sources = {(chunk.kind, chunk.path) for chunk in chunks}
+    assert ("documentation", "README.md") in chunk_sources
+    assert ("manifest", "requirements.txt") in chunk_sources
+    assert ("interface", "app.py") in chunk_sources

    candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
    assert candidate_graph.repository.name == "Example"
--- a/tests/test_storage_migrations.py
+++ b/tests/test_storage_migrations.py
@@ -25,9 +25,16 @@ def test_initialize_is_idempotent_and_applies_expected_columns(tmp_path):
        evidence_columns = {
            row[1] for row in connection.execute("PRAGMA table_info(approved_evidence)")
        }
+        tables = {
+            row[0]
+            for row in connection.execute(
+                "SELECT name FROM sqlite_master WHERE type = 'table'"
+            )
+        }

    assert "source_refs" in feature_columns
    assert "source_refs" in evidence_columns
+    assert "content_chunks" in tables


 def test_delete_repository_cascades_registry_and_review_rows(tmp_path):
@@ -72,6 +79,7 @@ def test_delete_repository_cascades_registry_and_review_rows(tmp_path):
            "approved_features",
            "approved_evidence",
            "analysis_runs",
+            "content_chunks",
            "review_decisions",
        ):
            count = connection.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
--- a/tests/test_web_api.py
+++ b/tests/test_web_api.py
@@ -320,6 +320,15 @@ def test_api_analysis_run_loop(tmp_path):
        assert ("documentation", "README", "README.md") in fact_names
        assert ("framework", "React", "package.json") in fact_names
        assert ("framework", "Vite", "package.json") in fact_names
+
+        chunks_response = client.get(
+            f"/repos/{repository_id}/analysis-runs/"
+            f"{run['analysis_run']['id']}/content-chunks"
+        )
+        assert chunks_response.status_code == 200
+        assert {
+            (chunk["kind"], chunk["path"]) for chunk in chunks_response.json()
+        } >= {("documentation", "README.md"), ("manifest", "package.json")}
    finally:
        app.dependency_overrides.clear()

@@ -392,6 +401,8 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
        run_detail = client.get(run_path)
        assert run_detail.status_code == 200
        assert "Candidate Graph" in run_detail.text
+        assert "Content Chunks" in run_detail.text
+        assert "README.md:1-1" in run_detail.text
        assert "ID " in run_detail.text
        assert "No review decisions yet." in run_detail.text