diff --git a/migrations/0001_initial.sql b/migrations/0001_initial.sql index a1553be..56df20e 100644 --- a/migrations/0001_initial.sql +++ b/migrations/0001_initial.sql @@ -43,6 +43,19 @@ CREATE TABLE IF NOT EXISTS observed_facts ( created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP ); +CREATE TABLE IF NOT EXISTS content_chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE, + analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE, + snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE, + path TEXT NOT NULL, + kind TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + text TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP +); + CREATE TABLE IF NOT EXISTS candidate_abilities ( id INTEGER PRIMARY KEY AUTOINCREMENT, repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE, @@ -155,6 +168,8 @@ CREATE INDEX IF NOT EXISTS idx_snapshots_repository ON repository_snapshots(repo CREATE INDEX IF NOT EXISTS idx_analysis_runs_repository ON analysis_runs(repository_id); CREATE INDEX IF NOT EXISTS idx_observed_facts_repository ON observed_facts(repository_id); CREATE INDEX IF NOT EXISTS idx_observed_facts_run ON observed_facts(analysis_run_id); +CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id); +CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id); CREATE INDEX IF NOT EXISTS idx_candidate_abilities_repository ON candidate_abilities(repository_id); CREATE INDEX IF NOT EXISTS idx_candidate_capabilities_repository ON candidate_capabilities(repository_id); CREATE INDEX IF NOT EXISTS idx_candidate_features_repository ON candidate_features(repository_id); diff --git a/src/repo_registry/content_indexing/__init__.py b/src/repo_registry/content_indexing/__init__.py new file mode 100644 index 0000000..2578378 --- /dev/null +++ b/src/repo_registry/content_indexing/__init__.py @@ -0,0 +1,3 @@ +from repo_registry.content_indexing.extractor import ContentChunkCandidate, ContentExtractor + +__all__ = ["ContentChunkCandidate", "ContentExtractor"] diff --git a/src/repo_registry/content_indexing/extractor.py b/src/repo_registry/content_indexing/extractor.py new file mode 100644 index 0000000..eec4fdf --- /dev/null +++ b/src/repo_registry/content_indexing/extractor.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from repo_registry.core.models import ObservedFact + + +INDEXED_FACT_KINDS = {"documentation", "example", "test", "manifest", "interface"} +MAX_CHUNK_LINES = 40 +MAX_FILE_BYTES = 200_000 + + +@dataclass(frozen=True) +class ContentChunkCandidate: + path: str + kind: str + start_line: int + end_line: int + text: str + + +class ContentExtractor: + """Extract deterministic text chunks from source-linked observed facts.""" + + def extract( + self, + source_path: str | Path, + facts: list[ObservedFact], + ) -> list[ContentChunkCandidate]: + root = Path(source_path).expanduser().resolve() + chunks: list[ContentChunkCandidate] = [] + seen: set[tuple[str, str, int, int]] = set() + for fact in facts: + if fact.kind not in INDEXED_FACT_KINDS or not fact.path: + continue + path = (root / fact.path).resolve() + if not self._is_within(root, path) or not path.is_file(): + continue + for chunk in self._chunks_for_fact(path, root, fact): + key = (chunk.path, chunk.kind, chunk.start_line, chunk.end_line) + if key in seen: + continue + seen.add(key) + chunks.append(chunk) + return sorted(chunks, key=lambda chunk: (chunk.path, chunk.start_line, chunk.kind)) + + def _chunks_for_fact( + self, + path: Path, + root: Path, + fact: ObservedFact, + ) -> list[ContentChunkCandidate]: + try: + if path.stat().st_size > MAX_FILE_BYTES: + return [] + lines = path.read_text(encoding="utf-8", errors="ignore").splitlines() + except OSError: + return [] + if not lines: + return [] + + line = fact.metadata.get("line") + if isinstance(line, int): + start_line = max(1, line - 5) + end_line = min(len(lines), line + 10) + return [ + self._chunk( + path, + root, + fact.kind, + lines, + start_line, + end_line, + ) + ] + + chunks: list[ContentChunkCandidate] = [] + for start_index in range(0, len(lines), MAX_CHUNK_LINES): + start_line = start_index + 1 + end_line = min(len(lines), start_index + MAX_CHUNK_LINES) + chunks.append( + self._chunk(path, root, fact.kind, lines, start_line, end_line) + ) + return chunks + + def _chunk( + self, + path: Path, + root: Path, + kind: str, + lines: list[str], + start_line: int, + end_line: int, + ) -> ContentChunkCandidate: + return ContentChunkCandidate( + path=path.relative_to(root).as_posix(), + kind=kind, + start_line=start_line, + end_line=end_line, + text="\n".join(lines[start_line - 1 : end_line]).strip(), + ) + + def _is_within(self, root: Path, path: Path) -> bool: + try: + path.relative_to(root) + except ValueError: + return False + return True diff --git a/src/repo_registry/core/models.py b/src/repo_registry/core/models.py index 99c6cd7..7143b5f 100644 --- a/src/repo_registry/core/models.py +++ b/src/repo_registry/core/models.py @@ -59,6 +59,19 @@ class ObservedFact: metadata: dict[str, Any] +@dataclass(frozen=True) +class ContentChunk: + id: int + repository_id: int + analysis_run_id: int + snapshot_id: int | None + path: str + kind: str + start_line: int + end_line: int + text: str + + @dataclass(frozen=True) class ScanSummary: analysis_run: AnalysisRun diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 10ffd8e..79bfe0f 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -7,6 +7,7 @@ from repo_registry.core.models import ( AnalysisRun, CapabilitySummary, CandidateGraph, + ContentChunk, ObservedFact, Repository, RepositoryAbilityMap, @@ -15,6 +16,7 @@ from repo_registry.core.models import ( SearchResult, ) from repo_registry.candidate_graph.generator import CandidateGraphGenerator +from repo_registry.content_indexing.extractor import ContentExtractor from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor from repo_registry.repo_scanning.scanner import DeterministicScanner @@ -34,6 +36,7 @@ class RegistryService: self.ingestion = ingestion or GitIngestionService() self.metadata_extractor = RepositoryMetadataExtractor() self.candidate_generator = CandidateGraphGenerator() + self.content_extractor = ContentExtractor() def register_repository( self, @@ -111,6 +114,13 @@ class RegistryService: else None ) facts = self.store.list_observed_facts(repository_id, completed_run.id) + chunks = self.content_extractor.extract(scan_result.source_path, facts) + self.store.replace_content_chunks( + repository_id, + completed_run.id, + completed_run.snapshot_id, + chunks, + ) candidates = self.candidate_generator.generate(repository, facts) self.store.replace_candidate_graph(repository_id, completed_run.id, candidates) return ScanSummary( @@ -145,6 +155,13 @@ class RegistryService: ) -> list[ObservedFact]: return self.store.list_observed_facts(repository_id, analysis_run_id) + def list_content_chunks( + self, + repository_id: int, + analysis_run_id: int | None = None, + ) -> list[ContentChunk]: + return self.store.list_content_chunks(repository_id, analysis_run_id) + def candidate_graph(self, repository_id: int, analysis_run_id: int) -> CandidateGraph: return self.store.get_candidate_graph(repository_id, analysis_run_id) diff --git a/src/repo_registry/storage/sqlite.py b/src/repo_registry/storage/sqlite.py index cbfd694..dbe041a 100644 --- a/src/repo_registry/storage/sqlite.py +++ b/src/repo_registry/storage/sqlite.py @@ -15,6 +15,7 @@ from repo_registry.core.models import ( CandidateGraph, Capability, CapabilitySummary, + ContentChunk, Evidence, Feature, ObservedFact, @@ -25,6 +26,7 @@ from repo_registry.core.models import ( SearchResult, SourceReference, ) +from repo_registry.content_indexing.extractor import ContentChunkCandidate from repo_registry.candidate_graph.generator import CandidateAbilityDraft from repo_registry.repo_scanning.scanner import FactCandidate, ScanResult @@ -41,6 +43,7 @@ class RegistryStore: migration_path = Path(__file__).parents[3] / "migrations" / "0001_initial.sql" with self.connect() as connection: connection.executescript(migration_path.read_text(encoding="utf-8")) + self._ensure_content_chunks_table(connection) self._ensure_approved_source_ref_columns(connection) def connect(self) -> sqlite3.Connection: @@ -63,6 +66,30 @@ class RegistryStore: f"ALTER TABLE {table} ADD COLUMN source_refs TEXT NOT NULL DEFAULT '[]'" ) + def _ensure_content_chunks_table(self, connection: sqlite3.Connection) -> None: + connection.execute( + """ + CREATE TABLE IF NOT EXISTS content_chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + repository_id INTEGER NOT NULL REFERENCES repositories(id) ON DELETE CASCADE, + analysis_run_id INTEGER NOT NULL REFERENCES analysis_runs(id) ON DELETE CASCADE, + snapshot_id INTEGER REFERENCES repository_snapshots(id) ON DELETE CASCADE, + path TEXT NOT NULL, + kind TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + text TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + connection.execute( + "CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)" + ) + connection.execute( + "CREATE INDEX IF NOT EXISTS idx_content_chunks_run ON content_chunks(analysis_run_id)" + ) + def create_repository( self, *, @@ -1163,6 +1190,65 @@ class RegistryStore: ).fetchall() return [self._observed_fact_from_row(row) for row in rows] + def replace_content_chunks( + self, + repository_id: int, + analysis_run_id: int, + snapshot_id: int | None, + chunks: list[ContentChunkCandidate], + ) -> None: + with self.connect() as connection: + connection.execute( + "DELETE FROM content_chunks WHERE analysis_run_id = ?", + (analysis_run_id,), + ) + connection.executemany( + """ + INSERT INTO content_chunks + (repository_id, analysis_run_id, snapshot_id, path, kind, + start_line, end_line, text) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + repository_id, + analysis_run_id, + snapshot_id, + chunk.path, + chunk.kind, + chunk.start_line, + chunk.end_line, + chunk.text, + ) + for chunk in chunks + ], + ) + + def list_content_chunks( + self, + repository_id: int, + analysis_run_id: int | None = None, + ) -> list[ContentChunk]: + self.get_repository(repository_id) + params: tuple[int, ...] + where = "WHERE repository_id = ?" + params = (repository_id,) + if analysis_run_id is not None: + where += " AND analysis_run_id = ?" + params = (repository_id, analysis_run_id) + with self.connect() as connection: + rows = connection.execute( + f""" + SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind, + start_line, end_line, text + FROM content_chunks + {where} + ORDER BY path ASC, start_line ASC, id ASC + """, + params, + ).fetchall() + return [self._content_chunk_from_row(row) for row in rows] + def create_ability( self, repository_id: int, @@ -1986,3 +2072,17 @@ class RegistryStore: value=row["value"], metadata=json.loads(row["metadata"]), ) + + @staticmethod + def _content_chunk_from_row(row: sqlite3.Row) -> ContentChunk: + return ContentChunk( + id=row["id"], + repository_id=row["repository_id"], + analysis_run_id=row["analysis_run_id"], + snapshot_id=row["snapshot_id"], + path=row["path"], + kind=row["kind"], + start_line=row["start_line"], + end_line=row["end_line"], + text=row["text"], + ) diff --git a/src/repo_registry/web_api/app.py b/src/repo_registry/web_api/app.py index 84e1c49..95bd3f2 100644 --- a/src/repo_registry/web_api/app.py +++ b/src/repo_registry/web_api/app.py @@ -464,6 +464,36 @@ def list_observed_facts( raise HTTPException(status_code=404, detail=str(exc)) from exc +@app.get("/repos/{repository_id}/content-chunks") +def list_content_chunks( + repository_id: int, + analysis_run_id: int | None = None, + service: RegistryService = Depends(get_service), +) -> list[dict[str, object]]: + try: + return [ + asdict(chunk) + for chunk in service.list_content_chunks(repository_id, analysis_run_id) + ] + except NotFoundError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + +@app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/content-chunks") +def list_analysis_run_content_chunks( + repository_id: int, + analysis_run_id: int, + service: RegistryService = Depends(get_service), +) -> list[dict[str, object]]: + try: + return [ + asdict(chunk) + for chunk in service.list_content_chunks(repository_id, analysis_run_id) + ] + except NotFoundError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + @app.get("/repos/{repository_id}/analysis-runs/{analysis_run_id}/candidate-graph") def get_candidate_graph( repository_id: int, diff --git a/src/repo_registry/web_ui/views.py b/src/repo_registry/web_ui/views.py index 4d6dbaa..770a83c 100644 --- a/src/repo_registry/web_ui/views.py +++ b/src/repo_registry/web_ui/views.py @@ -630,6 +630,7 @@ def analysis_run_detail( repository = service.get_repository(repository_id) candidate_graph = service.candidate_graph(repository_id, analysis_run_id) facts = service.list_observed_facts(repository_id, analysis_run_id) + chunks = service.list_content_chunks(repository_id, analysis_run_id) decisions = service.list_review_decisions(repository_id, analysis_run_id) fact_rows = "\n".join( f""" @@ -667,6 +668,10 @@ def analysis_run_detail( {render_review_decisions(decisions)} +
+

Content Chunks

+ {render_content_chunks(chunks)} +
""" return page(f"{repository.name} Run {analysis_run_id}", body) @@ -1065,6 +1070,27 @@ def render_review_decisions(decisions: list) -> str: """ +def render_content_chunks(chunks: list) -> str: + if not chunks: + return '

No content chunks extracted.

' + rows = "\n".join( + f""" + + {escape(chunk.kind)} + {escape(chunk.path)}:{chunk.start_line}-{chunk.end_line} +
{escape(chunk.text[:500])}
+ + """ + for chunk in chunks + ) + return f""" + + + {rows} +
KindSourceText
+ """ + + def render_candidate_ability_actions( ability: dict, repository_id: int, diff --git a/tests/test_content_indexing.py b/tests/test_content_indexing.py new file mode 100644 index 0000000..71727b0 --- /dev/null +++ b/tests/test_content_indexing.py @@ -0,0 +1,67 @@ +from repo_registry.content_indexing.extractor import ContentExtractor +from repo_registry.core.models import ObservedFact + + +def fact(id, kind, name, path="", line=None): + metadata = {} + if line is not None: + metadata["line"] = line + return ObservedFact( + id=id, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + kind=kind, + path=path, + name=name, + value="", + metadata=metadata, + ) + + +def test_content_extractor_chunks_docs_and_interface_line_ranges(tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + (repo / "README.md").write_text( + "\n".join(f"readme line {number}" for number in range(1, 46)), + encoding="utf-8", + ) + (repo / "app.py").write_text( + "\n".join(f"line {number}" for number in range(1, 21)), + encoding="utf-8", + ) + + chunks = ContentExtractor().extract( + repo, + [ + fact(1, "documentation", "README", "README.md"), + fact(2, "interface", "python route decorator", "app.py", line=10), + ], + ) + + readme_chunks = [chunk for chunk in chunks if chunk.path == "README.md"] + interface_chunks = [chunk for chunk in chunks if chunk.path == "app.py"] + assert [(chunk.start_line, chunk.end_line) for chunk in readme_chunks] == [ + (1, 40), + (41, 45), + ] + assert len(interface_chunks) == 1 + assert interface_chunks[0].start_line == 5 + assert interface_chunks[0].end_line == 20 + assert "line 10" in interface_chunks[0].text + + +def test_content_extractor_ignores_unindexed_and_missing_paths(tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + (repo / "README.md").write_text("# ok\n", encoding="utf-8") + + chunks = ContentExtractor().extract( + repo, + [ + fact(1, "language", "Python"), + fact(2, "documentation", "missing", "missing.md"), + ], + ) + + assert chunks == [] diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index 90963e9..f25f4a1 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -343,6 +343,11 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path): assert ("documentation", "README", "README.md") in fact_names assert ("framework", "FastAPI", "requirements.txt") in fact_names assert ("interface", "python route decorator", "app.py") in fact_names + chunks = service.list_content_chunks(repository.id, summary.analysis_run.id) + chunk_sources = {(chunk.kind, chunk.path) for chunk in chunks} + assert ("documentation", "README.md") in chunk_sources + assert ("manifest", "requirements.txt") in chunk_sources + assert ("interface", "app.py") in chunk_sources candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id) assert candidate_graph.repository.name == "Example" diff --git a/tests/test_storage_migrations.py b/tests/test_storage_migrations.py index 31a6695..b800379 100644 --- a/tests/test_storage_migrations.py +++ b/tests/test_storage_migrations.py @@ -25,9 +25,16 @@ def test_initialize_is_idempotent_and_applies_expected_columns(tmp_path): evidence_columns = { row[1] for row in connection.execute("PRAGMA table_info(approved_evidence)") } + tables = { + row[0] + for row in connection.execute( + "SELECT name FROM sqlite_master WHERE type = 'table'" + ) + } assert "source_refs" in feature_columns assert "source_refs" in evidence_columns + assert "content_chunks" in tables def test_delete_repository_cascades_registry_and_review_rows(tmp_path): @@ -72,6 +79,7 @@ def test_delete_repository_cascades_registry_and_review_rows(tmp_path): "approved_features", "approved_evidence", "analysis_runs", + "content_chunks", "review_decisions", ): count = connection.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] diff --git a/tests/test_web_api.py b/tests/test_web_api.py index 8a67465..a9c092b 100644 --- a/tests/test_web_api.py +++ b/tests/test_web_api.py @@ -320,6 +320,15 @@ def test_api_analysis_run_loop(tmp_path): assert ("documentation", "README", "README.md") in fact_names assert ("framework", "React", "package.json") in fact_names assert ("framework", "Vite", "package.json") in fact_names + + chunks_response = client.get( + f"/repos/{repository_id}/analysis-runs/" + f"{run['analysis_run']['id']}/content-chunks" + ) + assert chunks_response.status_code == 200 + assert { + (chunk["kind"], chunk["path"]) for chunk in chunks_response.json() + } >= {("documentation", "README.md"), ("manifest", "package.json")} finally: app.dependency_overrides.clear() @@ -392,6 +401,8 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): run_detail = client.get(run_path) assert run_detail.status_code == 200 assert "Candidate Graph" in run_detail.text + assert "Content Chunks" in run_detail.text + assert "README.md:1-1" in run_detail.text assert "ID " in run_detail.text assert "No review decisions yet." in run_detail.text