from repo_registry.content_indexing.extractor import ContentExtractor from repo_registry.core.models import ObservedFact def fact(id, kind, name, path="", line=None): metadata = {} if line is not None: metadata["line"] = line return ObservedFact( id=id, repository_id=1, analysis_run_id=1, snapshot_id=1, kind=kind, path=path, name=name, value="", metadata=metadata, ) def test_content_extractor_chunks_docs_and_interface_line_ranges(tmp_path): repo = tmp_path / "repo" repo.mkdir() (repo / "README.md").write_text( "\n".join(f"readme line {number}" for number in range(1, 46)), encoding="utf-8", ) (repo / "app.py").write_text( "\n".join(f"line {number}" for number in range(1, 21)), encoding="utf-8", ) chunks = ContentExtractor().extract( repo, [ fact(1, "documentation", "README", "README.md"), fact(2, "interface", "python route decorator", "app.py", line=10), ], ) readme_chunks = [chunk for chunk in chunks if chunk.path == "README.md"] interface_chunks = [chunk for chunk in chunks if chunk.path == "app.py"] assert [(chunk.start_line, chunk.end_line) for chunk in readme_chunks] == [ (1, 40), (41, 45), ] assert len(interface_chunks) == 1 assert interface_chunks[0].start_line == 5 assert interface_chunks[0].end_line == 20 assert "line 10" in interface_chunks[0].text def test_content_extractor_ignores_unindexed_and_missing_paths(tmp_path): repo = tmp_path / "repo" repo.mkdir() (repo / "README.md").write_text("# ok\n", encoding="utf-8") chunks = ContentExtractor().extract( repo, [ fact(1, "language", "Python"), fact(2, "documentation", "missing", "missing.md"), ], ) assert chunks == []