from repo_registry.content_indexing.extractor import ContentExtractor from repo_registry.core.models import ObservedFact def fact(id, kind, name, path="", line=None, source_role=""): metadata = {} if line is not None: metadata["line"] = line if source_role: metadata["source_role"] = source_role return ObservedFact( id=id, repository_id=1, analysis_run_id=1, snapshot_id=1, kind=kind, path=path, name=name, value="", metadata=metadata, ) def test_content_extractor_chunks_docs_and_interface_line_ranges(tmp_path): repo = tmp_path / "repo" repo.mkdir() (repo / "README.md").write_text( "\n".join(f"readme line {number}" for number in range(1, 46)), encoding="utf-8", ) (repo / "app.py").write_text( "\n".join(f"line {number}" for number in range(1, 21)), encoding="utf-8", ) chunks = ContentExtractor().extract( repo, [ fact(1, "documentation", "README", "README.md"), fact(2, "interface", "python route decorator", "app.py", line=10), ], ) readme_chunks = [chunk for chunk in chunks if chunk.path == "README.md"] interface_chunks = [chunk for chunk in chunks if chunk.path == "app.py"] assert [(chunk.start_line, chunk.end_line) for chunk in readme_chunks] == [ (1, 40), (41, 45), ] assert len(interface_chunks) == 1 assert interface_chunks[0].start_line == 5 assert interface_chunks[0].end_line == 20 assert "line 10" in interface_chunks[0].text def test_content_extractor_ignores_unindexed_and_missing_paths(tmp_path): repo = tmp_path / "repo" repo.mkdir() (repo / "README.md").write_text("# ok\n", encoding="utf-8") chunks = ContentExtractor().extract( repo, [ fact(1, "language", "Python"), fact(2, "documentation", "missing", "missing.md"), ], ) assert chunks == [] def test_content_extractor_chunks_provider_related_config(tmp_path): repo = tmp_path / "repo" repo.mkdir() (repo / ".env.example").write_text("OPENROUTER_API_KEY=\n", encoding="utf-8") chunks = ContentExtractor().extract( repo, [ fact(1, "credential_config", "OpenRouter API key", ".env.example"), ], ) assert len(chunks) == 1 assert chunks[0].path == ".env.example" assert "OPENROUTER_API_KEY" in chunks[0].text def test_content_extractor_preserves_intent_source_role_metadata(tmp_path): repo = tmp_path / "repo" repo.mkdir() (repo / "INTENT.md").write_text("# INTENT\n\nProvide OIDC.\n", encoding="utf-8") chunks = ContentExtractor().extract( repo, [ fact(1, "intent", "INTENT", "INTENT.md", source_role="intent_summary"), ], ) assert len(chunks) == 1 assert chunks[0].kind == "intent" assert chunks[0].metadata["source_role"] == "intent_summary"