diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py index 496a07e..e9978f6 100644 --- a/src/repo_registry/candidate_graph/generator.py +++ b/src/repo_registry/candidate_graph/generator.py @@ -63,7 +63,8 @@ class CandidateGraphGenerator: return [] chunks = chunks or [] - docs = self._facts(facts, "documentation") + scope_docs = self._facts(facts, "scope") + docs = scope_docs + self._facts(facts, "documentation") tests = self._facts(facts, "test") examples = self._facts(facts, "example") interfaces = self._facts(facts, "interface") @@ -660,8 +661,8 @@ class CandidateGraphGenerator: return f"Support {self._humanize_identifier(repository.name)}" def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str: - for chunk in chunks: - if chunk.kind != "documentation": + for chunk in self._documentation_chunks(chunks): + if chunk.kind not in {"scope", "documentation"}: continue lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] paragraph = next((line for line in lines if not line.startswith("#")), "") @@ -731,9 +732,7 @@ class CandidateGraphGenerator: ) def _document_summary(self, chunks: list[ContentChunk]) -> str: - for chunk in chunks: - if chunk.kind != "documentation": - continue + for chunk in self._documentation_chunks(chunks): lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] if not lines: continue @@ -744,6 +743,12 @@ class CandidateGraphGenerator: return heading or paragraph return "" + def _documentation_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]: + return sorted( + [chunk for chunk in chunks if chunk.kind in {"scope", "documentation"}], + key=lambda chunk: (0 if chunk.kind == "scope" else 1, chunk.path, chunk.start_line), + ) + def _interface_summary(self, chunks: list[ContentChunk]) -> str: for chunk in chunks: if chunk.kind != "interface": diff --git a/src/repo_registry/content_indexing/extractor.py b/src/repo_registry/content_indexing/extractor.py index 1349c6e..ed216ab 100644 --- a/src/repo_registry/content_indexing/extractor.py +++ b/src/repo_registry/content_indexing/extractor.py @@ -1,12 +1,13 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from repo_registry.core.models import ObservedFact INDEXED_FACT_KINDS = { + "scope", "documentation", "example", "test", @@ -29,6 +30,7 @@ class ContentChunkCandidate: start_line: int end_line: int text: str + metadata: dict[str, object] = field(default_factory=dict) class ContentExtractor: @@ -80,6 +82,7 @@ class ContentExtractor: path, root, fact.kind, + fact.metadata, lines, start_line, end_line, @@ -91,7 +94,15 @@ class ContentExtractor: start_line = start_index + 1 end_line = min(len(lines), start_index + MAX_CHUNK_LINES) chunks.append( - self._chunk(path, root, fact.kind, lines, start_line, end_line) + self._chunk( + path, + root, + fact.kind, + fact.metadata, + lines, + start_line, + end_line, + ) ) return chunks @@ -100,6 +111,7 @@ class ContentExtractor: path: Path, root: Path, kind: str, + fact_metadata: dict[str, object], lines: list[str], start_line: int, end_line: int, @@ -110,6 +122,7 @@ class ContentExtractor: start_line=start_line, end_line=end_line, text="\n".join(lines[start_line - 1 : end_line]).strip(), + metadata={"source_role": fact_metadata.get("source_role", "")}, ) def _is_within(self, root: Path, path: Path) -> bool: diff --git a/src/repo_registry/core/models.py b/src/repo_registry/core/models.py index 78148b7..db5adf7 100644 --- a/src/repo_registry/core/models.py +++ b/src/repo_registry/core/models.py @@ -119,6 +119,7 @@ class ContentChunk: start_line: int end_line: int text: str + metadata: dict[str, Any] = field(default_factory=dict) @dataclass(frozen=True) diff --git a/src/repo_registry/repo_scanning/scanner.py b/src/repo_registry/repo_scanning/scanner.py index cb841ef..db77d85 100644 --- a/src/repo_registry/repo_scanning/scanner.py +++ b/src/repo_registry/repo_scanning/scanner.py @@ -1,6 +1,7 @@ from __future__ import annotations import subprocess +import re from dataclasses import dataclass, field from pathlib import Path from typing import Any @@ -86,6 +87,17 @@ LLM_CREDENTIAL_HINTS = { "GOOGLE_API_KEY": "Google API key", } +AGENT_GUIDANCE_FILES = { + "agents.md", + "claude.md", +} + +AGENT_GUIDANCE_DIRS = { + ".claude", + ".codex", + ".cursor", +} + @dataclass(frozen=True) class FactCandidate: @@ -153,7 +165,7 @@ class DeterministicScanner: kind="language", name=language, value=str(count), - metadata={"file_count": count}, + metadata={"file_count": count, "source_role": "implementation_source"}, ) for language, count in counts.items() ] @@ -166,14 +178,45 @@ class DeterministicScanner: relative = path.relative_to(root).as_posix() lower = relative.lower() name = path.name.lower() + source_role = self._source_role(relative) - if name.startswith("readme"): - facts.append(FactCandidate("documentation", "README", relative)) + if name == "scope.md": + facts.append( + FactCandidate( + "scope", + "SCOPE", + relative, + metadata={"source_role": "scope_summary"}, + ) + ) + elif name.startswith("readme"): + facts.append( + FactCandidate( + "documentation", + "README", + relative, + metadata={"source_role": "product_documentation"}, + ) + ) elif lower.startswith("docs/") or lower.startswith("doc/"): - facts.append(FactCandidate("documentation", path.name, relative)) + facts.append( + FactCandidate( + "documentation", + path.name, + relative, + metadata={"source_role": "product_documentation"}, + ) + ) if lower.startswith("examples/") or lower.startswith("example/"): - facts.append(FactCandidate("example", path.name, relative)) + facts.append( + FactCandidate( + "example", + path.name, + relative, + metadata={"source_role": "product_documentation"}, + ) + ) if ( lower.startswith("tests/") @@ -183,7 +226,14 @@ class DeterministicScanner: or name.endswith(".test.ts") or name.endswith(".spec.ts") ): - facts.append(FactCandidate("test", path.name, relative)) + facts.append( + FactCandidate( + "test", + path.name, + relative, + metadata={"source_role": "test_evidence"}, + ) + ) if name in MANIFEST_FRAMEWORK_HINTS or name in { "requirements.txt", @@ -193,10 +243,24 @@ class DeterministicScanner: "yarn.lock", "go.mod", }: - facts.append(FactCandidate("manifest", path.name, relative)) + facts.append( + FactCandidate( + "manifest", + path.name, + relative, + metadata={"source_role": "dependency_declaration"}, + ) + ) if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")): - facts.append(FactCandidate("config", path.name, relative)) + facts.append( + FactCandidate( + "config", + path.name, + relative, + metadata={"source_role": source_role}, + ) + ) return facts @@ -223,7 +287,11 @@ class DeterministicScanner: kind="framework", name=framework, path=path.relative_to(root).as_posix(), - metadata={"source": "manifest_hint", "needle": needle}, + metadata={ + "source": "manifest_hint", + "needle": needle, + "source_role": "dependency_declaration", + }, ) ) return facts @@ -236,9 +304,23 @@ class DeterministicScanner: if path.suffix == ".py": facts.extend(self._python_interface_facts(path, relative)) if "cli" in lower or lower.endswith("/commands.py"): - facts.append(FactCandidate("interface", "possible CLI", relative)) + facts.append( + FactCandidate( + "interface", + "possible CLI", + relative, + metadata={"source_role": self._source_role(relative)}, + ) + ) if "routes" in lower or "api" in lower: - facts.append(FactCandidate("interface", "possible API surface", relative)) + facts.append( + FactCandidate( + "interface", + "possible API surface", + relative, + metadata={"source_role": self._source_role(relative)}, + ) + ) return facts def _llm_provider_facts(self, files: list[Path], root: Path) -> list[FactCandidate]: @@ -264,8 +346,11 @@ class DeterministicScanner: continue lower_text = text.lower() relative = path.relative_to(root).as_posix() + source_role = self._source_role(relative) + if source_role == "agent_guidance": + continue for needle, provider in LLM_PROVIDER_HINTS.items(): - if needle not in lower_text: + if not self._has_provider_signal(lower_text, needle): continue self._append_once( facts, @@ -275,7 +360,10 @@ class DeterministicScanner: name=provider, path=relative, value=needle, - metadata={"source": "provider_hint"}, + metadata={ + "source": "provider_hint", + "source_role": source_role, + }, ), ) for env_name, label in LLM_CREDENTIAL_HINTS.items(): @@ -289,11 +377,22 @@ class DeterministicScanner: name=label, path=relative, value=env_name, - metadata={"source": "environment_variable"}, + metadata={ + "source": "environment_variable", + "source_role": source_role, + }, ), ) - if any(term in lower_text for term in ("provider_registry", "providers =", "adapter")): - if any(needle in lower_text for needle in LLM_PROVIDER_HINTS): + registry_hint = ( + "provider_registry" in lower_text + or "providers =" in lower_text + or ("adapter" in lower_text and source_role == "implementation_source") + ) + if registry_hint: + if any( + self._has_provider_signal(lower_text, needle) + for needle in LLM_PROVIDER_HINTS + ): self._append_once( facts, seen, @@ -301,11 +400,15 @@ class DeterministicScanner: kind="provider_registry", name="LLM provider registry", path=relative, - metadata={"source": "provider_registry_hint"}, + metadata={ + "source": "provider_registry_hint", + "source_role": source_role, + }, ), ) if "fallback" in lower_text and any( - needle in lower_text for needle in LLM_PROVIDER_HINTS + self._has_provider_signal(lower_text, needle) + for needle in LLM_PROVIDER_HINTS ): self._append_once( facts, @@ -314,11 +417,47 @@ class DeterministicScanner: kind="fallback_policy", name="LLM provider fallback policy", path=relative, - metadata={"source": "fallback_hint"}, + metadata={ + "source": "fallback_hint", + "source_role": source_role, + }, ), ) return facts + def _source_role(self, relative_path: str) -> str: + lower = relative_path.lower() + parts = lower.split("/") + name = parts[-1] + if name == "scope.md": + return "scope_summary" + if name in AGENT_GUIDANCE_FILES or any(part in AGENT_GUIDANCE_DIRS for part in parts): + return "agent_guidance" + if lower.startswith((".github/workflows/", ".gitea/workflows/")): + return "ci_tooling" + if lower.startswith(("tests/", "test/")) or name.startswith("test_"): + return "test_evidence" + if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")): + return "product_documentation" + if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")): + return "dependency_declaration" + if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")): + return "configuration" + return "implementation_source" + + def _has_provider_signal(self, lower_text: str, needle: str) -> bool: + pattern = re.compile(rf"(?