first steps to better scanning of repos

2026-05-02 00:11:55 +02:00
parent 2c427d253c
commit 89c4081001
9 changed files with 270 additions and 35 deletions
--- a/src/repo_registry/candidate_graph/generator.py
+++ b/src/repo_registry/candidate_graph/generator.py
@@ -63,7 +63,8 @@ class CandidateGraphGenerator:
            return []
        chunks = chunks or []

-        docs = self._facts(facts, "documentation")
+        scope_docs = self._facts(facts, "scope")
+        docs = scope_docs + self._facts(facts, "documentation")
        tests = self._facts(facts, "test")
        examples = self._facts(facts, "example")
        interfaces = self._facts(facts, "interface")
@@ -660,8 +661,8 @@ class CandidateGraphGenerator:
        return f"Support {self._humanize_identifier(repository.name)}"

    def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
-        for chunk in chunks:
-            if chunk.kind != "documentation":
+        for chunk in self._documentation_chunks(chunks):
+            if chunk.kind not in {"scope", "documentation"}:
                continue
            lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
            paragraph = next((line for line in lines if not line.startswith("#")), "")
@@ -731,9 +732,7 @@ class CandidateGraphGenerator:
        )

    def _document_summary(self, chunks: list[ContentChunk]) -> str:
-        for chunk in chunks:
-            if chunk.kind != "documentation":
-                continue
+        for chunk in self._documentation_chunks(chunks):
            lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
            if not lines:
                continue
@@ -744,6 +743,12 @@ class CandidateGraphGenerator:
            return heading or paragraph
        return ""

+    def _documentation_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
+        return sorted(
+            [chunk for chunk in chunks if chunk.kind in {"scope", "documentation"}],
+            key=lambda chunk: (0 if chunk.kind == "scope" else 1, chunk.path, chunk.start_line),
+        )
+
    def _interface_summary(self, chunks: list[ContentChunk]) -> str:
        for chunk in chunks:
            if chunk.kind != "interface":
--- a/src/repo_registry/content_indexing/extractor.py
+++ b/src/repo_registry/content_indexing/extractor.py
@@ -1,12 +1,13 @@
 from __future__ import annotations

-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path

 from repo_registry.core.models import ObservedFact


 INDEXED_FACT_KINDS = {
+    "scope",
    "documentation",
    "example",
    "test",
@@ -29,6 +30,7 @@ class ContentChunkCandidate:
    start_line: int
    end_line: int
    text: str
+    metadata: dict[str, object] = field(default_factory=dict)


 class ContentExtractor:
@@ -80,6 +82,7 @@ class ContentExtractor:
                    path,
                    root,
                    fact.kind,
+                    fact.metadata,
                    lines,
                    start_line,
                    end_line,
@@ -91,7 +94,15 @@ class ContentExtractor:
            start_line = start_index + 1
            end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
            chunks.append(
-                self._chunk(path, root, fact.kind, lines, start_line, end_line)
+                self._chunk(
+                    path,
+                    root,
+                    fact.kind,
+                    fact.metadata,
+                    lines,
+                    start_line,
+                    end_line,
+                )
            )
        return chunks

@@ -100,6 +111,7 @@ class ContentExtractor:
        path: Path,
        root: Path,
        kind: str,
+        fact_metadata: dict[str, object],
        lines: list[str],
        start_line: int,
        end_line: int,
@@ -110,6 +122,7 @@ class ContentExtractor:
            start_line=start_line,
            end_line=end_line,
            text="\n".join(lines[start_line - 1 : end_line]).strip(),
+            metadata={"source_role": fact_metadata.get("source_role", "")},
        )

    def _is_within(self, root: Path, path: Path) -> bool:
--- a/src/repo_registry/core/models.py
+++ b/src/repo_registry/core/models.py
@@ -119,6 +119,7 @@ class ContentChunk:
    start_line: int
    end_line: int
    text: str
+    metadata: dict[str, Any] = field(default_factory=dict)


@dataclass(frozen=True)
--- a/src/repo_registry/repo_scanning/scanner.py
+++ b/src/repo_registry/repo_scanning/scanner.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import subprocess
+import re
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
@@ -86,6 +87,17 @@ LLM_CREDENTIAL_HINTS = {
    "GOOGLE_API_KEY": "Google API key",
 }

+AGENT_GUIDANCE_FILES = {
+    "agents.md",
+    "claude.md",
+}
+
+AGENT_GUIDANCE_DIRS = {
+    ".claude",
+    ".codex",
+    ".cursor",
+}
+

@dataclass(frozen=True)
 class FactCandidate:
@@ -153,7 +165,7 @@ class DeterministicScanner:
                kind="language",
                name=language,
                value=str(count),
-                metadata={"file_count": count},
+                metadata={"file_count": count, "source_role": "implementation_source"},
            )
            for language, count in counts.items()
        ]
@@ -166,14 +178,45 @@ class DeterministicScanner:
            relative = path.relative_to(root).as_posix()
            lower = relative.lower()
            name = path.name.lower()
+            source_role = self._source_role(relative)

-            if name.startswith("readme"):
-                facts.append(FactCandidate("documentation", "README", relative))
+            if name == "scope.md":
+                facts.append(
+                    FactCandidate(
+                        "scope",
+                        "SCOPE",
+                        relative,
+                        metadata={"source_role": "scope_summary"},
+                    )
+                )
+            elif name.startswith("readme"):
+                facts.append(
+                    FactCandidate(
+                        "documentation",
+                        "README",
+                        relative,
+                        metadata={"source_role": "product_documentation"},
+                    )
+                )
            elif lower.startswith("docs/") or lower.startswith("doc/"):
-                facts.append(FactCandidate("documentation", path.name, relative))
+                facts.append(
+                    FactCandidate(
+                        "documentation",
+                        path.name,
+                        relative,
+                        metadata={"source_role": "product_documentation"},
+                    )
+                )

            if lower.startswith("examples/") or lower.startswith("example/"):
-                facts.append(FactCandidate("example", path.name, relative))
+                facts.append(
+                    FactCandidate(
+                        "example",
+                        path.name,
+                        relative,
+                        metadata={"source_role": "product_documentation"},
+                    )
+                )

            if (
                lower.startswith("tests/")
@@ -183,7 +226,14 @@ class DeterministicScanner:
                or name.endswith(".test.ts")
                or name.endswith(".spec.ts")
            ):
-                facts.append(FactCandidate("test", path.name, relative))
+                facts.append(
+                    FactCandidate(
+                        "test",
+                        path.name,
+                        relative,
+                        metadata={"source_role": "test_evidence"},
+                    )
+                )

            if name in MANIFEST_FRAMEWORK_HINTS or name in {
                "requirements.txt",
@@ -193,10 +243,24 @@ class DeterministicScanner:
                "yarn.lock",
                "go.mod",
            }:
-                facts.append(FactCandidate("manifest", path.name, relative))
+                facts.append(
+                    FactCandidate(
+                        "manifest",
+                        path.name,
+                        relative,
+                        metadata={"source_role": "dependency_declaration"},
+                    )
+                )

            if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")):
-                facts.append(FactCandidate("config", path.name, relative))
+                facts.append(
+                    FactCandidate(
+                        "config",
+                        path.name,
+                        relative,
+                        metadata={"source_role": source_role},
+                    )
+                )

        return facts

@@ -223,7 +287,11 @@ class DeterministicScanner:
                        kind="framework",
                        name=framework,
                        path=path.relative_to(root).as_posix(),
-                        metadata={"source": "manifest_hint", "needle": needle},
+                        metadata={
+                            "source": "manifest_hint",
+                            "needle": needle,
+                            "source_role": "dependency_declaration",
+                        },
                    )
                )
        return facts
@@ -236,9 +304,23 @@ class DeterministicScanner:
            if path.suffix == ".py":
                facts.extend(self._python_interface_facts(path, relative))
            if "cli" in lower or lower.endswith("/commands.py"):
-                facts.append(FactCandidate("interface", "possible CLI", relative))
+                facts.append(
+                    FactCandidate(
+                        "interface",
+                        "possible CLI",
+                        relative,
+                        metadata={"source_role": self._source_role(relative)},
+                    )
+                )
            if "routes" in lower or "api" in lower:
-                facts.append(FactCandidate("interface", "possible API surface", relative))
+                facts.append(
+                    FactCandidate(
+                        "interface",
+                        "possible API surface",
+                        relative,
+                        metadata={"source_role": self._source_role(relative)},
+                    )
+                )
        return facts

    def _llm_provider_facts(self, files: list[Path], root: Path) -> list[FactCandidate]:
@@ -264,8 +346,11 @@ class DeterministicScanner:
                continue
            lower_text = text.lower()
            relative = path.relative_to(root).as_posix()
+            source_role = self._source_role(relative)
+            if source_role == "agent_guidance":
+                continue
            for needle, provider in LLM_PROVIDER_HINTS.items():
-                if needle not in lower_text:
+                if not self._has_provider_signal(lower_text, needle):
                    continue
                self._append_once(
                    facts,
@@ -275,7 +360,10 @@ class DeterministicScanner:
                        name=provider,
                        path=relative,
                        value=needle,
-                        metadata={"source": "provider_hint"},
+                        metadata={
+                            "source": "provider_hint",
+                            "source_role": source_role,
+                        },
                    ),
                )
            for env_name, label in LLM_CREDENTIAL_HINTS.items():
@@ -289,11 +377,22 @@ class DeterministicScanner:
                        name=label,
                        path=relative,
                        value=env_name,
-                        metadata={"source": "environment_variable"},
+                        metadata={
+                            "source": "environment_variable",
+                            "source_role": source_role,
+                        },
                    ),
                )
-            if any(term in lower_text for term in ("provider_registry", "providers =", "adapter")):
-                if any(needle in lower_text for needle in LLM_PROVIDER_HINTS):
+            registry_hint = (
+                "provider_registry" in lower_text
+                or "providers =" in lower_text
+                or ("adapter" in lower_text and source_role == "implementation_source")
+            )
+            if registry_hint:
+                if any(
+                    self._has_provider_signal(lower_text, needle)
+                    for needle in LLM_PROVIDER_HINTS
+                ):
                    self._append_once(
                        facts,
                        seen,
@@ -301,11 +400,15 @@ class DeterministicScanner:
                            kind="provider_registry",
                            name="LLM provider registry",
                            path=relative,
-                            metadata={"source": "provider_registry_hint"},
+                            metadata={
+                                "source": "provider_registry_hint",
+                                "source_role": source_role,
+                            },
                        ),
                    )
            if "fallback" in lower_text and any(
-                needle in lower_text for needle in LLM_PROVIDER_HINTS
+                self._has_provider_signal(lower_text, needle)
+                for needle in LLM_PROVIDER_HINTS
            ):
                self._append_once(
                    facts,
@@ -314,11 +417,47 @@ class DeterministicScanner:
                        kind="fallback_policy",
                        name="LLM provider fallback policy",
                        path=relative,
-                        metadata={"source": "fallback_hint"},
+                        metadata={
+                            "source": "fallback_hint",
+                            "source_role": source_role,
+                        },
                    ),
                )
        return facts

+    def _source_role(self, relative_path: str) -> str:
+        lower = relative_path.lower()
+        parts = lower.split("/")
+        name = parts[-1]
+        if name == "scope.md":
+            return "scope_summary"
+        if name in AGENT_GUIDANCE_FILES or any(part in AGENT_GUIDANCE_DIRS for part in parts):
+            return "agent_guidance"
+        if lower.startswith((".github/workflows/", ".gitea/workflows/")):
+            return "ci_tooling"
+        if lower.startswith(("tests/", "test/")) or name.startswith("test_"):
+            return "test_evidence"
+        if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")):
+            return "product_documentation"
+        if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")):
+            return "dependency_declaration"
+        if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")):
+            return "configuration"
+        return "implementation_source"
+
+    def _has_provider_signal(self, lower_text: str, needle: str) -> bool:
+        pattern = re.compile(rf"(?<![a-z0-9-]){re.escape(needle.lower())}(?![a-z0-9-])")
+        for match in pattern.finditer(lower_text):
+            context = lower_text[max(0, match.start() - 20) : match.end() + 20]
+            if needle == "claude" and (
+                "claude.md" in context
+                or "claude code" in context
+                or "claude.ai/code" in context
+            ):
+                continue
+            return True
+        return False
+
    def _append_once(
        self,
        facts: list[FactCandidate],
@@ -347,7 +486,10 @@ class DeterministicScanner:
                        name="python route decorator",
                        path=relative,
                        value=stripped,
-                        metadata={"line": line_number},
+                        metadata={
+                            "line": line_number,
+                            "source_role": self._source_role(relative),
+                        },
                    )
                )
            elif stripped.startswith("@click.command") or stripped.startswith("@app.command"):
@@ -357,7 +499,10 @@ class DeterministicScanner:
                        name="python CLI command decorator",
                        path=relative,
                        value=stripped,
-                        metadata={"line": line_number},
+                        metadata={
+                            "line": line_number,
+                            "source_role": self._source_role(relative),
+                        },
                    )
                )
        return facts
--- a/src/repo_registry/storage/sqlite.py
+++ b/src/repo_registry/storage/sqlite.py
@@ -180,6 +180,14 @@ class RegistryStore:
            )
            """
        )
+        columns = {
+            row["name"]
+            for row in connection.execute("PRAGMA table_info(content_chunks)").fetchall()
+        }
+        if "metadata" not in columns:
+            connection.execute(
+                "ALTER TABLE content_chunks ADD COLUMN metadata TEXT NOT NULL DEFAULT '{}'"
+            )
        connection.execute(
            "CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
        )
@@ -1675,8 +1683,8 @@ class RegistryStore:
                """
                INSERT INTO content_chunks
                  (repository_id, analysis_run_id, snapshot_id, path, kind,
-                   start_line, end_line, text)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                   start_line, end_line, text, metadata)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                """,
                [
                    (
@@ -1688,6 +1696,7 @@ class RegistryStore:
                        chunk.start_line,
                        chunk.end_line,
                        chunk.text,
+                        json.dumps(chunk.metadata),
                    )
                    for chunk in chunks
                ],
@@ -1709,7 +1718,7 @@ class RegistryStore:
            rows = connection.execute(
                f"""
                SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
-                       start_line, end_line, text
+                       start_line, end_line, text, metadata
                FROM content_chunks
                {where}
                ORDER BY path ASC, start_line ASC, id ASC
@@ -2842,6 +2851,7 @@ class RegistryStore:
            start_line=row["start_line"],
            end_line=row["end_line"],
            text=row["text"],
+            metadata=json.loads(row["metadata"]),
        )

    @staticmethod
--- a/src/repo_registry/web_api/schemas.py
+++ b/src/repo_registry/web_api/schemas.py
@@ -462,6 +462,7 @@ class ContentChunkResponse(BaseModel):
    start_line: int
    end_line: int
    text: str
+    metadata: dict[str, Any]


 class ScanSummaryResponse(BaseModel):
--- a/tests/test_content_indexing.py
+++ b/tests/test_content_indexing.py
@@ -2,10 +2,12 @@ from repo_registry.content_indexing.extractor import ContentExtractor
 from repo_registry.core.models import ObservedFact


-def fact(id, kind, name, path="", line=None):
+def fact(id, kind, name, path="", line=None, source_role=""):
    metadata = {}
    if line is not None:
        metadata["line"] = line
+    if source_role:
+        metadata["source_role"] = source_role
    return ObservedFact(
        id=id,
        repository_id=1,
@@ -82,3 +84,20 @@ def test_content_extractor_chunks_provider_related_config(tmp_path):
    assert len(chunks) == 1
    assert chunks[0].path == ".env.example"
    assert "OPENROUTER_API_KEY" in chunks[0].text
+
+
+def test_content_extractor_preserves_source_role_metadata(tmp_path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    (repo / "SCOPE.md").write_text("# SCOPE\n\nProvides OIDC.\n", encoding="utf-8")
+
+    chunks = ContentExtractor().extract(
+        repo,
+        [
+            fact(1, "scope", "SCOPE", "SCOPE.md", source_role="scope_summary"),
+        ],
+    )
+
+    assert len(chunks) == 1
+    assert chunks[0].kind == "scope"
+    assert chunks[0].metadata["source_role"] == "scope_summary"
--- a/tests/test_repository_scanner.py
+++ b/tests/test_repository_scanner.py
@@ -42,6 +42,22 @@ def test_deterministic_scanner_extracts_structural_facts(tmp_path):
    assert languages == {"Python": 2}


+def test_scanner_records_scope_with_source_role(tmp_path):
+    repo = tmp_path / "sample"
+    repo.mkdir()
+    (repo / "SCOPE.md").write_text(
+        "# SCOPE\n\n## One-liner\n\nProvides OIDC profile enforcement.\n",
+        encoding="utf-8",
+    )
+
+    result = DeterministicScanner().scan(repo)
+
+    scope_fact = next(fact for fact in result.facts if fact.kind == "scope")
+    assert scope_fact.name == "SCOPE"
+    assert scope_fact.path == "SCOPE.md"
+    assert scope_fact.metadata["source_role"] == "scope_summary"
+
+
 def test_scanner_readme_only_fixture_records_docs_without_interfaces(tmp_path):
    repo = write_readme_only_repo(tmp_path)

@@ -116,3 +132,28 @@ def test_scanner_records_llm_provider_and_fallback_facts(tmp_path):
    assert ("credential_config", "Anthropic API key", ".env.example") in facts
    assert ("provider_registry", "LLM provider registry", "providers.py") in facts
    assert ("fallback_policy", "LLM provider fallback policy", "README.md") in facts
+
+
+def test_scanner_does_not_treat_agent_guidance_as_llm_provider(tmp_path):
+    repo = tmp_path / "key-cape-like"
+    repo.mkdir()
+    (repo / "README.md").write_text(
+        "# KeyCape\n\n"
+        "Backend adapters live in src/internal/adapters.\n\n"
+        "See `CLAUDE.md` for agent session protocol.\n",
+        encoding="utf-8",
+    )
+    (repo / "CLAUDE.md").write_text(
+        "# CLAUDE.md\n\n"
+        "This file provides guidance to Claude Code when working in this repo.\n",
+        encoding="utf-8",
+    )
+    (repo / "src").mkdir()
+    (repo / "src" / "go.mod").write_text("module keycape\n", encoding="utf-8")
+
+    result = DeterministicScanner().scan(repo)
+
+    facts = {(fact.kind, fact.name, fact.path) for fact in result.facts}
+    assert ("llm_provider", "Claude", "CLAUDE.md") not in facts
+    assert ("llm_provider", "Claude", "README.md") not in facts
+    assert ("provider_registry", "LLM provider registry", "README.md") not in facts
--- a/workplans/RREG-WP-0009-provenance-aware-characteristic-rebuild.md
+++ b/workplans/RREG-WP-0009-provenance-aware-characteristic-rebuild.md
@@ -34,7 +34,7 @@ The target behavior is facts-first and provenance-aware:

 ```task
 id: RREG-WP-0009-T01
-status: todo
+status: done
 priority: high
 state_hub_task_id: "0c189443-5000-4025-a144-75e5bf1e3be5"
 ```
@@ -68,7 +68,7 @@ Acceptance criteria:

 ```task
 id: RREG-WP-0009-T02
-status: todo
+status: in_progress
 priority: high
 state_hub_task_id: "3ef728a0-832f-4441-9ece-16888ef68c47"
 ```