first steps to better scanning of repos

This commit is contained in:
2026-05-02 00:11:55 +02:00
parent 2c427d253c
commit 89c4081001
9 changed files with 270 additions and 35 deletions

View File

@@ -63,7 +63,8 @@ class CandidateGraphGenerator:
return []
chunks = chunks or []
docs = self._facts(facts, "documentation")
scope_docs = self._facts(facts, "scope")
docs = scope_docs + self._facts(facts, "documentation")
tests = self._facts(facts, "test")
examples = self._facts(facts, "example")
interfaces = self._facts(facts, "interface")
@@ -660,8 +661,8 @@ class CandidateGraphGenerator:
return f"Support {self._humanize_identifier(repository.name)}"
def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "documentation":
for chunk in self._documentation_chunks(chunks):
if chunk.kind not in {"scope", "documentation"}:
continue
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
paragraph = next((line for line in lines if not line.startswith("#")), "")
@@ -731,9 +732,7 @@ class CandidateGraphGenerator:
)
def _document_summary(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "documentation":
continue
for chunk in self._documentation_chunks(chunks):
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
if not lines:
continue
@@ -744,6 +743,12 @@ class CandidateGraphGenerator:
return heading or paragraph
return ""
def _documentation_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
return sorted(
[chunk for chunk in chunks if chunk.kind in {"scope", "documentation"}],
key=lambda chunk: (0 if chunk.kind == "scope" else 1, chunk.path, chunk.start_line),
)
def _interface_summary(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "interface":

View File

@@ -1,12 +1,13 @@
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from repo_registry.core.models import ObservedFact
INDEXED_FACT_KINDS = {
"scope",
"documentation",
"example",
"test",
@@ -29,6 +30,7 @@ class ContentChunkCandidate:
start_line: int
end_line: int
text: str
metadata: dict[str, object] = field(default_factory=dict)
class ContentExtractor:
@@ -80,6 +82,7 @@ class ContentExtractor:
path,
root,
fact.kind,
fact.metadata,
lines,
start_line,
end_line,
@@ -91,7 +94,15 @@ class ContentExtractor:
start_line = start_index + 1
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
chunks.append(
self._chunk(path, root, fact.kind, lines, start_line, end_line)
self._chunk(
path,
root,
fact.kind,
fact.metadata,
lines,
start_line,
end_line,
)
)
return chunks
@@ -100,6 +111,7 @@ class ContentExtractor:
path: Path,
root: Path,
kind: str,
fact_metadata: dict[str, object],
lines: list[str],
start_line: int,
end_line: int,
@@ -110,6 +122,7 @@ class ContentExtractor:
start_line=start_line,
end_line=end_line,
text="\n".join(lines[start_line - 1 : end_line]).strip(),
metadata={"source_role": fact_metadata.get("source_role", "")},
)
def _is_within(self, root: Path, path: Path) -> bool:

View File

@@ -119,6 +119,7 @@ class ContentChunk:
start_line: int
end_line: int
text: str
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass(frozen=True)

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import subprocess
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@@ -86,6 +87,17 @@ LLM_CREDENTIAL_HINTS = {
"GOOGLE_API_KEY": "Google API key",
}
AGENT_GUIDANCE_FILES = {
"agents.md",
"claude.md",
}
AGENT_GUIDANCE_DIRS = {
".claude",
".codex",
".cursor",
}
@dataclass(frozen=True)
class FactCandidate:
@@ -153,7 +165,7 @@ class DeterministicScanner:
kind="language",
name=language,
value=str(count),
metadata={"file_count": count},
metadata={"file_count": count, "source_role": "implementation_source"},
)
for language, count in counts.items()
]
@@ -166,14 +178,45 @@ class DeterministicScanner:
relative = path.relative_to(root).as_posix()
lower = relative.lower()
name = path.name.lower()
source_role = self._source_role(relative)
if name.startswith("readme"):
facts.append(FactCandidate("documentation", "README", relative))
if name == "scope.md":
facts.append(
FactCandidate(
"scope",
"SCOPE",
relative,
metadata={"source_role": "scope_summary"},
)
)
elif name.startswith("readme"):
facts.append(
FactCandidate(
"documentation",
"README",
relative,
metadata={"source_role": "product_documentation"},
)
)
elif lower.startswith("docs/") or lower.startswith("doc/"):
facts.append(FactCandidate("documentation", path.name, relative))
facts.append(
FactCandidate(
"documentation",
path.name,
relative,
metadata={"source_role": "product_documentation"},
)
)
if lower.startswith("examples/") or lower.startswith("example/"):
facts.append(FactCandidate("example", path.name, relative))
facts.append(
FactCandidate(
"example",
path.name,
relative,
metadata={"source_role": "product_documentation"},
)
)
if (
lower.startswith("tests/")
@@ -183,7 +226,14 @@ class DeterministicScanner:
or name.endswith(".test.ts")
or name.endswith(".spec.ts")
):
facts.append(FactCandidate("test", path.name, relative))
facts.append(
FactCandidate(
"test",
path.name,
relative,
metadata={"source_role": "test_evidence"},
)
)
if name in MANIFEST_FRAMEWORK_HINTS or name in {
"requirements.txt",
@@ -193,10 +243,24 @@ class DeterministicScanner:
"yarn.lock",
"go.mod",
}:
facts.append(FactCandidate("manifest", path.name, relative))
facts.append(
FactCandidate(
"manifest",
path.name,
relative,
metadata={"source_role": "dependency_declaration"},
)
)
if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")):
facts.append(FactCandidate("config", path.name, relative))
facts.append(
FactCandidate(
"config",
path.name,
relative,
metadata={"source_role": source_role},
)
)
return facts
@@ -223,7 +287,11 @@ class DeterministicScanner:
kind="framework",
name=framework,
path=path.relative_to(root).as_posix(),
metadata={"source": "manifest_hint", "needle": needle},
metadata={
"source": "manifest_hint",
"needle": needle,
"source_role": "dependency_declaration",
},
)
)
return facts
@@ -236,9 +304,23 @@ class DeterministicScanner:
if path.suffix == ".py":
facts.extend(self._python_interface_facts(path, relative))
if "cli" in lower or lower.endswith("/commands.py"):
facts.append(FactCandidate("interface", "possible CLI", relative))
facts.append(
FactCandidate(
"interface",
"possible CLI",
relative,
metadata={"source_role": self._source_role(relative)},
)
)
if "routes" in lower or "api" in lower:
facts.append(FactCandidate("interface", "possible API surface", relative))
facts.append(
FactCandidate(
"interface",
"possible API surface",
relative,
metadata={"source_role": self._source_role(relative)},
)
)
return facts
def _llm_provider_facts(self, files: list[Path], root: Path) -> list[FactCandidate]:
@@ -264,8 +346,11 @@ class DeterministicScanner:
continue
lower_text = text.lower()
relative = path.relative_to(root).as_posix()
source_role = self._source_role(relative)
if source_role == "agent_guidance":
continue
for needle, provider in LLM_PROVIDER_HINTS.items():
if needle not in lower_text:
if not self._has_provider_signal(lower_text, needle):
continue
self._append_once(
facts,
@@ -275,7 +360,10 @@ class DeterministicScanner:
name=provider,
path=relative,
value=needle,
metadata={"source": "provider_hint"},
metadata={
"source": "provider_hint",
"source_role": source_role,
},
),
)
for env_name, label in LLM_CREDENTIAL_HINTS.items():
@@ -289,11 +377,22 @@ class DeterministicScanner:
name=label,
path=relative,
value=env_name,
metadata={"source": "environment_variable"},
metadata={
"source": "environment_variable",
"source_role": source_role,
},
),
)
if any(term in lower_text for term in ("provider_registry", "providers =", "adapter")):
if any(needle in lower_text for needle in LLM_PROVIDER_HINTS):
registry_hint = (
"provider_registry" in lower_text
or "providers =" in lower_text
or ("adapter" in lower_text and source_role == "implementation_source")
)
if registry_hint:
if any(
self._has_provider_signal(lower_text, needle)
for needle in LLM_PROVIDER_HINTS
):
self._append_once(
facts,
seen,
@@ -301,11 +400,15 @@ class DeterministicScanner:
kind="provider_registry",
name="LLM provider registry",
path=relative,
metadata={"source": "provider_registry_hint"},
metadata={
"source": "provider_registry_hint",
"source_role": source_role,
},
),
)
if "fallback" in lower_text and any(
needle in lower_text for needle in LLM_PROVIDER_HINTS
self._has_provider_signal(lower_text, needle)
for needle in LLM_PROVIDER_HINTS
):
self._append_once(
facts,
@@ -314,11 +417,47 @@ class DeterministicScanner:
kind="fallback_policy",
name="LLM provider fallback policy",
path=relative,
metadata={"source": "fallback_hint"},
metadata={
"source": "fallback_hint",
"source_role": source_role,
},
),
)
return facts
def _source_role(self, relative_path: str) -> str:
lower = relative_path.lower()
parts = lower.split("/")
name = parts[-1]
if name == "scope.md":
return "scope_summary"
if name in AGENT_GUIDANCE_FILES or any(part in AGENT_GUIDANCE_DIRS for part in parts):
return "agent_guidance"
if lower.startswith((".github/workflows/", ".gitea/workflows/")):
return "ci_tooling"
if lower.startswith(("tests/", "test/")) or name.startswith("test_"):
return "test_evidence"
if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")):
return "product_documentation"
if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")):
return "dependency_declaration"
if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")):
return "configuration"
return "implementation_source"
def _has_provider_signal(self, lower_text: str, needle: str) -> bool:
pattern = re.compile(rf"(?<![a-z0-9-]){re.escape(needle.lower())}(?![a-z0-9-])")
for match in pattern.finditer(lower_text):
context = lower_text[max(0, match.start() - 20) : match.end() + 20]
if needle == "claude" and (
"claude.md" in context
or "claude code" in context
or "claude.ai/code" in context
):
continue
return True
return False
def _append_once(
self,
facts: list[FactCandidate],
@@ -347,7 +486,10 @@ class DeterministicScanner:
name="python route decorator",
path=relative,
value=stripped,
metadata={"line": line_number},
metadata={
"line": line_number,
"source_role": self._source_role(relative),
},
)
)
elif stripped.startswith("@click.command") or stripped.startswith("@app.command"):
@@ -357,7 +499,10 @@ class DeterministicScanner:
name="python CLI command decorator",
path=relative,
value=stripped,
metadata={"line": line_number},
metadata={
"line": line_number,
"source_role": self._source_role(relative),
},
)
)
return facts

View File

@@ -180,6 +180,14 @@ class RegistryStore:
)
"""
)
columns = {
row["name"]
for row in connection.execute("PRAGMA table_info(content_chunks)").fetchall()
}
if "metadata" not in columns:
connection.execute(
"ALTER TABLE content_chunks ADD COLUMN metadata TEXT NOT NULL DEFAULT '{}'"
)
connection.execute(
"CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
)
@@ -1675,8 +1683,8 @@ class RegistryStore:
"""
INSERT INTO content_chunks
(repository_id, analysis_run_id, snapshot_id, path, kind,
start_line, end_line, text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
start_line, end_line, text, metadata)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(
@@ -1688,6 +1696,7 @@ class RegistryStore:
chunk.start_line,
chunk.end_line,
chunk.text,
json.dumps(chunk.metadata),
)
for chunk in chunks
],
@@ -1709,7 +1718,7 @@ class RegistryStore:
rows = connection.execute(
f"""
SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
start_line, end_line, text
start_line, end_line, text, metadata
FROM content_chunks
{where}
ORDER BY path ASC, start_line ASC, id ASC
@@ -2842,6 +2851,7 @@ class RegistryStore:
start_line=row["start_line"],
end_line=row["end_line"],
text=row["text"],
metadata=json.loads(row["metadata"]),
)
@staticmethod

View File

@@ -462,6 +462,7 @@ class ContentChunkResponse(BaseModel):
start_line: int
end_line: int
text: str
metadata: dict[str, Any]
class ScanSummaryResponse(BaseModel):

View File

@@ -2,10 +2,12 @@ from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.core.models import ObservedFact
def fact(id, kind, name, path="", line=None):
def fact(id, kind, name, path="", line=None, source_role=""):
metadata = {}
if line is not None:
metadata["line"] = line
if source_role:
metadata["source_role"] = source_role
return ObservedFact(
id=id,
repository_id=1,
@@ -82,3 +84,20 @@ def test_content_extractor_chunks_provider_related_config(tmp_path):
assert len(chunks) == 1
assert chunks[0].path == ".env.example"
assert "OPENROUTER_API_KEY" in chunks[0].text
def test_content_extractor_preserves_source_role_metadata(tmp_path):
repo = tmp_path / "repo"
repo.mkdir()
(repo / "SCOPE.md").write_text("# SCOPE\n\nProvides OIDC.\n", encoding="utf-8")
chunks = ContentExtractor().extract(
repo,
[
fact(1, "scope", "SCOPE", "SCOPE.md", source_role="scope_summary"),
],
)
assert len(chunks) == 1
assert chunks[0].kind == "scope"
assert chunks[0].metadata["source_role"] == "scope_summary"

View File

@@ -42,6 +42,22 @@ def test_deterministic_scanner_extracts_structural_facts(tmp_path):
assert languages == {"Python": 2}
def test_scanner_records_scope_with_source_role(tmp_path):
repo = tmp_path / "sample"
repo.mkdir()
(repo / "SCOPE.md").write_text(
"# SCOPE\n\n## One-liner\n\nProvides OIDC profile enforcement.\n",
encoding="utf-8",
)
result = DeterministicScanner().scan(repo)
scope_fact = next(fact for fact in result.facts if fact.kind == "scope")
assert scope_fact.name == "SCOPE"
assert scope_fact.path == "SCOPE.md"
assert scope_fact.metadata["source_role"] == "scope_summary"
def test_scanner_readme_only_fixture_records_docs_without_interfaces(tmp_path):
repo = write_readme_only_repo(tmp_path)
@@ -116,3 +132,28 @@ def test_scanner_records_llm_provider_and_fallback_facts(tmp_path):
assert ("credential_config", "Anthropic API key", ".env.example") in facts
assert ("provider_registry", "LLM provider registry", "providers.py") in facts
assert ("fallback_policy", "LLM provider fallback policy", "README.md") in facts
def test_scanner_does_not_treat_agent_guidance_as_llm_provider(tmp_path):
repo = tmp_path / "key-cape-like"
repo.mkdir()
(repo / "README.md").write_text(
"# KeyCape\n\n"
"Backend adapters live in src/internal/adapters.\n\n"
"See `CLAUDE.md` for agent session protocol.\n",
encoding="utf-8",
)
(repo / "CLAUDE.md").write_text(
"# CLAUDE.md\n\n"
"This file provides guidance to Claude Code when working in this repo.\n",
encoding="utf-8",
)
(repo / "src").mkdir()
(repo / "src" / "go.mod").write_text("module keycape\n", encoding="utf-8")
result = DeterministicScanner().scan(repo)
facts = {(fact.kind, fact.name, fact.path) for fact in result.facts}
assert ("llm_provider", "Claude", "CLAUDE.md") not in facts
assert ("llm_provider", "Claude", "README.md") not in facts
assert ("provider_registry", "LLM provider registry", "README.md") not in facts

View File

@@ -34,7 +34,7 @@ The target behavior is facts-first and provenance-aware:
```task
id: RREG-WP-0009-T01
status: todo
status: done
priority: high
state_hub_task_id: "0c189443-5000-4025-a144-75e5bf1e3be5"
```
@@ -68,7 +68,7 @@ Acceptance criteria:
```task
id: RREG-WP-0009-T02
status: todo
status: in_progress
priority: high
state_hub_task_id: "3ef728a0-832f-4441-9ece-16888ef68c47"
```