generated from coulomb/repo-seed
first steps to better scanning of repos
This commit is contained in:
@@ -63,7 +63,8 @@ class CandidateGraphGenerator:
|
||||
return []
|
||||
chunks = chunks or []
|
||||
|
||||
docs = self._facts(facts, "documentation")
|
||||
scope_docs = self._facts(facts, "scope")
|
||||
docs = scope_docs + self._facts(facts, "documentation")
|
||||
tests = self._facts(facts, "test")
|
||||
examples = self._facts(facts, "example")
|
||||
interfaces = self._facts(facts, "interface")
|
||||
@@ -660,8 +661,8 @@ class CandidateGraphGenerator:
|
||||
return f"Support {self._humanize_identifier(repository.name)}"
|
||||
|
||||
def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
|
||||
for chunk in chunks:
|
||||
if chunk.kind != "documentation":
|
||||
for chunk in self._documentation_chunks(chunks):
|
||||
if chunk.kind not in {"scope", "documentation"}:
|
||||
continue
|
||||
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||
paragraph = next((line for line in lines if not line.startswith("#")), "")
|
||||
@@ -731,9 +732,7 @@ class CandidateGraphGenerator:
|
||||
)
|
||||
|
||||
def _document_summary(self, chunks: list[ContentChunk]) -> str:
|
||||
for chunk in chunks:
|
||||
if chunk.kind != "documentation":
|
||||
continue
|
||||
for chunk in self._documentation_chunks(chunks):
|
||||
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
continue
|
||||
@@ -744,6 +743,12 @@ class CandidateGraphGenerator:
|
||||
return heading or paragraph
|
||||
return ""
|
||||
|
||||
def _documentation_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
|
||||
return sorted(
|
||||
[chunk for chunk in chunks if chunk.kind in {"scope", "documentation"}],
|
||||
key=lambda chunk: (0 if chunk.kind == "scope" else 1, chunk.path, chunk.start_line),
|
||||
)
|
||||
|
||||
def _interface_summary(self, chunks: list[ContentChunk]) -> str:
|
||||
for chunk in chunks:
|
||||
if chunk.kind != "interface":
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from repo_registry.core.models import ObservedFact
|
||||
|
||||
|
||||
INDEXED_FACT_KINDS = {
|
||||
"scope",
|
||||
"documentation",
|
||||
"example",
|
||||
"test",
|
||||
@@ -29,6 +30,7 @@ class ContentChunkCandidate:
|
||||
start_line: int
|
||||
end_line: int
|
||||
text: str
|
||||
metadata: dict[str, object] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ContentExtractor:
|
||||
@@ -80,6 +82,7 @@ class ContentExtractor:
|
||||
path,
|
||||
root,
|
||||
fact.kind,
|
||||
fact.metadata,
|
||||
lines,
|
||||
start_line,
|
||||
end_line,
|
||||
@@ -91,7 +94,15 @@ class ContentExtractor:
|
||||
start_line = start_index + 1
|
||||
end_line = min(len(lines), start_index + MAX_CHUNK_LINES)
|
||||
chunks.append(
|
||||
self._chunk(path, root, fact.kind, lines, start_line, end_line)
|
||||
self._chunk(
|
||||
path,
|
||||
root,
|
||||
fact.kind,
|
||||
fact.metadata,
|
||||
lines,
|
||||
start_line,
|
||||
end_line,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
|
||||
@@ -100,6 +111,7 @@ class ContentExtractor:
|
||||
path: Path,
|
||||
root: Path,
|
||||
kind: str,
|
||||
fact_metadata: dict[str, object],
|
||||
lines: list[str],
|
||||
start_line: int,
|
||||
end_line: int,
|
||||
@@ -110,6 +122,7 @@ class ContentExtractor:
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
text="\n".join(lines[start_line - 1 : end_line]).strip(),
|
||||
metadata={"source_role": fact_metadata.get("source_role", "")},
|
||||
)
|
||||
|
||||
def _is_within(self, root: Path, path: Path) -> bool:
|
||||
|
||||
@@ -119,6 +119,7 @@ class ContentChunk:
|
||||
start_line: int
|
||||
end_line: int
|
||||
text: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -86,6 +87,17 @@ LLM_CREDENTIAL_HINTS = {
|
||||
"GOOGLE_API_KEY": "Google API key",
|
||||
}
|
||||
|
||||
AGENT_GUIDANCE_FILES = {
|
||||
"agents.md",
|
||||
"claude.md",
|
||||
}
|
||||
|
||||
AGENT_GUIDANCE_DIRS = {
|
||||
".claude",
|
||||
".codex",
|
||||
".cursor",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FactCandidate:
|
||||
@@ -153,7 +165,7 @@ class DeterministicScanner:
|
||||
kind="language",
|
||||
name=language,
|
||||
value=str(count),
|
||||
metadata={"file_count": count},
|
||||
metadata={"file_count": count, "source_role": "implementation_source"},
|
||||
)
|
||||
for language, count in counts.items()
|
||||
]
|
||||
@@ -166,14 +178,45 @@ class DeterministicScanner:
|
||||
relative = path.relative_to(root).as_posix()
|
||||
lower = relative.lower()
|
||||
name = path.name.lower()
|
||||
source_role = self._source_role(relative)
|
||||
|
||||
if name.startswith("readme"):
|
||||
facts.append(FactCandidate("documentation", "README", relative))
|
||||
if name == "scope.md":
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"scope",
|
||||
"SCOPE",
|
||||
relative,
|
||||
metadata={"source_role": "scope_summary"},
|
||||
)
|
||||
)
|
||||
elif name.startswith("readme"):
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"documentation",
|
||||
"README",
|
||||
relative,
|
||||
metadata={"source_role": "product_documentation"},
|
||||
)
|
||||
)
|
||||
elif lower.startswith("docs/") or lower.startswith("doc/"):
|
||||
facts.append(FactCandidate("documentation", path.name, relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"documentation",
|
||||
path.name,
|
||||
relative,
|
||||
metadata={"source_role": "product_documentation"},
|
||||
)
|
||||
)
|
||||
|
||||
if lower.startswith("examples/") or lower.startswith("example/"):
|
||||
facts.append(FactCandidate("example", path.name, relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"example",
|
||||
path.name,
|
||||
relative,
|
||||
metadata={"source_role": "product_documentation"},
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
lower.startswith("tests/")
|
||||
@@ -183,7 +226,14 @@ class DeterministicScanner:
|
||||
or name.endswith(".test.ts")
|
||||
or name.endswith(".spec.ts")
|
||||
):
|
||||
facts.append(FactCandidate("test", path.name, relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"test",
|
||||
path.name,
|
||||
relative,
|
||||
metadata={"source_role": "test_evidence"},
|
||||
)
|
||||
)
|
||||
|
||||
if name in MANIFEST_FRAMEWORK_HINTS or name in {
|
||||
"requirements.txt",
|
||||
@@ -193,10 +243,24 @@ class DeterministicScanner:
|
||||
"yarn.lock",
|
||||
"go.mod",
|
||||
}:
|
||||
facts.append(FactCandidate("manifest", path.name, relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"manifest",
|
||||
path.name,
|
||||
relative,
|
||||
metadata={"source_role": "dependency_declaration"},
|
||||
)
|
||||
)
|
||||
|
||||
if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")):
|
||||
facts.append(FactCandidate("config", path.name, relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"config",
|
||||
path.name,
|
||||
relative,
|
||||
metadata={"source_role": source_role},
|
||||
)
|
||||
)
|
||||
|
||||
return facts
|
||||
|
||||
@@ -223,7 +287,11 @@ class DeterministicScanner:
|
||||
kind="framework",
|
||||
name=framework,
|
||||
path=path.relative_to(root).as_posix(),
|
||||
metadata={"source": "manifest_hint", "needle": needle},
|
||||
metadata={
|
||||
"source": "manifest_hint",
|
||||
"needle": needle,
|
||||
"source_role": "dependency_declaration",
|
||||
},
|
||||
)
|
||||
)
|
||||
return facts
|
||||
@@ -236,9 +304,23 @@ class DeterministicScanner:
|
||||
if path.suffix == ".py":
|
||||
facts.extend(self._python_interface_facts(path, relative))
|
||||
if "cli" in lower or lower.endswith("/commands.py"):
|
||||
facts.append(FactCandidate("interface", "possible CLI", relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"interface",
|
||||
"possible CLI",
|
||||
relative,
|
||||
metadata={"source_role": self._source_role(relative)},
|
||||
)
|
||||
)
|
||||
if "routes" in lower or "api" in lower:
|
||||
facts.append(FactCandidate("interface", "possible API surface", relative))
|
||||
facts.append(
|
||||
FactCandidate(
|
||||
"interface",
|
||||
"possible API surface",
|
||||
relative,
|
||||
metadata={"source_role": self._source_role(relative)},
|
||||
)
|
||||
)
|
||||
return facts
|
||||
|
||||
def _llm_provider_facts(self, files: list[Path], root: Path) -> list[FactCandidate]:
|
||||
@@ -264,8 +346,11 @@ class DeterministicScanner:
|
||||
continue
|
||||
lower_text = text.lower()
|
||||
relative = path.relative_to(root).as_posix()
|
||||
source_role = self._source_role(relative)
|
||||
if source_role == "agent_guidance":
|
||||
continue
|
||||
for needle, provider in LLM_PROVIDER_HINTS.items():
|
||||
if needle not in lower_text:
|
||||
if not self._has_provider_signal(lower_text, needle):
|
||||
continue
|
||||
self._append_once(
|
||||
facts,
|
||||
@@ -275,7 +360,10 @@ class DeterministicScanner:
|
||||
name=provider,
|
||||
path=relative,
|
||||
value=needle,
|
||||
metadata={"source": "provider_hint"},
|
||||
metadata={
|
||||
"source": "provider_hint",
|
||||
"source_role": source_role,
|
||||
},
|
||||
),
|
||||
)
|
||||
for env_name, label in LLM_CREDENTIAL_HINTS.items():
|
||||
@@ -289,11 +377,22 @@ class DeterministicScanner:
|
||||
name=label,
|
||||
path=relative,
|
||||
value=env_name,
|
||||
metadata={"source": "environment_variable"},
|
||||
metadata={
|
||||
"source": "environment_variable",
|
||||
"source_role": source_role,
|
||||
},
|
||||
),
|
||||
)
|
||||
if any(term in lower_text for term in ("provider_registry", "providers =", "adapter")):
|
||||
if any(needle in lower_text for needle in LLM_PROVIDER_HINTS):
|
||||
registry_hint = (
|
||||
"provider_registry" in lower_text
|
||||
or "providers =" in lower_text
|
||||
or ("adapter" in lower_text and source_role == "implementation_source")
|
||||
)
|
||||
if registry_hint:
|
||||
if any(
|
||||
self._has_provider_signal(lower_text, needle)
|
||||
for needle in LLM_PROVIDER_HINTS
|
||||
):
|
||||
self._append_once(
|
||||
facts,
|
||||
seen,
|
||||
@@ -301,11 +400,15 @@ class DeterministicScanner:
|
||||
kind="provider_registry",
|
||||
name="LLM provider registry",
|
||||
path=relative,
|
||||
metadata={"source": "provider_registry_hint"},
|
||||
metadata={
|
||||
"source": "provider_registry_hint",
|
||||
"source_role": source_role,
|
||||
},
|
||||
),
|
||||
)
|
||||
if "fallback" in lower_text and any(
|
||||
needle in lower_text for needle in LLM_PROVIDER_HINTS
|
||||
self._has_provider_signal(lower_text, needle)
|
||||
for needle in LLM_PROVIDER_HINTS
|
||||
):
|
||||
self._append_once(
|
||||
facts,
|
||||
@@ -314,11 +417,47 @@ class DeterministicScanner:
|
||||
kind="fallback_policy",
|
||||
name="LLM provider fallback policy",
|
||||
path=relative,
|
||||
metadata={"source": "fallback_hint"},
|
||||
metadata={
|
||||
"source": "fallback_hint",
|
||||
"source_role": source_role,
|
||||
},
|
||||
),
|
||||
)
|
||||
return facts
|
||||
|
||||
def _source_role(self, relative_path: str) -> str:
|
||||
lower = relative_path.lower()
|
||||
parts = lower.split("/")
|
||||
name = parts[-1]
|
||||
if name == "scope.md":
|
||||
return "scope_summary"
|
||||
if name in AGENT_GUIDANCE_FILES or any(part in AGENT_GUIDANCE_DIRS for part in parts):
|
||||
return "agent_guidance"
|
||||
if lower.startswith((".github/workflows/", ".gitea/workflows/")):
|
||||
return "ci_tooling"
|
||||
if lower.startswith(("tests/", "test/")) or name.startswith("test_"):
|
||||
return "test_evidence"
|
||||
if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")):
|
||||
return "product_documentation"
|
||||
if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")):
|
||||
return "dependency_declaration"
|
||||
if lower.endswith((".yaml", ".yml", ".toml", ".ini", ".env.example")):
|
||||
return "configuration"
|
||||
return "implementation_source"
|
||||
|
||||
def _has_provider_signal(self, lower_text: str, needle: str) -> bool:
|
||||
pattern = re.compile(rf"(?<![a-z0-9-]){re.escape(needle.lower())}(?![a-z0-9-])")
|
||||
for match in pattern.finditer(lower_text):
|
||||
context = lower_text[max(0, match.start() - 20) : match.end() + 20]
|
||||
if needle == "claude" and (
|
||||
"claude.md" in context
|
||||
or "claude code" in context
|
||||
or "claude.ai/code" in context
|
||||
):
|
||||
continue
|
||||
return True
|
||||
return False
|
||||
|
||||
def _append_once(
|
||||
self,
|
||||
facts: list[FactCandidate],
|
||||
@@ -347,7 +486,10 @@ class DeterministicScanner:
|
||||
name="python route decorator",
|
||||
path=relative,
|
||||
value=stripped,
|
||||
metadata={"line": line_number},
|
||||
metadata={
|
||||
"line": line_number,
|
||||
"source_role": self._source_role(relative),
|
||||
},
|
||||
)
|
||||
)
|
||||
elif stripped.startswith("@click.command") or stripped.startswith("@app.command"):
|
||||
@@ -357,7 +499,10 @@ class DeterministicScanner:
|
||||
name="python CLI command decorator",
|
||||
path=relative,
|
||||
value=stripped,
|
||||
metadata={"line": line_number},
|
||||
metadata={
|
||||
"line": line_number,
|
||||
"source_role": self._source_role(relative),
|
||||
},
|
||||
)
|
||||
)
|
||||
return facts
|
||||
|
||||
@@ -180,6 +180,14 @@ class RegistryStore:
|
||||
)
|
||||
"""
|
||||
)
|
||||
columns = {
|
||||
row["name"]
|
||||
for row in connection.execute("PRAGMA table_info(content_chunks)").fetchall()
|
||||
}
|
||||
if "metadata" not in columns:
|
||||
connection.execute(
|
||||
"ALTER TABLE content_chunks ADD COLUMN metadata TEXT NOT NULL DEFAULT '{}'"
|
||||
)
|
||||
connection.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_content_chunks_repository ON content_chunks(repository_id)"
|
||||
)
|
||||
@@ -1675,8 +1683,8 @@ class RegistryStore:
|
||||
"""
|
||||
INSERT INTO content_chunks
|
||||
(repository_id, analysis_run_id, snapshot_id, path, kind,
|
||||
start_line, end_line, text)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
start_line, end_line, text, metadata)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(
|
||||
@@ -1688,6 +1696,7 @@ class RegistryStore:
|
||||
chunk.start_line,
|
||||
chunk.end_line,
|
||||
chunk.text,
|
||||
json.dumps(chunk.metadata),
|
||||
)
|
||||
for chunk in chunks
|
||||
],
|
||||
@@ -1709,7 +1718,7 @@ class RegistryStore:
|
||||
rows = connection.execute(
|
||||
f"""
|
||||
SELECT id, repository_id, analysis_run_id, snapshot_id, path, kind,
|
||||
start_line, end_line, text
|
||||
start_line, end_line, text, metadata
|
||||
FROM content_chunks
|
||||
{where}
|
||||
ORDER BY path ASC, start_line ASC, id ASC
|
||||
@@ -2842,6 +2851,7 @@ class RegistryStore:
|
||||
start_line=row["start_line"],
|
||||
end_line=row["end_line"],
|
||||
text=row["text"],
|
||||
metadata=json.loads(row["metadata"]),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -462,6 +462,7 @@ class ContentChunkResponse(BaseModel):
|
||||
start_line: int
|
||||
end_line: int
|
||||
text: str
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
class ScanSummaryResponse(BaseModel):
|
||||
|
||||
@@ -2,10 +2,12 @@ from repo_registry.content_indexing.extractor import ContentExtractor
|
||||
from repo_registry.core.models import ObservedFact
|
||||
|
||||
|
||||
def fact(id, kind, name, path="", line=None):
|
||||
def fact(id, kind, name, path="", line=None, source_role=""):
|
||||
metadata = {}
|
||||
if line is not None:
|
||||
metadata["line"] = line
|
||||
if source_role:
|
||||
metadata["source_role"] = source_role
|
||||
return ObservedFact(
|
||||
id=id,
|
||||
repository_id=1,
|
||||
@@ -82,3 +84,20 @@ def test_content_extractor_chunks_provider_related_config(tmp_path):
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].path == ".env.example"
|
||||
assert "OPENROUTER_API_KEY" in chunks[0].text
|
||||
|
||||
|
||||
def test_content_extractor_preserves_source_role_metadata(tmp_path):
|
||||
repo = tmp_path / "repo"
|
||||
repo.mkdir()
|
||||
(repo / "SCOPE.md").write_text("# SCOPE\n\nProvides OIDC.\n", encoding="utf-8")
|
||||
|
||||
chunks = ContentExtractor().extract(
|
||||
repo,
|
||||
[
|
||||
fact(1, "scope", "SCOPE", "SCOPE.md", source_role="scope_summary"),
|
||||
],
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].kind == "scope"
|
||||
assert chunks[0].metadata["source_role"] == "scope_summary"
|
||||
|
||||
@@ -42,6 +42,22 @@ def test_deterministic_scanner_extracts_structural_facts(tmp_path):
|
||||
assert languages == {"Python": 2}
|
||||
|
||||
|
||||
def test_scanner_records_scope_with_source_role(tmp_path):
|
||||
repo = tmp_path / "sample"
|
||||
repo.mkdir()
|
||||
(repo / "SCOPE.md").write_text(
|
||||
"# SCOPE\n\n## One-liner\n\nProvides OIDC profile enforcement.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = DeterministicScanner().scan(repo)
|
||||
|
||||
scope_fact = next(fact for fact in result.facts if fact.kind == "scope")
|
||||
assert scope_fact.name == "SCOPE"
|
||||
assert scope_fact.path == "SCOPE.md"
|
||||
assert scope_fact.metadata["source_role"] == "scope_summary"
|
||||
|
||||
|
||||
def test_scanner_readme_only_fixture_records_docs_without_interfaces(tmp_path):
|
||||
repo = write_readme_only_repo(tmp_path)
|
||||
|
||||
@@ -116,3 +132,28 @@ def test_scanner_records_llm_provider_and_fallback_facts(tmp_path):
|
||||
assert ("credential_config", "Anthropic API key", ".env.example") in facts
|
||||
assert ("provider_registry", "LLM provider registry", "providers.py") in facts
|
||||
assert ("fallback_policy", "LLM provider fallback policy", "README.md") in facts
|
||||
|
||||
|
||||
def test_scanner_does_not_treat_agent_guidance_as_llm_provider(tmp_path):
|
||||
repo = tmp_path / "key-cape-like"
|
||||
repo.mkdir()
|
||||
(repo / "README.md").write_text(
|
||||
"# KeyCape\n\n"
|
||||
"Backend adapters live in src/internal/adapters.\n\n"
|
||||
"See `CLAUDE.md` for agent session protocol.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(repo / "CLAUDE.md").write_text(
|
||||
"# CLAUDE.md\n\n"
|
||||
"This file provides guidance to Claude Code when working in this repo.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(repo / "src").mkdir()
|
||||
(repo / "src" / "go.mod").write_text("module keycape\n", encoding="utf-8")
|
||||
|
||||
result = DeterministicScanner().scan(repo)
|
||||
|
||||
facts = {(fact.kind, fact.name, fact.path) for fact in result.facts}
|
||||
assert ("llm_provider", "Claude", "CLAUDE.md") not in facts
|
||||
assert ("llm_provider", "Claude", "README.md") not in facts
|
||||
assert ("provider_registry", "LLM provider registry", "README.md") not in facts
|
||||
|
||||
@@ -34,7 +34,7 @@ The target behavior is facts-first and provenance-aware:
|
||||
|
||||
```task
|
||||
id: RREG-WP-0009-T01
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "0c189443-5000-4025-a144-75e5bf1e3be5"
|
||||
```
|
||||
@@ -68,7 +68,7 @@ Acceptance criteria:
|
||||
|
||||
```task
|
||||
id: RREG-WP-0009-T02
|
||||
status: todo
|
||||
status: in_progress
|
||||
priority: high
|
||||
state_hub_task_id: "3ef728a0-832f-4441-9ece-16888ef68c47"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user