generated from coulomb/repo-seed
candidate generation use content chunks
This commit is contained in:
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from repo_registry.core.models import ObservedFact, Repository, SourceReference
|
||||
from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -50,9 +50,11 @@ class CandidateGraphGenerator:
|
||||
self,
|
||||
repository: Repository,
|
||||
facts: list[ObservedFact],
|
||||
chunks: list[ContentChunk] | None = None,
|
||||
) -> list[CandidateAbilityDraft]:
|
||||
if not facts:
|
||||
return []
|
||||
chunks = chunks or []
|
||||
|
||||
docs = self._facts(facts, "documentation")
|
||||
tests = self._facts(facts, "test")
|
||||
@@ -65,11 +67,7 @@ class CandidateGraphGenerator:
|
||||
ability_sources = docs or manifests or languages
|
||||
ability = CandidateAbilityDraft(
|
||||
name=f"Review {repository.name} Repository Usefulness",
|
||||
description=(
|
||||
"Candidate usefulness summary generated from observed repository "
|
||||
"documentation, manifests, languages, and interfaces. This is a "
|
||||
"review seed, not an asserted domain ability."
|
||||
),
|
||||
description=self._ability_description(chunks),
|
||||
confidence=0.55 if docs else 0.35,
|
||||
source_refs=self._source_refs(ability_sources),
|
||||
capabilities=[],
|
||||
@@ -77,7 +75,9 @@ class CandidateGraphGenerator:
|
||||
|
||||
capabilities: list[CandidateCapabilityDraft] = []
|
||||
if interfaces:
|
||||
capabilities.append(self._interface_capability(interfaces, tests, examples, docs))
|
||||
capabilities.append(
|
||||
self._interface_capability(interfaces, tests, examples, docs, chunks)
|
||||
)
|
||||
if manifests or frameworks or languages:
|
||||
capabilities.append(
|
||||
CandidateCapabilityDraft(
|
||||
@@ -110,6 +110,7 @@ class CandidateGraphGenerator:
|
||||
tests: list[ObservedFact],
|
||||
examples: list[ObservedFact],
|
||||
docs: list[ObservedFact],
|
||||
chunks: list[ContentChunk],
|
||||
) -> CandidateCapabilityDraft:
|
||||
features = [
|
||||
CandidateFeatureDraft(
|
||||
@@ -123,10 +124,7 @@ class CandidateGraphGenerator:
|
||||
]
|
||||
return CandidateCapabilityDraft(
|
||||
name="Expose Repository Interface",
|
||||
description=(
|
||||
"Expose one or more likely user-facing API or CLI entry points. "
|
||||
"Review is required to name the concrete domain behavior."
|
||||
),
|
||||
description=self._interface_description(chunks),
|
||||
inputs=[],
|
||||
outputs=["callable interface"],
|
||||
confidence=0.65,
|
||||
@@ -179,6 +177,56 @@ class CandidateGraphGenerator:
|
||||
return "API"
|
||||
return "interface"
|
||||
|
||||
def _ability_description(self, chunks: list[ContentChunk]) -> str:
|
||||
doc_summary = self._document_summary(chunks)
|
||||
if doc_summary:
|
||||
return (
|
||||
"Candidate usefulness summary seeded from repository content: "
|
||||
f"{doc_summary} This is a review seed, not an asserted domain ability."
|
||||
)
|
||||
return (
|
||||
"Candidate usefulness summary generated from observed repository "
|
||||
"documentation, manifests, languages, and interfaces. This is a "
|
||||
"review seed, not an asserted domain ability."
|
||||
)
|
||||
|
||||
def _interface_description(self, chunks: list[ContentChunk]) -> str:
|
||||
interface_summary = self._interface_summary(chunks)
|
||||
if interface_summary:
|
||||
return (
|
||||
"Expose one or more likely user-facing API or CLI entry points. "
|
||||
f"Source context: {interface_summary} Review is required to name "
|
||||
"the concrete domain behavior."
|
||||
)
|
||||
return (
|
||||
"Expose one or more likely user-facing API or CLI entry points. "
|
||||
"Review is required to name the concrete domain behavior."
|
||||
)
|
||||
|
||||
def _document_summary(self, chunks: list[ContentChunk]) -> str:
|
||||
for chunk in chunks:
|
||||
if chunk.kind != "documentation":
|
||||
continue
|
||||
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
continue
|
||||
heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "")
|
||||
paragraph = next((line for line in lines if not line.startswith("#")), "")
|
||||
if heading and paragraph:
|
||||
return f"{heading}. {paragraph}"
|
||||
return heading or paragraph
|
||||
return ""
|
||||
|
||||
def _interface_summary(self, chunks: list[ContentChunk]) -> str:
|
||||
for chunk in chunks:
|
||||
if chunk.kind != "interface":
|
||||
continue
|
||||
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
continue
|
||||
return " ".join(lines[:3])
|
||||
return ""
|
||||
|
||||
def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
|
||||
return [fact for fact in facts if fact.kind == kind]
|
||||
|
||||
|
||||
@@ -121,7 +121,12 @@ class RegistryService:
|
||||
completed_run.snapshot_id,
|
||||
chunks,
|
||||
)
|
||||
candidates = self.candidate_generator.generate(repository, facts)
|
||||
stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id)
|
||||
candidates = self.candidate_generator.generate(
|
||||
repository,
|
||||
facts,
|
||||
stored_chunks,
|
||||
)
|
||||
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
|
||||
return ScanSummary(
|
||||
analysis_run=completed_run,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
|
||||
from repo_registry.core.models import ObservedFact, Repository
|
||||
from repo_registry.core.models import ContentChunk, ObservedFact, Repository
|
||||
|
||||
|
||||
def fact(id, kind, name, path="", value=""):
|
||||
@@ -16,6 +16,20 @@ def fact(id, kind, name, path="", value=""):
|
||||
)
|
||||
|
||||
|
||||
def chunk(id, kind, path, text, start_line=1, end_line=1):
|
||||
return ContentChunk(
|
||||
id=id,
|
||||
repository_id=1,
|
||||
analysis_run_id=1,
|
||||
snapshot_id=1,
|
||||
path=path,
|
||||
kind=kind,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
text=text,
|
||||
)
|
||||
|
||||
|
||||
def test_candidate_generator_builds_review_seed_from_observed_facts():
|
||||
repository = Repository(
|
||||
id=1,
|
||||
@@ -43,3 +57,40 @@ def test_candidate_generator_builds_review_seed_from_observed_facts():
|
||||
assert interface_capability.features[0].type == "API"
|
||||
assert interface_capability.features[0].location == "app.py"
|
||||
assert interface_capability.evidence[0].strength == "strong"
|
||||
|
||||
|
||||
def test_candidate_generator_enriches_descriptions_from_content_chunks():
|
||||
repository = Repository(
|
||||
id=1,
|
||||
name="MailRouter",
|
||||
url="/tmp/mail-router",
|
||||
description=None,
|
||||
branch="main",
|
||||
status="analyzed",
|
||||
)
|
||||
facts = [
|
||||
fact(1, "documentation", "README", "README.md"),
|
||||
fact(2, "interface", "python route decorator", "app.py", '@app.post("/classify")'),
|
||||
]
|
||||
chunks = [
|
||||
chunk(
|
||||
1,
|
||||
"documentation",
|
||||
"README.md",
|
||||
"# MailRouter\nRoutes incoming customer email to the right team.",
|
||||
end_line=2,
|
||||
),
|
||||
chunk(
|
||||
2,
|
||||
"interface",
|
||||
"app.py",
|
||||
'@app.post("/classify")\ndef classify_email():\n return {}',
|
||||
start_line=5,
|
||||
end_line=7,
|
||||
),
|
||||
]
|
||||
|
||||
graph = CandidateGraphGenerator().generate(repository, facts, chunks)
|
||||
|
||||
assert "MailRouter. Routes incoming customer email" in graph[0].description
|
||||
assert '@app.post("/classify")' in graph[0].capabilities[0].description
|
||||
|
||||
@@ -352,6 +352,8 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
|
||||
candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
|
||||
assert candidate_graph.repository.name == "Example"
|
||||
assert candidate_graph.abilities
|
||||
assert "Example" in candidate_graph.abilities[0].description
|
||||
assert "@app.get" in candidate_graph.abilities[0].capabilities[0].description
|
||||
capability_names = {
|
||||
capability.name
|
||||
for ability in candidate_graph.abilities
|
||||
|
||||
Reference in New Issue
Block a user