candidate generation use content chunks

This commit is contained in:
2026-04-26 02:49:58 +02:00
parent 6416139176
commit 07c837d6bb
4 changed files with 119 additions and 13 deletions

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass, field
from repo_registry.core.models import ObservedFact, Repository, SourceReference
from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
@dataclass(frozen=True)
@@ -50,9 +50,11 @@ class CandidateGraphGenerator:
self,
repository: Repository,
facts: list[ObservedFact],
chunks: list[ContentChunk] | None = None,
) -> list[CandidateAbilityDraft]:
if not facts:
return []
chunks = chunks or []
docs = self._facts(facts, "documentation")
tests = self._facts(facts, "test")
@@ -65,11 +67,7 @@ class CandidateGraphGenerator:
ability_sources = docs or manifests or languages
ability = CandidateAbilityDraft(
name=f"Review {repository.name} Repository Usefulness",
description=(
"Candidate usefulness summary generated from observed repository "
"documentation, manifests, languages, and interfaces. This is a "
"review seed, not an asserted domain ability."
),
description=self._ability_description(chunks),
confidence=0.55 if docs else 0.35,
source_refs=self._source_refs(ability_sources),
capabilities=[],
@@ -77,7 +75,9 @@ class CandidateGraphGenerator:
capabilities: list[CandidateCapabilityDraft] = []
if interfaces:
capabilities.append(self._interface_capability(interfaces, tests, examples, docs))
capabilities.append(
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
if manifests or frameworks or languages:
capabilities.append(
CandidateCapabilityDraft(
@@ -110,6 +110,7 @@ class CandidateGraphGenerator:
tests: list[ObservedFact],
examples: list[ObservedFact],
docs: list[ObservedFact],
chunks: list[ContentChunk],
) -> CandidateCapabilityDraft:
features = [
CandidateFeatureDraft(
@@ -123,10 +124,7 @@ class CandidateGraphGenerator:
]
return CandidateCapabilityDraft(
name="Expose Repository Interface",
description=(
"Expose one or more likely user-facing API or CLI entry points. "
"Review is required to name the concrete domain behavior."
),
description=self._interface_description(chunks),
inputs=[],
outputs=["callable interface"],
confidence=0.65,
@@ -179,6 +177,56 @@ class CandidateGraphGenerator:
return "API"
return "interface"
def _ability_description(self, chunks: list[ContentChunk]) -> str:
doc_summary = self._document_summary(chunks)
if doc_summary:
return (
"Candidate usefulness summary seeded from repository content: "
f"{doc_summary} This is a review seed, not an asserted domain ability."
)
return (
"Candidate usefulness summary generated from observed repository "
"documentation, manifests, languages, and interfaces. This is a "
"review seed, not an asserted domain ability."
)
def _interface_description(self, chunks: list[ContentChunk]) -> str:
interface_summary = self._interface_summary(chunks)
if interface_summary:
return (
"Expose one or more likely user-facing API or CLI entry points. "
f"Source context: {interface_summary} Review is required to name "
"the concrete domain behavior."
)
return (
"Expose one or more likely user-facing API or CLI entry points. "
"Review is required to name the concrete domain behavior."
)
def _document_summary(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "documentation":
continue
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
if not lines:
continue
heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "")
paragraph = next((line for line in lines if not line.startswith("#")), "")
if heading and paragraph:
return f"{heading}. {paragraph}"
return heading or paragraph
return ""
def _interface_summary(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "interface":
continue
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
if not lines:
continue
return " ".join(lines[:3])
return ""
def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
return [fact for fact in facts if fact.kind == kind]

View File

@@ -121,7 +121,12 @@ class RegistryService:
completed_run.snapshot_id,
chunks,
)
candidates = self.candidate_generator.generate(repository, facts)
stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id)
candidates = self.candidate_generator.generate(
repository,
facts,
stored_chunks,
)
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
return ScanSummary(
analysis_run=completed_run,

View File

@@ -1,5 +1,5 @@
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
from repo_registry.core.models import ObservedFact, Repository
from repo_registry.core.models import ContentChunk, ObservedFact, Repository
def fact(id, kind, name, path="", value=""):
@@ -16,6 +16,20 @@ def fact(id, kind, name, path="", value=""):
)
def chunk(id, kind, path, text, start_line=1, end_line=1):
return ContentChunk(
id=id,
repository_id=1,
analysis_run_id=1,
snapshot_id=1,
path=path,
kind=kind,
start_line=start_line,
end_line=end_line,
text=text,
)
def test_candidate_generator_builds_review_seed_from_observed_facts():
repository = Repository(
id=1,
@@ -43,3 +57,40 @@ def test_candidate_generator_builds_review_seed_from_observed_facts():
assert interface_capability.features[0].type == "API"
assert interface_capability.features[0].location == "app.py"
assert interface_capability.evidence[0].strength == "strong"
def test_candidate_generator_enriches_descriptions_from_content_chunks():
repository = Repository(
id=1,
name="MailRouter",
url="/tmp/mail-router",
description=None,
branch="main",
status="analyzed",
)
facts = [
fact(1, "documentation", "README", "README.md"),
fact(2, "interface", "python route decorator", "app.py", '@app.post("/classify")'),
]
chunks = [
chunk(
1,
"documentation",
"README.md",
"# MailRouter\nRoutes incoming customer email to the right team.",
end_line=2,
),
chunk(
2,
"interface",
"app.py",
'@app.post("/classify")\ndef classify_email():\n return {}',
start_line=5,
end_line=7,
),
]
graph = CandidateGraphGenerator().generate(repository, facts, chunks)
assert "MailRouter. Routes incoming customer email" in graph[0].description
assert '@app.post("/classify")' in graph[0].capabilities[0].description

View File

@@ -352,6 +352,8 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
assert candidate_graph.repository.name == "Example"
assert candidate_graph.abilities
assert "Example" in candidate_graph.abilities[0].description
assert "@app.get" in candidate_graph.abilities[0].capabilities[0].description
capability_names = {
capability.name
for ability in candidate_graph.abilities