generated from coulomb/repo-seed
candidate generation use content chunks
This commit is contained in:
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from repo_registry.core.models import ObservedFact, Repository, SourceReference
|
from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -50,9 +50,11 @@ class CandidateGraphGenerator:
|
|||||||
self,
|
self,
|
||||||
repository: Repository,
|
repository: Repository,
|
||||||
facts: list[ObservedFact],
|
facts: list[ObservedFact],
|
||||||
|
chunks: list[ContentChunk] | None = None,
|
||||||
) -> list[CandidateAbilityDraft]:
|
) -> list[CandidateAbilityDraft]:
|
||||||
if not facts:
|
if not facts:
|
||||||
return []
|
return []
|
||||||
|
chunks = chunks or []
|
||||||
|
|
||||||
docs = self._facts(facts, "documentation")
|
docs = self._facts(facts, "documentation")
|
||||||
tests = self._facts(facts, "test")
|
tests = self._facts(facts, "test")
|
||||||
@@ -65,11 +67,7 @@ class CandidateGraphGenerator:
|
|||||||
ability_sources = docs or manifests or languages
|
ability_sources = docs or manifests or languages
|
||||||
ability = CandidateAbilityDraft(
|
ability = CandidateAbilityDraft(
|
||||||
name=f"Review {repository.name} Repository Usefulness",
|
name=f"Review {repository.name} Repository Usefulness",
|
||||||
description=(
|
description=self._ability_description(chunks),
|
||||||
"Candidate usefulness summary generated from observed repository "
|
|
||||||
"documentation, manifests, languages, and interfaces. This is a "
|
|
||||||
"review seed, not an asserted domain ability."
|
|
||||||
),
|
|
||||||
confidence=0.55 if docs else 0.35,
|
confidence=0.55 if docs else 0.35,
|
||||||
source_refs=self._source_refs(ability_sources),
|
source_refs=self._source_refs(ability_sources),
|
||||||
capabilities=[],
|
capabilities=[],
|
||||||
@@ -77,7 +75,9 @@ class CandidateGraphGenerator:
|
|||||||
|
|
||||||
capabilities: list[CandidateCapabilityDraft] = []
|
capabilities: list[CandidateCapabilityDraft] = []
|
||||||
if interfaces:
|
if interfaces:
|
||||||
capabilities.append(self._interface_capability(interfaces, tests, examples, docs))
|
capabilities.append(
|
||||||
|
self._interface_capability(interfaces, tests, examples, docs, chunks)
|
||||||
|
)
|
||||||
if manifests or frameworks or languages:
|
if manifests or frameworks or languages:
|
||||||
capabilities.append(
|
capabilities.append(
|
||||||
CandidateCapabilityDraft(
|
CandidateCapabilityDraft(
|
||||||
@@ -110,6 +110,7 @@ class CandidateGraphGenerator:
|
|||||||
tests: list[ObservedFact],
|
tests: list[ObservedFact],
|
||||||
examples: list[ObservedFact],
|
examples: list[ObservedFact],
|
||||||
docs: list[ObservedFact],
|
docs: list[ObservedFact],
|
||||||
|
chunks: list[ContentChunk],
|
||||||
) -> CandidateCapabilityDraft:
|
) -> CandidateCapabilityDraft:
|
||||||
features = [
|
features = [
|
||||||
CandidateFeatureDraft(
|
CandidateFeatureDraft(
|
||||||
@@ -123,10 +124,7 @@ class CandidateGraphGenerator:
|
|||||||
]
|
]
|
||||||
return CandidateCapabilityDraft(
|
return CandidateCapabilityDraft(
|
||||||
name="Expose Repository Interface",
|
name="Expose Repository Interface",
|
||||||
description=(
|
description=self._interface_description(chunks),
|
||||||
"Expose one or more likely user-facing API or CLI entry points. "
|
|
||||||
"Review is required to name the concrete domain behavior."
|
|
||||||
),
|
|
||||||
inputs=[],
|
inputs=[],
|
||||||
outputs=["callable interface"],
|
outputs=["callable interface"],
|
||||||
confidence=0.65,
|
confidence=0.65,
|
||||||
@@ -179,6 +177,56 @@ class CandidateGraphGenerator:
|
|||||||
return "API"
|
return "API"
|
||||||
return "interface"
|
return "interface"
|
||||||
|
|
||||||
|
def _ability_description(self, chunks: list[ContentChunk]) -> str:
|
||||||
|
doc_summary = self._document_summary(chunks)
|
||||||
|
if doc_summary:
|
||||||
|
return (
|
||||||
|
"Candidate usefulness summary seeded from repository content: "
|
||||||
|
f"{doc_summary} This is a review seed, not an asserted domain ability."
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
"Candidate usefulness summary generated from observed repository "
|
||||||
|
"documentation, manifests, languages, and interfaces. This is a "
|
||||||
|
"review seed, not an asserted domain ability."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _interface_description(self, chunks: list[ContentChunk]) -> str:
|
||||||
|
interface_summary = self._interface_summary(chunks)
|
||||||
|
if interface_summary:
|
||||||
|
return (
|
||||||
|
"Expose one or more likely user-facing API or CLI entry points. "
|
||||||
|
f"Source context: {interface_summary} Review is required to name "
|
||||||
|
"the concrete domain behavior."
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
"Expose one or more likely user-facing API or CLI entry points. "
|
||||||
|
"Review is required to name the concrete domain behavior."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _document_summary(self, chunks: list[ContentChunk]) -> str:
|
||||||
|
for chunk in chunks:
|
||||||
|
if chunk.kind != "documentation":
|
||||||
|
continue
|
||||||
|
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||||
|
if not lines:
|
||||||
|
continue
|
||||||
|
heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "")
|
||||||
|
paragraph = next((line for line in lines if not line.startswith("#")), "")
|
||||||
|
if heading and paragraph:
|
||||||
|
return f"{heading}. {paragraph}"
|
||||||
|
return heading or paragraph
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _interface_summary(self, chunks: list[ContentChunk]) -> str:
|
||||||
|
for chunk in chunks:
|
||||||
|
if chunk.kind != "interface":
|
||||||
|
continue
|
||||||
|
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||||
|
if not lines:
|
||||||
|
continue
|
||||||
|
return " ".join(lines[:3])
|
||||||
|
return ""
|
||||||
|
|
||||||
def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
|
def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
|
||||||
return [fact for fact in facts if fact.kind == kind]
|
return [fact for fact in facts if fact.kind == kind]
|
||||||
|
|
||||||
|
|||||||
@@ -121,7 +121,12 @@ class RegistryService:
|
|||||||
completed_run.snapshot_id,
|
completed_run.snapshot_id,
|
||||||
chunks,
|
chunks,
|
||||||
)
|
)
|
||||||
candidates = self.candidate_generator.generate(repository, facts)
|
stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id)
|
||||||
|
candidates = self.candidate_generator.generate(
|
||||||
|
repository,
|
||||||
|
facts,
|
||||||
|
stored_chunks,
|
||||||
|
)
|
||||||
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
|
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
|
||||||
return ScanSummary(
|
return ScanSummary(
|
||||||
analysis_run=completed_run,
|
analysis_run=completed_run,
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
|
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
|
||||||
from repo_registry.core.models import ObservedFact, Repository
|
from repo_registry.core.models import ContentChunk, ObservedFact, Repository
|
||||||
|
|
||||||
|
|
||||||
def fact(id, kind, name, path="", value=""):
|
def fact(id, kind, name, path="", value=""):
|
||||||
@@ -16,6 +16,20 @@ def fact(id, kind, name, path="", value=""):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def chunk(id, kind, path, text, start_line=1, end_line=1):
|
||||||
|
return ContentChunk(
|
||||||
|
id=id,
|
||||||
|
repository_id=1,
|
||||||
|
analysis_run_id=1,
|
||||||
|
snapshot_id=1,
|
||||||
|
path=path,
|
||||||
|
kind=kind,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_candidate_generator_builds_review_seed_from_observed_facts():
|
def test_candidate_generator_builds_review_seed_from_observed_facts():
|
||||||
repository = Repository(
|
repository = Repository(
|
||||||
id=1,
|
id=1,
|
||||||
@@ -43,3 +57,40 @@ def test_candidate_generator_builds_review_seed_from_observed_facts():
|
|||||||
assert interface_capability.features[0].type == "API"
|
assert interface_capability.features[0].type == "API"
|
||||||
assert interface_capability.features[0].location == "app.py"
|
assert interface_capability.features[0].location == "app.py"
|
||||||
assert interface_capability.evidence[0].strength == "strong"
|
assert interface_capability.evidence[0].strength == "strong"
|
||||||
|
|
||||||
|
|
||||||
|
def test_candidate_generator_enriches_descriptions_from_content_chunks():
|
||||||
|
repository = Repository(
|
||||||
|
id=1,
|
||||||
|
name="MailRouter",
|
||||||
|
url="/tmp/mail-router",
|
||||||
|
description=None,
|
||||||
|
branch="main",
|
||||||
|
status="analyzed",
|
||||||
|
)
|
||||||
|
facts = [
|
||||||
|
fact(1, "documentation", "README", "README.md"),
|
||||||
|
fact(2, "interface", "python route decorator", "app.py", '@app.post("/classify")'),
|
||||||
|
]
|
||||||
|
chunks = [
|
||||||
|
chunk(
|
||||||
|
1,
|
||||||
|
"documentation",
|
||||||
|
"README.md",
|
||||||
|
"# MailRouter\nRoutes incoming customer email to the right team.",
|
||||||
|
end_line=2,
|
||||||
|
),
|
||||||
|
chunk(
|
||||||
|
2,
|
||||||
|
"interface",
|
||||||
|
"app.py",
|
||||||
|
'@app.post("/classify")\ndef classify_email():\n return {}',
|
||||||
|
start_line=5,
|
||||||
|
end_line=7,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
graph = CandidateGraphGenerator().generate(repository, facts, chunks)
|
||||||
|
|
||||||
|
assert "MailRouter. Routes incoming customer email" in graph[0].description
|
||||||
|
assert '@app.post("/classify")' in graph[0].capabilities[0].description
|
||||||
|
|||||||
@@ -352,6 +352,8 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
|
|||||||
candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
|
candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
|
||||||
assert candidate_graph.repository.name == "Example"
|
assert candidate_graph.repository.name == "Example"
|
||||||
assert candidate_graph.abilities
|
assert candidate_graph.abilities
|
||||||
|
assert "Example" in candidate_graph.abilities[0].description
|
||||||
|
assert "@app.get" in candidate_graph.abilities[0].capabilities[0].description
|
||||||
capability_names = {
|
capability_names = {
|
||||||
capability.name
|
capability.name
|
||||||
for ability in candidate_graph.abilities
|
for ability in candidate_graph.abilities
|
||||||
|
|||||||
Reference in New Issue
Block a user