candidate generation use content chunks

This commit is contained in:
2026-04-26 02:49:58 +02:00
parent 6416139176
commit 07c837d6bb
4 changed files with 119 additions and 13 deletions

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass, field
from repo_registry.core.models import ObservedFact, Repository, SourceReference
from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
@dataclass(frozen=True)
@@ -50,9 +50,11 @@ class CandidateGraphGenerator:
self,
repository: Repository,
facts: list[ObservedFact],
chunks: list[ContentChunk] | None = None,
) -> list[CandidateAbilityDraft]:
if not facts:
return []
chunks = chunks or []
docs = self._facts(facts, "documentation")
tests = self._facts(facts, "test")
@@ -65,11 +67,7 @@ class CandidateGraphGenerator:
ability_sources = docs or manifests or languages
ability = CandidateAbilityDraft(
name=f"Review {repository.name} Repository Usefulness",
description=(
"Candidate usefulness summary generated from observed repository "
"documentation, manifests, languages, and interfaces. This is a "
"review seed, not an asserted domain ability."
),
description=self._ability_description(chunks),
confidence=0.55 if docs else 0.35,
source_refs=self._source_refs(ability_sources),
capabilities=[],
@@ -77,7 +75,9 @@ class CandidateGraphGenerator:
capabilities: list[CandidateCapabilityDraft] = []
if interfaces:
capabilities.append(self._interface_capability(interfaces, tests, examples, docs))
capabilities.append(
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
if manifests or frameworks or languages:
capabilities.append(
CandidateCapabilityDraft(
@@ -110,6 +110,7 @@ class CandidateGraphGenerator:
tests: list[ObservedFact],
examples: list[ObservedFact],
docs: list[ObservedFact],
chunks: list[ContentChunk],
) -> CandidateCapabilityDraft:
features = [
CandidateFeatureDraft(
@@ -123,10 +124,7 @@ class CandidateGraphGenerator:
]
return CandidateCapabilityDraft(
name="Expose Repository Interface",
description=(
"Expose one or more likely user-facing API or CLI entry points. "
"Review is required to name the concrete domain behavior."
),
description=self._interface_description(chunks),
inputs=[],
outputs=["callable interface"],
confidence=0.65,
@@ -179,6 +177,56 @@ class CandidateGraphGenerator:
return "API"
return "interface"
def _ability_description(self, chunks: list[ContentChunk]) -> str:
doc_summary = self._document_summary(chunks)
if doc_summary:
return (
"Candidate usefulness summary seeded from repository content: "
f"{doc_summary} This is a review seed, not an asserted domain ability."
)
return (
"Candidate usefulness summary generated from observed repository "
"documentation, manifests, languages, and interfaces. This is a "
"review seed, not an asserted domain ability."
)
def _interface_description(self, chunks: list[ContentChunk]) -> str:
interface_summary = self._interface_summary(chunks)
if interface_summary:
return (
"Expose one or more likely user-facing API or CLI entry points. "
f"Source context: {interface_summary} Review is required to name "
"the concrete domain behavior."
)
return (
"Expose one or more likely user-facing API or CLI entry points. "
"Review is required to name the concrete domain behavior."
)
def _document_summary(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "documentation":
continue
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
if not lines:
continue
heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "")
paragraph = next((line for line in lines if not line.startswith("#")), "")
if heading and paragraph:
return f"{heading}. {paragraph}"
return heading or paragraph
return ""
def _interface_summary(self, chunks: list[ContentChunk]) -> str:
for chunk in chunks:
if chunk.kind != "interface":
continue
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
if not lines:
continue
return " ".join(lines[:3])
return ""
def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
return [fact for fact in facts if fact.kind == kind]

View File

@@ -121,7 +121,12 @@ class RegistryService:
completed_run.snapshot_id,
chunks,
)
candidates = self.candidate_generator.generate(repository, facts)
stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id)
candidates = self.candidate_generator.generate(
repository,
facts,
stored_chunks,
)
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
return ScanSummary(
analysis_run=completed_run,