Implement scope-derived candidate review infrastructure

2026-05-16 00:26:29 +02:00
parent f4d782c997
commit ba2228e889
14 changed files with 1740 additions and 39 deletions
--- a/src/repo_scoping/candidate_graph/generator.py
+++ b/src/repo_scoping/candidate_graph/generator.py
@@ -275,6 +275,8 @@ class CandidateGraphGenerator:
        manifests = self._facts(facts, "manifest")
        frameworks = self._facts(facts, "framework")
        languages = self._facts(facts, "language")
+        configs = self._facts(facts, "config")
+        scope_facts = self._facts(facts, "scope")
        llm_providers = self._facts(facts, "llm_provider")
        credential_configs = self._facts(facts, "credential_config")
        provider_registries = self._facts(facts, "provider_registry")
@@ -286,7 +288,7 @@ class CandidateGraphGenerator:
            chunks,
        )

-        ability_sources = docs or manifests or languages
+        ability_sources = docs or scope_facts or manifests or languages or configs
        ability = CandidateAbilityDraft(
            name=self._ability_name(repository, chunks),
            description=self._ability_description(chunks),
@@ -308,6 +310,15 @@ class CandidateGraphGenerator:
        capabilities.extend(
            self._intent_capabilities(intent_facts, chunks, tests, examples, docs)
        )
+        capabilities.extend(
+            self._scope_capabilities(
+                scope_facts,
+                chunks,
+                tests,
+                examples,
+                allow_summary_fallback=not intent_facts,
+            )
+        )
        capabilities.extend(
            self._repo_scoping_native_capabilities(
                repository,
@@ -347,6 +358,18 @@ class CandidateGraphGenerator:
            capabilities.append(
                self._interface_capability(interfaces, tests, examples, docs, chunks)
            )
+        if not capabilities:
+            capabilities.extend(
+                self._fact_derived_capabilities(
+                    configs=configs,
+                    manifests=manifests,
+                    frameworks=frameworks,
+                    languages=languages,
+                    docs=docs,
+                    tests=tests,
+                    chunks=chunks,
+                )
+            )

        return [
            CandidateAbilityDraft(
@@ -582,6 +605,257 @@ class CandidateGraphGenerator:
            words.pop()
        return self._title_from_words(words[:10])

+    def _scope_capabilities(
+        self,
+        scope_facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+        tests: list[ObservedFact],
+        examples: list[ObservedFact],
+        *,
+        allow_summary_fallback: bool = True,
+    ) -> list[CandidateCapabilityDraft]:
+        scope_chunks = [
+            chunk
+            for chunk in chunks
+            if chunk.kind == "scope"
+            or chunk.metadata.get("source_role") == "derived_scope"
+            or chunk.path.lower().endswith("scope.md")
+        ]
+        if not scope_chunks:
+            return []
+        source_refs = self._source_refs(scope_facts)
+        capabilities: list[CandidateCapabilityDraft] = []
+        seen: set[str] = set()
+        for block in self._scope_capability_blocks(scope_chunks):
+            title = block.get("title", "").strip()
+            if not title:
+                continue
+            key = title.lower()
+            if key in seen:
+                continue
+            seen.add(key)
+            capability_type = block.get("type", "scope-derived").strip() or "scope-derived"
+            description = block.get("description", "").strip()
+            keywords = self._scope_keywords(block.get("keywords", ""))
+            attributes = self._unique(
+                [
+                    capability_type,
+                    *keywords,
+                    "scope-derived",
+                    "current-state",
+                    "review-required-scope",
+                ]
+            )
+            feature = CandidateFeatureDraft(
+                name=title,
+                type=capability_type,
+                location="SCOPE.md",
+                confidence=0.55,
+                source_refs=source_refs,
+                primary_class=capability_type,
+                attributes=self._unique(
+                    [capability_type, "scope-defined", "review-required-scope"]
+                ),
+            )
+            capabilities.append(
+                CandidateCapabilityDraft(
+                    name=title,
+                    description=(
+                        "Reviewable current-state capability extracted from "
+                        f"SCOPE.md: {description or title}"
+                    ),
+                    inputs=[],
+                    outputs=[title],
+                    confidence=self._confidence(
+                        0.45,
+                        [
+                            (0.10, bool(description)),
+                            (0.05, bool(keywords)),
+                            (0.05, bool(tests)),
+                            (0.05, bool(examples)),
+                        ],
+                    ),
+                    source_refs=source_refs,
+                    primary_class=capability_type,
+                    attributes=attributes,
+                    features=[feature],
+                    evidence=[
+                        CandidateEvidenceDraft(
+                            type="scope-current-state",
+                            reference="SCOPE.md",
+                            strength="medium",
+                            source_refs=source_refs,
+                        )
+                    ],
+                )
+            )
+        if capabilities or not allow_summary_fallback:
+            return capabilities
+        fallback_name = self._scope_summary_capability_name(scope_chunks)
+        if not fallback_name:
+            return []
+        return [
+            CandidateCapabilityDraft(
+                name=fallback_name,
+                description=(
+                    "Reviewable current-state capability inferred from SCOPE.md "
+                    "summary text. A curator should split this into more precise "
+                    "capabilities when reviewing."
+                ),
+                inputs=[],
+                outputs=[fallback_name],
+                confidence=0.45,
+                source_refs=source_refs,
+                primary_class="scope-derived",
+                attributes=[
+                    "scope-derived",
+                    "current-state",
+                    "review-required-scope",
+                ],
+                evidence=[
+                    CandidateEvidenceDraft(
+                        type="scope-current-state",
+                        reference="SCOPE.md",
+                        strength="weak",
+                        source_refs=source_refs,
+                    )
+                ],
+            )
+        ]
+
+    def _scope_capability_blocks(
+        self,
+        chunks: list[ContentChunk],
+    ) -> list[dict[str, str]]:
+        blocks: list[dict[str, str]] = []
+        in_block = False
+        current: dict[str, str] = {}
+        current_key = ""
+        for chunk in sorted(chunks, key=lambda item: (item.path, item.start_line)):
+            for raw_line in chunk.text.splitlines():
+                line = raw_line.rstrip()
+                stripped = line.strip()
+                if stripped.startswith("```capability"):
+                    in_block = True
+                    current = {}
+                    current_key = ""
+                    continue
+                if in_block and stripped.startswith("```"):
+                    if current:
+                        blocks.append(current)
+                    in_block = False
+                    current = {}
+                    current_key = ""
+                    continue
+                if not in_block:
+                    continue
+                key, separator, value = stripped.partition(":")
+                if separator and re.match(r"^[A-Za-z_][A-Za-z0-9_-]*$", key):
+                    current_key = key.lower()
+                    current[current_key] = value.strip().strip('"')
+                elif current_key and stripped:
+                    current[current_key] = (
+                        f"{current[current_key]} {stripped.strip()}"
+                    ).strip()
+        return blocks
+
+    def _scope_keywords(self, value: str) -> list[str]:
+        cleaned = value.strip()
+        if cleaned.startswith("[") and cleaned.endswith("]"):
+            cleaned = cleaned[1:-1]
+        return [
+            item.strip(" `\"'")
+            for item in cleaned.split(",")
+            if item.strip(" `\"'")
+        ][:8]
+
+    def _scope_summary_capability_name(self, chunks: list[ContentChunk]) -> str:
+        one_liner = self._scope_one_liner(chunks)
+        if one_liner:
+            return self._imperative_purpose(one_liner)
+        return ""
+
+    def _fact_derived_capabilities(
+        self,
+        *,
+        configs: list[ObservedFact],
+        manifests: list[ObservedFact],
+        frameworks: list[ObservedFact],
+        languages: list[ObservedFact],
+        docs: list[ObservedFact],
+        tests: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> list[CandidateCapabilityDraft]:
+        if not configs:
+            return []
+        capability_facts = configs + manifests + frameworks + languages
+        if not capability_facts:
+            return []
+        features: list[CandidateFeatureDraft] = []
+        for label, kind, facts in (
+            ("Manage Repository Configuration", "configuration", configs),
+            ("Declare Runtime And Package Manifests", "manifest", manifests),
+            ("Use Detected Frameworks", "framework", frameworks),
+            ("Provide Implementation In Detected Languages", "implementation", languages),
+        ):
+            if not facts:
+                continue
+            features.append(
+                CandidateFeatureDraft(
+                    name=label,
+                    type=kind,
+                    location=self._grouped_location(facts),
+                    confidence=0.45,
+                    source_refs=self._source_refs(facts),
+                    primary_class=kind,
+                    attributes=[kind, "fact-derived", "review-required"],
+                )
+            )
+        if not features:
+            return []
+        name = self._fact_derived_capability_name(chunks, features)
+        return [
+            CandidateCapabilityDraft(
+                name=name,
+                description=(
+                    "Reviewable capability inferred from deterministic facts. "
+                    "This fills the hierarchy when no stronger intent, scope "
+                    "capability, or interface candidate exists."
+                ),
+                inputs=self._feature_inputs(features),
+                outputs=self._feature_outputs(features),
+                confidence=self._confidence(
+                    0.35,
+                    [
+                        (0.10, bool(configs)),
+                        (0.10, bool(manifests)),
+                        (0.05, bool(frameworks)),
+                        (0.05, bool(tests)),
+                        (0.05, bool(docs)),
+                    ],
+                ),
+                source_refs=self._source_refs(capability_facts),
+                primary_class="fact-derived",
+                attributes=["fact-derived", "review-required", "partial-hierarchy"],
+                features=features,
+                evidence=self._evidence(tests, [], docs),
+            )
+        ]
+
+    def _fact_derived_capability_name(
+        self,
+        chunks: list[ContentChunk],
+        features: list[CandidateFeatureDraft],
+    ) -> str:
+        scope_name = self._scope_summary_capability_name(chunks)
+        if scope_name:
+            return scope_name
+        if any(feature.type == "configuration" for feature in features):
+            return "Manage Repository Configuration"
+        if any(feature.type == "manifest" for feature in features):
+            return "Declare Repository Runtime"
+        return "Describe Repository Implementation"
+
    def _repo_scoping_native_capabilities(
        self,
        repository: Repository,
@@ -1219,40 +1493,110 @@ class CandidateGraphGenerator:
        ops_name = self._operations_ability_name(chunks)
        if ops_name:
            return ops_name
-        purpose_text = self._document_purpose_sentence(chunks) or repository.description
+        purpose_text = (
+            self._intent_purpose_sentence(chunks)
+            or self._scope_one_liner(chunks)
+            or self._documentation_purpose_sentence(chunks)
+            or repository.description
+        )
        if purpose_text:
            normalized = self._imperative_purpose(purpose_text)
            if normalized:
                return normalized
        return f"Support {self._humanize_identifier(repository.name)}"

-    def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
-        for chunk in self._purpose_chunks(chunks):
+    def _intent_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
+        return self._purpose_sentence_for_chunks(
+            [
+                chunk
+                for chunk in self._purpose_chunks(chunks)
+                if chunk.kind == "intent"
+                or chunk.metadata.get("source_role") == "intent_summary"
+                or chunk.path.lower().endswith("intent.md")
+            ]
+        )
+
+    def _documentation_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
+        return self._purpose_sentence_for_chunks(
+            [
+                chunk
+                for chunk in self._purpose_chunks(chunks)
+                if chunk.kind == "documentation"
+                and chunk.metadata.get("source_role") != "derived_scope"
+                and not chunk.path.lower().endswith("scope.md")
+            ]
+        )
+
+    def _purpose_sentence_for_chunks(self, chunks: list[ContentChunk]) -> str:
+        for chunk in chunks:
            if chunk.kind not in {"intent", "documentation"}:
                continue
            lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
            paragraph = next((line for line in lines if not line.startswith("#")), "")
-            if paragraph:
+            if paragraph and not self._is_template_boilerplate(paragraph):
                return paragraph
        return ""

+    def _scope_one_liner(self, chunks: list[ContentChunk]) -> str:
+        for chunk in sorted(chunks, key=lambda item: (item.path, item.start_line)):
+            if not (
+                chunk.kind == "scope"
+                or chunk.metadata.get("source_role") == "derived_scope"
+                or chunk.path.lower().endswith("scope.md")
+            ):
+                continue
+            lines = chunk.text.splitlines()
+            for index, raw_line in enumerate(lines):
+                if raw_line.strip().lower() == "## one-liner":
+                    for following in lines[index + 1 :]:
+                        candidate = following.strip()
+                        if not candidate or candidate.startswith("---"):
+                            continue
+                        if candidate.startswith(">"):
+                            continue
+                        return candidate.strip(" .")
+            before_first_section: list[str] = []
+            for raw_line in lines:
+                candidate = raw_line.strip()
+                if candidate.startswith("## "):
+                    break
+                before_first_section.append(candidate)
+            for candidate in before_first_section:
+                if (
+                    candidate
+                    and not candidate.startswith("#")
+                    and not candidate.startswith(">")
+                    and not candidate.startswith("---")
+                    and not self._is_template_boilerplate(candidate)
+                ):
+                    return candidate.strip(" .")
+        return ""
+
+    def _is_template_boilerplate(self, text: str) -> bool:
+        lowered = text.lower()
+        return (
+            "git repository template to bootstrap" in lowered
+            or "this file helps you quickly understand" in lowered
+            or "intentionally lightweight and may be incomplete" in lowered
+        )
+
    def _purpose_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
        def priority(chunk: ContentChunk) -> tuple[int, str, int]:
            role = chunk.metadata.get("source_role")
            path = chunk.path.lower()
            if role == "intent_summary" or path.endswith("intent.md"):
                return (0, path, chunk.start_line)
-            if role == "product_documentation" or path.startswith("readme"):
-                return (1, path, chunk.start_line)
            if role == "derived_scope" or path.endswith("scope.md"):
-                return (3, path, chunk.start_line)
-            return (2, path, chunk.start_line)
+                return (1, path, chunk.start_line)
+            if role == "product_documentation" or path.startswith("readme"):
+                return (2, path, chunk.start_line)
+            return (3, path, chunk.start_line)

        return sorted(
            [
                chunk
                for chunk in chunks
-                if chunk.kind in {"intent", "documentation"}
+                if chunk.kind in {"intent", "documentation", "scope"}
                and chunk.metadata.get("source_role") != "agent_guidance"
            ],
            key=priority,
@@ -1284,9 +1628,11 @@ class CandidateGraphGenerator:
        if not words:
            return ""
        words[0] = self._imperative_verb(words[0])
-        return self._title_from_words(words[:8])
+        return self._title_from_words(words[:10])

    def _imperative_verb(self, word: str) -> str:
+        if word.isupper():
+            return word
        lower = word.lower().strip(",;:")
        irregular = {
            "does": "do",
@@ -1313,7 +1659,7 @@ class CandidateGraphGenerator:
            for word in words
        ]
        return " ".join(
-            word[:1].upper() + word[1:]
+            word if word.isupper() else word[:1].upper() + word[1:]
            for word in cleaned_words
            if word
        )
@@ -1341,17 +1687,37 @@ class CandidateGraphGenerator:
            lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
            if not lines:
                continue
+            if chunk.kind == "scope" or chunk.metadata.get("source_role") == "derived_scope":
+                one_liner = self._scope_one_liner([chunk])
+                if one_liner:
+                    return f"SCOPE. {one_liner}"
            heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "")
            paragraph = next((line for line in lines if not line.startswith("#")), "")
+            if self._is_template_boilerplate(paragraph):
+                paragraph = ""
            if heading and paragraph:
                return f"{heading}. {paragraph}"
            return heading or paragraph
        return ""

    def _documentation_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
+        def priority(chunk: ContentChunk) -> tuple[int, str, int]:
+            role = chunk.metadata.get("source_role")
+            path = chunk.path.lower()
+            if chunk.kind == "intent" or role == "intent_summary" or path.endswith("intent.md"):
+                return (0, path, chunk.start_line)
+            if chunk.kind == "scope" or role == "derived_scope" or path.endswith("scope.md"):
+                return (1, path, chunk.start_line)
+            return (2, path, chunk.start_line)
+
        return sorted(
-            [chunk for chunk in chunks if chunk.kind in {"intent", "documentation"}],
-            key=lambda chunk: (0 if chunk.kind == "intent" else 1, chunk.path, chunk.start_line),
+            [
+                chunk
+                for chunk in chunks
+                if chunk.kind in {"intent", "documentation", "scope"}
+                and chunk.metadata.get("source_role") != "agent_guidance"
+            ],
+            key=priority,
        )

    def _interface_summary(self, chunks: list[ContentChunk]) -> str: