Fix rerun assessment and candidate extraction

2026-05-16 00:57:44 +02:00
parent bee770fad7
commit 3e906c1dd4
7 changed files with 227 additions and 12 deletions
--- a/src/repo_scoping/candidate_graph/generator.py
+++ b/src/repo_scoping/candidate_graph/generator.py
@@ -566,17 +566,41 @@ class CandidateGraphGenerator:
    def _intent_capability_items(self, chunks: list[ContentChunk]) -> list[str]:
        items: list[str] = []
        in_capability_section = False
+        capability_section_level = 0
        for chunk in sorted(chunks, key=lambda item: (item.path, item.start_line)):
            for raw_line in chunk.text.splitlines():
                line = raw_line.strip()
                if not line:
                    continue
                if line.startswith("#"):
-                    heading = line.lstrip("#").strip().lower()
-                    in_capability_section = (
+                    level = len(line) - len(line.lstrip("#"))
+                    heading_text = re.sub(r"\\([._-])", r"\1", line.lstrip("#").strip())
+                    heading = re.sub(
+                        r"^\d+(?:\.\d+)*\.?\s+",
+                        "",
+                        heading_text,
+                    ).lower()
+                    if in_capability_section and level > capability_section_level:
+                        item = re.sub(
+                            r"^\d+(?:\.\d+)*\.?\s+",
+                            "",
+                            heading_text,
+                        )
+                        if item and item.lower() not in {"capabilities", "intended capabilities"}:
+                            items.append(item)
+                        continue
+                    opens_capability_section = (
                        "capabilit" in heading
-                        or heading in {"primary utility", "core utility"}
+                        or heading
+                        in {
+                            "outcomes",
+                            "primary outcomes",
+                            "primary utility",
+                            "core utility",
+                        }
                    )
+                    in_capability_section = opens_capability_section
+                    capability_section_level = level if opens_capability_section else 0
                    continue
                if not in_capability_section:
                    continue
@@ -594,6 +618,16 @@ class CandidateGraphGenerator:
            return "Make Connectivity Observable Auditable And Controllable"
        if "cli tool" in lowered and "mcp" in lowered:
            return "Expose CLI And MCP Accessible Service"
+        capability_outcomes = {
+            "capability discovery": "Support Capability Discovery",
+            "capability modeling": "Model Capabilities",
+            "capability realisation": "Realize Capabilities",
+            "capability realization": "Realize Capabilities",
+            "capability validation": "Validate Capabilities",
+            "capability evolution": "Evolve Capabilities",
+        }
+        if lowered.strip(" .:-") in capability_outcomes:
+            return capability_outcomes[lowered.strip(" .:-")]
        candidate = re.split(r"\s+-\s+|\s*:\s*|[.!?]\s+", text.strip(), maxsplit=1)[0]
        candidate = candidate.strip(" .:-")
        if not candidate:
@@ -601,6 +635,12 @@ class CandidateGraphGenerator:
        words = candidate.split()
        if words:
            words[0] = self._imperative_verb(words[0])
+            if (
+                len(words) > 1
+                and words[0].lower() in {"analyze", "compare", "detect", "explore", "identify", "interpret"}
+                and words[1].lower().strip(",;:") == "of"
+            ):
+                words.pop(1)
        while words and words[-1].lower().strip(",;:") in {"a", "an", "the", "and", "or", "as", "both"}:
            words.pop()
        return self._title_from_words(words[:10])
@@ -1614,6 +1654,7 @@ class CandidateGraphGenerator:

    def _imperative_purpose(self, text: str) -> str:
        cleaned = re.sub(r"\s+", " ", text.strip())
+        cleaned = re.split("\\s+(?:-|\\u2013|\\u2014)\\s+", cleaned, maxsplit=1)[0]
        cleaned = re.split(r"[.!?]\s+", cleaned, maxsplit=1)[0]
        cleaned = re.sub(
            r"(?i)^this\s+repository\s+exists\s+to\s+provide\s+(?:an?\s+)?",
@@ -1635,8 +1676,14 @@ class CandidateGraphGenerator:
            return word
        lower = word.lower().strip(",;:")
        irregular = {
+            "analysis": "analyze",
+            "comparison": "compare",
+            "detection": "detect",
            "does": "do",
+            "exploration": "explore",
            "has": "have",
+            "identification": "identify",
+            "interpretation": "interpret",
            "is": "be",
        }
        if lower in irregular:
@@ -1655,7 +1702,7 @@ class CandidateGraphGenerator:

    def _title_from_words(self, words: list[str]) -> str:
        cleaned_words = [
-            re.sub(r"[^A-Za-z0-9_/{}-]", "", word)
+            re.sub(r"[^\w/{}-]", "", word, flags=re.UNICODE)
            for word in words
        ]
        return " ".join(
--- a/src/repo_scoping/cli.py
+++ b/src/repo_scoping/cli.py
@@ -334,7 +334,7 @@ def dataset_assessment(service: RegistryService) -> dict[str, object]:
    }
    for repository in service.list_repositories():
        runs = service.list_analysis_runs(repository.id)
-        latest_run = next((run for run in reversed(runs) if run.status == "completed"), None)
+        latest_run = next((run for run in runs if run.status == "completed"), None)
        facts = service.list_observed_facts(repository.id, latest_run.id) if latest_run else []
        chunks = service.list_content_chunks(repository.id, latest_run.id) if latest_run else []
        candidate_counts = {
--- a/src/repo_scoping/core/service.py
+++ b/src/repo_scoping/core/service.py
@@ -1581,6 +1581,7 @@ class RegistryService:
                    "kind": kind,
                    "layer": self._dependency_layer(kind),
                    "label": detail.get("label")
+                    or detail.get("name")
                    or self._dependency_node_label(repository_id, kind, key, item_id),
                    "reviewState": detail.get("reviewState", "accepted"),
                    "name": detail.get("name")
@@ -2724,7 +2725,7 @@ class RegistryService:
            for run in self.store.list_analysis_runs(repository_id)
            if run.status == "completed"
        ]
-        return completed[-1] if completed else None
+        return completed[0] if completed else None

    def _candidate_graph_or_none(
        self,