fix(lifecycle): _relative_to_root path doubling with relative workspace

fix(evaluation_io): tolerate code-fenced frontmatter and varied score shapes from small LLMs Two bugs surfaced running the first live Lefevre chapter-I smoke against openai/gpt-4o-mini. 1. _relative_to_root doubled artifact paths when --workspace was a relative path (e.g. "."). The function received an already-CWD- relative path like infospaces/foo/artifacts/sources/x.md and re-prepended root, producing infospaces/foo/infospaces/foo/... stored in artifacts/index.yaml — which then failed file reads on the subsequent workflow stage. Fix: when raw is relative, try CWD-relative resolution first (matches root / sub call shapes); fall back to root-prefixing only when the CWD interpretation does not land under root (matches bare relative-subpath call shapes from rendered template outputs). 2. _read_frontmatter_markdown only accepted a literal ---/--- delimited block at the start of the file. gpt-4o-mini emitted three other shapes across the seven evaluation files this chapter produced: - ```yaml ... ``` fence (no --- delimiters) - ```markdown ... ``` outer fence wrapping --- frontmatter - scores as mapping ({groundedness: 4, ...}) instead of the canonical list of {name, value} dicts - scores as list of single-key dicts ([{groundedness: 4}, ...]) Fix: _extract_frontmatter_block tolerates ```yaml fences and strips ```markdown outer fences; _normalise_scores rewrites mapping- and single-key-dict shapes into the canonical form so ScoreEntry.from_dict keeps working. Both fixes are pure-Python; no API changes. 179 tests pass, 2 skipped. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 03:26:55 +02:00
parent 08ecefe309
commit 9404831069
2 changed files with 126 additions and 25 deletions
--- a/src/infospace_bench/evaluation_io.py
+++ b/src/infospace_bench/evaluation_io.py
@@ -136,21 +136,7 @@ def read_history(history_path: str | Path) -> list[EvaluationSnapshot]:

 def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]:
    text = path.read_text(encoding="utf-8")
-    if not text.startswith(f"{FRONTMATTER_MARKER}\n"):
-        raise InfospaceError(
-            "invalid_evaluation_file",
-            f"Missing YAML frontmatter in evaluation file: {path}",
-            {"path": str(path)},
-        )
-    end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1)
-    if end == -1:
-        raise InfospaceError(
-            "invalid_evaluation_file",
-            f"Unclosed YAML frontmatter in evaluation file: {path}",
-            {"path": str(path)},
-        )
-    raw = text[len(FRONTMATTER_MARKER) + 1 : end]
-    body = text[end + len(FRONTMATTER_MARKER) + 2 :]
+    raw, body = _extract_frontmatter_block(text, path)
    data = yaml.safe_load(raw)
    if not isinstance(data, dict):
        raise InfospaceError(
@@ -158,9 +144,105 @@ def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]:
            f"Expected mapping frontmatter in evaluation file: {path}",
            {"path": str(path)},
        )
+    _normalise_scores(data)
    return data, body


+def _normalise_scores(data: dict[str, Any]) -> None:
+    """Normalise score shapes emitted by various LLMs into the canonical
+    list-of-{name, value} form the rest of the pipeline expects.
+
+    Handles three variants beyond the canonical:
+
+    - mapping form: ``scores: {groundedness: 5, lesson_clarity: 4}``
+    - list of single-key dicts: ``[{groundedness: 4}, {lesson_clarity: 3}]``
+    - list of canonical dicts (left as-is)
+    """
+    scores = data.get("scores")
+    if isinstance(scores, dict):
+        data["scores"] = [
+            {"name": str(name), "value": _coerce_score(value)}
+            for name, value in scores.items()
+        ]
+    elif isinstance(scores, list):
+        normalised: list[dict[str, Any]] = []
+        for item in scores:
+            if not isinstance(item, dict):
+                continue
+            if "name" in item and "value" in item:
+                normalised.append(item)
+            elif len(item) == 1:
+                (name, value), = item.items()
+                normalised.append({"name": str(name), "value": _coerce_score(value)})
+            else:
+                normalised.append(item)
+        data["scores"] = normalised
+
+
+def _coerce_score(value: Any) -> float:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+
+def _extract_frontmatter_block(text: str, path: Path) -> tuple[str, str]:
+    """Pull a YAML frontmatter block out of an evaluation file.
+
+    Tolerates several shapes commonly produced by LLMs:
+
+    - the canonical ``---``-delimited block at the start of the file
+    - a ``` ```yaml ... ``` `` code fence at the start of the file
+    - a ``` ```markdown ... ``` `` outer fence wrapping ``---`` frontmatter
+    """
+    stripped_text = text.lstrip("\n")
+    # Strip an outer ```markdown / ```md fence if present and recurse on its
+    # body so any ``---`` frontmatter inside still gets recognised.
+    for outer_marker in ("```markdown\n", "```md\n"):
+        if stripped_text.startswith(outer_marker):
+            inner_start = len(outer_marker)
+            closing_idx = stripped_text.rfind("```")
+            if closing_idx <= inner_start:
+                break
+            inner = stripped_text[inner_start:closing_idx].rstrip()
+            return _extract_frontmatter_block(inner, path)
+
+    if stripped_text.startswith(f"{FRONTMATTER_MARKER}\n"):
+        text = stripped_text
+        end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1)
+        if end == -1:
+            # Also accept a closing fence at EOF without a trailing newline.
+            if text.rstrip().endswith(FRONTMATTER_MARKER):
+                end = text.rstrip().rfind(FRONTMATTER_MARKER) - 1
+            else:
+                raise InfospaceError(
+                    "invalid_evaluation_file",
+                    f"Unclosed YAML frontmatter in evaluation file: {path}",
+                    {"path": str(path)},
+                )
+        raw = text[len(FRONTMATTER_MARKER) + 1 : end]
+        body = text[end + len(FRONTMATTER_MARKER) + 2 :]
+        return raw, body
+    if stripped_text.startswith("```yaml") or stripped_text.startswith("```yml"):
+        fence_start = stripped_text.find("```")
+        content_start = stripped_text.find("\n", fence_start) + 1
+        fence_end = stripped_text.find("\n```", content_start)
+        if fence_end == -1:
+            raise InfospaceError(
+                "invalid_evaluation_file",
+                f"Unclosed YAML code fence in evaluation file: {path}",
+                {"path": str(path)},
+            )
+        raw = stripped_text[content_start:fence_end]
+        body = stripped_text[fence_end + len("\n```") :]
+        return raw, body.lstrip("\n")
+    raise InfospaceError(
+        "invalid_evaluation_file",
+        f"Missing YAML frontmatter in evaluation file: {path}",
+        {"path": str(path)},
+    )
+
+
 def _parse_rationales(body: str) -> dict[str, str]:
    rationales: dict[str, str] = {}
    current_name: str | None = None
--- a/src/infospace_bench/lifecycle.py
+++ b/src/infospace_bench/lifecycle.py
@@ -219,18 +219,37 @@ def _read_yaml(path: Path) -> dict[str, Any]:


 def _relative_to_root(root: Path, path: Path | str) -> str:
+    """Return ``path`` relative to ``root``, accepting either call shape.
+
+    Callers pass either a fully-resolved ``root / sub`` style path or a
+    bare ``sub`` path that should be interpreted relative to ``root``.
+    With a relative ``root`` the old single-interpretation logic produced
+    a doubled path (e.g. ``infospaces/foo/infospaces/foo/...``) because it
+    re-prepended ``root`` to a path that was already under ``root`` when
+    resolved from CWD. The fix tries the CWD interpretation first and only
+    falls back to root-prefixing when the CWD interpretation doesn't land
+    under ``root``.
+    """
    raw = Path(path)
-    target = raw if raw.is_absolute() else root / raw
    root_resolved = root.resolve()
-    target_resolved = target.resolve()
-    try:
-        return str(target_resolved.relative_to(root_resolved))
-    except ValueError as exc:
-        raise InfospaceError(
-            "artifact_path_escapes_infospace",
-            f"Artifact path escapes infospace root: {path}",
-            {"root": str(root), "path": str(path)},
-        ) from exc
+    if raw.is_absolute():
+        candidates = [raw.resolve()]
+    else:
+        cwd_candidate = raw.resolve()
+        joined_candidate = (root / raw).resolve()
+        candidates = [cwd_candidate]
+        if joined_candidate != cwd_candidate:
+            candidates.append(joined_candidate)
+    for candidate in candidates:
+        try:
+            return str(candidate.relative_to(root_resolved))
+        except ValueError:
+            continue
+    raise InfospaceError(
+        "artifact_path_escapes_infospace",
+        f"Artifact path escapes infospace root: {path}",
+        {"root": str(root), "path": str(path)},
+    )


 def _write_yaml(path: Path, data: dict[str, Any]) -> None: