diff --git a/src/infospace_bench/evaluation_io.py b/src/infospace_bench/evaluation_io.py index 4cecf1d..26be30e 100644 --- a/src/infospace_bench/evaluation_io.py +++ b/src/infospace_bench/evaluation_io.py @@ -136,21 +136,7 @@ def read_history(history_path: str | Path) -> list[EvaluationSnapshot]: def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]: text = path.read_text(encoding="utf-8") - if not text.startswith(f"{FRONTMATTER_MARKER}\n"): - raise InfospaceError( - "invalid_evaluation_file", - f"Missing YAML frontmatter in evaluation file: {path}", - {"path": str(path)}, - ) - end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1) - if end == -1: - raise InfospaceError( - "invalid_evaluation_file", - f"Unclosed YAML frontmatter in evaluation file: {path}", - {"path": str(path)}, - ) - raw = text[len(FRONTMATTER_MARKER) + 1 : end] - body = text[end + len(FRONTMATTER_MARKER) + 2 :] + raw, body = _extract_frontmatter_block(text, path) data = yaml.safe_load(raw) if not isinstance(data, dict): raise InfospaceError( @@ -158,9 +144,105 @@ def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]: f"Expected mapping frontmatter in evaluation file: {path}", {"path": str(path)}, ) + _normalise_scores(data) return data, body +def _normalise_scores(data: dict[str, Any]) -> None: + """Normalise score shapes emitted by various LLMs into the canonical + list-of-{name, value} form the rest of the pipeline expects. + + Handles three variants beyond the canonical: + + - mapping form: ``scores: {groundedness: 5, lesson_clarity: 4}`` + - list of single-key dicts: ``[{groundedness: 4}, {lesson_clarity: 3}]`` + - list of canonical dicts (left as-is) + """ + scores = data.get("scores") + if isinstance(scores, dict): + data["scores"] = [ + {"name": str(name), "value": _coerce_score(value)} + for name, value in scores.items() + ] + elif isinstance(scores, list): + normalised: list[dict[str, Any]] = [] + for item in scores: + if not isinstance(item, dict): + continue + if "name" in item and "value" in item: + normalised.append(item) + elif len(item) == 1: + (name, value), = item.items() + normalised.append({"name": str(name), "value": _coerce_score(value)}) + else: + normalised.append(item) + data["scores"] = normalised + + +def _coerce_score(value: Any) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def _extract_frontmatter_block(text: str, path: Path) -> tuple[str, str]: + """Pull a YAML frontmatter block out of an evaluation file. + + Tolerates several shapes commonly produced by LLMs: + + - the canonical ``---``-delimited block at the start of the file + - a ``` ```yaml ... ``` `` code fence at the start of the file + - a ``` ```markdown ... ``` `` outer fence wrapping ``---`` frontmatter + """ + stripped_text = text.lstrip("\n") + # Strip an outer ```markdown / ```md fence if present and recurse on its + # body so any ``---`` frontmatter inside still gets recognised. + for outer_marker in ("```markdown\n", "```md\n"): + if stripped_text.startswith(outer_marker): + inner_start = len(outer_marker) + closing_idx = stripped_text.rfind("```") + if closing_idx <= inner_start: + break + inner = stripped_text[inner_start:closing_idx].rstrip() + return _extract_frontmatter_block(inner, path) + + if stripped_text.startswith(f"{FRONTMATTER_MARKER}\n"): + text = stripped_text + end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1) + if end == -1: + # Also accept a closing fence at EOF without a trailing newline. + if text.rstrip().endswith(FRONTMATTER_MARKER): + end = text.rstrip().rfind(FRONTMATTER_MARKER) - 1 + else: + raise InfospaceError( + "invalid_evaluation_file", + f"Unclosed YAML frontmatter in evaluation file: {path}", + {"path": str(path)}, + ) + raw = text[len(FRONTMATTER_MARKER) + 1 : end] + body = text[end + len(FRONTMATTER_MARKER) + 2 :] + return raw, body + if stripped_text.startswith("```yaml") or stripped_text.startswith("```yml"): + fence_start = stripped_text.find("```") + content_start = stripped_text.find("\n", fence_start) + 1 + fence_end = stripped_text.find("\n```", content_start) + if fence_end == -1: + raise InfospaceError( + "invalid_evaluation_file", + f"Unclosed YAML code fence in evaluation file: {path}", + {"path": str(path)}, + ) + raw = stripped_text[content_start:fence_end] + body = stripped_text[fence_end + len("\n```") :] + return raw, body.lstrip("\n") + raise InfospaceError( + "invalid_evaluation_file", + f"Missing YAML frontmatter in evaluation file: {path}", + {"path": str(path)}, + ) + + def _parse_rationales(body: str) -> dict[str, str]: rationales: dict[str, str] = {} current_name: str | None = None diff --git a/src/infospace_bench/lifecycle.py b/src/infospace_bench/lifecycle.py index 6a40d93..88fc8da 100644 --- a/src/infospace_bench/lifecycle.py +++ b/src/infospace_bench/lifecycle.py @@ -219,18 +219,37 @@ def _read_yaml(path: Path) -> dict[str, Any]: def _relative_to_root(root: Path, path: Path | str) -> str: + """Return ``path`` relative to ``root``, accepting either call shape. + + Callers pass either a fully-resolved ``root / sub`` style path or a + bare ``sub`` path that should be interpreted relative to ``root``. + With a relative ``root`` the old single-interpretation logic produced + a doubled path (e.g. ``infospaces/foo/infospaces/foo/...``) because it + re-prepended ``root`` to a path that was already under ``root`` when + resolved from CWD. The fix tries the CWD interpretation first and only + falls back to root-prefixing when the CWD interpretation doesn't land + under ``root``. + """ raw = Path(path) - target = raw if raw.is_absolute() else root / raw root_resolved = root.resolve() - target_resolved = target.resolve() - try: - return str(target_resolved.relative_to(root_resolved)) - except ValueError as exc: - raise InfospaceError( - "artifact_path_escapes_infospace", - f"Artifact path escapes infospace root: {path}", - {"root": str(root), "path": str(path)}, - ) from exc + if raw.is_absolute(): + candidates = [raw.resolve()] + else: + cwd_candidate = raw.resolve() + joined_candidate = (root / raw).resolve() + candidates = [cwd_candidate] + if joined_candidate != cwd_candidate: + candidates.append(joined_candidate) + for candidate in candidates: + try: + return str(candidate.relative_to(root_resolved)) + except ValueError: + continue + raise InfospaceError( + "artifact_path_escapes_infospace", + f"Artifact path escapes infospace root: {path}", + {"root": str(root), "path": str(path)}, + ) def _write_yaml(path: Path, data: dict[str, Any]) -> None: