diff --git a/docs/ASSESSMENT-infra-friction.md b/docs/ASSESSMENT-infra-friction.md index 9a054b9..b2ba61c 100644 --- a/docs/ASSESSMENT-infra-friction.md +++ b/docs/ASSESSMENT-infra-friction.md @@ -120,11 +120,14 @@ Reading: in 3 sessions each — corroborates the plumbing-overhead story and the live MCP flakiness seen during this work (REST fallback used). -**Caveat — fingerprint noise:** the fail-hint heuristic also catches non-failures -(successful hub JSON responses, source lines containing `raise …Error`, linter -"N errors" summaries). The *top* fingerprints above are real; a future refinement -should tighten `_is_failed` (e.g. skip valid-JSON success payloads and code-read -snapshots) before trusting the long tail. +**Fingerprint noise — mostly handled.** `_is_failed` now excludes successful hub +JSON responses (top-level no-error payloads) and file-read snapshots (numbered +`cat -n` source lines), which cut distinct fingerprints **444 → 269 (~40 %)** +without touching the top entries. Residual low-value items remain in the long tail +(bare structural lines like `{`, linter "N errors" summaries); the *top* +fingerprints are real. Note several entries (`MCP error -32602`, +`update_task_status 'title'`) reflect the State Hub MCP instability hit live during +this work — genuine, if self-referential, friction. ## What this assessment still can't see diff --git a/session_memory/core/digest.py b/session_memory/core/digest.py index 67a6065..ef45cb2 100644 --- a/session_memory/core/digest.py +++ b/session_memory/core/digest.py @@ -12,6 +12,7 @@ belongs to the Detect phase (PRD §6.2). from __future__ import annotations import collections +import json import re from typing import Any @@ -22,6 +23,12 @@ _FAIL_HINTS = ("error", "failed", "exception", "traceback", "fatal", "non-zero") # Substrings suggesting a clean test pass. _PASS_HINTS = ("passed", "0 failed", "ok", "success") +# A line that is numbered source content from a Read result (`cat -n` style), +# e.g. "229\t raise InfospaceError(" — code text, never a runtime error. +_NUMBERED_LINE_RE = re.compile(r"^\s*\d+\t") +# Top-level keys that mark a JSON tool-result as an actual error (vs. success). +_JSON_ERROR_KEYS = ("error", "errors", "detail") + # Normalization patterns so the same error collapses to one fingerprint # regardless of paths / ids / counts (WP-0006 T01). _UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", re.I) @@ -195,12 +202,48 @@ def _error_body(event: SessionEvent, blobs: dict) -> str: return event.summary or "" +def _looks_like_file_read(body: str) -> bool: + """True if the body is mostly numbered source lines (a Read result), not an error.""" + lines = [ln for ln in body.splitlines() if ln.strip()] + if not lines: + return False + numbered = sum(1 for ln in lines if _NUMBERED_LINE_RE.match(ln)) + return numbered >= max(3, len(lines) // 2) + + +def _json_verdict(body: str): + """Classify a JSON tool-result body: 'error', 'success', or None (not JSON). + + Hub MCP successes look like ``{"result": "..."}`` and mention 'error' deep + inside summaries but are not failures ('success'). A payload with a top-level + error key (``{"detail": ...}`` / ``{"error": ...}``) is 'error'. Non-JSON text + returns None so the plain fail-hint heuristic still applies. + """ + s = body.strip() + if not s or s[0] not in "{[": + return None + try: + obj = json.loads(s) + except (ValueError, TypeError): + return None + if isinstance(obj, dict) and any(k in obj for k in _JSON_ERROR_KEYS): + return "error" + return "success" + + def _is_failed(event: SessionEvent, blobs: dict) -> bool: if event.kind == "error": return True if event.kind == "tool_result": - body = _error_body(event, blobs).lower() - return bool(body) and any(h in body for h in _FAIL_HINTS) + body = _error_body(event, blobs) + if not body.strip(): + return False + if _looks_like_file_read(body): + return False + verdict = _json_verdict(body) + if verdict is not None: + return verdict == "error" + return any(h in body.lower() for h in _FAIL_HINTS) return False diff --git a/tests/test_digest_errors.py b/tests/test_digest_errors.py index 78e0440..e1effb7 100644 --- a/tests/test_digest_errors.py +++ b/tests/test_digest_errors.py @@ -59,6 +59,33 @@ def test_clean_tool_result_not_mined(): assert _error_snippets(events, blobs) == [] +def test_success_json_not_mined(): + # a hub MCP success payload mentioning 'error' deep inside is NOT a failure + blobs = {"b1": '{"result": "{\\"domain\\": \\"custodian\\", \\"note\\": \\"no errors\\"}"}'} + events = [_ev(0, "tool_result", tool="mcp__state-hub__get_domain_summary", payload_ref="b1")] + assert _error_snippets(events, blobs) == [] + + +def test_error_json_still_mined(): + blobs = {"b1": '{"detail": "Invalid request parameters"}'} + events = [_ev(0, "tool_result", tool="Bash", payload_ref="b1")] + snips = _error_snippets(events, blobs) + assert len(snips) == 1 + + +def test_plain_mcp_error_still_mined(): + blobs = {"b1": "MCP error -32602: Invalid request parameters"} + events = [_ev(0, "tool_result", tool="Bash", payload_ref="b1")] + assert len(_error_snippets(events, blobs)) == 1 + + +def test_file_read_snapshot_not_mined(): + # a Read result of source code containing 'raise ...Error' is not a runtime error + blobs = {"b1": "227\t def f():\n228\t x = 1\n229\t raise InfospaceError()\n"} + events = [_ev(0, "tool_result", tool="Read", payload_ref="b1")] + assert _error_snippets(events, blobs) == [] + + def test_build_digest_includes_error_snippets_and_v2(): s = Session(session_uid="claude:s", flavor="claude", native_session_id="s", repo="r") events = [_ev(0, "user_msg"), _ev(1, "error", payload_ref="b1"), _ev(2, "assistant_msg")]