session-memory: denoise error fingerprints (WP-0006 follow-up)

Tighten _is_failed: exclude successful hub JSON responses (top-level no-error
payloads) and file-read snapshots (numbered cat -n source lines) that were
polluting error_snippets. JSON verdict classifies error vs success payloads
directly. Cuts distinct fingerprints 444 -> 269 (~40%) over the real corpus with
the top errors unchanged. Assessment caveat updated. 5 new tests; suite 102/102.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-07 13:39:08 +02:00
parent 7cce276d32
commit 1b6081cd88
3 changed files with 80 additions and 7 deletions

View File

@@ -12,6 +12,7 @@ belongs to the Detect phase (PRD §6.2).
from __future__ import annotations
import collections
import json
import re
from typing import Any
@@ -22,6 +23,12 @@ _FAIL_HINTS = ("error", "failed", "exception", "traceback", "fatal", "non-zero")
# Substrings suggesting a clean test pass.
_PASS_HINTS = ("passed", "0 failed", "ok", "success")
# A line that is numbered source content from a Read result (`cat -n` style),
# e.g. "229\t raise InfospaceError(" — code text, never a runtime error.
_NUMBERED_LINE_RE = re.compile(r"^\s*\d+\t")
# Top-level keys that mark a JSON tool-result as an actual error (vs. success).
_JSON_ERROR_KEYS = ("error", "errors", "detail")
# Normalization patterns so the same error collapses to one fingerprint
# regardless of paths / ids / counts (WP-0006 T01).
_UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", re.I)
@@ -195,12 +202,48 @@ def _error_body(event: SessionEvent, blobs: dict) -> str:
return event.summary or ""
def _looks_like_file_read(body: str) -> bool:
"""True if the body is mostly numbered source lines (a Read result), not an error."""
lines = [ln for ln in body.splitlines() if ln.strip()]
if not lines:
return False
numbered = sum(1 for ln in lines if _NUMBERED_LINE_RE.match(ln))
return numbered >= max(3, len(lines) // 2)
def _json_verdict(body: str):
"""Classify a JSON tool-result body: 'error', 'success', or None (not JSON).
Hub MCP successes look like ``{"result": "..."}`` and mention 'error' deep
inside summaries but are not failures ('success'). A payload with a top-level
error key (``{"detail": ...}`` / ``{"error": ...}``) is 'error'. Non-JSON text
returns None so the plain fail-hint heuristic still applies.
"""
s = body.strip()
if not s or s[0] not in "{[":
return None
try:
obj = json.loads(s)
except (ValueError, TypeError):
return None
if isinstance(obj, dict) and any(k in obj for k in _JSON_ERROR_KEYS):
return "error"
return "success"
def _is_failed(event: SessionEvent, blobs: dict) -> bool:
if event.kind == "error":
return True
if event.kind == "tool_result":
body = _error_body(event, blobs).lower()
return bool(body) and any(h in body for h in _FAIL_HINTS)
body = _error_body(event, blobs)
if not body.strip():
return False
if _looks_like_file_read(body):
return False
verdict = _json_verdict(body)
if verdict is not None:
return verdict == "error"
return any(h in body.lower() for h in _FAIL_HINTS)
return False