generated from coulomb/repo-seed
session-memory: denoise error fingerprints (WP-0006 follow-up)
Tighten _is_failed: exclude successful hub JSON responses (top-level no-error payloads) and file-read snapshots (numbered cat -n source lines) that were polluting error_snippets. JSON verdict classifies error vs success payloads directly. Cuts distinct fingerprints 444 -> 269 (~40%) over the real corpus with the top errors unchanged. Assessment caveat updated. 5 new tests; suite 102/102. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -120,11 +120,14 @@ Reading:
|
||||
in 3 sessions each — corroborates the plumbing-overhead story and the live MCP
|
||||
flakiness seen during this work (REST fallback used).
|
||||
|
||||
**Caveat — fingerprint noise:** the fail-hint heuristic also catches non-failures
|
||||
(successful hub JSON responses, source lines containing `raise …Error`, linter
|
||||
"N errors" summaries). The *top* fingerprints above are real; a future refinement
|
||||
should tighten `_is_failed` (e.g. skip valid-JSON success payloads and code-read
|
||||
snapshots) before trusting the long tail.
|
||||
**Fingerprint noise — mostly handled.** `_is_failed` now excludes successful hub
|
||||
JSON responses (top-level no-error payloads) and file-read snapshots (numbered
|
||||
`cat -n` source lines), which cut distinct fingerprints **444 → 269 (~40 %)**
|
||||
without touching the top entries. Residual low-value items remain in the long tail
|
||||
(bare structural lines like `{`, linter "N errors" summaries); the *top*
|
||||
fingerprints are real. Note several entries (`MCP error -32602`,
|
||||
`update_task_status 'title'`) reflect the State Hub MCP instability hit live during
|
||||
this work — genuine, if self-referential, friction.
|
||||
|
||||
## What this assessment still can't see
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ belongs to the Detect phase (PRD §6.2).
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
@@ -22,6 +23,12 @@ _FAIL_HINTS = ("error", "failed", "exception", "traceback", "fatal", "non-zero")
|
||||
# Substrings suggesting a clean test pass.
|
||||
_PASS_HINTS = ("passed", "0 failed", "ok", "success")
|
||||
|
||||
# A line that is numbered source content from a Read result (`cat -n` style),
|
||||
# e.g. "229\t raise InfospaceError(" — code text, never a runtime error.
|
||||
_NUMBERED_LINE_RE = re.compile(r"^\s*\d+\t")
|
||||
# Top-level keys that mark a JSON tool-result as an actual error (vs. success).
|
||||
_JSON_ERROR_KEYS = ("error", "errors", "detail")
|
||||
|
||||
# Normalization patterns so the same error collapses to one fingerprint
|
||||
# regardless of paths / ids / counts (WP-0006 T01).
|
||||
_UUID_RE = re.compile(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", re.I)
|
||||
@@ -195,12 +202,48 @@ def _error_body(event: SessionEvent, blobs: dict) -> str:
|
||||
return event.summary or ""
|
||||
|
||||
|
||||
def _looks_like_file_read(body: str) -> bool:
|
||||
"""True if the body is mostly numbered source lines (a Read result), not an error."""
|
||||
lines = [ln for ln in body.splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
return False
|
||||
numbered = sum(1 for ln in lines if _NUMBERED_LINE_RE.match(ln))
|
||||
return numbered >= max(3, len(lines) // 2)
|
||||
|
||||
|
||||
def _json_verdict(body: str):
|
||||
"""Classify a JSON tool-result body: 'error', 'success', or None (not JSON).
|
||||
|
||||
Hub MCP successes look like ``{"result": "..."}`` and mention 'error' deep
|
||||
inside summaries but are not failures ('success'). A payload with a top-level
|
||||
error key (``{"detail": ...}`` / ``{"error": ...}``) is 'error'. Non-JSON text
|
||||
returns None so the plain fail-hint heuristic still applies.
|
||||
"""
|
||||
s = body.strip()
|
||||
if not s or s[0] not in "{[":
|
||||
return None
|
||||
try:
|
||||
obj = json.loads(s)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
if isinstance(obj, dict) and any(k in obj for k in _JSON_ERROR_KEYS):
|
||||
return "error"
|
||||
return "success"
|
||||
|
||||
|
||||
def _is_failed(event: SessionEvent, blobs: dict) -> bool:
|
||||
if event.kind == "error":
|
||||
return True
|
||||
if event.kind == "tool_result":
|
||||
body = _error_body(event, blobs).lower()
|
||||
return bool(body) and any(h in body for h in _FAIL_HINTS)
|
||||
body = _error_body(event, blobs)
|
||||
if not body.strip():
|
||||
return False
|
||||
if _looks_like_file_read(body):
|
||||
return False
|
||||
verdict = _json_verdict(body)
|
||||
if verdict is not None:
|
||||
return verdict == "error"
|
||||
return any(h in body.lower() for h in _FAIL_HINTS)
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -59,6 +59,33 @@ def test_clean_tool_result_not_mined():
|
||||
assert _error_snippets(events, blobs) == []
|
||||
|
||||
|
||||
def test_success_json_not_mined():
|
||||
# a hub MCP success payload mentioning 'error' deep inside is NOT a failure
|
||||
blobs = {"b1": '{"result": "{\\"domain\\": \\"custodian\\", \\"note\\": \\"no errors\\"}"}'}
|
||||
events = [_ev(0, "tool_result", tool="mcp__state-hub__get_domain_summary", payload_ref="b1")]
|
||||
assert _error_snippets(events, blobs) == []
|
||||
|
||||
|
||||
def test_error_json_still_mined():
|
||||
blobs = {"b1": '{"detail": "Invalid request parameters"}'}
|
||||
events = [_ev(0, "tool_result", tool="Bash", payload_ref="b1")]
|
||||
snips = _error_snippets(events, blobs)
|
||||
assert len(snips) == 1
|
||||
|
||||
|
||||
def test_plain_mcp_error_still_mined():
|
||||
blobs = {"b1": "MCP error -32602: Invalid request parameters"}
|
||||
events = [_ev(0, "tool_result", tool="Bash", payload_ref="b1")]
|
||||
assert len(_error_snippets(events, blobs)) == 1
|
||||
|
||||
|
||||
def test_file_read_snapshot_not_mined():
|
||||
# a Read result of source code containing 'raise ...Error' is not a runtime error
|
||||
blobs = {"b1": "227\t def f():\n228\t x = 1\n229\t raise InfospaceError()\n"}
|
||||
events = [_ev(0, "tool_result", tool="Read", payload_ref="b1")]
|
||||
assert _error_snippets(events, blobs) == []
|
||||
|
||||
|
||||
def test_build_digest_includes_error_snippets_and_v2():
|
||||
s = Session(session_uid="claude:s", flavor="claude", native_session_id="s", repo="r")
|
||||
events = [_ev(0, "user_msg"), _ev(1, "error", payload_ref="b1"), _ev(2, "assistant_msg")]
|
||||
|
||||
Reference in New Issue
Block a user