generated from coulomb/repo-seed
session-memory: session-quality filter (WP-0005 T01)
detect/quality.py: is_real_coding_session drops health-checks / smoke-tests / interrupted / trivially-short sessions (event floor, repo present, substantive tool activity, non-trivial prompt). Wired into run_detect so signals only form over real sessions — fixes the abandoned false-positive. [detect.quality] knobs; existing detect/curate fixtures made realistic. 8 new tests; suite 80/80. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -31,6 +31,13 @@ enabled = true
|
|||||||
root = "~/.grok/sessions"
|
root = "~/.grok/sessions"
|
||||||
glob = "*/*/chat_history.jsonl"
|
glob = "*/*/chat_history.jsonl"
|
||||||
|
|
||||||
|
# Detect phase (AGENTIC-WP-0005): quality filter — drop non-coding/trivial sessions
|
||||||
|
# before signals form, so health-checks don't mint false-positive patterns.
|
||||||
|
[detect.quality]
|
||||||
|
min_events = 20 # below this many events, not a real coding session
|
||||||
|
min_substantive = 3 # require >= this many substantive (edit/read/shell) tool calls
|
||||||
|
min_prompt_len = 25 # first prompt shorter than this is treated as trivial
|
||||||
|
|
||||||
# Curate phase (AGENTIC-WP-0004): catalog location + promotion evidence bar.
|
# Curate phase (AGENTIC-WP-0004): catalog location + promotion evidence bar.
|
||||||
[curate]
|
[curate]
|
||||||
catalog_dir = "session_memory/catalog" # files-first Pattern Catalog (committed)
|
catalog_dir = "session_memory/catalog" # files-first Pattern Catalog (committed)
|
||||||
|
|||||||
@@ -16,13 +16,14 @@ import os
|
|||||||
from ..core.store import Store
|
from ..core.store import Store
|
||||||
from ..ingest import _expand, load_config
|
from ..ingest import _expand, load_config
|
||||||
from .cluster import cluster
|
from .cluster import cluster
|
||||||
|
from .quality import filter_real, quality_config
|
||||||
from .signals import extract_signals
|
from .signals import extract_signals
|
||||||
|
|
||||||
|
|
||||||
def run_detect(config: dict, *, min_frequency: int = 2) -> list[dict]:
|
def run_detect(config: dict, *, min_frequency: int = 2) -> list[dict]:
|
||||||
store_cfg = config.get("store", {})
|
store_cfg = config.get("store", {})
|
||||||
store = Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"]))
|
store = Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"]))
|
||||||
digests = store.list_digests()
|
digests = filter_real(store.list_digests(), quality_config(config))
|
||||||
signals = extract_signals(digests)
|
signals = extract_signals(digests)
|
||||||
patterns = [p.to_dict() for p in cluster(signals, min_frequency=min_frequency)]
|
patterns = [p.to_dict() for p in cluster(signals, min_frequency=min_frequency)]
|
||||||
store.save_patterns(patterns)
|
store.save_patterns(patterns)
|
||||||
@@ -56,7 +57,8 @@ def main(argv=None) -> int:
|
|||||||
|
|
||||||
config = load_config(args.config)
|
config = load_config(args.config)
|
||||||
store_cfg = config.get("store", {})
|
store_cfg = config.get("store", {})
|
||||||
n = len(Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"])).list_digests())
|
all_digests = Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"])).list_digests()
|
||||||
|
n = len(filter_real(all_digests, quality_config(config)))
|
||||||
patterns = run_detect(config, min_frequency=args.min_frequency)
|
patterns = run_detect(config, min_frequency=args.min_frequency)
|
||||||
|
|
||||||
if args.json:
|
if args.json:
|
||||||
|
|||||||
75
session_memory/detect/quality.py
Normal file
75
session_memory/detect/quality.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
"""Session-quality filter (T01).
|
||||||
|
|
||||||
|
The capture layer ingests *every* session it finds — including API health-checks,
|
||||||
|
smoke-tests, and interrupted runs (e.g. ``llm-connect`` firing "Say hello in one
|
||||||
|
word", or a transcript that is just ``[Request interrupted by user]``). These are
|
||||||
|
not real coding work, but the outcome heuristic labels the short ones ``abandoned``
|
||||||
|
and the clusterer then mints false-positive "problem" patterns from them.
|
||||||
|
|
||||||
|
:func:`is_real_coding_session` gates those out so Detect signals/clusters form only
|
||||||
|
over genuine coding sessions. It is intentionally conservative — a session counts
|
||||||
|
as real if it shows substantive activity, and is dropped only on clear trivial
|
||||||
|
markers. Thresholds come from ``[detect.quality]`` in ``config.toml``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
# Prompt prefixes/markers that indicate a non-coding or interrupted session.
|
||||||
|
_TRIVIAL_PROMPTS = (
|
||||||
|
"say hello", "hello", "[request interrupted", "return only this json",
|
||||||
|
"ping", "ok", "<system-reminder>",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tool buckets that count as "substantive" coding activity.
|
||||||
|
_SUBSTANTIVE_TOOLS = (
|
||||||
|
"Edit", "Write", "Read", "Bash", "search_replace", "write", "read_file",
|
||||||
|
"run_terminal_command", "grep", "Grep", "glob", "Glob", "NotebookEdit",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QualityConfig:
|
||||||
|
min_events: int = 20 # below this, not a real coding session
|
||||||
|
min_substantive: int = 3 # >= this many substantive tool calls required
|
||||||
|
min_prompt_len: int = 25 # first prompt shorter than this is suspect
|
||||||
|
|
||||||
|
|
||||||
|
def quality_config(config: Optional[dict] = None) -> QualityConfig:
|
||||||
|
d = (config or {}).get("detect", {}).get("quality", {}) if config else {}
|
||||||
|
return QualityConfig(
|
||||||
|
min_events=d.get("min_events", 20),
|
||||||
|
min_substantive=d.get("min_substantive", 3),
|
||||||
|
min_prompt_len=d.get("min_prompt_len", 25),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _substantive_calls(digest: dict) -> int:
|
||||||
|
hist = digest.get("tool_histogram") or {}
|
||||||
|
return sum(n for t, n in hist.items() if t in _SUBSTANTIVE_TOOLS)
|
||||||
|
|
||||||
|
|
||||||
|
def is_real_coding_session(digest: dict, config: Optional[QualityConfig] = None) -> bool:
|
||||||
|
cfg = config or QualityConfig()
|
||||||
|
|
||||||
|
if not digest.get("repo"):
|
||||||
|
return False
|
||||||
|
if digest.get("event_count", 0) < cfg.min_events:
|
||||||
|
return False
|
||||||
|
if _substantive_calls(digest) < cfg.min_substantive:
|
||||||
|
return False
|
||||||
|
|
||||||
|
prompt = (digest.get("first_prompt") or "").strip().lower()
|
||||||
|
if len(prompt) < cfg.min_prompt_len:
|
||||||
|
return False
|
||||||
|
if any(prompt.startswith(p) for p in _TRIVIAL_PROMPTS):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def filter_real(digests: list[dict], config: Optional[QualityConfig] = None) -> list[dict]:
|
||||||
|
cfg = config or QualityConfig()
|
||||||
|
return [d for d in digests if is_real_coding_session(d, cfg)]
|
||||||
@@ -16,6 +16,9 @@ def _digest(uid, flavor, repo, **markers):
|
|||||||
"cost": {"input_tokens": 10, "output_tokens": 1},
|
"cost": {"input_tokens": 10, "output_tokens": 1},
|
||||||
"markers": {"errors": markers.get("errors", 0), "retries": markers.get("retries", 0),
|
"markers": {"errors": markers.get("errors", 0), "retries": markers.get("retries", 0),
|
||||||
"test_runs": 0, "edits": 0, "human_interventions": 0},
|
"test_runs": 0, "edits": 0, "human_interventions": 0},
|
||||||
|
# real coding session per the quality filter (WP-0005 T01)
|
||||||
|
"event_count": 40, "first_prompt": "Fix the failing build and retry the suite",
|
||||||
|
"tool_histogram": {"Bash": 20, "Edit": 12, "Read": 8},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,9 @@ def _digest(uid, flavor, repo, **markers):
|
|||||||
"cost": {"input_tokens": 10, "output_tokens": 1},
|
"cost": {"input_tokens": 10, "output_tokens": 1},
|
||||||
"markers": {"errors": markers.get("errors", 0), "retries": markers.get("retries", 0),
|
"markers": {"errors": markers.get("errors", 0), "retries": markers.get("retries", 0),
|
||||||
"test_runs": 0, "edits": 0, "human_interventions": 0},
|
"test_runs": 0, "edits": 0, "human_interventions": 0},
|
||||||
|
# fields the quality filter (WP-0005 T01) checks — real coding session
|
||||||
|
"event_count": 40, "first_prompt": "Fix the failing build and retry the suite",
|
||||||
|
"tool_histogram": {"Bash": 20, "Edit": 12, "Read": 8},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
61
tests/test_detect_quality.py
Normal file
61
tests/test_detect_quality.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
"""Session-quality filter tests (T01)."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from session_memory.detect.quality import ( # noqa: E402
|
||||||
|
QualityConfig,
|
||||||
|
filter_real,
|
||||||
|
is_real_coding_session,
|
||||||
|
quality_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _digest(repo="agentic-resources", events=60, prompt="Implement the curate entrypoint",
|
||||||
|
tools=None):
|
||||||
|
return {
|
||||||
|
"session_uid": "claude:x", "flavor": "claude", "repo": repo,
|
||||||
|
"event_count": events, "first_prompt": prompt,
|
||||||
|
"tool_histogram": tools if tools is not None else {"Bash": 20, "Edit": 15, "Read": 8},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_real_session_passes():
|
||||||
|
assert is_real_coding_session(_digest()) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_healthcheck_prompt_dropped():
|
||||||
|
assert is_real_coding_session(_digest(events=3, prompt="Say hello in one word.",
|
||||||
|
tools={})) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_interrupted_dropped():
|
||||||
|
assert is_real_coding_session(_digest(events=1, prompt="[Request interrupted by user]",
|
||||||
|
tools={})) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_too_short_dropped():
|
||||||
|
assert is_real_coding_session(_digest(events=5)) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_repo_dropped():
|
||||||
|
assert is_real_coding_session(_digest(repo=None)) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_substantive_tools_dropped():
|
||||||
|
# plenty of events but only plumbing calls -> not real coding
|
||||||
|
assert is_real_coding_session(
|
||||||
|
_digest(tools={"mcp__state-hub__update_task_status": 40})) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_real_keeps_only_real():
|
||||||
|
digs = [_digest(), _digest(events=3, prompt="hello", tools={}), _digest(repo=None)]
|
||||||
|
assert len(filter_real(digs)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_quality_config_from_toml():
|
||||||
|
cfg = quality_config({"detect": {"quality": {"min_events": 50}}})
|
||||||
|
assert cfg.min_events == 50
|
||||||
|
assert cfg.min_substantive == 3 # default preserved
|
||||||
88
workplans/AGENTIC-WP-0005-detect-hardening.md
Normal file
88
workplans/AGENTIC-WP-0005-detect-hardening.md
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
---
|
||||||
|
id: AGENTIC-WP-0005
|
||||||
|
type: workplan
|
||||||
|
title: "Coding Session Memory — Detect Hardening (quality filter + infra signals)"
|
||||||
|
domain: helix_forge
|
||||||
|
repo: agentic-resources
|
||||||
|
status: ready
|
||||||
|
owner: codex
|
||||||
|
topic_slug: helix-forge
|
||||||
|
created: "2026-06-07"
|
||||||
|
updated: "2026-06-07"
|
||||||
|
state_hub_workstream_id: "d8b7b8d1-1d85-4d2a-8ccd-7b0366a9442d"
|
||||||
|
---
|
||||||
|
|
||||||
|
# Coding Session Memory — Detect Hardening
|
||||||
|
|
||||||
|
A focused hardening pass (call it Phase 1.5) so the Detect output is trustworthy
|
||||||
|
enough to drive an **infrastructure assessment**. Triggered by ad-hoc analysis of
|
||||||
|
the live store after Phase 2:
|
||||||
|
|
||||||
|
- Of **72 captured sessions, only 31 are real coding sessions**; the rest are
|
||||||
|
health-checks / smoke-tests / interrupted runs (mostly `llm-connect` *"Say hello
|
||||||
|
in one word"*). The `abandoned` outcome heuristic mislabels these, and Phase 2
|
||||||
|
cataloged a **false-positive** "cross-flavor abandoned" pattern as
|
||||||
|
`approved`/`distribution_ready`.
|
||||||
|
- All 31 real sessions read as `success`, so the current signal set
|
||||||
|
(outcome + markers + cost) surfaces almost no genuine friction.
|
||||||
|
- The already-captured `tool_histogram` tells the real story: **~17% of tool
|
||||||
|
activity in real sessions is State Hub MCP + task plumbing + `ToolSearch`
|
||||||
|
schema-loading**, concentrated to 40–70% in some sessions — but `signals.py`
|
||||||
|
never looks at it.
|
||||||
|
|
||||||
|
No new capture is needed — this is analysis the data already supports.
|
||||||
|
|
||||||
|
## Session-Quality Filter
|
||||||
|
|
||||||
|
```task
|
||||||
|
id: AGENTIC-WP-0005-T01
|
||||||
|
status: done
|
||||||
|
priority: high
|
||||||
|
state_hub_task_id: "9f8b4304-0a37-4f66-ad34-d93e12fba0d8"
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `detect/quality.py` with `is_real_coding_session(digest)` that filters out
|
||||||
|
health-checks, smoke-tests, interrupted, and trivially-short sessions (event-count
|
||||||
|
floor, repo present, substantive edit/tool activity, not a single hello/interrupt
|
||||||
|
prompt). Wire it into the detect pipeline so signals/clusters only form over real
|
||||||
|
sessions — fixing the `abandoned` false-positive. Knobs under `[detect]` in
|
||||||
|
`config.toml`. Unit-tested on synthetic trivial-vs-real digests.
|
||||||
|
|
||||||
|
## Infra-Overhead + Thrash Signals
|
||||||
|
|
||||||
|
```task
|
||||||
|
id: AGENTIC-WP-0005-T02
|
||||||
|
status: todo
|
||||||
|
priority: high
|
||||||
|
state_hub_task_id: "10d57b05-a731-4ece-bf45-f6a98ac77555"
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `tool_histogram`-based extractors to `detect/signals.py`: a shared tool-bucket
|
||||||
|
helper (`shell` / `edit` / `read` / `statehub_mcp` / `task_mgmt` / `schema_load` /
|
||||||
|
`other`); `sig_infra_overhead` (PROBLEM when the statehub+task+schema share of tool
|
||||||
|
calls exceeds a threshold; magnitude = share; locus `infra_overhead`);
|
||||||
|
`sig_schema_thrash` (`ToolSearch` count over threshold; locus `schema_load`);
|
||||||
|
`sig_tool_thrash` (extreme single-tool repetition). Pure functions over digests;
|
||||||
|
thresholds configurable. Unit-tested.
|
||||||
|
|
||||||
|
## Re-run Live, Purge False Positives, Ranked Friction Report
|
||||||
|
|
||||||
|
```task
|
||||||
|
id: AGENTIC-WP-0005-T03
|
||||||
|
status: todo
|
||||||
|
priority: high
|
||||||
|
state_hub_task_id: "8b9d029a-60d0-4caf-af62-4fcc9c9a645c"
|
||||||
|
```
|
||||||
|
|
||||||
|
Re-run `ingest → detect` over the real local sessions with the filter + new
|
||||||
|
signals. Purge the false-positive catalog entries seeded in Phase 2 (the
|
||||||
|
health-check `abandoned` pattern) and re-curate so the catalog reflects real
|
||||||
|
friction. Produce a ranked **friction assessment** (`docs/ASSESSMENT-infra-friction.md`)
|
||||||
|
of the major infrastructure problems — quantified per repo/flavor, infra-overhead
|
||||||
|
share, schema-thrash — with recommendations (incl. the State Hub / MCP skill
|
||||||
|
hypothesis). After workplan file updates, notify the operator to run from
|
||||||
|
`~/state-hub`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make fix-consistency REPO=agentic-resources
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user