diff --git a/session_memory/config.toml b/session_memory/config.toml index 5ea5256..d5464be 100644 --- a/session_memory/config.toml +++ b/session_memory/config.toml @@ -31,6 +31,13 @@ enabled = true root = "~/.grok/sessions" glob = "*/*/chat_history.jsonl" +# Detect phase (AGENTIC-WP-0005): quality filter — drop non-coding/trivial sessions +# before signals form, so health-checks don't mint false-positive patterns. +[detect.quality] +min_events = 20 # below this many events, not a real coding session +min_substantive = 3 # require >= this many substantive (edit/read/shell) tool calls +min_prompt_len = 25 # first prompt shorter than this is treated as trivial + # Curate phase (AGENTIC-WP-0004): catalog location + promotion evidence bar. [curate] catalog_dir = "session_memory/catalog" # files-first Pattern Catalog (committed) diff --git a/session_memory/detect/__main__.py b/session_memory/detect/__main__.py index 270a17c..90fa78d 100644 --- a/session_memory/detect/__main__.py +++ b/session_memory/detect/__main__.py @@ -16,13 +16,14 @@ import os from ..core.store import Store from ..ingest import _expand, load_config from .cluster import cluster +from .quality import filter_real, quality_config from .signals import extract_signals def run_detect(config: dict, *, min_frequency: int = 2) -> list[dict]: store_cfg = config.get("store", {}) store = Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"])) - digests = store.list_digests() + digests = filter_real(store.list_digests(), quality_config(config)) signals = extract_signals(digests) patterns = [p.to_dict() for p in cluster(signals, min_frequency=min_frequency)] store.save_patterns(patterns) @@ -56,7 +57,8 @@ def main(argv=None) -> int: config = load_config(args.config) store_cfg = config.get("store", {}) - n = len(Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"])).list_digests()) + all_digests = Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"])).list_digests() + n = len(filter_real(all_digests, quality_config(config))) patterns = run_detect(config, min_frequency=args.min_frequency) if args.json: diff --git a/session_memory/detect/quality.py b/session_memory/detect/quality.py new file mode 100644 index 0000000..3a14408 --- /dev/null +++ b/session_memory/detect/quality.py @@ -0,0 +1,75 @@ +"""Session-quality filter (T01). + +The capture layer ingests *every* session it finds — including API health-checks, +smoke-tests, and interrupted runs (e.g. ``llm-connect`` firing "Say hello in one +word", or a transcript that is just ``[Request interrupted by user]``). These are +not real coding work, but the outcome heuristic labels the short ones ``abandoned`` +and the clusterer then mints false-positive "problem" patterns from them. + +:func:`is_real_coding_session` gates those out so Detect signals/clusters form only +over genuine coding sessions. It is intentionally conservative — a session counts +as real if it shows substantive activity, and is dropped only on clear trivial +markers. Thresholds come from ``[detect.quality]`` in ``config.toml``. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +# Prompt prefixes/markers that indicate a non-coding or interrupted session. +_TRIVIAL_PROMPTS = ( + "say hello", "hello", "[request interrupted", "return only this json", + "ping", "ok", "", +) + +# Tool buckets that count as "substantive" coding activity. +_SUBSTANTIVE_TOOLS = ( + "Edit", "Write", "Read", "Bash", "search_replace", "write", "read_file", + "run_terminal_command", "grep", "Grep", "glob", "Glob", "NotebookEdit", +) + + +@dataclass +class QualityConfig: + min_events: int = 20 # below this, not a real coding session + min_substantive: int = 3 # >= this many substantive tool calls required + min_prompt_len: int = 25 # first prompt shorter than this is suspect + + +def quality_config(config: Optional[dict] = None) -> QualityConfig: + d = (config or {}).get("detect", {}).get("quality", {}) if config else {} + return QualityConfig( + min_events=d.get("min_events", 20), + min_substantive=d.get("min_substantive", 3), + min_prompt_len=d.get("min_prompt_len", 25), + ) + + +def _substantive_calls(digest: dict) -> int: + hist = digest.get("tool_histogram") or {} + return sum(n for t, n in hist.items() if t in _SUBSTANTIVE_TOOLS) + + +def is_real_coding_session(digest: dict, config: Optional[QualityConfig] = None) -> bool: + cfg = config or QualityConfig() + + if not digest.get("repo"): + return False + if digest.get("event_count", 0) < cfg.min_events: + return False + if _substantive_calls(digest) < cfg.min_substantive: + return False + + prompt = (digest.get("first_prompt") or "").strip().lower() + if len(prompt) < cfg.min_prompt_len: + return False + if any(prompt.startswith(p) for p in _TRIVIAL_PROMPTS): + return False + + return True + + +def filter_real(digests: list[dict], config: Optional[QualityConfig] = None) -> list[dict]: + cfg = config or QualityConfig() + return [d for d in digests if is_real_coding_session(d, cfg)] diff --git a/tests/test_curate_entrypoint.py b/tests/test_curate_entrypoint.py index c9b47d7..f4d0ccd 100644 --- a/tests/test_curate_entrypoint.py +++ b/tests/test_curate_entrypoint.py @@ -16,6 +16,9 @@ def _digest(uid, flavor, repo, **markers): "cost": {"input_tokens": 10, "output_tokens": 1}, "markers": {"errors": markers.get("errors", 0), "retries": markers.get("retries", 0), "test_runs": 0, "edits": 0, "human_interventions": 0}, + # real coding session per the quality filter (WP-0005 T01) + "event_count": 40, "first_prompt": "Fix the failing build and retry the suite", + "tool_histogram": {"Bash": 20, "Edit": 12, "Read": 8}, } diff --git a/tests/test_detect_entrypoint.py b/tests/test_detect_entrypoint.py index 9cdc307..0323021 100644 --- a/tests/test_detect_entrypoint.py +++ b/tests/test_detect_entrypoint.py @@ -15,6 +15,9 @@ def _digest(uid, flavor, repo, **markers): "cost": {"input_tokens": 10, "output_tokens": 1}, "markers": {"errors": markers.get("errors", 0), "retries": markers.get("retries", 0), "test_runs": 0, "edits": 0, "human_interventions": 0}, + # fields the quality filter (WP-0005 T01) checks — real coding session + "event_count": 40, "first_prompt": "Fix the failing build and retry the suite", + "tool_histogram": {"Bash": 20, "Edit": 12, "Read": 8}, } diff --git a/tests/test_detect_quality.py b/tests/test_detect_quality.py new file mode 100644 index 0000000..a024cfd --- /dev/null +++ b/tests/test_detect_quality.py @@ -0,0 +1,61 @@ +"""Session-quality filter tests (T01).""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from session_memory.detect.quality import ( # noqa: E402 + QualityConfig, + filter_real, + is_real_coding_session, + quality_config, +) + + +def _digest(repo="agentic-resources", events=60, prompt="Implement the curate entrypoint", + tools=None): + return { + "session_uid": "claude:x", "flavor": "claude", "repo": repo, + "event_count": events, "first_prompt": prompt, + "tool_histogram": tools if tools is not None else {"Bash": 20, "Edit": 15, "Read": 8}, + } + + +def test_real_session_passes(): + assert is_real_coding_session(_digest()) is True + + +def test_healthcheck_prompt_dropped(): + assert is_real_coding_session(_digest(events=3, prompt="Say hello in one word.", + tools={})) is False + + +def test_interrupted_dropped(): + assert is_real_coding_session(_digest(events=1, prompt="[Request interrupted by user]", + tools={})) is False + + +def test_too_short_dropped(): + assert is_real_coding_session(_digest(events=5)) is False + + +def test_no_repo_dropped(): + assert is_real_coding_session(_digest(repo=None)) is False + + +def test_no_substantive_tools_dropped(): + # plenty of events but only plumbing calls -> not real coding + assert is_real_coding_session( + _digest(tools={"mcp__state-hub__update_task_status": 40})) is False + + +def test_filter_real_keeps_only_real(): + digs = [_digest(), _digest(events=3, prompt="hello", tools={}), _digest(repo=None)] + assert len(filter_real(digs)) == 1 + + +def test_quality_config_from_toml(): + cfg = quality_config({"detect": {"quality": {"min_events": 50}}}) + assert cfg.min_events == 50 + assert cfg.min_substantive == 3 # default preserved diff --git a/workplans/AGENTIC-WP-0005-detect-hardening.md b/workplans/AGENTIC-WP-0005-detect-hardening.md new file mode 100644 index 0000000..e57f171 --- /dev/null +++ b/workplans/AGENTIC-WP-0005-detect-hardening.md @@ -0,0 +1,88 @@ +--- +id: AGENTIC-WP-0005 +type: workplan +title: "Coding Session Memory — Detect Hardening (quality filter + infra signals)" +domain: helix_forge +repo: agentic-resources +status: ready +owner: codex +topic_slug: helix-forge +created: "2026-06-07" +updated: "2026-06-07" +state_hub_workstream_id: "d8b7b8d1-1d85-4d2a-8ccd-7b0366a9442d" +--- + +# Coding Session Memory — Detect Hardening + +A focused hardening pass (call it Phase 1.5) so the Detect output is trustworthy +enough to drive an **infrastructure assessment**. Triggered by ad-hoc analysis of +the live store after Phase 2: + +- Of **72 captured sessions, only 31 are real coding sessions**; the rest are + health-checks / smoke-tests / interrupted runs (mostly `llm-connect` *"Say hello + in one word"*). The `abandoned` outcome heuristic mislabels these, and Phase 2 + cataloged a **false-positive** "cross-flavor abandoned" pattern as + `approved`/`distribution_ready`. +- All 31 real sessions read as `success`, so the current signal set + (outcome + markers + cost) surfaces almost no genuine friction. +- The already-captured `tool_histogram` tells the real story: **~17% of tool + activity in real sessions is State Hub MCP + task plumbing + `ToolSearch` + schema-loading**, concentrated to 40–70% in some sessions — but `signals.py` + never looks at it. + +No new capture is needed — this is analysis the data already supports. + +## Session-Quality Filter + +```task +id: AGENTIC-WP-0005-T01 +status: done +priority: high +state_hub_task_id: "9f8b4304-0a37-4f66-ad34-d93e12fba0d8" +``` + +Add `detect/quality.py` with `is_real_coding_session(digest)` that filters out +health-checks, smoke-tests, interrupted, and trivially-short sessions (event-count +floor, repo present, substantive edit/tool activity, not a single hello/interrupt +prompt). Wire it into the detect pipeline so signals/clusters only form over real +sessions — fixing the `abandoned` false-positive. Knobs under `[detect]` in +`config.toml`. Unit-tested on synthetic trivial-vs-real digests. + +## Infra-Overhead + Thrash Signals + +```task +id: AGENTIC-WP-0005-T02 +status: todo +priority: high +state_hub_task_id: "10d57b05-a731-4ece-bf45-f6a98ac77555" +``` + +Add `tool_histogram`-based extractors to `detect/signals.py`: a shared tool-bucket +helper (`shell` / `edit` / `read` / `statehub_mcp` / `task_mgmt` / `schema_load` / +`other`); `sig_infra_overhead` (PROBLEM when the statehub+task+schema share of tool +calls exceeds a threshold; magnitude = share; locus `infra_overhead`); +`sig_schema_thrash` (`ToolSearch` count over threshold; locus `schema_load`); +`sig_tool_thrash` (extreme single-tool repetition). Pure functions over digests; +thresholds configurable. Unit-tested. + +## Re-run Live, Purge False Positives, Ranked Friction Report + +```task +id: AGENTIC-WP-0005-T03 +status: todo +priority: high +state_hub_task_id: "8b9d029a-60d0-4caf-af62-4fcc9c9a645c" +``` + +Re-run `ingest → detect` over the real local sessions with the filter + new +signals. Purge the false-positive catalog entries seeded in Phase 2 (the +health-check `abandoned` pattern) and re-curate so the catalog reflects real +friction. Produce a ranked **friction assessment** (`docs/ASSESSMENT-infra-friction.md`) +of the major infrastructure problems — quantified per repo/flavor, infra-overhead +share, schema-thrash — with recommendations (incl. the State Hub / MCP skill +hypothesis). After workplan file updates, notify the operator to run from +`~/state-hub`: + +```bash +make fix-consistency REPO=agentic-resources +```