diff --git a/session_memory/config.toml b/session_memory/config.toml index 3a02bd5..c0f989d 100644 --- a/session_memory/config.toml +++ b/session_memory/config.toml @@ -31,6 +31,21 @@ enabled = true root = "~/.grok/sessions" glob = "*/*/chat_history.jsonl" +# Curate phase (AGENTIC-WP-0004): catalog location + promotion evidence bar. +[curate] +catalog_dir = "session_memory/catalog" # files-first Pattern Catalog (committed) +review_log = "session_memory/.store/reviews.jsonl" # remembered decisions (gitignored) + +# Evidence bar (OQ5): floors to promote at all, and stricter floors to be +# distribution-eligible (status=approved, distribution_ready=true). +[curate.gate] +min_frequency = 2 # >= this many supporting signals to promote +min_sessions = 2 # >= this many distinct sessions +min_cost_impact = 0.0 +dist_require_cross_flavor = false # require cross-flavor evidence to distribute +dist_min_frequency = 3 +dist_min_cost_impact = 0.0 + # cwd basename -> domain slug. Used to tag sessions with their Custodian domain. [repo_domain_map] agentic-resources = "helix_forge" diff --git a/session_memory/curate/gating.py b/session_memory/curate/gating.py new file mode 100644 index 0000000..d631dd3 --- /dev/null +++ b/session_memory/curate/gating.py @@ -0,0 +1,117 @@ +"""Promotion evidence-bar + bloat guard (design OQ5/OQ6; T04). + +Two gates protect the catalog: + +* **Evidence bar (OQ5)** — a candidate must clear configurable floors + (frequency, distinct supporting sessions) before it may be promoted at all. + A separate, stricter bar decides whether the promoted pattern is + *distribution-eligible* (``status="approved"``, ``distribution_ready=True``) + vs. merely ``provisional`` — the minimum trustworthy evidence before a pattern + is allowed near live agent environments. + +* **Bloat guard (OQ6)** — flags candidates that would add little: a duplicate of + an already-cataloged pattern, or a near-duplicate sharing the same + signal-type+locus. Keeps the catalog lean so agent context budgets aren't + degraded by low-value instructions. + +Knobs live under ``[curate]`` in ``config.toml``; :func:`gate_config` reads them +with safe defaults so the module also works config-free (tests). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional + +from .schema import SolutionPattern + + +@dataclass +class GateConfig: + # promotion floor (OQ5) + min_frequency: int = 2 + min_sessions: int = 2 + min_cost_impact: float = 0.0 + # distribution-eligibility floor (stricter; OQ5) + dist_require_cross_flavor: bool = False + dist_min_frequency: int = 3 + dist_min_cost_impact: float = 0.0 + + +def gate_config(config: Optional[dict] = None) -> GateConfig: + c = (config or {}).get("curate", {}) if config else {} + g = c.get("gate", {}) if isinstance(c, dict) else {} + return GateConfig( + min_frequency=g.get("min_frequency", 2), + min_sessions=g.get("min_sessions", 2), + min_cost_impact=g.get("min_cost_impact", 0.0), + dist_require_cross_flavor=g.get("dist_require_cross_flavor", False), + dist_min_frequency=g.get("dist_min_frequency", 3), + dist_min_cost_impact=g.get("dist_min_cost_impact", 0.0), + ) + + +@dataclass +class GateResult: + promotable: bool + distribution_ready: bool + status: str # "approved" if distribution-ready else "provisional" + reasons: list = field(default_factory=list) + + +def _n_sessions(candidate: dict) -> int: + return len(candidate.get("sessions", []) or []) + + +def evaluate(candidate: dict, config: Optional[GateConfig] = None) -> GateResult: + """Decide whether a candidate may be promoted, and at what trust level.""" + cfg = config or GateConfig() + reasons: list[str] = [] + + freq = candidate.get("frequency", 0) + sessions = _n_sessions(candidate) + impact = candidate.get("cost_impact", 0.0) + + promotable = True + if freq < cfg.min_frequency: + promotable = False + reasons.append(f"frequency {freq} < min {cfg.min_frequency}") + if sessions < cfg.min_sessions: + promotable = False + reasons.append(f"sessions {sessions} < min {cfg.min_sessions}") + if impact < cfg.min_cost_impact: + promotable = False + reasons.append(f"cost_impact {impact} < min {cfg.min_cost_impact}") + + dist = promotable + if cfg.dist_require_cross_flavor and not candidate.get("cross_flavor", False): + dist = False + reasons.append("not cross-flavor (required for distribution)") + if freq < cfg.dist_min_frequency: + dist = False + reasons.append(f"frequency {freq} < distribution min {cfg.dist_min_frequency}") + if impact < cfg.dist_min_cost_impact: + dist = False + reasons.append(f"cost_impact {impact} < distribution min {cfg.dist_min_cost_impact}") + + return GateResult( + promotable=promotable, + distribution_ready=bool(dist), + status="approved" if dist else "provisional", + reasons=reasons, + ) + + +def bloat_warnings(candidate: dict, existing: list[SolutionPattern]) -> list[str]: + """Flag low-value adds against what is already catalogued (OQ6).""" + warnings: list[str] = [] + cand_id = SolutionPattern.make_id(candidate["key"]) + _, sig_type, locus = (candidate["key"].split(":", 2) + ["", ""])[:3] + for p in existing: + if p.id == cand_id: + warnings.append(f"duplicate of catalogued pattern {p.id}") + continue + p_parts = (p.provenance.source_key.split(":", 2) + ["", ""])[:3] + if (p_parts[1], p_parts[2]) == (sig_type, locus): + warnings.append(f"near-duplicate of {p.id} (same {sig_type}/{locus})") + return warnings diff --git a/session_memory/curate/review.py b/session_memory/curate/review.py index 3294303..b0abfae 100644 --- a/session_memory/curate/review.py +++ b/session_memory/curate/review.py @@ -22,6 +22,7 @@ from datetime import datetime, timezone from typing import Callable, Optional from .catalog import Catalog +from .gating import GateConfig, evaluate from .schema import Provenance, Resolution, Scope, SolutionPattern APPROVE = "approve" @@ -46,8 +47,13 @@ def evidence_fingerprint(candidate: dict) -> str: return hashlib.sha1(json.dumps(payload, sort_keys=True).encode("utf-8")).hexdigest() -def candidate_to_pattern(candidate: dict) -> SolutionPattern: - """Build a (provisional) Solution Pattern from a detect candidate.""" +def candidate_to_pattern(candidate: dict, *, status: str = "provisional", + distribution_ready: bool = False) -> SolutionPattern: + """Build a Solution Pattern from a detect candidate. + + ``status``/``distribution_ready`` come from the evidence gate (T04); they + default to a provisional, non-distribution-ready pattern when ungated. + """ src = candidate["key"] flavors = list(candidate.get("flavors", [])) hints = {f: {"target": _DEFAULT_TARGET.get(f, ""), "note": "TODO: refine rendering"} @@ -62,7 +68,8 @@ def candidate_to_pattern(candidate: dict) -> SolutionPattern: scope=Scope(flavors=flavors, repos=list(candidate.get("repos", []))), provenance=Provenance(source_key=src, evidence=dict(candidate), promoted_at=_now()), rendering_hints=hints, - status="provisional", + status=status, + distribution_ready=distribution_ready, ) @@ -112,8 +119,14 @@ class ReviewResult: def review(candidates: list[dict], decide: Decider, catalog: Catalog, - log: ReviewLog) -> ReviewResult: - """Run each candidate through ``decide``; promote approvals into ``catalog``.""" + log: ReviewLog, gate: Optional[GateConfig] = None) -> ReviewResult: + """Run each candidate through ``decide``; promote approvals into ``catalog``. + + When a ``gate`` (T04 evidence bar) is supplied, the promoted pattern's + ``status``/``distribution_ready`` are set from the gate evaluation, so an + approved-but-thin candidate lands as ``provisional`` rather than + distribution-ready. + """ result = ReviewResult() for cand in candidates: key = cand["key"] @@ -125,7 +138,11 @@ def review(candidates: list[dict], decide: Decider, catalog: Catalog, result.deferred.append(key) continue # not a final decision — leave for a later pass if action == APPROVE: - cat_action = catalog.upsert(candidate_to_pattern(cand)) + g = evaluate(cand, gate) if gate is not None else None + pattern = (candidate_to_pattern(cand, status=g.status, + distribution_ready=g.distribution_ready) + if g is not None else candidate_to_pattern(cand)) + cat_action = catalog.upsert(pattern) result.approved.append((key, cat_action)) elif action == REJECT: result.rejected.append(key) diff --git a/tests/test_curate_gating.py b/tests/test_curate_gating.py new file mode 100644 index 0000000..7807b0d --- /dev/null +++ b/tests/test_curate_gating.py @@ -0,0 +1,76 @@ +"""Evidence-bar + bloat-guard tests (T04).""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from session_memory.curate.catalog import Catalog # noqa: E402 +from session_memory.curate.gating import ( # noqa: E402 + GateConfig, + bloat_warnings, + evaluate, + gate_config, +) +from session_memory.curate.review import candidate_to_pattern # noqa: E402 + + +def _candidate(key="success:clean_pass:outcome", freq=5, sessions=5, impact=10.0, + cross=True, flavors=("claude", "grok")): + return { + "key": key, + "frequency": freq, + "sessions": [f"s{i}" for i in range(sessions)], + "cost_impact": impact, + "cross_flavor": cross, + "flavors": list(flavors), + } + + +def test_clears_bar_and_distribution_ready(): + r = evaluate(_candidate(), GateConfig(dist_min_frequency=3)) + assert r.promotable and r.distribution_ready + assert r.status == "approved" + + +def test_thin_candidate_promotable_but_provisional(): + # meets promote floor (freq>=2) but below distribution floor (freq<3) + r = evaluate(_candidate(freq=2, sessions=2), GateConfig(dist_min_frequency=3)) + assert r.promotable + assert not r.distribution_ready + assert r.status == "provisional" + + +def test_below_promote_floor_not_promotable(): + r = evaluate(_candidate(freq=1, sessions=1)) + assert not r.promotable + assert any("frequency" in reason for reason in r.reasons) + + +def test_cross_flavor_required_for_distribution(): + r = evaluate(_candidate(cross=False), GateConfig(dist_require_cross_flavor=True)) + assert r.promotable + assert not r.distribution_ready + assert any("cross-flavor" in reason for reason in r.reasons) + + +def test_gate_config_reads_toml_dict(): + cfg = gate_config({"curate": {"gate": {"min_frequency": 9, "dist_require_cross_flavor": True}}}) + assert cfg.min_frequency == 9 + assert cfg.dist_require_cross_flavor is True + # defaults preserved for unspecified keys + assert cfg.dist_min_frequency == 3 + + +def test_bloat_flags_duplicate_and_near_duplicate(tmp_path): + cat = Catalog(str(tmp_path)) + cat.upsert(candidate_to_pattern(_candidate(key="success:clean_pass:outcome"))) + existing = cat.list() + # exact same key -> duplicate + dup = bloat_warnings(_candidate(key="success:clean_pass:outcome"), existing) + assert any("duplicate" in w for w in dup) + # different polarity, same signal_type+locus -> near-duplicate + near = bloat_warnings(_candidate(key="problem:clean_pass:outcome"), existing) + assert any("near-duplicate" in w for w in near) + # unrelated -> no warnings + assert bloat_warnings(_candidate(key="problem:retry_storm:retries"), existing) == [] diff --git a/workplans/AGENTIC-WP-0004-session-memory-phase2.md b/workplans/AGENTIC-WP-0004-session-memory-phase2.md index fa66cde..3fccfc4 100644 --- a/workplans/AGENTIC-WP-0004-session-memory-phase2.md +++ b/workplans/AGENTIC-WP-0004-session-memory-phase2.md @@ -94,7 +94,7 @@ prior reject is remembered so it is not re-surfaced unless evidence changed. ```task id: AGENTIC-WP-0004-T04 -status: todo +status: done priority: medium state_hub_task_id: "d474425d-18af-48e4-8f5b-7716b2da0057" ```