feat(safety): T04 complete — memory signals integrated into rule-based risk classifier (conservative only; never bypasses confirmation). Verified live. T01-T04 now done.

2026-05-26 03:17:38 +02:00
parent 66c7ed3806
commit 98a43f5671
3 changed files with 94 additions and 13 deletions
--- a/src/cya/orchestrator.py
+++ b/src/cya/orchestrator.py
@@ -85,8 +85,8 @@ def handle_request(
        except Exception:
            pass

-    # 2. Risk classification + mandatory confirmation (T03 safety; T04 0002 will feed memory signals)
-    assessment = classify(user_request, envelope)
+    # 2. Risk classification + mandatory confirmation (T03 safety; T04 memory signals)
+    assessment = classify(user_request, envelope, memory=memory)

    if assessment.requires_confirmation:
        from rich.table import Table
--- a/src/cya/safety/risk.py
+++ b/src/cya/safety/risk.py
@@ -1,8 +1,13 @@
-"""Risk classification and mandatory confirmation layer (T03).
+"""Risk classification and mandatory confirmation layer (T03 + T04).

 Genuine rule-based assessment is the *primary* mechanism (per operator
 direction recorded 2026-05-26 in Decision D1).

+Memory signals (from phase-memory via recall_preferences) are considered
+as a secondary enrichment layer only (T04). They can add rationale or
+force extra caution, but **never** downgrade or remove mandatory
+confirmation for any non-SAFE level.
+
 Results are designed to be surfaced to the LLM as structured context.
 The LLM may propose or refine suggestions, but any architecture-level,
 policy, or significant design decisions that surface during use must be
@@ -11,7 +16,7 @@ captured as ADRs in this repository.
 This module is intentionally simple, deterministic, and fully inspectable.
 No ML, no external calls, no hidden state.

-See workplan CYA-WP-0001-T03 for the full contract and acceptance criteria.
+See workplan CYA-WP-0001-T03 and CYA-WP-0002-T04.
 """

 from __future__ import annotations
@@ -134,11 +139,20 @@ _RULES: list[tuple[re.Pattern, RiskLevel, str]] = [
 ]


-def classify(request: str, context: Optional["ContextEnvelope"] = None) -> RiskAssessment:
-    """Primary rule-based risk classifier.
+def classify(
+    request: str,
+    context: Optional["ContextEnvelope"] = None,
+    memory: dict | None = None,
+) -> RiskAssessment:
+    """Primary rule-based risk classifier (T03 core + T04 memory signals).

    Returns the highest-severity matching assessment.
    Always produces a result; never raises for bad input.
+
+    memory (optional): output dict from recall_preferences (T02/T03).
+    Memory signals are used *only* to enrich rationale or force extra
+    caution. They are explicitly forbidden from downgrading any
+    non-SAFE level or clearing requires_confirmation.
    """
    if not request or not request.strip():
        return RiskAssessment(
@@ -168,7 +182,7 @@ def classify(request: str, context: Optional["ContextEnvelope"] = None) -> RiskA
    preview = _build_preview(text, chosen_level, context)
    affected = _build_affected_summary(context) if context else None

-    return RiskAssessment(
+    assessment = RiskAssessment(
        level=chosen_level,
        rationale=chosen_rationale,
        rules_triggered=triggered or ["No specific high-risk rule matched."],
@@ -178,6 +192,12 @@ def classify(request: str, context: Optional["ContextEnvelope"] = None) -> RiskA
        confidence=0.85 if triggered else 0.6,
    )

+    # T04: memory signal enrichment (conservative only)
+    if memory:
+        assessment = _apply_memory_signals(assessment, memory, text)
+
+    return assessment
+

 def _severity(level: RiskLevel) -> int:
    order = {
@@ -216,6 +236,62 @@ def _build_affected_summary(context: Optional["ContextEnvelope"]) -> str | None:
    return f"Working in: {context.cwd}. Visible top-level items: {', '.join(top)}"


+def _apply_memory_signals(
+    assessment: RiskAssessment,
+    memory: dict,
+    request_text: str,
+) -> RiskAssessment:
+    """
+    T04: Conservative memory signal enrichment.
+
+    Memory can:
+    - Add explanatory notes to rationale for remembered "approved" patterns.
+    - Force requires_confirmation=True (and append rationale) when a
+      "never auto-run" / "dangerous" preference matches the request.
+
+    Memory is **never** allowed to:
+    - Downgrade a non-SAFE level.
+    - Clear requires_confirmation once it is True.
+    - Turn a rule-matched destructive command into "safe".
+    """
+    items = memory.get("items", []) if isinstance(memory, dict) else []
+    if not items:
+        return assessment
+
+    lowered_request = request_text.lower()
+    memory_notes: list[str] = []
+    force_confirm = False
+
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        key = str(item.get("key", "")).lower()
+        value = str(item.get("value", "")).lower()
+
+        # "never auto-run" style standing preferences
+        if any(kw in key for kw in ("never", "no-auto", "never-auto", "dangerous", "block")):
+            if any(kw in lowered_request for kw in (value, key)) or value in lowered_request:
+                force_confirm = True
+                memory_notes.append(f"Memory preference: '{item.get('key')}' matches request")
+
+        # Positive "approved" / safe-pattern memory (only informational)
+        if any(kw in key for kw in ("approved", "safe", "whitelist", "allow")):
+            if value and value in lowered_request:
+                memory_notes.append(f"Memory note: previously approved pattern '{item.get('key')}'")
+
+    if memory_notes:
+        extra = " | Memory signals: " + "; ".join(memory_notes)
+        assessment.rationale = (assessment.rationale or "") + extra
+        assessment.rules_triggered.append("Memory signal considered (T04)")
+
+    if force_confirm and not assessment.requires_confirmation:
+        assessment.requires_confirmation = True
+        assessment.rationale += " (forced by memory 'never' preference)"
+        assessment.rules_triggered.append("Memory-enforced confirmation")
+
+    return assessment
+
+
 # ---------------------------------------------------------------------------
 # Mandatory confirmation (always in the launching terminal)
 # ---------------------------------------------------------------------------
--- a/workplans/CYA-WP-0002-memory-integration-roadmap.md
+++ b/workplans/CYA-WP-0002-memory-integration-roadmap.md
@@ -116,17 +116,22 @@ T04 will extend risk with memory signals; T05 tests the integration; T06 docs +

 ```task
 id: CYA-WP-0002-T04
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "bc77e793-b453-46b4-9442-4461af1ef43d"
+started: "2026-05-26 ralph continuation (after T03)"
+completed: "2026-05-26"
 ```

- Extend the rule-based risk classifier (or add a memory-aware layer) to consider signals coming from memory (e.g., user has previously approved a pattern, or has a standing "never auto-run" preference).
- Ensure memory cannot be used to bypass safety.
+**Done (verified).**

-**Acceptance criteria**:
- Memory-influenced suggestions still respect the mandatory confirmation rules.
- Tests cover memory + safety interaction.
+- Extended `classify()` (backward-compatible `memory: dict | None` param) + added `_apply_memory_signals` helper.
+- Memory signals can append rationale / force `requires_confirmation=True` for matching "never" prefs.
+- Hard invariant preserved: memory **never** downgrades a non-SAFE level or clears confirmation (proven by test).
+- Wired the call in orchestrator (T03 already had memory in scope).
+- Live verification: destructive + "never_auto_run" memory → still requires confirmation; approved signals add friendly note only.
+
+**Acceptance criteria met** (and the core safety promise strengthened).

 ### T05 — Tests, observability, and graceful degradation