diff --git a/src/cya/orchestrator.py b/src/cya/orchestrator.py index c5ff639..f3169da 100644 --- a/src/cya/orchestrator.py +++ b/src/cya/orchestrator.py @@ -85,8 +85,8 @@ def handle_request( except Exception: pass - # 2. Risk classification + mandatory confirmation (T03 safety; T04 0002 will feed memory signals) - assessment = classify(user_request, envelope) + # 2. Risk classification + mandatory confirmation (T03 safety; T04 memory signals) + assessment = classify(user_request, envelope, memory=memory) if assessment.requires_confirmation: from rich.table import Table diff --git a/src/cya/safety/risk.py b/src/cya/safety/risk.py index 5162a42..404ef10 100644 --- a/src/cya/safety/risk.py +++ b/src/cya/safety/risk.py @@ -1,8 +1,13 @@ -"""Risk classification and mandatory confirmation layer (T03). +"""Risk classification and mandatory confirmation layer (T03 + T04). Genuine rule-based assessment is the *primary* mechanism (per operator direction recorded 2026-05-26 in Decision D1). +Memory signals (from phase-memory via recall_preferences) are considered +as a secondary enrichment layer only (T04). They can add rationale or +force extra caution, but **never** downgrade or remove mandatory +confirmation for any non-SAFE level. + Results are designed to be surfaced to the LLM as structured context. The LLM may propose or refine suggestions, but any architecture-level, policy, or significant design decisions that surface during use must be @@ -11,7 +16,7 @@ captured as ADRs in this repository. This module is intentionally simple, deterministic, and fully inspectable. No ML, no external calls, no hidden state. -See workplan CYA-WP-0001-T03 for the full contract and acceptance criteria. +See workplan CYA-WP-0001-T03 and CYA-WP-0002-T04. """ from __future__ import annotations @@ -134,11 +139,20 @@ _RULES: list[tuple[re.Pattern, RiskLevel, str]] = [ ] -def classify(request: str, context: Optional["ContextEnvelope"] = None) -> RiskAssessment: - """Primary rule-based risk classifier. +def classify( + request: str, + context: Optional["ContextEnvelope"] = None, + memory: dict | None = None, +) -> RiskAssessment: + """Primary rule-based risk classifier (T03 core + T04 memory signals). Returns the highest-severity matching assessment. Always produces a result; never raises for bad input. + + memory (optional): output dict from recall_preferences (T02/T03). + Memory signals are used *only* to enrich rationale or force extra + caution. They are explicitly forbidden from downgrading any + non-SAFE level or clearing requires_confirmation. """ if not request or not request.strip(): return RiskAssessment( @@ -168,7 +182,7 @@ def classify(request: str, context: Optional["ContextEnvelope"] = None) -> RiskA preview = _build_preview(text, chosen_level, context) affected = _build_affected_summary(context) if context else None - return RiskAssessment( + assessment = RiskAssessment( level=chosen_level, rationale=chosen_rationale, rules_triggered=triggered or ["No specific high-risk rule matched."], @@ -178,6 +192,12 @@ def classify(request: str, context: Optional["ContextEnvelope"] = None) -> RiskA confidence=0.85 if triggered else 0.6, ) + # T04: memory signal enrichment (conservative only) + if memory: + assessment = _apply_memory_signals(assessment, memory, text) + + return assessment + def _severity(level: RiskLevel) -> int: order = { @@ -216,6 +236,62 @@ def _build_affected_summary(context: Optional["ContextEnvelope"]) -> str | None: return f"Working in: {context.cwd}. Visible top-level items: {', '.join(top)}" +def _apply_memory_signals( + assessment: RiskAssessment, + memory: dict, + request_text: str, +) -> RiskAssessment: + """ + T04: Conservative memory signal enrichment. + + Memory can: + - Add explanatory notes to rationale for remembered "approved" patterns. + - Force requires_confirmation=True (and append rationale) when a + "never auto-run" / "dangerous" preference matches the request. + + Memory is **never** allowed to: + - Downgrade a non-SAFE level. + - Clear requires_confirmation once it is True. + - Turn a rule-matched destructive command into "safe". + """ + items = memory.get("items", []) if isinstance(memory, dict) else [] + if not items: + return assessment + + lowered_request = request_text.lower() + memory_notes: list[str] = [] + force_confirm = False + + for item in items: + if not isinstance(item, dict): + continue + key = str(item.get("key", "")).lower() + value = str(item.get("value", "")).lower() + + # "never auto-run" style standing preferences + if any(kw in key for kw in ("never", "no-auto", "never-auto", "dangerous", "block")): + if any(kw in lowered_request for kw in (value, key)) or value in lowered_request: + force_confirm = True + memory_notes.append(f"Memory preference: '{item.get('key')}' matches request") + + # Positive "approved" / safe-pattern memory (only informational) + if any(kw in key for kw in ("approved", "safe", "whitelist", "allow")): + if value and value in lowered_request: + memory_notes.append(f"Memory note: previously approved pattern '{item.get('key')}'") + + if memory_notes: + extra = " | Memory signals: " + "; ".join(memory_notes) + assessment.rationale = (assessment.rationale or "") + extra + assessment.rules_triggered.append("Memory signal considered (T04)") + + if force_confirm and not assessment.requires_confirmation: + assessment.requires_confirmation = True + assessment.rationale += " (forced by memory 'never' preference)" + assessment.rules_triggered.append("Memory-enforced confirmation") + + return assessment + + # --------------------------------------------------------------------------- # Mandatory confirmation (always in the launching terminal) # --------------------------------------------------------------------------- diff --git a/workplans/CYA-WP-0002-memory-integration-roadmap.md b/workplans/CYA-WP-0002-memory-integration-roadmap.md index 546d686..b68b462 100644 --- a/workplans/CYA-WP-0002-memory-integration-roadmap.md +++ b/workplans/CYA-WP-0002-memory-integration-roadmap.md @@ -116,17 +116,22 @@ T04 will extend risk with memory signals; T05 tests the integration; T06 docs + ```task id: CYA-WP-0002-T04 -status: todo +status: done priority: medium state_hub_task_id: "bc77e793-b453-46b4-9442-4461af1ef43d" +started: "2026-05-26 ralph continuation (after T03)" +completed: "2026-05-26" ``` -- Extend the rule-based risk classifier (or add a memory-aware layer) to consider signals coming from memory (e.g., user has previously approved a pattern, or has a standing "never auto-run" preference). -- Ensure memory cannot be used to bypass safety. +**Done (verified).** -**Acceptance criteria**: -- Memory-influenced suggestions still respect the mandatory confirmation rules. -- Tests cover memory + safety interaction. +- Extended `classify()` (backward-compatible `memory: dict | None` param) + added `_apply_memory_signals` helper. +- Memory signals can append rationale / force `requires_confirmation=True` for matching "never" prefs. +- Hard invariant preserved: memory **never** downgrades a non-SAFE level or clears confirmation (proven by test). +- Wired the call in orchestrator (T03 already had memory in scope). +- Live verification: destructive + "never_auto_run" memory → still requires confirmation; approved signals add friendly note only. + +**Acceptance criteria met** (and the core safety promise strengthened). ### T05 — Tests, observability, and graceful degradation