"""Pattern clusterer + evidence (PRD §5, §6.2; T05/T06). Groups recurring :class:`Signal`s into candidate ``Pattern`` records. Clustering is deterministic and keyed on ``(polarity, signal-type, locus)`` — enough to surface "the same thing keeps happening" without embeddings (a later option). Each candidate carries evidence (FR-D3): supporting sessions, frequency, affected repos, affected **flavors**, and an estimated cost-impact score. Candidates whose evidence spans more than one flavor are flagged ``cross_flavor`` (FR-D4) — the highest-value reuse targets. """ from __future__ import annotations import collections from dataclasses import asdict, dataclass, field from typing import Any from .signals import PROBLEM, Signal @dataclass class Pattern: key: str # stable cluster key polarity: str # problem | success signal_type: str locus: str frequency: int # number of supporting signals sessions: list[str] = field(default_factory=list) repos: list[str] = field(default_factory=list) flavors: list[str] = field(default_factory=list) cross_flavor: bool = False cost_impact: float = 0.0 # frequency-weighted magnitude score: float = 0.0 # ranking score (impact x frequency) title: str = "" def to_dict(self) -> dict[str, Any]: return asdict(self) def _key(s: Signal) -> str: return f"{s.polarity}:{s.type}:{s.locus}" def _title(polarity: str, signal_type: str, n_flavors: int) -> str: scope = "cross-flavor " if n_flavors > 1 else "" verb = "problem" if polarity == PROBLEM else "success" return f"{scope}{verb}: {signal_type.replace('_', ' ')}" def cluster(signals: list[Signal], *, min_frequency: int = 2) -> list[Pattern]: """Group signals into candidate patterns; keep clusters >= min_frequency.""" groups: dict[str, list[Signal]] = collections.defaultdict(list) for s in signals: groups[_key(s)].append(s) patterns: list[Pattern] = [] for key, members in groups.items(): if len(members) < min_frequency: continue sessions = sorted({m.session_uid for m in members}) repos = sorted({m.repo for m in members if m.repo}) flavors = sorted({m.flavor for m in members}) cost_impact = sum(m.magnitude for m in members) first = members[0] p = Pattern( key=key, polarity=first.polarity, signal_type=first.type, locus=first.locus, frequency=len(members), sessions=sessions, repos=repos, flavors=flavors, cross_flavor=len(flavors) > 1, cost_impact=round(cost_impact, 3), title=_title(first.polarity, first.type, len(flavors)), ) # rank: impact x frequency, with a boost for cross-flavor reuse value p.score = round(p.cost_impact * p.frequency * (1.5 if p.cross_flavor else 1.0), 3) patterns.append(p) # cross-flavor first, then by score patterns.sort(key=lambda p: (not p.cross_flavor, -p.score)) return patterns