""" C1 — Redundancy detection. Uses embedding similarity to find entity pairs with overlapping meanings that may be candidates for merging. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Dict, List, Optional from markitect.infospace.models import EntityMeta from markitect.llm.similarity import find_similar_pairs @dataclass class RedundancyReport: """Results from redundancy analysis.""" similar_pairs: List[dict] = field(default_factory=list) redundancy_ratio: float = 0.0 entity_count: int = 0 def to_dict(self) -> dict: return { "concern": "C1", "redundancy_ratio": round(self.redundancy_ratio, 4), "similar_pairs": self.similar_pairs, "entity_count": self.entity_count, } def check_redundancy( entities: List[EntityMeta], embeddings: Optional[Dict[str, list[float]]] = None, threshold: float = 0.85, ) -> RedundancyReport: """Check for redundant entities using embedding similarity. Args: entities: Entity metadata list. embeddings: Pre-computed ``{slug: vector}`` mapping. If ``None``, redundancy is checked structurally (title overlap). threshold: Similarity threshold for flagging pairs. Returns: :class:`RedundancyReport` with similar pairs and ratio. """ n = len(entities) if n < 2: return RedundancyReport(entity_count=n) pairs: list[dict] = [] if embeddings: # Embedding-based similarity raw_pairs = find_similar_pairs(embeddings, threshold=threshold) for slug_a, slug_b, sim in raw_pairs: pairs.append({ "entity_a": slug_a, "entity_b": slug_b, "similarity": round(sim, 4), "method": "embedding", }) else: # Fallback: structural overlap (shared definition words) slug_to_words = {} for e in entities: words = set(e.definition.lower().split()) if e.definition else set() slug_to_words[e.slug] = words slugs = sorted(slug_to_words) for i, a in enumerate(slugs): for b in slugs[i + 1:]: wa, wb = slug_to_words[a], slug_to_words[b] if wa and wb: overlap = len(wa & wb) / min(len(wa), len(wb)) if overlap >= threshold: pairs.append({ "entity_a": a, "entity_b": b, "similarity": round(overlap, 4), "method": "word_overlap", }) # redundancy_ratio: fraction of entities involved in similar pairs involved = set() for p in pairs: involved.add(p["entity_a"]) involved.add(p["entity_b"]) ratio = len(involved) / n if n > 0 else 0.0 return RedundancyReport( similar_pairs=pairs, redundancy_ratio=ratio, entity_count=n, )