markitect-main/markitect/infospace/checks/redundancy.py

"""
C1 — Redundancy detection.

Uses embedding similarity to find entity pairs with overlapping
meanings that may be candidates for merging.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, List, Optional

from markitect.infospace.models import EntityMeta
from markitect.llm.similarity import find_similar_pairs


@dataclass
class RedundancyReport:
    """Results from redundancy analysis."""

    similar_pairs: List[dict] = field(default_factory=list)
    redundancy_ratio: float = 0.0
    entity_count: int = 0

    def to_dict(self) -> dict:
        return {
            "concern": "C1",
            "redundancy_ratio": round(self.redundancy_ratio, 4),
            "similar_pairs": self.similar_pairs,
            "entity_count": self.entity_count,
        }


def check_redundancy(
    entities: List[EntityMeta],
    embeddings: Optional[Dict[str, list[float]]] = None,
    threshold: float = 0.85,
) -> RedundancyReport:
    """Check for redundant entities using embedding similarity.

    Args:
        entities: Entity metadata list.
        embeddings: Pre-computed ``{slug: vector}`` mapping.
            If ``None``, redundancy is checked structurally (title overlap).
        threshold: Similarity threshold for flagging pairs.

    Returns:
        :class:`RedundancyReport` with similar pairs and ratio.
    """
    n = len(entities)
    if n < 2:
        return RedundancyReport(entity_count=n)

    pairs: list[dict] = []

    if embeddings:
        # Embedding-based similarity
        raw_pairs = find_similar_pairs(embeddings, threshold=threshold)
        for slug_a, slug_b, sim in raw_pairs:
            pairs.append({
                "entity_a": slug_a,
                "entity_b": slug_b,
                "similarity": round(sim, 4),
                "method": "embedding",
            })
    else:
        # Fallback: structural overlap (shared definition words)
        slug_to_words = {}
        for e in entities:
            words = set(e.definition.lower().split()) if e.definition else set()
            slug_to_words[e.slug] = words

        slugs = sorted(slug_to_words)
        for i, a in enumerate(slugs):
            for b in slugs[i + 1:]:
                wa, wb = slug_to_words[a], slug_to_words[b]
                if wa and wb:
                    overlap = len(wa & wb) / min(len(wa), len(wb))
                    if overlap >= threshold:
                        pairs.append({
                            "entity_a": a,
                            "entity_b": b,
                            "similarity": round(overlap, 4),
                            "method": "word_overlap",
                        })

    # redundancy_ratio: fraction of entities involved in similar pairs
    involved = set()
    for p in pairs:
        involved.add(p["entity_a"])
        involved.add(p["entity_b"])
    ratio = len(involved) / n if n > 0 else 0.0

    return RedundancyReport(
        similar_pairs=pairs,
        redundancy_ratio=ratio,
        entity_count=n,
    )