Files
markitect-main/markitect/infospace/checks/redundancy.py
tegwick 11585e6968 feat(infospace): add collection-level quality checks C1–C5 (S2.4)
Five concern checks: Redundancy (embedding/word overlap), Coverage
(FCA gap analysis), Coherence (graph connectivity), Consistency
(cycle detection), Granularity (Shannon entropy). Orchestrator runs
all or selected checks, CLI `markitect infospace check` command added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:54:22 +01:00

99 lines
3.0 KiB
Python

"""
C1 — Redundancy detection.
Uses embedding similarity to find entity pairs with overlapping
meanings that may be candidates for merging.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from markitect.infospace.models import EntityMeta
from markitect.llm.similarity import find_similar_pairs
@dataclass
class RedundancyReport:
"""Results from redundancy analysis."""
similar_pairs: List[dict] = field(default_factory=list)
redundancy_ratio: float = 0.0
entity_count: int = 0
def to_dict(self) -> dict:
return {
"concern": "C1",
"redundancy_ratio": round(self.redundancy_ratio, 4),
"similar_pairs": self.similar_pairs,
"entity_count": self.entity_count,
}
def check_redundancy(
entities: List[EntityMeta],
embeddings: Optional[Dict[str, list[float]]] = None,
threshold: float = 0.85,
) -> RedundancyReport:
"""Check for redundant entities using embedding similarity.
Args:
entities: Entity metadata list.
embeddings: Pre-computed ``{slug: vector}`` mapping.
If ``None``, redundancy is checked structurally (title overlap).
threshold: Similarity threshold for flagging pairs.
Returns:
:class:`RedundancyReport` with similar pairs and ratio.
"""
n = len(entities)
if n < 2:
return RedundancyReport(entity_count=n)
pairs: list[dict] = []
if embeddings:
# Embedding-based similarity
raw_pairs = find_similar_pairs(embeddings, threshold=threshold)
for slug_a, slug_b, sim in raw_pairs:
pairs.append({
"entity_a": slug_a,
"entity_b": slug_b,
"similarity": round(sim, 4),
"method": "embedding",
})
else:
# Fallback: structural overlap (shared definition words)
slug_to_words = {}
for e in entities:
words = set(e.definition.lower().split()) if e.definition else set()
slug_to_words[e.slug] = words
slugs = sorted(slug_to_words)
for i, a in enumerate(slugs):
for b in slugs[i + 1:]:
wa, wb = slug_to_words[a], slug_to_words[b]
if wa and wb:
overlap = len(wa & wb) / min(len(wa), len(wb))
if overlap >= threshold:
pairs.append({
"entity_a": a,
"entity_b": b,
"similarity": round(overlap, 4),
"method": "word_overlap",
})
# redundancy_ratio: fraction of entities involved in similar pairs
involved = set()
for p in pairs:
involved.add(p["entity_a"])
involved.add(p["entity_b"])
ratio = len(involved) / n if n > 0 else 0.0
return RedundancyReport(
similar_pairs=pairs,
redundancy_ratio=ratio,
entity_count=n,
)