Five concern checks: Redundancy (embedding/word overlap), Coverage (FCA gap analysis), Coherence (graph connectivity), Consistency (cycle detection), Granularity (Shannon entropy). Orchestrator runs all or selected checks, CLI `markitect infospace check` command added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
"""
|
|
C1 — Redundancy detection.
|
|
|
|
Uses embedding similarity to find entity pairs with overlapping
|
|
meanings that may be candidates for merging.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List, Optional
|
|
|
|
from markitect.infospace.models import EntityMeta
|
|
from markitect.llm.similarity import find_similar_pairs
|
|
|
|
|
|
@dataclass
|
|
class RedundancyReport:
|
|
"""Results from redundancy analysis."""
|
|
|
|
similar_pairs: List[dict] = field(default_factory=list)
|
|
redundancy_ratio: float = 0.0
|
|
entity_count: int = 0
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"concern": "C1",
|
|
"redundancy_ratio": round(self.redundancy_ratio, 4),
|
|
"similar_pairs": self.similar_pairs,
|
|
"entity_count": self.entity_count,
|
|
}
|
|
|
|
|
|
def check_redundancy(
|
|
entities: List[EntityMeta],
|
|
embeddings: Optional[Dict[str, list[float]]] = None,
|
|
threshold: float = 0.85,
|
|
) -> RedundancyReport:
|
|
"""Check for redundant entities using embedding similarity.
|
|
|
|
Args:
|
|
entities: Entity metadata list.
|
|
embeddings: Pre-computed ``{slug: vector}`` mapping.
|
|
If ``None``, redundancy is checked structurally (title overlap).
|
|
threshold: Similarity threshold for flagging pairs.
|
|
|
|
Returns:
|
|
:class:`RedundancyReport` with similar pairs and ratio.
|
|
"""
|
|
n = len(entities)
|
|
if n < 2:
|
|
return RedundancyReport(entity_count=n)
|
|
|
|
pairs: list[dict] = []
|
|
|
|
if embeddings:
|
|
# Embedding-based similarity
|
|
raw_pairs = find_similar_pairs(embeddings, threshold=threshold)
|
|
for slug_a, slug_b, sim in raw_pairs:
|
|
pairs.append({
|
|
"entity_a": slug_a,
|
|
"entity_b": slug_b,
|
|
"similarity": round(sim, 4),
|
|
"method": "embedding",
|
|
})
|
|
else:
|
|
# Fallback: structural overlap (shared definition words)
|
|
slug_to_words = {}
|
|
for e in entities:
|
|
words = set(e.definition.lower().split()) if e.definition else set()
|
|
slug_to_words[e.slug] = words
|
|
|
|
slugs = sorted(slug_to_words)
|
|
for i, a in enumerate(slugs):
|
|
for b in slugs[i + 1:]:
|
|
wa, wb = slug_to_words[a], slug_to_words[b]
|
|
if wa and wb:
|
|
overlap = len(wa & wb) / min(len(wa), len(wb))
|
|
if overlap >= threshold:
|
|
pairs.append({
|
|
"entity_a": a,
|
|
"entity_b": b,
|
|
"similarity": round(overlap, 4),
|
|
"method": "word_overlap",
|
|
})
|
|
|
|
# redundancy_ratio: fraction of entities involved in similar pairs
|
|
involved = set()
|
|
for p in pairs:
|
|
involved.add(p["entity_a"])
|
|
involved.add(p["entity_b"])
|
|
ratio = len(involved) / n if n > 0 else 0.0
|
|
|
|
return RedundancyReport(
|
|
similar_pairs=pairs,
|
|
redundancy_ratio=ratio,
|
|
entity_count=n,
|
|
)
|