Files
markitect-main/markitect/infospace/checks/granularity.py
tegwick 11585e6968 feat(infospace): add collection-level quality checks C1–C5 (S2.4)
Five concern checks: Redundancy (embedding/word overlap), Coverage
(FCA gap analysis), Coherence (graph connectivity), Consistency
(cycle detection), Granularity (Shannon entropy). Orchestrator runs
all or selected checks, CLI `markitect infospace check` command added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:54:22 +01:00

99 lines
2.6 KiB
Python

"""
C5 — Granularity balance.
Checks that entities are at a consistent level of abstraction,
measured by word count distribution and Shannon entropy of domain
assignments.
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from typing import Dict, List
from markitect.infospace.models import EntityMeta
@dataclass
class GranularityReport:
"""Results from granularity analysis."""
domain_entropy: float = 0.0
word_count_stats: Dict[str, float] = field(default_factory=dict)
domain_distribution: Dict[str, int] = field(default_factory=dict)
entity_count: int = 0
def to_dict(self) -> dict:
return {
"concern": "C5",
"domain_entropy": round(self.domain_entropy, 4),
"word_count_stats": {
k: round(v, 2) for k, v in self.word_count_stats.items()
},
"domain_distribution": self.domain_distribution,
"entity_count": self.entity_count,
}
def _shannon_entropy(counts: Dict[str, int]) -> float:
"""Compute Shannon entropy of a distribution."""
total = sum(counts.values())
if total == 0:
return 0.0
entropy = 0.0
for count in counts.values():
if count > 0:
p = count / total
entropy -= p * math.log2(p)
return entropy
def check_granularity(entities: List[EntityMeta]) -> GranularityReport:
"""Check granularity balance across entities.
Metrics:
- Domain entropy: higher = more balanced distribution.
- Word count statistics: mean, min, max, std dev.
Args:
entities: Entity metadata list.
Returns:
:class:`GranularityReport` with balance metrics.
"""
n = len(entities)
if n == 0:
return GranularityReport()
# Domain distribution
domain_counts: Dict[str, int] = {}
for e in entities:
d = e.domain or "(unspecified)"
domain_counts[d] = domain_counts.get(d, 0) + 1
entropy = _shannon_entropy(domain_counts)
# Word count statistics
word_counts = [e.definition_word_count for e in entities]
if not word_counts:
word_counts = [0]
mean_wc = sum(word_counts) / len(word_counts)
min_wc = min(word_counts)
max_wc = max(word_counts)
variance = sum((wc - mean_wc) ** 2 for wc in word_counts) / len(word_counts)
std_wc = math.sqrt(variance)
return GranularityReport(
domain_entropy=entropy,
word_count_stats={
"mean": mean_wc,
"min": float(min_wc),
"max": float(max_wc),
"std": std_wc,
},
domain_distribution=domain_counts,
entity_count=n,
)