""" C5 — Granularity balance. Checks that entities are at a consistent level of abstraction, measured by word count distribution and Shannon entropy of domain assignments. """ from __future__ import annotations import math from dataclasses import dataclass, field from typing import Dict, List from markitect.infospace.models import EntityMeta @dataclass class GranularityReport: """Results from granularity analysis.""" domain_entropy: float = 0.0 word_count_stats: Dict[str, float] = field(default_factory=dict) domain_distribution: Dict[str, int] = field(default_factory=dict) entity_count: int = 0 def to_dict(self) -> dict: return { "concern": "C5", "domain_entropy": round(self.domain_entropy, 4), "word_count_stats": { k: round(v, 2) for k, v in self.word_count_stats.items() }, "domain_distribution": self.domain_distribution, "entity_count": self.entity_count, } def _shannon_entropy(counts: Dict[str, int]) -> float: """Compute Shannon entropy of a distribution.""" total = sum(counts.values()) if total == 0: return 0.0 entropy = 0.0 for count in counts.values(): if count > 0: p = count / total entropy -= p * math.log2(p) return entropy def check_granularity(entities: List[EntityMeta]) -> GranularityReport: """Check granularity balance across entities. Metrics: - Domain entropy: higher = more balanced distribution. - Word count statistics: mean, min, max, std dev. Args: entities: Entity metadata list. Returns: :class:`GranularityReport` with balance metrics. """ n = len(entities) if n == 0: return GranularityReport() # Domain distribution domain_counts: Dict[str, int] = {} for e in entities: d = e.domain or "(unspecified)" domain_counts[d] = domain_counts.get(d, 0) + 1 entropy = _shannon_entropy(domain_counts) # Word count statistics word_counts = [e.definition_word_count for e in entities] if not word_counts: word_counts = [0] mean_wc = sum(word_counts) / len(word_counts) min_wc = min(word_counts) max_wc = max(word_counts) variance = sum((wc - mean_wc) ** 2 for wc in word_counts) / len(word_counts) std_wc = math.sqrt(variance) return GranularityReport( domain_entropy=entropy, word_count_stats={ "mean": mean_wc, "min": float(min_wc), "max": float(max_wc), "std": std_wc, }, domain_distribution=domain_counts, entity_count=n, )