Five concern checks: Redundancy (embedding/word overlap), Coverage (FCA gap analysis), Coherence (graph connectivity), Consistency (cycle detection), Granularity (Shannon entropy). Orchestrator runs all or selected checks, CLI `markitect infospace check` command added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
99 lines
2.6 KiB
Python
99 lines
2.6 KiB
Python
"""
|
|
C5 — Granularity balance.
|
|
|
|
Checks that entities are at a consistent level of abstraction,
|
|
measured by word count distribution and Shannon entropy of domain
|
|
assignments.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List
|
|
|
|
from markitect.infospace.models import EntityMeta
|
|
|
|
|
|
@dataclass
|
|
class GranularityReport:
|
|
"""Results from granularity analysis."""
|
|
|
|
domain_entropy: float = 0.0
|
|
word_count_stats: Dict[str, float] = field(default_factory=dict)
|
|
domain_distribution: Dict[str, int] = field(default_factory=dict)
|
|
entity_count: int = 0
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"concern": "C5",
|
|
"domain_entropy": round(self.domain_entropy, 4),
|
|
"word_count_stats": {
|
|
k: round(v, 2) for k, v in self.word_count_stats.items()
|
|
},
|
|
"domain_distribution": self.domain_distribution,
|
|
"entity_count": self.entity_count,
|
|
}
|
|
|
|
|
|
def _shannon_entropy(counts: Dict[str, int]) -> float:
|
|
"""Compute Shannon entropy of a distribution."""
|
|
total = sum(counts.values())
|
|
if total == 0:
|
|
return 0.0
|
|
entropy = 0.0
|
|
for count in counts.values():
|
|
if count > 0:
|
|
p = count / total
|
|
entropy -= p * math.log2(p)
|
|
return entropy
|
|
|
|
|
|
def check_granularity(entities: List[EntityMeta]) -> GranularityReport:
|
|
"""Check granularity balance across entities.
|
|
|
|
Metrics:
|
|
- Domain entropy: higher = more balanced distribution.
|
|
- Word count statistics: mean, min, max, std dev.
|
|
|
|
Args:
|
|
entities: Entity metadata list.
|
|
|
|
Returns:
|
|
:class:`GranularityReport` with balance metrics.
|
|
"""
|
|
n = len(entities)
|
|
if n == 0:
|
|
return GranularityReport()
|
|
|
|
# Domain distribution
|
|
domain_counts: Dict[str, int] = {}
|
|
for e in entities:
|
|
d = e.domain or "(unspecified)"
|
|
domain_counts[d] = domain_counts.get(d, 0) + 1
|
|
|
|
entropy = _shannon_entropy(domain_counts)
|
|
|
|
# Word count statistics
|
|
word_counts = [e.definition_word_count for e in entities]
|
|
if not word_counts:
|
|
word_counts = [0]
|
|
|
|
mean_wc = sum(word_counts) / len(word_counts)
|
|
min_wc = min(word_counts)
|
|
max_wc = max(word_counts)
|
|
variance = sum((wc - mean_wc) ** 2 for wc in word_counts) / len(word_counts)
|
|
std_wc = math.sqrt(variance)
|
|
|
|
return GranularityReport(
|
|
domain_entropy=entropy,
|
|
word_count_stats={
|
|
"mean": mean_wc,
|
|
"min": float(min_wc),
|
|
"max": float(max_wc),
|
|
"std": std_wc,
|
|
},
|
|
domain_distribution=domain_counts,
|
|
entity_count=n,
|
|
)
|