feat(infospace): add collection-level quality checks C1–C5 (S2.4)
Five concern checks: Redundancy (embedding/word overlap), Coverage (FCA gap analysis), Coherence (graph connectivity), Consistency (cycle detection), Granularity (Shannon entropy). Orchestrator runs all or selected checks, CLI `markitect infospace check` command added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
98
markitect/infospace/checks/granularity.py
Normal file
98
markitect/infospace/checks/granularity.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
C5 — Granularity balance.
|
||||
|
||||
Checks that entities are at a consistent level of abstraction,
|
||||
measured by word count distribution and Shannon entropy of domain
|
||||
assignments.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List
|
||||
|
||||
from markitect.infospace.models import EntityMeta
|
||||
|
||||
|
||||
@dataclass
|
||||
class GranularityReport:
|
||||
"""Results from granularity analysis."""
|
||||
|
||||
domain_entropy: float = 0.0
|
||||
word_count_stats: Dict[str, float] = field(default_factory=dict)
|
||||
domain_distribution: Dict[str, int] = field(default_factory=dict)
|
||||
entity_count: int = 0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"concern": "C5",
|
||||
"domain_entropy": round(self.domain_entropy, 4),
|
||||
"word_count_stats": {
|
||||
k: round(v, 2) for k, v in self.word_count_stats.items()
|
||||
},
|
||||
"domain_distribution": self.domain_distribution,
|
||||
"entity_count": self.entity_count,
|
||||
}
|
||||
|
||||
|
||||
def _shannon_entropy(counts: Dict[str, int]) -> float:
|
||||
"""Compute Shannon entropy of a distribution."""
|
||||
total = sum(counts.values())
|
||||
if total == 0:
|
||||
return 0.0
|
||||
entropy = 0.0
|
||||
for count in counts.values():
|
||||
if count > 0:
|
||||
p = count / total
|
||||
entropy -= p * math.log2(p)
|
||||
return entropy
|
||||
|
||||
|
||||
def check_granularity(entities: List[EntityMeta]) -> GranularityReport:
|
||||
"""Check granularity balance across entities.
|
||||
|
||||
Metrics:
|
||||
- Domain entropy: higher = more balanced distribution.
|
||||
- Word count statistics: mean, min, max, std dev.
|
||||
|
||||
Args:
|
||||
entities: Entity metadata list.
|
||||
|
||||
Returns:
|
||||
:class:`GranularityReport` with balance metrics.
|
||||
"""
|
||||
n = len(entities)
|
||||
if n == 0:
|
||||
return GranularityReport()
|
||||
|
||||
# Domain distribution
|
||||
domain_counts: Dict[str, int] = {}
|
||||
for e in entities:
|
||||
d = e.domain or "(unspecified)"
|
||||
domain_counts[d] = domain_counts.get(d, 0) + 1
|
||||
|
||||
entropy = _shannon_entropy(domain_counts)
|
||||
|
||||
# Word count statistics
|
||||
word_counts = [e.definition_word_count for e in entities]
|
||||
if not word_counts:
|
||||
word_counts = [0]
|
||||
|
||||
mean_wc = sum(word_counts) / len(word_counts)
|
||||
min_wc = min(word_counts)
|
||||
max_wc = max(word_counts)
|
||||
variance = sum((wc - mean_wc) ** 2 for wc in word_counts) / len(word_counts)
|
||||
std_wc = math.sqrt(variance)
|
||||
|
||||
return GranularityReport(
|
||||
domain_entropy=entropy,
|
||||
word_count_stats={
|
||||
"mean": mean_wc,
|
||||
"min": float(min_wc),
|
||||
"max": float(max_wc),
|
||||
"std": std_wc,
|
||||
},
|
||||
domain_distribution=domain_counts,
|
||||
entity_count=n,
|
||||
)
|
||||
Reference in New Issue
Block a user