""" C2 — Coverage completeness. **What this measures** Builds a binary *domain × chapter* cross-table: rows are economic domains found across all entities, columns are source chapters. A cell is marked populated when at least one entity has that (domain, chapter) combination. coverage_ratio = populated_cells / (n_domains × n_chapters) This is a measure of how *uniformly* economic domains are distributed across source chapters, not of how richly entities connect to each other (that is C3 — Structural Coherence) and not of VSM competency-question answerability (that requires supplying ``extra_attributes`` with VSM system mappings, which the pipeline does not currently do). **Interpreting the ratio alone is misleading.** A single ratio cannot distinguish two structurally different situations: - *Healthy topic separation* — domains are locally dense within their book/section, sparse elsewhere. The matrix has clean block structure; low cross-chapter density per domain is *expected*. - *Fragmented extraction* — domains appear sporadically in all chapters, never strongly anchored anywhere. The matrix is uniformly thin everywhere. Both can produce the same ratio. Use the *per-domain density distribution* (``domain_densities``, ``density_std``, ``cross_cutting_ratio``) to distinguish them: - High ``density_std`` + bimodal distribution → healthy topic separation. - Low ``density_std`` + uniform distribution → potential fragmentation. - ``cross_cutting_ratio`` measures what fraction of domains span more than half the chapters — these are the foundational cross-cutting concepts. **Threshold note** A 0.50 threshold is appropriate for a focused single-topic corpus. For a heterogeneous multi-book corpus (e.g. all five books of The Wealth of Nations), domains introduced in later books create empty cells for all earlier chapters, causing the ratio to fall below 0.50 even for structurally healthy corpora. Consider 0.30–0.40 for large, multi-topic corpora. """ from __future__ import annotations import math import statistics from dataclasses import dataclass, field from typing import Any, Dict, List, Optional from markitect.infospace.models import EntityMeta from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts @dataclass class CoverageReport: """Results from coverage analysis. Attributes: coverage_ratio: Fraction of (domain, chapter) cells that are populated. See module docstring for interpretation notes. domain_densities: Per-domain fraction of chapters that contain at least one entity with that domain. Keys are domain names. density_std: Standard deviation of ``domain_densities`` values. High std suggests healthy topic separation; low std suggests uniform but thin coverage. cross_cutting_ratio: Fraction of domains that appear in more than 50 % of source chapters. These are the foundational concepts. empty_cells: List of ``{dimension_a, dimension_b}`` dicts for each unpopulated (domain, chapter) cell. gap_concepts: FCA gap concepts — attribute combinations present in the lattice but with no entity. domain_counts: Total entity count per domain. entity_count: Total number of entities analysed. """ coverage_ratio: float = 0.0 domain_densities: Dict[str, float] = field(default_factory=dict) density_std: float = 0.0 cross_cutting_ratio: float = 0.0 empty_cells: List[dict] = field(default_factory=list) gap_concepts: List[dict] = field(default_factory=list) domain_counts: Dict[str, int] = field(default_factory=dict) entity_count: int = 0 def to_dict(self) -> dict: return { "concern": "C2", "coverage_ratio": round(self.coverage_ratio, 4), "domain_densities": {k: round(v, 4) for k, v in self.domain_densities.items()}, "density_std": round(self.density_std, 4), "cross_cutting_ratio": round(self.cross_cutting_ratio, 4), "empty_cells": self.empty_cells, "gap_concepts_count": len(self.gap_concepts), "domain_counts": self.domain_counts, "entity_count": self.entity_count, } def _extract_attributes(entity: EntityMeta) -> set[str]: """Extract FCA attributes from an entity.""" attrs: set[str] = set() if entity.domain: attrs.add(f"domain:{entity.domain}") if entity.source_chapter: attrs.add(f"chapter:{entity.source_chapter}") return attrs def check_coverage( entities: List[EntityMeta], extra_attributes: Optional[Dict[str, set[str]]] = None, ) -> CoverageReport: """Check coverage completeness using FCA gap analysis. Args: entities: Entity metadata list. extra_attributes: Optional ``{slug: {attr, ...}}`` to merge with auto-extracted attributes (e.g. VSM mappings). Returns: :class:`CoverageReport` with gaps and coverage ratio. """ n = len(entities) if n == 0: return CoverageReport() # Build entity → attributes mapping entity_attrs: Dict[str, set[str]] = {} for e in entities: attrs = _extract_attributes(e) if extra_attributes and e.slug in extra_attributes: attrs.update(extra_attributes[e.slug]) entity_attrs[e.slug] = attrs # Domain counts domain_counts: Dict[str, int] = {} for e in entities: d = e.domain or "(unspecified)" domain_counts[d] = domain_counts.get(d, 0) + 1 # Build FCA context context = FormalContext.from_dict(entity_attrs) # Cross-tabulation: domain × chapter domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")}) chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")}) empty = [] if domains and chapters: raw_empty = find_empty_cells(context, domains, chapters) empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty] # FCA gap concepts gaps = find_gap_concepts(context) gap_dicts = [ {"intent": sorted(g.intent), "extent_size": g.extent_size} for g in gaps if g.intent_size <= 4 # Only report manageable gaps ] # Coverage ratio: populated cells / total possible cells total_cells = len(domains) * len(chapters) if domains and chapters else 1 populated = total_cells - len(empty) ratio = populated / total_cells if total_cells > 0 else 0.0 # Per-domain density: fraction of chapters that contain this domain n_chapters = len(chapters) domain_densities: Dict[str, float] = {} if n_chapters > 0: empty_pairs = {(e["dimension_a"], e["dimension_b"]) for e in empty} for d in domains: populated_for_domain = sum( 1 for c in chapters if (d, c) not in empty_pairs ) domain_densities[d.removeprefix("domain:")] = populated_for_domain / n_chapters density_values = list(domain_densities.values()) density_std = statistics.stdev(density_values) if len(density_values) >= 2 else 0.0 cross_cutting_ratio = ( sum(1 for v in density_values if v > 0.5) / len(density_values) if density_values else 0.0 ) return CoverageReport( coverage_ratio=ratio, domain_densities=domain_densities, density_std=round(density_std, 6), cross_cutting_ratio=round(cross_cutting_ratio, 4), empty_cells=empty, gap_concepts=gap_dicts, domain_counts=domain_counts, entity_count=n, )