markitect-main/markitect/infospace/checks/coverage.py

"""
C2 — Coverage completeness.

**What this measures**

Builds a binary *domain × chapter* cross-table: rows are economic domains
found across all entities, columns are source chapters.  A cell is marked
populated when at least one entity has that (domain, chapter) combination.

    coverage_ratio = populated_cells / (n_domains × n_chapters)

This is a measure of how *uniformly* economic domains are distributed across
source chapters, not of how richly entities connect to each other (that is
C3 — Structural Coherence) and not of VSM competency-question answerability
(that requires supplying ``extra_attributes`` with VSM system mappings, which
the pipeline does not currently do).

**Interpreting the ratio alone is misleading.**  A single ratio cannot
distinguish two structurally different situations:

- *Healthy topic separation* — domains are locally dense within their
  book/section, sparse elsewhere.  The matrix has clean block structure;
  low cross-chapter density per domain is *expected*.
- *Fragmented extraction* — domains appear sporadically in all chapters,
  never strongly anchored anywhere.  The matrix is uniformly thin everywhere.

Both can produce the same ratio.  Use the *per-domain density distribution*
(``domain_densities``, ``density_std``, ``cross_cutting_ratio``) to
distinguish them:

- High ``density_std`` + bimodal distribution → healthy topic separation.
- Low ``density_std`` + uniform distribution → potential fragmentation.
- ``cross_cutting_ratio`` measures what fraction of domains span more than
  half the chapters — these are the foundational cross-cutting concepts.

**Threshold note**

A 0.50 threshold is appropriate for a focused single-topic corpus.  For a
heterogeneous multi-book corpus (e.g. all five books of The Wealth of
Nations), domains introduced in later books create empty cells for all
earlier chapters, causing the ratio to fall below 0.50 even for structurally
healthy corpora.  Consider 0.30–0.40 for large, multi-topic corpora.
"""

from __future__ import annotations

import math
import statistics
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

from markitect.infospace.models import EntityMeta
from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts


@dataclass
class CoverageReport:
    """Results from coverage analysis.

    Attributes:
        coverage_ratio: Fraction of (domain, chapter) cells that are
            populated.  See module docstring for interpretation notes.
        domain_densities: Per-domain fraction of chapters that contain
            at least one entity with that domain.  Keys are domain names.
        density_std: Standard deviation of ``domain_densities`` values.
            High std suggests healthy topic separation; low std suggests
            uniform but thin coverage.
        cross_cutting_ratio: Fraction of domains that appear in more than
            50 % of source chapters.  These are the foundational concepts.
        empty_cells: List of ``{dimension_a, dimension_b}`` dicts for each
            unpopulated (domain, chapter) cell.
        gap_concepts: FCA gap concepts — attribute combinations present in
            the lattice but with no entity.
        domain_counts: Total entity count per domain.
        entity_count: Total number of entities analysed.
    """

    coverage_ratio: float = 0.0
    domain_densities: Dict[str, float] = field(default_factory=dict)
    density_std: float = 0.0
    cross_cutting_ratio: float = 0.0
    empty_cells: List[dict] = field(default_factory=list)
    gap_concepts: List[dict] = field(default_factory=list)
    domain_counts: Dict[str, int] = field(default_factory=dict)
    entity_count: int = 0

    def to_dict(self) -> dict:
        return {
            "concern": "C2",
            "coverage_ratio": round(self.coverage_ratio, 4),
            "domain_densities": {k: round(v, 4) for k, v in self.domain_densities.items()},
            "density_std": round(self.density_std, 4),
            "cross_cutting_ratio": round(self.cross_cutting_ratio, 4),
            "empty_cells": self.empty_cells,
            "gap_concepts_count": len(self.gap_concepts),
            "domain_counts": self.domain_counts,
            "entity_count": self.entity_count,
        }


def _extract_attributes(entity: EntityMeta) -> set[str]:
    """Extract FCA attributes from an entity."""
    attrs: set[str] = set()
    if entity.domain:
        attrs.add(f"domain:{entity.domain}")
    if entity.source_chapter:
        attrs.add(f"chapter:{entity.source_chapter}")
    return attrs


def check_coverage(
    entities: List[EntityMeta],
    extra_attributes: Optional[Dict[str, set[str]]] = None,
) -> CoverageReport:
    """Check coverage completeness using FCA gap analysis.

    Args:
        entities: Entity metadata list.
        extra_attributes: Optional ``{slug: {attr, ...}}`` to merge
            with auto-extracted attributes (e.g. VSM mappings).

    Returns:
        :class:`CoverageReport` with gaps and coverage ratio.
    """
    n = len(entities)
    if n == 0:
        return CoverageReport()

    # Build entity → attributes mapping
    entity_attrs: Dict[str, set[str]] = {}
    for e in entities:
        attrs = _extract_attributes(e)
        if extra_attributes and e.slug in extra_attributes:
            attrs.update(extra_attributes[e.slug])
        entity_attrs[e.slug] = attrs

    # Domain counts
    domain_counts: Dict[str, int] = {}
    for e in entities:
        d = e.domain or "(unspecified)"
        domain_counts[d] = domain_counts.get(d, 0) + 1

    # Build FCA context
    context = FormalContext.from_dict(entity_attrs)

    # Cross-tabulation: domain × chapter
    domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")})
    chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")})

    empty = []
    if domains and chapters:
        raw_empty = find_empty_cells(context, domains, chapters)
        empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty]

    # FCA gap concepts
    gaps = find_gap_concepts(context)
    gap_dicts = [
        {"intent": sorted(g.intent), "extent_size": g.extent_size}
        for g in gaps
        if g.intent_size <= 4  # Only report manageable gaps
    ]

    # Coverage ratio: populated cells / total possible cells
    total_cells = len(domains) * len(chapters) if domains and chapters else 1
    populated = total_cells - len(empty)
    ratio = populated / total_cells if total_cells > 0 else 0.0

    # Per-domain density: fraction of chapters that contain this domain
    n_chapters = len(chapters)
    domain_densities: Dict[str, float] = {}
    if n_chapters > 0:
        empty_pairs = {(e["dimension_a"], e["dimension_b"]) for e in empty}
        for d in domains:
            populated_for_domain = sum(
                1 for c in chapters if (d, c) not in empty_pairs
            )
            domain_densities[d.removeprefix("domain:")] = populated_for_domain / n_chapters

    density_values = list(domain_densities.values())
    density_std = statistics.stdev(density_values) if len(density_values) >= 2 else 0.0
    cross_cutting_ratio = (
        sum(1 for v in density_values if v > 0.5) / len(density_values)
        if density_values else 0.0
    )

    return CoverageReport(
        coverage_ratio=ratio,
        domain_densities=domain_densities,
        density_std=round(density_std, 6),
        cross_cutting_ratio=round(cross_cutting_ratio, 4),
        empty_cells=empty,
        gap_concepts=gap_dicts,
        domain_counts=domain_counts,
        entity_count=n,
    )