Files
markitect-main/markitect/infospace/checks/coverage.py
tegwick dfe56a4f9b docs(metrics): clarify C2 coverage — domain×chapter matrix, not domain×VSM
- coverage.py: rewrite module docstring to explain what the metric actually
  computes (domain × chapter cross-tabulation, not VSM system coverage),
  what it does not capture (entity connectivity → C3), and when the
  threshold is appropriate
- CoverageReport: add domain_densities, density_std, cross_cutting_ratio
  for distribution-level insight beyond the aggregate ratio
- check_coverage: compute per-domain density and cross-cutting ratio
- METRICS-METHODOLOGY.md: correct C2 section to match implementation,
  document the distribution-based interpretation, add implementation status
  table distinguishing what is wired vs planned

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-20 00:08:46 +01:00

196 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
C2 — Coverage completeness.
**What this measures**
Builds a binary *domain × chapter* cross-table: rows are economic domains
found across all entities, columns are source chapters. A cell is marked
populated when at least one entity has that (domain, chapter) combination.
coverage_ratio = populated_cells / (n_domains × n_chapters)
This is a measure of how *uniformly* economic domains are distributed across
source chapters, not of how richly entities connect to each other (that is
C3 — Structural Coherence) and not of VSM competency-question answerability
(that requires supplying ``extra_attributes`` with VSM system mappings, which
the pipeline does not currently do).
**Interpreting the ratio alone is misleading.** A single ratio cannot
distinguish two structurally different situations:
- *Healthy topic separation* — domains are locally dense within their
book/section, sparse elsewhere. The matrix has clean block structure;
low cross-chapter density per domain is *expected*.
- *Fragmented extraction* — domains appear sporadically in all chapters,
never strongly anchored anywhere. The matrix is uniformly thin everywhere.
Both can produce the same ratio. Use the *per-domain density distribution*
(``domain_densities``, ``density_std``, ``cross_cutting_ratio``) to
distinguish them:
- High ``density_std`` + bimodal distribution → healthy topic separation.
- Low ``density_std`` + uniform distribution → potential fragmentation.
- ``cross_cutting_ratio`` measures what fraction of domains span more than
half the chapters — these are the foundational cross-cutting concepts.
**Threshold note**
A 0.50 threshold is appropriate for a focused single-topic corpus. For a
heterogeneous multi-book corpus (e.g. all five books of The Wealth of
Nations), domains introduced in later books create empty cells for all
earlier chapters, causing the ratio to fall below 0.50 even for structurally
healthy corpora. Consider 0.300.40 for large, multi-topic corpora.
"""
from __future__ import annotations
import math
import statistics
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from markitect.infospace.models import EntityMeta
from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts
@dataclass
class CoverageReport:
"""Results from coverage analysis.
Attributes:
coverage_ratio: Fraction of (domain, chapter) cells that are
populated. See module docstring for interpretation notes.
domain_densities: Per-domain fraction of chapters that contain
at least one entity with that domain. Keys are domain names.
density_std: Standard deviation of ``domain_densities`` values.
High std suggests healthy topic separation; low std suggests
uniform but thin coverage.
cross_cutting_ratio: Fraction of domains that appear in more than
50 % of source chapters. These are the foundational concepts.
empty_cells: List of ``{dimension_a, dimension_b}`` dicts for each
unpopulated (domain, chapter) cell.
gap_concepts: FCA gap concepts — attribute combinations present in
the lattice but with no entity.
domain_counts: Total entity count per domain.
entity_count: Total number of entities analysed.
"""
coverage_ratio: float = 0.0
domain_densities: Dict[str, float] = field(default_factory=dict)
density_std: float = 0.0
cross_cutting_ratio: float = 0.0
empty_cells: List[dict] = field(default_factory=list)
gap_concepts: List[dict] = field(default_factory=list)
domain_counts: Dict[str, int] = field(default_factory=dict)
entity_count: int = 0
def to_dict(self) -> dict:
return {
"concern": "C2",
"coverage_ratio": round(self.coverage_ratio, 4),
"domain_densities": {k: round(v, 4) for k, v in self.domain_densities.items()},
"density_std": round(self.density_std, 4),
"cross_cutting_ratio": round(self.cross_cutting_ratio, 4),
"empty_cells": self.empty_cells,
"gap_concepts_count": len(self.gap_concepts),
"domain_counts": self.domain_counts,
"entity_count": self.entity_count,
}
def _extract_attributes(entity: EntityMeta) -> set[str]:
"""Extract FCA attributes from an entity."""
attrs: set[str] = set()
if entity.domain:
attrs.add(f"domain:{entity.domain}")
if entity.source_chapter:
attrs.add(f"chapter:{entity.source_chapter}")
return attrs
def check_coverage(
entities: List[EntityMeta],
extra_attributes: Optional[Dict[str, set[str]]] = None,
) -> CoverageReport:
"""Check coverage completeness using FCA gap analysis.
Args:
entities: Entity metadata list.
extra_attributes: Optional ``{slug: {attr, ...}}`` to merge
with auto-extracted attributes (e.g. VSM mappings).
Returns:
:class:`CoverageReport` with gaps and coverage ratio.
"""
n = len(entities)
if n == 0:
return CoverageReport()
# Build entity → attributes mapping
entity_attrs: Dict[str, set[str]] = {}
for e in entities:
attrs = _extract_attributes(e)
if extra_attributes and e.slug in extra_attributes:
attrs.update(extra_attributes[e.slug])
entity_attrs[e.slug] = attrs
# Domain counts
domain_counts: Dict[str, int] = {}
for e in entities:
d = e.domain or "(unspecified)"
domain_counts[d] = domain_counts.get(d, 0) + 1
# Build FCA context
context = FormalContext.from_dict(entity_attrs)
# Cross-tabulation: domain × chapter
domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")})
chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")})
empty = []
if domains and chapters:
raw_empty = find_empty_cells(context, domains, chapters)
empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty]
# FCA gap concepts
gaps = find_gap_concepts(context)
gap_dicts = [
{"intent": sorted(g.intent), "extent_size": g.extent_size}
for g in gaps
if g.intent_size <= 4 # Only report manageable gaps
]
# Coverage ratio: populated cells / total possible cells
total_cells = len(domains) * len(chapters) if domains and chapters else 1
populated = total_cells - len(empty)
ratio = populated / total_cells if total_cells > 0 else 0.0
# Per-domain density: fraction of chapters that contain this domain
n_chapters = len(chapters)
domain_densities: Dict[str, float] = {}
if n_chapters > 0:
empty_pairs = {(e["dimension_a"], e["dimension_b"]) for e in empty}
for d in domains:
populated_for_domain = sum(
1 for c in chapters if (d, c) not in empty_pairs
)
domain_densities[d.removeprefix("domain:")] = populated_for_domain / n_chapters
density_values = list(domain_densities.values())
density_std = statistics.stdev(density_values) if len(density_values) >= 2 else 0.0
cross_cutting_ratio = (
sum(1 for v in density_values if v > 0.5) / len(density_values)
if density_values else 0.0
)
return CoverageReport(
coverage_ratio=ratio,
domain_densities=domain_densities,
density_std=round(density_std, 6),
cross_cutting_ratio=round(cross_cutting_ratio, 4),
empty_cells=empty,
gap_concepts=gap_dicts,
domain_counts=domain_counts,
entity_count=n,
)