- coverage.py: rewrite module docstring to explain what the metric actually computes (domain × chapter cross-tabulation, not VSM system coverage), what it does not capture (entity connectivity → C3), and when the threshold is appropriate - CoverageReport: add domain_densities, density_std, cross_cutting_ratio for distribution-level insight beyond the aggregate ratio - check_coverage: compute per-domain density and cross-cutting ratio - METRICS-METHODOLOGY.md: correct C2 section to match implementation, document the distribution-based interpretation, add implementation status table distinguishing what is wired vs planned Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
196 lines
7.5 KiB
Python
196 lines
7.5 KiB
Python
"""
|
||
C2 — Coverage completeness.
|
||
|
||
**What this measures**
|
||
|
||
Builds a binary *domain × chapter* cross-table: rows are economic domains
|
||
found across all entities, columns are source chapters. A cell is marked
|
||
populated when at least one entity has that (domain, chapter) combination.
|
||
|
||
coverage_ratio = populated_cells / (n_domains × n_chapters)
|
||
|
||
This is a measure of how *uniformly* economic domains are distributed across
|
||
source chapters, not of how richly entities connect to each other (that is
|
||
C3 — Structural Coherence) and not of VSM competency-question answerability
|
||
(that requires supplying ``extra_attributes`` with VSM system mappings, which
|
||
the pipeline does not currently do).
|
||
|
||
**Interpreting the ratio alone is misleading.** A single ratio cannot
|
||
distinguish two structurally different situations:
|
||
|
||
- *Healthy topic separation* — domains are locally dense within their
|
||
book/section, sparse elsewhere. The matrix has clean block structure;
|
||
low cross-chapter density per domain is *expected*.
|
||
- *Fragmented extraction* — domains appear sporadically in all chapters,
|
||
never strongly anchored anywhere. The matrix is uniformly thin everywhere.
|
||
|
||
Both can produce the same ratio. Use the *per-domain density distribution*
|
||
(``domain_densities``, ``density_std``, ``cross_cutting_ratio``) to
|
||
distinguish them:
|
||
|
||
- High ``density_std`` + bimodal distribution → healthy topic separation.
|
||
- Low ``density_std`` + uniform distribution → potential fragmentation.
|
||
- ``cross_cutting_ratio`` measures what fraction of domains span more than
|
||
half the chapters — these are the foundational cross-cutting concepts.
|
||
|
||
**Threshold note**
|
||
|
||
A 0.50 threshold is appropriate for a focused single-topic corpus. For a
|
||
heterogeneous multi-book corpus (e.g. all five books of The Wealth of
|
||
Nations), domains introduced in later books create empty cells for all
|
||
earlier chapters, causing the ratio to fall below 0.50 even for structurally
|
||
healthy corpora. Consider 0.30–0.40 for large, multi-topic corpora.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import math
|
||
import statistics
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
from markitect.infospace.models import EntityMeta
|
||
from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts
|
||
|
||
|
||
@dataclass
|
||
class CoverageReport:
|
||
"""Results from coverage analysis.
|
||
|
||
Attributes:
|
||
coverage_ratio: Fraction of (domain, chapter) cells that are
|
||
populated. See module docstring for interpretation notes.
|
||
domain_densities: Per-domain fraction of chapters that contain
|
||
at least one entity with that domain. Keys are domain names.
|
||
density_std: Standard deviation of ``domain_densities`` values.
|
||
High std suggests healthy topic separation; low std suggests
|
||
uniform but thin coverage.
|
||
cross_cutting_ratio: Fraction of domains that appear in more than
|
||
50 % of source chapters. These are the foundational concepts.
|
||
empty_cells: List of ``{dimension_a, dimension_b}`` dicts for each
|
||
unpopulated (domain, chapter) cell.
|
||
gap_concepts: FCA gap concepts — attribute combinations present in
|
||
the lattice but with no entity.
|
||
domain_counts: Total entity count per domain.
|
||
entity_count: Total number of entities analysed.
|
||
"""
|
||
|
||
coverage_ratio: float = 0.0
|
||
domain_densities: Dict[str, float] = field(default_factory=dict)
|
||
density_std: float = 0.0
|
||
cross_cutting_ratio: float = 0.0
|
||
empty_cells: List[dict] = field(default_factory=list)
|
||
gap_concepts: List[dict] = field(default_factory=list)
|
||
domain_counts: Dict[str, int] = field(default_factory=dict)
|
||
entity_count: int = 0
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"concern": "C2",
|
||
"coverage_ratio": round(self.coverage_ratio, 4),
|
||
"domain_densities": {k: round(v, 4) for k, v in self.domain_densities.items()},
|
||
"density_std": round(self.density_std, 4),
|
||
"cross_cutting_ratio": round(self.cross_cutting_ratio, 4),
|
||
"empty_cells": self.empty_cells,
|
||
"gap_concepts_count": len(self.gap_concepts),
|
||
"domain_counts": self.domain_counts,
|
||
"entity_count": self.entity_count,
|
||
}
|
||
|
||
|
||
def _extract_attributes(entity: EntityMeta) -> set[str]:
|
||
"""Extract FCA attributes from an entity."""
|
||
attrs: set[str] = set()
|
||
if entity.domain:
|
||
attrs.add(f"domain:{entity.domain}")
|
||
if entity.source_chapter:
|
||
attrs.add(f"chapter:{entity.source_chapter}")
|
||
return attrs
|
||
|
||
|
||
def check_coverage(
|
||
entities: List[EntityMeta],
|
||
extra_attributes: Optional[Dict[str, set[str]]] = None,
|
||
) -> CoverageReport:
|
||
"""Check coverage completeness using FCA gap analysis.
|
||
|
||
Args:
|
||
entities: Entity metadata list.
|
||
extra_attributes: Optional ``{slug: {attr, ...}}`` to merge
|
||
with auto-extracted attributes (e.g. VSM mappings).
|
||
|
||
Returns:
|
||
:class:`CoverageReport` with gaps and coverage ratio.
|
||
"""
|
||
n = len(entities)
|
||
if n == 0:
|
||
return CoverageReport()
|
||
|
||
# Build entity → attributes mapping
|
||
entity_attrs: Dict[str, set[str]] = {}
|
||
for e in entities:
|
||
attrs = _extract_attributes(e)
|
||
if extra_attributes and e.slug in extra_attributes:
|
||
attrs.update(extra_attributes[e.slug])
|
||
entity_attrs[e.slug] = attrs
|
||
|
||
# Domain counts
|
||
domain_counts: Dict[str, int] = {}
|
||
for e in entities:
|
||
d = e.domain or "(unspecified)"
|
||
domain_counts[d] = domain_counts.get(d, 0) + 1
|
||
|
||
# Build FCA context
|
||
context = FormalContext.from_dict(entity_attrs)
|
||
|
||
# Cross-tabulation: domain × chapter
|
||
domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")})
|
||
chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")})
|
||
|
||
empty = []
|
||
if domains and chapters:
|
||
raw_empty = find_empty_cells(context, domains, chapters)
|
||
empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty]
|
||
|
||
# FCA gap concepts
|
||
gaps = find_gap_concepts(context)
|
||
gap_dicts = [
|
||
{"intent": sorted(g.intent), "extent_size": g.extent_size}
|
||
for g in gaps
|
||
if g.intent_size <= 4 # Only report manageable gaps
|
||
]
|
||
|
||
# Coverage ratio: populated cells / total possible cells
|
||
total_cells = len(domains) * len(chapters) if domains and chapters else 1
|
||
populated = total_cells - len(empty)
|
||
ratio = populated / total_cells if total_cells > 0 else 0.0
|
||
|
||
# Per-domain density: fraction of chapters that contain this domain
|
||
n_chapters = len(chapters)
|
||
domain_densities: Dict[str, float] = {}
|
||
if n_chapters > 0:
|
||
empty_pairs = {(e["dimension_a"], e["dimension_b"]) for e in empty}
|
||
for d in domains:
|
||
populated_for_domain = sum(
|
||
1 for c in chapters if (d, c) not in empty_pairs
|
||
)
|
||
domain_densities[d.removeprefix("domain:")] = populated_for_domain / n_chapters
|
||
|
||
density_values = list(domain_densities.values())
|
||
density_std = statistics.stdev(density_values) if len(density_values) >= 2 else 0.0
|
||
cross_cutting_ratio = (
|
||
sum(1 for v in density_values if v > 0.5) / len(density_values)
|
||
if density_values else 0.0
|
||
)
|
||
|
||
return CoverageReport(
|
||
coverage_ratio=ratio,
|
||
domain_densities=domain_densities,
|
||
density_std=round(density_std, 6),
|
||
cross_cutting_ratio=round(cross_cutting_ratio, 4),
|
||
empty_cells=empty,
|
||
gap_concepts=gap_dicts,
|
||
domain_counts=domain_counts,
|
||
entity_count=n,
|
||
)
|