Files
markitect-main/markitect/infospace/checks/coverage.py
tegwick 11585e6968 feat(infospace): add collection-level quality checks C1–C5 (S2.4)
Five concern checks: Redundancy (embedding/word overlap), Coverage
(FCA gap analysis), Coherence (graph connectivity), Consistency
(cycle detection), Granularity (Shannon entropy). Orchestrator runs
all or selected checks, CLI `markitect infospace check` command added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:54:22 +01:00

112 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
C2 — Coverage completeness.
Uses FCA and cross-tabulation to detect structural coverage gaps:
attribute combinations (domain × VSM system) with no entities.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from markitect.infospace.models import EntityMeta
from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts
@dataclass
class CoverageReport:
"""Results from coverage analysis."""
coverage_ratio: float = 0.0
empty_cells: List[dict] = field(default_factory=list)
gap_concepts: List[dict] = field(default_factory=list)
domain_counts: Dict[str, int] = field(default_factory=dict)
entity_count: int = 0
def to_dict(self) -> dict:
return {
"concern": "C2",
"coverage_ratio": round(self.coverage_ratio, 4),
"empty_cells": self.empty_cells,
"gap_concepts_count": len(self.gap_concepts),
"domain_counts": self.domain_counts,
"entity_count": self.entity_count,
}
def _extract_attributes(entity: EntityMeta) -> set[str]:
"""Extract FCA attributes from an entity."""
attrs: set[str] = set()
if entity.domain:
attrs.add(f"domain:{entity.domain}")
if entity.source_chapter:
attrs.add(f"chapter:{entity.source_chapter}")
return attrs
def check_coverage(
entities: List[EntityMeta],
extra_attributes: Optional[Dict[str, set[str]]] = None,
) -> CoverageReport:
"""Check coverage completeness using FCA gap analysis.
Args:
entities: Entity metadata list.
extra_attributes: Optional ``{slug: {attr, ...}}`` to merge
with auto-extracted attributes (e.g. VSM mappings).
Returns:
:class:`CoverageReport` with gaps and coverage ratio.
"""
n = len(entities)
if n == 0:
return CoverageReport()
# Build entity → attributes mapping
entity_attrs: Dict[str, set[str]] = {}
for e in entities:
attrs = _extract_attributes(e)
if extra_attributes and e.slug in extra_attributes:
attrs.update(extra_attributes[e.slug])
entity_attrs[e.slug] = attrs
# Domain counts
domain_counts: Dict[str, int] = {}
for e in entities:
d = e.domain or "(unspecified)"
domain_counts[d] = domain_counts.get(d, 0) + 1
# Build FCA context
context = FormalContext.from_dict(entity_attrs)
# Cross-tabulation: domain × chapter
domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")})
chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")})
empty = []
if domains and chapters:
raw_empty = find_empty_cells(context, domains, chapters)
empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty]
# FCA gap concepts
gaps = find_gap_concepts(context)
gap_dicts = [
{"intent": sorted(g.intent), "extent_size": g.extent_size}
for g in gaps
if g.intent_size <= 4 # Only report manageable gaps
]
# Coverage ratio: populated cells / total possible cells
total_cells = len(domains) * len(chapters) if domains and chapters else 1
populated = total_cells - len(empty)
ratio = populated / total_cells if total_cells > 0 else 0.0
return CoverageReport(
coverage_ratio=ratio,
empty_cells=empty,
gap_concepts=gap_dicts,
domain_counts=domain_counts,
entity_count=n,
)