feat(infospace): add collection-level quality checks C1–C5 (S2.4)

Five concern checks: Redundancy (embedding/word overlap), Coverage
(FCA gap analysis), Coherence (graph connectivity), Consistency
(cycle detection), Granularity (Shannon entropy). Orchestrator runs
all or selected checks, CLI `markitect infospace check` command added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 01:54:22 +01:00
parent 3461d2f354
commit 11585e6968
9 changed files with 1042 additions and 0 deletions

View File

@@ -0,0 +1,111 @@
"""
C2 — Coverage completeness.
Uses FCA and cross-tabulation to detect structural coverage gaps:
attribute combinations (domain × VSM system) with no entities.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from markitect.infospace.models import EntityMeta
from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts
@dataclass
class CoverageReport:
"""Results from coverage analysis."""
coverage_ratio: float = 0.0
empty_cells: List[dict] = field(default_factory=list)
gap_concepts: List[dict] = field(default_factory=list)
domain_counts: Dict[str, int] = field(default_factory=dict)
entity_count: int = 0
def to_dict(self) -> dict:
return {
"concern": "C2",
"coverage_ratio": round(self.coverage_ratio, 4),
"empty_cells": self.empty_cells,
"gap_concepts_count": len(self.gap_concepts),
"domain_counts": self.domain_counts,
"entity_count": self.entity_count,
}
def _extract_attributes(entity: EntityMeta) -> set[str]:
"""Extract FCA attributes from an entity."""
attrs: set[str] = set()
if entity.domain:
attrs.add(f"domain:{entity.domain}")
if entity.source_chapter:
attrs.add(f"chapter:{entity.source_chapter}")
return attrs
def check_coverage(
entities: List[EntityMeta],
extra_attributes: Optional[Dict[str, set[str]]] = None,
) -> CoverageReport:
"""Check coverage completeness using FCA gap analysis.
Args:
entities: Entity metadata list.
extra_attributes: Optional ``{slug: {attr, ...}}`` to merge
with auto-extracted attributes (e.g. VSM mappings).
Returns:
:class:`CoverageReport` with gaps and coverage ratio.
"""
n = len(entities)
if n == 0:
return CoverageReport()
# Build entity → attributes mapping
entity_attrs: Dict[str, set[str]] = {}
for e in entities:
attrs = _extract_attributes(e)
if extra_attributes and e.slug in extra_attributes:
attrs.update(extra_attributes[e.slug])
entity_attrs[e.slug] = attrs
# Domain counts
domain_counts: Dict[str, int] = {}
for e in entities:
d = e.domain or "(unspecified)"
domain_counts[d] = domain_counts.get(d, 0) + 1
# Build FCA context
context = FormalContext.from_dict(entity_attrs)
# Cross-tabulation: domain × chapter
domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")})
chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")})
empty = []
if domains and chapters:
raw_empty = find_empty_cells(context, domains, chapters)
empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty]
# FCA gap concepts
gaps = find_gap_concepts(context)
gap_dicts = [
{"intent": sorted(g.intent), "extent_size": g.extent_size}
for g in gaps
if g.intent_size <= 4 # Only report manageable gaps
]
# Coverage ratio: populated cells / total possible cells
total_cells = len(domains) * len(chapters) if domains and chapters else 1
populated = total_cells - len(empty)
ratio = populated / total_cells if total_cells > 0 else 0.0
return CoverageReport(
coverage_ratio=ratio,
empty_cells=empty,
gap_concepts=gap_dicts,
domain_counts=domain_counts,
entity_count=n,
)