feat(infospace): add collection-level quality checks C1–C5 (S2.4)
Five concern checks: Redundancy (embedding/word overlap), Coverage (FCA gap analysis), Coherence (graph connectivity), Consistency (cycle detection), Granularity (Shannon entropy). Orchestrator runs all or selected checks, CLI `markitect infospace check` command added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
111
markitect/infospace/checks/coverage.py
Normal file
111
markitect/infospace/checks/coverage.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
C2 — Coverage completeness.
|
||||
|
||||
Uses FCA and cross-tabulation to detect structural coverage gaps:
|
||||
attribute combinations (domain × VSM system) with no entities.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from markitect.infospace.models import EntityMeta
|
||||
from markitect.analysis.fca import FormalContext, find_empty_cells, find_gap_concepts
|
||||
|
||||
|
||||
@dataclass
|
||||
class CoverageReport:
|
||||
"""Results from coverage analysis."""
|
||||
|
||||
coverage_ratio: float = 0.0
|
||||
empty_cells: List[dict] = field(default_factory=list)
|
||||
gap_concepts: List[dict] = field(default_factory=list)
|
||||
domain_counts: Dict[str, int] = field(default_factory=dict)
|
||||
entity_count: int = 0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"concern": "C2",
|
||||
"coverage_ratio": round(self.coverage_ratio, 4),
|
||||
"empty_cells": self.empty_cells,
|
||||
"gap_concepts_count": len(self.gap_concepts),
|
||||
"domain_counts": self.domain_counts,
|
||||
"entity_count": self.entity_count,
|
||||
}
|
||||
|
||||
|
||||
def _extract_attributes(entity: EntityMeta) -> set[str]:
|
||||
"""Extract FCA attributes from an entity."""
|
||||
attrs: set[str] = set()
|
||||
if entity.domain:
|
||||
attrs.add(f"domain:{entity.domain}")
|
||||
if entity.source_chapter:
|
||||
attrs.add(f"chapter:{entity.source_chapter}")
|
||||
return attrs
|
||||
|
||||
|
||||
def check_coverage(
|
||||
entities: List[EntityMeta],
|
||||
extra_attributes: Optional[Dict[str, set[str]]] = None,
|
||||
) -> CoverageReport:
|
||||
"""Check coverage completeness using FCA gap analysis.
|
||||
|
||||
Args:
|
||||
entities: Entity metadata list.
|
||||
extra_attributes: Optional ``{slug: {attr, ...}}`` to merge
|
||||
with auto-extracted attributes (e.g. VSM mappings).
|
||||
|
||||
Returns:
|
||||
:class:`CoverageReport` with gaps and coverage ratio.
|
||||
"""
|
||||
n = len(entities)
|
||||
if n == 0:
|
||||
return CoverageReport()
|
||||
|
||||
# Build entity → attributes mapping
|
||||
entity_attrs: Dict[str, set[str]] = {}
|
||||
for e in entities:
|
||||
attrs = _extract_attributes(e)
|
||||
if extra_attributes and e.slug in extra_attributes:
|
||||
attrs.update(extra_attributes[e.slug])
|
||||
entity_attrs[e.slug] = attrs
|
||||
|
||||
# Domain counts
|
||||
domain_counts: Dict[str, int] = {}
|
||||
for e in entities:
|
||||
d = e.domain or "(unspecified)"
|
||||
domain_counts[d] = domain_counts.get(d, 0) + 1
|
||||
|
||||
# Build FCA context
|
||||
context = FormalContext.from_dict(entity_attrs)
|
||||
|
||||
# Cross-tabulation: domain × chapter
|
||||
domains = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("domain:")})
|
||||
chapters = sorted({a for attrs in entity_attrs.values() for a in attrs if a.startswith("chapter:")})
|
||||
|
||||
empty = []
|
||||
if domains and chapters:
|
||||
raw_empty = find_empty_cells(context, domains, chapters)
|
||||
empty = [{"dimension_a": a, "dimension_b": b} for a, b in raw_empty]
|
||||
|
||||
# FCA gap concepts
|
||||
gaps = find_gap_concepts(context)
|
||||
gap_dicts = [
|
||||
{"intent": sorted(g.intent), "extent_size": g.extent_size}
|
||||
for g in gaps
|
||||
if g.intent_size <= 4 # Only report manageable gaps
|
||||
]
|
||||
|
||||
# Coverage ratio: populated cells / total possible cells
|
||||
total_cells = len(domains) * len(chapters) if domains and chapters else 1
|
||||
populated = total_cells - len(empty)
|
||||
ratio = populated / total_cells if total_cells > 0 else 0.0
|
||||
|
||||
return CoverageReport(
|
||||
coverage_ratio=ratio,
|
||||
empty_cells=empty,
|
||||
gap_concepts=gap_dicts,
|
||||
domain_counts=domain_counts,
|
||||
entity_count=n,
|
||||
)
|
||||
Reference in New Issue
Block a user