feat(infospace): add collection-level quality checks C1–C5 (S2.4)

Five concern checks: Redundancy (embedding/word overlap), Coverage
(FCA gap analysis), Coherence (graph connectivity), Consistency
(cycle detection), Granularity (Shannon entropy). Orchestrator runs
all or selected checks, CLI `markitect infospace check` command added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 01:54:22 +01:00
parent 3461d2f354
commit 11585e6968
9 changed files with 1042 additions and 0 deletions

View File

@@ -0,0 +1,413 @@
"""
Tests for collection-level quality checks (S2.4).
Covers all five concerns: Redundancy (C1), Coverage (C2), Coherence (C3),
Consistency (C4), Granularity (C5), and the orchestrator.
"""
from __future__ import annotations
import math
import pytest
from markitect.infospace.models import EntityMeta
from markitect.prompts.dependencies.models import DependencyGraph
# ── helpers ──────────────────────────────────────────────────────────
def _entity(slug: str, domain: str = "", definition: str = "",
source_chapter: str = "", word_count: int = 0) -> EntityMeta:
wc = word_count if word_count else (len(definition.split()) if definition else 0)
return EntityMeta(
slug=slug,
title=slug.replace("-", " ").title(),
h1_raw=slug.replace("-", " ").title(),
definition=definition,
domain=domain,
source_chapter=source_chapter,
definition_word_count=wc,
total_word_count=wc,
)
def _sample_entities() -> list[EntityMeta]:
return [
_entity("alpha", domain="economics", definition="the first concept in our model", source_chapter="ch01"),
_entity("beta", domain="economics", definition="the second concept about markets", source_chapter="ch01"),
_entity("gamma", domain="sociology", definition="a social structure framework", source_chapter="ch02"),
_entity("delta", domain="sociology", definition="a social dynamic pattern", source_chapter="ch02"),
_entity("epsilon", domain="philosophy", definition="an epistemic principle", source_chapter="ch03"),
]
def _linear_graph() -> DependencyGraph:
"""A -> B -> C -> D."""
g = DependencyGraph()
g.add_edge("A", "B")
g.add_edge("B", "C")
g.add_edge("C", "D")
return g
def _cyclic_graph() -> DependencyGraph:
"""A -> B -> C -> A (one cycle)."""
g = DependencyGraph()
g.add_edge("A", "B")
g.add_edge("B", "C")
g.add_edge("C", "A")
return g
def _can_import_graph_analysis():
try:
from markitect.analysis.graph import connected_components # noqa: F401
return True
except ImportError:
return False
# ── C1: Redundancy ──────────────────────────────────────────────────
class TestRedundancy:
def test_empty_entities(self):
from markitect.infospace.checks.redundancy import check_redundancy
report = check_redundancy([])
assert report.entity_count == 0
assert report.redundancy_ratio == 0.0
assert report.similar_pairs == []
def test_single_entity(self):
from markitect.infospace.checks.redundancy import check_redundancy
report = check_redundancy([_entity("a", definition="hello world")])
assert report.entity_count == 1
assert report.redundancy_ratio == 0.0
def test_no_overlap_word_fallback(self):
from markitect.infospace.checks.redundancy import check_redundancy
entities = [
_entity("a", definition="apple banana cherry"),
_entity("b", definition="delta epsilon zeta"),
]
report = check_redundancy(entities, threshold=0.5)
assert report.similar_pairs == []
assert report.redundancy_ratio == 0.0
def test_high_overlap_word_fallback(self):
from markitect.infospace.checks.redundancy import check_redundancy
entities = [
_entity("a", definition="the quick brown fox"),
_entity("b", definition="the quick brown dog"),
]
report = check_redundancy(entities, threshold=0.5)
assert len(report.similar_pairs) == 1
assert report.similar_pairs[0]["method"] == "word_overlap"
assert report.similar_pairs[0]["entity_a"] == "a"
assert report.similar_pairs[0]["entity_b"] == "b"
assert report.redundancy_ratio == 1.0 # both entities involved
def test_embedding_based(self):
from markitect.infospace.checks.redundancy import check_redundancy
entities = [
_entity("a", definition="x"),
_entity("b", definition="y"),
_entity("c", definition="z"),
]
# a and b are very similar; c is different
embeddings = {
"a": [1.0, 0.0, 0.0],
"b": [0.99, 0.1, 0.0],
"c": [0.0, 0.0, 1.0],
}
report = check_redundancy(entities, embeddings=embeddings, threshold=0.9)
assert len(report.similar_pairs) >= 1
assert report.similar_pairs[0]["method"] == "embedding"
assert report.redundancy_ratio > 0.0
def test_to_dict(self):
from markitect.infospace.checks.redundancy import RedundancyReport
r = RedundancyReport(similar_pairs=[], redundancy_ratio=0.25, entity_count=10)
d = r.to_dict()
assert d["concern"] == "C1"
assert d["redundancy_ratio"] == 0.25
assert d["entity_count"] == 10
# ── C2: Coverage ────────────────────────────────────────────────────
class TestCoverage:
def test_empty_entities(self):
from markitect.infospace.checks.coverage import check_coverage
report = check_coverage([])
assert report.entity_count == 0
assert report.coverage_ratio == 0.0
def test_full_coverage(self):
"""All domain×chapter cells are populated."""
from markitect.infospace.checks.coverage import check_coverage
entities = [
_entity("a", domain="d1", source_chapter="ch1"),
_entity("b", domain="d2", source_chapter="ch1"),
_entity("c", domain="d1", source_chapter="ch2"),
_entity("d", domain="d2", source_chapter="ch2"),
]
report = check_coverage(entities)
assert report.coverage_ratio == 1.0
assert report.empty_cells == []
def test_partial_coverage(self):
"""One cell is missing → coverage < 1.0."""
from markitect.infospace.checks.coverage import check_coverage
entities = [
_entity("a", domain="d1", source_chapter="ch1"),
_entity("b", domain="d2", source_chapter="ch1"),
_entity("c", domain="d1", source_chapter="ch2"),
# Missing: d2×ch2
]
report = check_coverage(entities)
assert report.coverage_ratio < 1.0
assert len(report.empty_cells) == 1
assert report.empty_cells[0]["dimension_a"] == "domain:d2"
assert report.empty_cells[0]["dimension_b"] == "chapter:ch2"
def test_domain_counts(self):
from markitect.infospace.checks.coverage import check_coverage
entities = _sample_entities()
report = check_coverage(entities)
assert report.domain_counts["economics"] == 2
assert report.domain_counts["sociology"] == 2
assert report.domain_counts["philosophy"] == 1
def test_to_dict(self):
from markitect.infospace.checks.coverage import CoverageReport
r = CoverageReport(coverage_ratio=0.75, entity_count=8)
d = r.to_dict()
assert d["concern"] == "C2"
assert d["coverage_ratio"] == 0.75
def test_extra_attributes(self):
from markitect.infospace.checks.coverage import check_coverage
entities = [
_entity("a", domain="d1", source_chapter="ch1"),
]
extra = {"a": {"vsm:production"}}
report = check_coverage(entities, extra_attributes=extra)
assert report.entity_count == 1
# ── C3: Coherence ───────────────────────────────────────────────────
class TestCoherence:
def test_no_graph(self):
from markitect.infospace.checks.coherence import check_coherence
report = check_coherence(graph=None, entity_count=5)
assert report.connected_components == 0
assert report.entity_count == 5
def test_empty_graph(self):
from markitect.infospace.checks.coherence import check_coherence
g = DependencyGraph()
report = check_coherence(graph=g, entity_count=0)
assert report.connected_components == 0
def test_to_dict(self):
from markitect.infospace.checks.coherence import CoherenceReport
r = CoherenceReport(connected_components=2, modularity=0.3456, entity_count=10)
d = r.to_dict()
assert d["concern"] == "C3"
assert d["modularity"] == 0.3456
assert d["connected_components"] == 2
@pytest.mark.skipif(
not _can_import_graph_analysis(),
reason="networkx not available",
)
def test_with_graph(self):
from markitect.infospace.checks.coherence import check_coherence
g = _linear_graph()
report = check_coherence(graph=g, entity_count=4)
assert report.connected_components >= 1
assert report.entity_count == 4
# ── C4: Consistency ─────────────────────────────────────────────────
class TestConsistency:
def test_no_graph(self):
from markitect.infospace.checks.consistency import check_consistency
entities = _sample_entities()
report = check_consistency(entities)
assert report.cycle_count == 0
assert report.entity_count == 5
def test_acyclic_graph(self):
from markitect.infospace.checks.consistency import check_consistency
entities = _sample_entities()
g = _linear_graph()
report = check_consistency(entities, graph=g)
assert report.cycle_count == 0
def test_cyclic_graph(self):
from markitect.infospace.checks.consistency import check_consistency
entities = _sample_entities()
g = _cyclic_graph()
report = check_consistency(entities, graph=g)
assert report.cycle_count >= 1
assert len(report.cycles) >= 1
def test_to_dict(self):
from markitect.infospace.checks.consistency import ConsistencyReport
r = ConsistencyReport(cycles=[["A", "B", "A"]], cycle_count=1, entity_count=5)
d = r.to_dict()
assert d["concern"] == "C4"
assert d["cycle_count"] == 1
# ── C5: Granularity ─────────────────────────────────────────────────
class TestGranularity:
def test_empty_entities(self):
from markitect.infospace.checks.granularity import check_granularity
report = check_granularity([])
assert report.entity_count == 0
assert report.domain_entropy == 0.0
def test_single_domain(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [
_entity("a", domain="d1", word_count=10),
_entity("b", domain="d1", word_count=20),
]
report = check_granularity(entities)
assert report.domain_entropy == 0.0 # single domain = zero entropy
assert report.entity_count == 2
assert report.word_count_stats["mean"] == 15.0
def test_balanced_domains(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [
_entity("a", domain="d1", word_count=10),
_entity("b", domain="d2", word_count=10),
]
report = check_granularity(entities)
assert report.domain_entropy == pytest.approx(1.0) # log2(2) = 1.0
assert report.domain_distribution == {"d1": 1, "d2": 1}
def test_word_count_stats(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [
_entity("a", domain="d1", word_count=10),
_entity("b", domain="d1", word_count=30),
]
report = check_granularity(entities)
assert report.word_count_stats["mean"] == 20.0
assert report.word_count_stats["min"] == 10.0
assert report.word_count_stats["max"] == 30.0
assert report.word_count_stats["std"] == 10.0
def test_to_dict(self):
from markitect.infospace.checks.granularity import GranularityReport
r = GranularityReport(domain_entropy=1.5, entity_count=4)
d = r.to_dict()
assert d["concern"] == "C5"
assert d["domain_entropy"] == 1.5
def test_unspecified_domain(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [_entity("a", domain="", word_count=10)]
report = check_granularity(entities)
assert "(unspecified)" in report.domain_distribution
# ── Orchestrator ────────────────────────────────────────────────────
class TestOrchestrator:
def test_run_all_default(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities)
assert report.redundancy is not None
assert report.coverage is not None
assert report.coherence is not None
assert report.consistency is not None
assert report.granularity is not None
def test_run_selected_checks(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities, checks=["redundancy", "granularity"])
assert report.redundancy is not None
assert report.granularity is not None
assert report.coverage is None
assert report.coherence is None
assert report.consistency is None
def test_to_dict(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities, checks=["granularity"])
d = report.to_dict()
assert "granularity" in d
assert "redundancy" not in d
def test_metrics(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities, checks=["redundancy", "granularity"])
m = report.metrics()
assert "redundancy_ratio" in m
assert "granularity_entropy" in m
assert isinstance(m["redundancy_ratio"], float)
assert isinstance(m["granularity_entropy"], float)
def test_metrics_empty_report(self):
from markitect.infospace.checks.orchestrator import CheckReport
report = CheckReport()
assert report.metrics() == {}
def test_run_all_with_graph(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
g = _linear_graph()
report = run_all_checks(entities, graph=g, checks=["consistency"])
assert report.consistency is not None
assert report.consistency.cycle_count == 0
def test_run_all_with_cyclic_graph(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
g = _cyclic_graph()
report = run_all_checks(entities, graph=g, checks=["consistency"])
assert report.consistency.cycle_count >= 1
# ── Shannon entropy helper ──────────────────────────────────────────
class TestShannonEntropy:
def test_uniform_distribution(self):
from markitect.infospace.checks.granularity import _shannon_entropy
counts = {"a": 1, "b": 1, "c": 1, "d": 1}
assert _shannon_entropy(counts) == pytest.approx(2.0) # log2(4)
def test_single_element(self):
from markitect.infospace.checks.granularity import _shannon_entropy
assert _shannon_entropy({"a": 10}) == 0.0
def test_empty(self):
from markitect.infospace.checks.granularity import _shannon_entropy
assert _shannon_entropy({}) == 0.0
def test_skewed(self):
from markitect.infospace.checks.granularity import _shannon_entropy
counts = {"a": 99, "b": 1}
entropy = _shannon_entropy(counts)
assert 0.0 < entropy < 1.0