Five concern checks: Redundancy (embedding/word overlap), Coverage (FCA gap analysis), Coherence (graph connectivity), Consistency (cycle detection), Granularity (Shannon entropy). Orchestrator runs all or selected checks, CLI `markitect infospace check` command added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
414 lines
16 KiB
Python
414 lines
16 KiB
Python
"""
|
||
Tests for collection-level quality checks (S2.4).
|
||
|
||
Covers all five concerns: Redundancy (C1), Coverage (C2), Coherence (C3),
|
||
Consistency (C4), Granularity (C5), and the orchestrator.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import math
|
||
|
||
import pytest
|
||
|
||
from markitect.infospace.models import EntityMeta
|
||
from markitect.prompts.dependencies.models import DependencyGraph
|
||
|
||
|
||
# ── helpers ──────────────────────────────────────────────────────────
|
||
|
||
|
||
def _entity(slug: str, domain: str = "", definition: str = "",
|
||
source_chapter: str = "", word_count: int = 0) -> EntityMeta:
|
||
wc = word_count if word_count else (len(definition.split()) if definition else 0)
|
||
return EntityMeta(
|
||
slug=slug,
|
||
title=slug.replace("-", " ").title(),
|
||
h1_raw=slug.replace("-", " ").title(),
|
||
definition=definition,
|
||
domain=domain,
|
||
source_chapter=source_chapter,
|
||
definition_word_count=wc,
|
||
total_word_count=wc,
|
||
)
|
||
|
||
|
||
def _sample_entities() -> list[EntityMeta]:
|
||
return [
|
||
_entity("alpha", domain="economics", definition="the first concept in our model", source_chapter="ch01"),
|
||
_entity("beta", domain="economics", definition="the second concept about markets", source_chapter="ch01"),
|
||
_entity("gamma", domain="sociology", definition="a social structure framework", source_chapter="ch02"),
|
||
_entity("delta", domain="sociology", definition="a social dynamic pattern", source_chapter="ch02"),
|
||
_entity("epsilon", domain="philosophy", definition="an epistemic principle", source_chapter="ch03"),
|
||
]
|
||
|
||
|
||
def _linear_graph() -> DependencyGraph:
|
||
"""A -> B -> C -> D."""
|
||
g = DependencyGraph()
|
||
g.add_edge("A", "B")
|
||
g.add_edge("B", "C")
|
||
g.add_edge("C", "D")
|
||
return g
|
||
|
||
|
||
def _cyclic_graph() -> DependencyGraph:
|
||
"""A -> B -> C -> A (one cycle)."""
|
||
g = DependencyGraph()
|
||
g.add_edge("A", "B")
|
||
g.add_edge("B", "C")
|
||
g.add_edge("C", "A")
|
||
return g
|
||
|
||
|
||
def _can_import_graph_analysis():
|
||
try:
|
||
from markitect.analysis.graph import connected_components # noqa: F401
|
||
return True
|
||
except ImportError:
|
||
return False
|
||
|
||
|
||
# ── C1: Redundancy ──────────────────────────────────────────────────
|
||
|
||
|
||
class TestRedundancy:
|
||
def test_empty_entities(self):
|
||
from markitect.infospace.checks.redundancy import check_redundancy
|
||
report = check_redundancy([])
|
||
assert report.entity_count == 0
|
||
assert report.redundancy_ratio == 0.0
|
||
assert report.similar_pairs == []
|
||
|
||
def test_single_entity(self):
|
||
from markitect.infospace.checks.redundancy import check_redundancy
|
||
report = check_redundancy([_entity("a", definition="hello world")])
|
||
assert report.entity_count == 1
|
||
assert report.redundancy_ratio == 0.0
|
||
|
||
def test_no_overlap_word_fallback(self):
|
||
from markitect.infospace.checks.redundancy import check_redundancy
|
||
entities = [
|
||
_entity("a", definition="apple banana cherry"),
|
||
_entity("b", definition="delta epsilon zeta"),
|
||
]
|
||
report = check_redundancy(entities, threshold=0.5)
|
||
assert report.similar_pairs == []
|
||
assert report.redundancy_ratio == 0.0
|
||
|
||
def test_high_overlap_word_fallback(self):
|
||
from markitect.infospace.checks.redundancy import check_redundancy
|
||
entities = [
|
||
_entity("a", definition="the quick brown fox"),
|
||
_entity("b", definition="the quick brown dog"),
|
||
]
|
||
report = check_redundancy(entities, threshold=0.5)
|
||
assert len(report.similar_pairs) == 1
|
||
assert report.similar_pairs[0]["method"] == "word_overlap"
|
||
assert report.similar_pairs[0]["entity_a"] == "a"
|
||
assert report.similar_pairs[0]["entity_b"] == "b"
|
||
assert report.redundancy_ratio == 1.0 # both entities involved
|
||
|
||
def test_embedding_based(self):
|
||
from markitect.infospace.checks.redundancy import check_redundancy
|
||
entities = [
|
||
_entity("a", definition="x"),
|
||
_entity("b", definition="y"),
|
||
_entity("c", definition="z"),
|
||
]
|
||
# a and b are very similar; c is different
|
||
embeddings = {
|
||
"a": [1.0, 0.0, 0.0],
|
||
"b": [0.99, 0.1, 0.0],
|
||
"c": [0.0, 0.0, 1.0],
|
||
}
|
||
report = check_redundancy(entities, embeddings=embeddings, threshold=0.9)
|
||
assert len(report.similar_pairs) >= 1
|
||
assert report.similar_pairs[0]["method"] == "embedding"
|
||
assert report.redundancy_ratio > 0.0
|
||
|
||
def test_to_dict(self):
|
||
from markitect.infospace.checks.redundancy import RedundancyReport
|
||
r = RedundancyReport(similar_pairs=[], redundancy_ratio=0.25, entity_count=10)
|
||
d = r.to_dict()
|
||
assert d["concern"] == "C1"
|
||
assert d["redundancy_ratio"] == 0.25
|
||
assert d["entity_count"] == 10
|
||
|
||
|
||
# ── C2: Coverage ────────────────────────────────────────────────────
|
||
|
||
|
||
class TestCoverage:
|
||
def test_empty_entities(self):
|
||
from markitect.infospace.checks.coverage import check_coverage
|
||
report = check_coverage([])
|
||
assert report.entity_count == 0
|
||
assert report.coverage_ratio == 0.0
|
||
|
||
def test_full_coverage(self):
|
||
"""All domain×chapter cells are populated."""
|
||
from markitect.infospace.checks.coverage import check_coverage
|
||
entities = [
|
||
_entity("a", domain="d1", source_chapter="ch1"),
|
||
_entity("b", domain="d2", source_chapter="ch1"),
|
||
_entity("c", domain="d1", source_chapter="ch2"),
|
||
_entity("d", domain="d2", source_chapter="ch2"),
|
||
]
|
||
report = check_coverage(entities)
|
||
assert report.coverage_ratio == 1.0
|
||
assert report.empty_cells == []
|
||
|
||
def test_partial_coverage(self):
|
||
"""One cell is missing → coverage < 1.0."""
|
||
from markitect.infospace.checks.coverage import check_coverage
|
||
entities = [
|
||
_entity("a", domain="d1", source_chapter="ch1"),
|
||
_entity("b", domain="d2", source_chapter="ch1"),
|
||
_entity("c", domain="d1", source_chapter="ch2"),
|
||
# Missing: d2×ch2
|
||
]
|
||
report = check_coverage(entities)
|
||
assert report.coverage_ratio < 1.0
|
||
assert len(report.empty_cells) == 1
|
||
assert report.empty_cells[0]["dimension_a"] == "domain:d2"
|
||
assert report.empty_cells[0]["dimension_b"] == "chapter:ch2"
|
||
|
||
def test_domain_counts(self):
|
||
from markitect.infospace.checks.coverage import check_coverage
|
||
entities = _sample_entities()
|
||
report = check_coverage(entities)
|
||
assert report.domain_counts["economics"] == 2
|
||
assert report.domain_counts["sociology"] == 2
|
||
assert report.domain_counts["philosophy"] == 1
|
||
|
||
def test_to_dict(self):
|
||
from markitect.infospace.checks.coverage import CoverageReport
|
||
r = CoverageReport(coverage_ratio=0.75, entity_count=8)
|
||
d = r.to_dict()
|
||
assert d["concern"] == "C2"
|
||
assert d["coverage_ratio"] == 0.75
|
||
|
||
def test_extra_attributes(self):
|
||
from markitect.infospace.checks.coverage import check_coverage
|
||
entities = [
|
||
_entity("a", domain="d1", source_chapter="ch1"),
|
||
]
|
||
extra = {"a": {"vsm:production"}}
|
||
report = check_coverage(entities, extra_attributes=extra)
|
||
assert report.entity_count == 1
|
||
|
||
|
||
# ── C3: Coherence ───────────────────────────────────────────────────
|
||
|
||
|
||
class TestCoherence:
|
||
def test_no_graph(self):
|
||
from markitect.infospace.checks.coherence import check_coherence
|
||
report = check_coherence(graph=None, entity_count=5)
|
||
assert report.connected_components == 0
|
||
assert report.entity_count == 5
|
||
|
||
def test_empty_graph(self):
|
||
from markitect.infospace.checks.coherence import check_coherence
|
||
g = DependencyGraph()
|
||
report = check_coherence(graph=g, entity_count=0)
|
||
assert report.connected_components == 0
|
||
|
||
def test_to_dict(self):
|
||
from markitect.infospace.checks.coherence import CoherenceReport
|
||
r = CoherenceReport(connected_components=2, modularity=0.3456, entity_count=10)
|
||
d = r.to_dict()
|
||
assert d["concern"] == "C3"
|
||
assert d["modularity"] == 0.3456
|
||
assert d["connected_components"] == 2
|
||
|
||
@pytest.mark.skipif(
|
||
not _can_import_graph_analysis(),
|
||
reason="networkx not available",
|
||
)
|
||
def test_with_graph(self):
|
||
from markitect.infospace.checks.coherence import check_coherence
|
||
g = _linear_graph()
|
||
report = check_coherence(graph=g, entity_count=4)
|
||
assert report.connected_components >= 1
|
||
assert report.entity_count == 4
|
||
|
||
|
||
# ── C4: Consistency ─────────────────────────────────────────────────
|
||
|
||
|
||
class TestConsistency:
|
||
def test_no_graph(self):
|
||
from markitect.infospace.checks.consistency import check_consistency
|
||
entities = _sample_entities()
|
||
report = check_consistency(entities)
|
||
assert report.cycle_count == 0
|
||
assert report.entity_count == 5
|
||
|
||
def test_acyclic_graph(self):
|
||
from markitect.infospace.checks.consistency import check_consistency
|
||
entities = _sample_entities()
|
||
g = _linear_graph()
|
||
report = check_consistency(entities, graph=g)
|
||
assert report.cycle_count == 0
|
||
|
||
def test_cyclic_graph(self):
|
||
from markitect.infospace.checks.consistency import check_consistency
|
||
entities = _sample_entities()
|
||
g = _cyclic_graph()
|
||
report = check_consistency(entities, graph=g)
|
||
assert report.cycle_count >= 1
|
||
assert len(report.cycles) >= 1
|
||
|
||
def test_to_dict(self):
|
||
from markitect.infospace.checks.consistency import ConsistencyReport
|
||
r = ConsistencyReport(cycles=[["A", "B", "A"]], cycle_count=1, entity_count=5)
|
||
d = r.to_dict()
|
||
assert d["concern"] == "C4"
|
||
assert d["cycle_count"] == 1
|
||
|
||
|
||
# ── C5: Granularity ─────────────────────────────────────────────────
|
||
|
||
|
||
class TestGranularity:
|
||
def test_empty_entities(self):
|
||
from markitect.infospace.checks.granularity import check_granularity
|
||
report = check_granularity([])
|
||
assert report.entity_count == 0
|
||
assert report.domain_entropy == 0.0
|
||
|
||
def test_single_domain(self):
|
||
from markitect.infospace.checks.granularity import check_granularity
|
||
entities = [
|
||
_entity("a", domain="d1", word_count=10),
|
||
_entity("b", domain="d1", word_count=20),
|
||
]
|
||
report = check_granularity(entities)
|
||
assert report.domain_entropy == 0.0 # single domain = zero entropy
|
||
assert report.entity_count == 2
|
||
assert report.word_count_stats["mean"] == 15.0
|
||
|
||
def test_balanced_domains(self):
|
||
from markitect.infospace.checks.granularity import check_granularity
|
||
entities = [
|
||
_entity("a", domain="d1", word_count=10),
|
||
_entity("b", domain="d2", word_count=10),
|
||
]
|
||
report = check_granularity(entities)
|
||
assert report.domain_entropy == pytest.approx(1.0) # log2(2) = 1.0
|
||
assert report.domain_distribution == {"d1": 1, "d2": 1}
|
||
|
||
def test_word_count_stats(self):
|
||
from markitect.infospace.checks.granularity import check_granularity
|
||
entities = [
|
||
_entity("a", domain="d1", word_count=10),
|
||
_entity("b", domain="d1", word_count=30),
|
||
]
|
||
report = check_granularity(entities)
|
||
assert report.word_count_stats["mean"] == 20.0
|
||
assert report.word_count_stats["min"] == 10.0
|
||
assert report.word_count_stats["max"] == 30.0
|
||
assert report.word_count_stats["std"] == 10.0
|
||
|
||
def test_to_dict(self):
|
||
from markitect.infospace.checks.granularity import GranularityReport
|
||
r = GranularityReport(domain_entropy=1.5, entity_count=4)
|
||
d = r.to_dict()
|
||
assert d["concern"] == "C5"
|
||
assert d["domain_entropy"] == 1.5
|
||
|
||
def test_unspecified_domain(self):
|
||
from markitect.infospace.checks.granularity import check_granularity
|
||
entities = [_entity("a", domain="", word_count=10)]
|
||
report = check_granularity(entities)
|
||
assert "(unspecified)" in report.domain_distribution
|
||
|
||
|
||
# ── Orchestrator ────────────────────────────────────────────────────
|
||
|
||
|
||
class TestOrchestrator:
|
||
def test_run_all_default(self):
|
||
from markitect.infospace.checks.orchestrator import run_all_checks
|
||
entities = _sample_entities()
|
||
report = run_all_checks(entities)
|
||
assert report.redundancy is not None
|
||
assert report.coverage is not None
|
||
assert report.coherence is not None
|
||
assert report.consistency is not None
|
||
assert report.granularity is not None
|
||
|
||
def test_run_selected_checks(self):
|
||
from markitect.infospace.checks.orchestrator import run_all_checks
|
||
entities = _sample_entities()
|
||
report = run_all_checks(entities, checks=["redundancy", "granularity"])
|
||
assert report.redundancy is not None
|
||
assert report.granularity is not None
|
||
assert report.coverage is None
|
||
assert report.coherence is None
|
||
assert report.consistency is None
|
||
|
||
def test_to_dict(self):
|
||
from markitect.infospace.checks.orchestrator import run_all_checks
|
||
entities = _sample_entities()
|
||
report = run_all_checks(entities, checks=["granularity"])
|
||
d = report.to_dict()
|
||
assert "granularity" in d
|
||
assert "redundancy" not in d
|
||
|
||
def test_metrics(self):
|
||
from markitect.infospace.checks.orchestrator import run_all_checks
|
||
entities = _sample_entities()
|
||
report = run_all_checks(entities, checks=["redundancy", "granularity"])
|
||
m = report.metrics()
|
||
assert "redundancy_ratio" in m
|
||
assert "granularity_entropy" in m
|
||
assert isinstance(m["redundancy_ratio"], float)
|
||
assert isinstance(m["granularity_entropy"], float)
|
||
|
||
def test_metrics_empty_report(self):
|
||
from markitect.infospace.checks.orchestrator import CheckReport
|
||
report = CheckReport()
|
||
assert report.metrics() == {}
|
||
|
||
def test_run_all_with_graph(self):
|
||
from markitect.infospace.checks.orchestrator import run_all_checks
|
||
entities = _sample_entities()
|
||
g = _linear_graph()
|
||
report = run_all_checks(entities, graph=g, checks=["consistency"])
|
||
assert report.consistency is not None
|
||
assert report.consistency.cycle_count == 0
|
||
|
||
def test_run_all_with_cyclic_graph(self):
|
||
from markitect.infospace.checks.orchestrator import run_all_checks
|
||
entities = _sample_entities()
|
||
g = _cyclic_graph()
|
||
report = run_all_checks(entities, graph=g, checks=["consistency"])
|
||
assert report.consistency.cycle_count >= 1
|
||
|
||
|
||
# ── Shannon entropy helper ──────────────────────────────────────────
|
||
|
||
|
||
class TestShannonEntropy:
|
||
def test_uniform_distribution(self):
|
||
from markitect.infospace.checks.granularity import _shannon_entropy
|
||
counts = {"a": 1, "b": 1, "c": 1, "d": 1}
|
||
assert _shannon_entropy(counts) == pytest.approx(2.0) # log2(4)
|
||
|
||
def test_single_element(self):
|
||
from markitect.infospace.checks.granularity import _shannon_entropy
|
||
assert _shannon_entropy({"a": 10}) == 0.0
|
||
|
||
def test_empty(self):
|
||
from markitect.infospace.checks.granularity import _shannon_entropy
|
||
assert _shannon_entropy({}) == 0.0
|
||
|
||
def test_skewed(self):
|
||
from markitect.infospace.checks.granularity import _shannon_entropy
|
||
counts = {"a": 99, "b": 1}
|
||
entropy = _shannon_entropy(counts)
|
||
assert 0.0 < entropy < 1.0
|