""" Tests for collection-level quality checks (S2.4). Covers all five concerns: Redundancy (C1), Coverage (C2), Coherence (C3), Consistency (C4), Granularity (C5), and the orchestrator. """ from __future__ import annotations import math import pytest from markitect.infospace.models import EntityMeta from markitect.prompts.dependencies.models import DependencyGraph # ── helpers ────────────────────────────────────────────────────────── def _entity(slug: str, domain: str = "", definition: str = "", source_chapter: str = "", word_count: int = 0) -> EntityMeta: wc = word_count if word_count else (len(definition.split()) if definition else 0) return EntityMeta( slug=slug, title=slug.replace("-", " ").title(), h1_raw=slug.replace("-", " ").title(), definition=definition, domain=domain, source_chapter=source_chapter, definition_word_count=wc, total_word_count=wc, ) def _sample_entities() -> list[EntityMeta]: return [ _entity("alpha", domain="economics", definition="the first concept in our model", source_chapter="ch01"), _entity("beta", domain="economics", definition="the second concept about markets", source_chapter="ch01"), _entity("gamma", domain="sociology", definition="a social structure framework", source_chapter="ch02"), _entity("delta", domain="sociology", definition="a social dynamic pattern", source_chapter="ch02"), _entity("epsilon", domain="philosophy", definition="an epistemic principle", source_chapter="ch03"), ] def _linear_graph() -> DependencyGraph: """A -> B -> C -> D.""" g = DependencyGraph() g.add_edge("A", "B") g.add_edge("B", "C") g.add_edge("C", "D") return g def _cyclic_graph() -> DependencyGraph: """A -> B -> C -> A (one cycle).""" g = DependencyGraph() g.add_edge("A", "B") g.add_edge("B", "C") g.add_edge("C", "A") return g def _can_import_graph_analysis(): try: from markitect.analysis.graph import connected_components # noqa: F401 return True except ImportError: return False # ── C1: Redundancy ────────────────────────────────────────────────── class TestRedundancy: def test_empty_entities(self): from markitect.infospace.checks.redundancy import check_redundancy report = check_redundancy([]) assert report.entity_count == 0 assert report.redundancy_ratio == 0.0 assert report.similar_pairs == [] def test_single_entity(self): from markitect.infospace.checks.redundancy import check_redundancy report = check_redundancy([_entity("a", definition="hello world")]) assert report.entity_count == 1 assert report.redundancy_ratio == 0.0 def test_no_overlap_word_fallback(self): from markitect.infospace.checks.redundancy import check_redundancy entities = [ _entity("a", definition="apple banana cherry"), _entity("b", definition="delta epsilon zeta"), ] report = check_redundancy(entities, threshold=0.5) assert report.similar_pairs == [] assert report.redundancy_ratio == 0.0 def test_high_overlap_word_fallback(self): from markitect.infospace.checks.redundancy import check_redundancy entities = [ _entity("a", definition="the quick brown fox"), _entity("b", definition="the quick brown dog"), ] report = check_redundancy(entities, threshold=0.5) assert len(report.similar_pairs) == 1 assert report.similar_pairs[0]["method"] == "word_overlap" assert report.similar_pairs[0]["entity_a"] == "a" assert report.similar_pairs[0]["entity_b"] == "b" assert report.redundancy_ratio == 1.0 # both entities involved def test_embedding_based(self): from markitect.infospace.checks.redundancy import check_redundancy entities = [ _entity("a", definition="x"), _entity("b", definition="y"), _entity("c", definition="z"), ] # a and b are very similar; c is different embeddings = { "a": [1.0, 0.0, 0.0], "b": [0.99, 0.1, 0.0], "c": [0.0, 0.0, 1.0], } report = check_redundancy(entities, embeddings=embeddings, threshold=0.9) assert len(report.similar_pairs) >= 1 assert report.similar_pairs[0]["method"] == "embedding" assert report.redundancy_ratio > 0.0 def test_to_dict(self): from markitect.infospace.checks.redundancy import RedundancyReport r = RedundancyReport(similar_pairs=[], redundancy_ratio=0.25, entity_count=10) d = r.to_dict() assert d["concern"] == "C1" assert d["redundancy_ratio"] == 0.25 assert d["entity_count"] == 10 # ── C2: Coverage ──────────────────────────────────────────────────── class TestCoverage: def test_empty_entities(self): from markitect.infospace.checks.coverage import check_coverage report = check_coverage([]) assert report.entity_count == 0 assert report.coverage_ratio == 0.0 def test_full_coverage(self): """All domain×chapter cells are populated.""" from markitect.infospace.checks.coverage import check_coverage entities = [ _entity("a", domain="d1", source_chapter="ch1"), _entity("b", domain="d2", source_chapter="ch1"), _entity("c", domain="d1", source_chapter="ch2"), _entity("d", domain="d2", source_chapter="ch2"), ] report = check_coverage(entities) assert report.coverage_ratio == 1.0 assert report.empty_cells == [] def test_partial_coverage(self): """One cell is missing → coverage < 1.0.""" from markitect.infospace.checks.coverage import check_coverage entities = [ _entity("a", domain="d1", source_chapter="ch1"), _entity("b", domain="d2", source_chapter="ch1"), _entity("c", domain="d1", source_chapter="ch2"), # Missing: d2×ch2 ] report = check_coverage(entities) assert report.coverage_ratio < 1.0 assert len(report.empty_cells) == 1 assert report.empty_cells[0]["dimension_a"] == "domain:d2" assert report.empty_cells[0]["dimension_b"] == "chapter:ch2" def test_domain_counts(self): from markitect.infospace.checks.coverage import check_coverage entities = _sample_entities() report = check_coverage(entities) assert report.domain_counts["economics"] == 2 assert report.domain_counts["sociology"] == 2 assert report.domain_counts["philosophy"] == 1 def test_to_dict(self): from markitect.infospace.checks.coverage import CoverageReport r = CoverageReport(coverage_ratio=0.75, entity_count=8) d = r.to_dict() assert d["concern"] == "C2" assert d["coverage_ratio"] == 0.75 def test_extra_attributes(self): from markitect.infospace.checks.coverage import check_coverage entities = [ _entity("a", domain="d1", source_chapter="ch1"), ] extra = {"a": {"vsm:production"}} report = check_coverage(entities, extra_attributes=extra) assert report.entity_count == 1 # ── C3: Coherence ─────────────────────────────────────────────────── class TestCoherence: def test_no_graph(self): from markitect.infospace.checks.coherence import check_coherence report = check_coherence(graph=None, entity_count=5) assert report.connected_components == 0 assert report.entity_count == 5 def test_empty_graph(self): from markitect.infospace.checks.coherence import check_coherence g = DependencyGraph() report = check_coherence(graph=g, entity_count=0) assert report.connected_components == 0 def test_to_dict(self): from markitect.infospace.checks.coherence import CoherenceReport r = CoherenceReport(connected_components=2, modularity=0.3456, entity_count=10) d = r.to_dict() assert d["concern"] == "C3" assert d["modularity"] == 0.3456 assert d["connected_components"] == 2 @pytest.mark.skipif( not _can_import_graph_analysis(), reason="networkx not available", ) def test_with_graph(self): from markitect.infospace.checks.coherence import check_coherence g = _linear_graph() report = check_coherence(graph=g, entity_count=4) assert report.connected_components >= 1 assert report.entity_count == 4 # ── C4: Consistency ───────────────────────────────────────────────── class TestConsistency: def test_no_graph(self): from markitect.infospace.checks.consistency import check_consistency entities = _sample_entities() report = check_consistency(entities) assert report.cycle_count == 0 assert report.entity_count == 5 def test_acyclic_graph(self): from markitect.infospace.checks.consistency import check_consistency entities = _sample_entities() g = _linear_graph() report = check_consistency(entities, graph=g) assert report.cycle_count == 0 def test_cyclic_graph(self): from markitect.infospace.checks.consistency import check_consistency entities = _sample_entities() g = _cyclic_graph() report = check_consistency(entities, graph=g) assert report.cycle_count >= 1 assert len(report.cycles) >= 1 def test_to_dict(self): from markitect.infospace.checks.consistency import ConsistencyReport r = ConsistencyReport(cycles=[["A", "B", "A"]], cycle_count=1, entity_count=5) d = r.to_dict() assert d["concern"] == "C4" assert d["cycle_count"] == 1 # ── C5: Granularity ───────────────────────────────────────────────── class TestGranularity: def test_empty_entities(self): from markitect.infospace.checks.granularity import check_granularity report = check_granularity([]) assert report.entity_count == 0 assert report.domain_entropy == 0.0 def test_single_domain(self): from markitect.infospace.checks.granularity import check_granularity entities = [ _entity("a", domain="d1", word_count=10), _entity("b", domain="d1", word_count=20), ] report = check_granularity(entities) assert report.domain_entropy == 0.0 # single domain = zero entropy assert report.entity_count == 2 assert report.word_count_stats["mean"] == 15.0 def test_balanced_domains(self): from markitect.infospace.checks.granularity import check_granularity entities = [ _entity("a", domain="d1", word_count=10), _entity("b", domain="d2", word_count=10), ] report = check_granularity(entities) assert report.domain_entropy == pytest.approx(1.0) # log2(2) = 1.0 assert report.domain_distribution == {"d1": 1, "d2": 1} def test_word_count_stats(self): from markitect.infospace.checks.granularity import check_granularity entities = [ _entity("a", domain="d1", word_count=10), _entity("b", domain="d1", word_count=30), ] report = check_granularity(entities) assert report.word_count_stats["mean"] == 20.0 assert report.word_count_stats["min"] == 10.0 assert report.word_count_stats["max"] == 30.0 assert report.word_count_stats["std"] == 10.0 def test_to_dict(self): from markitect.infospace.checks.granularity import GranularityReport r = GranularityReport(domain_entropy=1.5, entity_count=4) d = r.to_dict() assert d["concern"] == "C5" assert d["domain_entropy"] == 1.5 def test_unspecified_domain(self): from markitect.infospace.checks.granularity import check_granularity entities = [_entity("a", domain="", word_count=10)] report = check_granularity(entities) assert "(unspecified)" in report.domain_distribution # ── Orchestrator ──────────────────────────────────────────────────── class TestOrchestrator: def test_run_all_default(self): from markitect.infospace.checks.orchestrator import run_all_checks entities = _sample_entities() report = run_all_checks(entities) assert report.redundancy is not None assert report.coverage is not None assert report.coherence is not None assert report.consistency is not None assert report.granularity is not None def test_run_selected_checks(self): from markitect.infospace.checks.orchestrator import run_all_checks entities = _sample_entities() report = run_all_checks(entities, checks=["redundancy", "granularity"]) assert report.redundancy is not None assert report.granularity is not None assert report.coverage is None assert report.coherence is None assert report.consistency is None def test_to_dict(self): from markitect.infospace.checks.orchestrator import run_all_checks entities = _sample_entities() report = run_all_checks(entities, checks=["granularity"]) d = report.to_dict() assert "granularity" in d assert "redundancy" not in d def test_metrics(self): from markitect.infospace.checks.orchestrator import run_all_checks entities = _sample_entities() report = run_all_checks(entities, checks=["redundancy", "granularity"]) m = report.metrics() assert "redundancy_ratio" in m assert "granularity_entropy" in m assert isinstance(m["redundancy_ratio"], float) assert isinstance(m["granularity_entropy"], float) def test_metrics_empty_report(self): from markitect.infospace.checks.orchestrator import CheckReport report = CheckReport() assert report.metrics() == {} def test_run_all_with_graph(self): from markitect.infospace.checks.orchestrator import run_all_checks entities = _sample_entities() g = _linear_graph() report = run_all_checks(entities, graph=g, checks=["consistency"]) assert report.consistency is not None assert report.consistency.cycle_count == 0 def test_run_all_with_cyclic_graph(self): from markitect.infospace.checks.orchestrator import run_all_checks entities = _sample_entities() g = _cyclic_graph() report = run_all_checks(entities, graph=g, checks=["consistency"]) assert report.consistency.cycle_count >= 1 # ── Shannon entropy helper ────────────────────────────────────────── class TestShannonEntropy: def test_uniform_distribution(self): from markitect.infospace.checks.granularity import _shannon_entropy counts = {"a": 1, "b": 1, "c": 1, "d": 1} assert _shannon_entropy(counts) == pytest.approx(2.0) # log2(4) def test_single_element(self): from markitect.infospace.checks.granularity import _shannon_entropy assert _shannon_entropy({"a": 10}) == 0.0 def test_empty(self): from markitect.infospace.checks.granularity import _shannon_entropy assert _shannon_entropy({}) == 0.0 def test_skewed(self): from markitect.infospace.checks.granularity import _shannon_entropy counts = {"a": 99, "b": 1} entropy = _shannon_entropy(counts) assert 0.0 < entropy < 1.0