Files
markitect-main/tests/unit/infospace/test_checks.py
tegwick 11585e6968 feat(infospace): add collection-level quality checks C1–C5 (S2.4)
Five concern checks: Redundancy (embedding/word overlap), Coverage
(FCA gap analysis), Coherence (graph connectivity), Consistency
(cycle detection), Granularity (Shannon entropy). Orchestrator runs
all or selected checks, CLI `markitect infospace check` command added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:54:22 +01:00

414 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Tests for collection-level quality checks (S2.4).
Covers all five concerns: Redundancy (C1), Coverage (C2), Coherence (C3),
Consistency (C4), Granularity (C5), and the orchestrator.
"""
from __future__ import annotations
import math
import pytest
from markitect.infospace.models import EntityMeta
from markitect.prompts.dependencies.models import DependencyGraph
# ── helpers ──────────────────────────────────────────────────────────
def _entity(slug: str, domain: str = "", definition: str = "",
source_chapter: str = "", word_count: int = 0) -> EntityMeta:
wc = word_count if word_count else (len(definition.split()) if definition else 0)
return EntityMeta(
slug=slug,
title=slug.replace("-", " ").title(),
h1_raw=slug.replace("-", " ").title(),
definition=definition,
domain=domain,
source_chapter=source_chapter,
definition_word_count=wc,
total_word_count=wc,
)
def _sample_entities() -> list[EntityMeta]:
return [
_entity("alpha", domain="economics", definition="the first concept in our model", source_chapter="ch01"),
_entity("beta", domain="economics", definition="the second concept about markets", source_chapter="ch01"),
_entity("gamma", domain="sociology", definition="a social structure framework", source_chapter="ch02"),
_entity("delta", domain="sociology", definition="a social dynamic pattern", source_chapter="ch02"),
_entity("epsilon", domain="philosophy", definition="an epistemic principle", source_chapter="ch03"),
]
def _linear_graph() -> DependencyGraph:
"""A -> B -> C -> D."""
g = DependencyGraph()
g.add_edge("A", "B")
g.add_edge("B", "C")
g.add_edge("C", "D")
return g
def _cyclic_graph() -> DependencyGraph:
"""A -> B -> C -> A (one cycle)."""
g = DependencyGraph()
g.add_edge("A", "B")
g.add_edge("B", "C")
g.add_edge("C", "A")
return g
def _can_import_graph_analysis():
try:
from markitect.analysis.graph import connected_components # noqa: F401
return True
except ImportError:
return False
# ── C1: Redundancy ──────────────────────────────────────────────────
class TestRedundancy:
def test_empty_entities(self):
from markitect.infospace.checks.redundancy import check_redundancy
report = check_redundancy([])
assert report.entity_count == 0
assert report.redundancy_ratio == 0.0
assert report.similar_pairs == []
def test_single_entity(self):
from markitect.infospace.checks.redundancy import check_redundancy
report = check_redundancy([_entity("a", definition="hello world")])
assert report.entity_count == 1
assert report.redundancy_ratio == 0.0
def test_no_overlap_word_fallback(self):
from markitect.infospace.checks.redundancy import check_redundancy
entities = [
_entity("a", definition="apple banana cherry"),
_entity("b", definition="delta epsilon zeta"),
]
report = check_redundancy(entities, threshold=0.5)
assert report.similar_pairs == []
assert report.redundancy_ratio == 0.0
def test_high_overlap_word_fallback(self):
from markitect.infospace.checks.redundancy import check_redundancy
entities = [
_entity("a", definition="the quick brown fox"),
_entity("b", definition="the quick brown dog"),
]
report = check_redundancy(entities, threshold=0.5)
assert len(report.similar_pairs) == 1
assert report.similar_pairs[0]["method"] == "word_overlap"
assert report.similar_pairs[0]["entity_a"] == "a"
assert report.similar_pairs[0]["entity_b"] == "b"
assert report.redundancy_ratio == 1.0 # both entities involved
def test_embedding_based(self):
from markitect.infospace.checks.redundancy import check_redundancy
entities = [
_entity("a", definition="x"),
_entity("b", definition="y"),
_entity("c", definition="z"),
]
# a and b are very similar; c is different
embeddings = {
"a": [1.0, 0.0, 0.0],
"b": [0.99, 0.1, 0.0],
"c": [0.0, 0.0, 1.0],
}
report = check_redundancy(entities, embeddings=embeddings, threshold=0.9)
assert len(report.similar_pairs) >= 1
assert report.similar_pairs[0]["method"] == "embedding"
assert report.redundancy_ratio > 0.0
def test_to_dict(self):
from markitect.infospace.checks.redundancy import RedundancyReport
r = RedundancyReport(similar_pairs=[], redundancy_ratio=0.25, entity_count=10)
d = r.to_dict()
assert d["concern"] == "C1"
assert d["redundancy_ratio"] == 0.25
assert d["entity_count"] == 10
# ── C2: Coverage ────────────────────────────────────────────────────
class TestCoverage:
def test_empty_entities(self):
from markitect.infospace.checks.coverage import check_coverage
report = check_coverage([])
assert report.entity_count == 0
assert report.coverage_ratio == 0.0
def test_full_coverage(self):
"""All domain×chapter cells are populated."""
from markitect.infospace.checks.coverage import check_coverage
entities = [
_entity("a", domain="d1", source_chapter="ch1"),
_entity("b", domain="d2", source_chapter="ch1"),
_entity("c", domain="d1", source_chapter="ch2"),
_entity("d", domain="d2", source_chapter="ch2"),
]
report = check_coverage(entities)
assert report.coverage_ratio == 1.0
assert report.empty_cells == []
def test_partial_coverage(self):
"""One cell is missing → coverage < 1.0."""
from markitect.infospace.checks.coverage import check_coverage
entities = [
_entity("a", domain="d1", source_chapter="ch1"),
_entity("b", domain="d2", source_chapter="ch1"),
_entity("c", domain="d1", source_chapter="ch2"),
# Missing: d2×ch2
]
report = check_coverage(entities)
assert report.coverage_ratio < 1.0
assert len(report.empty_cells) == 1
assert report.empty_cells[0]["dimension_a"] == "domain:d2"
assert report.empty_cells[0]["dimension_b"] == "chapter:ch2"
def test_domain_counts(self):
from markitect.infospace.checks.coverage import check_coverage
entities = _sample_entities()
report = check_coverage(entities)
assert report.domain_counts["economics"] == 2
assert report.domain_counts["sociology"] == 2
assert report.domain_counts["philosophy"] == 1
def test_to_dict(self):
from markitect.infospace.checks.coverage import CoverageReport
r = CoverageReport(coverage_ratio=0.75, entity_count=8)
d = r.to_dict()
assert d["concern"] == "C2"
assert d["coverage_ratio"] == 0.75
def test_extra_attributes(self):
from markitect.infospace.checks.coverage import check_coverage
entities = [
_entity("a", domain="d1", source_chapter="ch1"),
]
extra = {"a": {"vsm:production"}}
report = check_coverage(entities, extra_attributes=extra)
assert report.entity_count == 1
# ── C3: Coherence ───────────────────────────────────────────────────
class TestCoherence:
def test_no_graph(self):
from markitect.infospace.checks.coherence import check_coherence
report = check_coherence(graph=None, entity_count=5)
assert report.connected_components == 0
assert report.entity_count == 5
def test_empty_graph(self):
from markitect.infospace.checks.coherence import check_coherence
g = DependencyGraph()
report = check_coherence(graph=g, entity_count=0)
assert report.connected_components == 0
def test_to_dict(self):
from markitect.infospace.checks.coherence import CoherenceReport
r = CoherenceReport(connected_components=2, modularity=0.3456, entity_count=10)
d = r.to_dict()
assert d["concern"] == "C3"
assert d["modularity"] == 0.3456
assert d["connected_components"] == 2
@pytest.mark.skipif(
not _can_import_graph_analysis(),
reason="networkx not available",
)
def test_with_graph(self):
from markitect.infospace.checks.coherence import check_coherence
g = _linear_graph()
report = check_coherence(graph=g, entity_count=4)
assert report.connected_components >= 1
assert report.entity_count == 4
# ── C4: Consistency ─────────────────────────────────────────────────
class TestConsistency:
def test_no_graph(self):
from markitect.infospace.checks.consistency import check_consistency
entities = _sample_entities()
report = check_consistency(entities)
assert report.cycle_count == 0
assert report.entity_count == 5
def test_acyclic_graph(self):
from markitect.infospace.checks.consistency import check_consistency
entities = _sample_entities()
g = _linear_graph()
report = check_consistency(entities, graph=g)
assert report.cycle_count == 0
def test_cyclic_graph(self):
from markitect.infospace.checks.consistency import check_consistency
entities = _sample_entities()
g = _cyclic_graph()
report = check_consistency(entities, graph=g)
assert report.cycle_count >= 1
assert len(report.cycles) >= 1
def test_to_dict(self):
from markitect.infospace.checks.consistency import ConsistencyReport
r = ConsistencyReport(cycles=[["A", "B", "A"]], cycle_count=1, entity_count=5)
d = r.to_dict()
assert d["concern"] == "C4"
assert d["cycle_count"] == 1
# ── C5: Granularity ─────────────────────────────────────────────────
class TestGranularity:
def test_empty_entities(self):
from markitect.infospace.checks.granularity import check_granularity
report = check_granularity([])
assert report.entity_count == 0
assert report.domain_entropy == 0.0
def test_single_domain(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [
_entity("a", domain="d1", word_count=10),
_entity("b", domain="d1", word_count=20),
]
report = check_granularity(entities)
assert report.domain_entropy == 0.0 # single domain = zero entropy
assert report.entity_count == 2
assert report.word_count_stats["mean"] == 15.0
def test_balanced_domains(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [
_entity("a", domain="d1", word_count=10),
_entity("b", domain="d2", word_count=10),
]
report = check_granularity(entities)
assert report.domain_entropy == pytest.approx(1.0) # log2(2) = 1.0
assert report.domain_distribution == {"d1": 1, "d2": 1}
def test_word_count_stats(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [
_entity("a", domain="d1", word_count=10),
_entity("b", domain="d1", word_count=30),
]
report = check_granularity(entities)
assert report.word_count_stats["mean"] == 20.0
assert report.word_count_stats["min"] == 10.0
assert report.word_count_stats["max"] == 30.0
assert report.word_count_stats["std"] == 10.0
def test_to_dict(self):
from markitect.infospace.checks.granularity import GranularityReport
r = GranularityReport(domain_entropy=1.5, entity_count=4)
d = r.to_dict()
assert d["concern"] == "C5"
assert d["domain_entropy"] == 1.5
def test_unspecified_domain(self):
from markitect.infospace.checks.granularity import check_granularity
entities = [_entity("a", domain="", word_count=10)]
report = check_granularity(entities)
assert "(unspecified)" in report.domain_distribution
# ── Orchestrator ────────────────────────────────────────────────────
class TestOrchestrator:
def test_run_all_default(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities)
assert report.redundancy is not None
assert report.coverage is not None
assert report.coherence is not None
assert report.consistency is not None
assert report.granularity is not None
def test_run_selected_checks(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities, checks=["redundancy", "granularity"])
assert report.redundancy is not None
assert report.granularity is not None
assert report.coverage is None
assert report.coherence is None
assert report.consistency is None
def test_to_dict(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities, checks=["granularity"])
d = report.to_dict()
assert "granularity" in d
assert "redundancy" not in d
def test_metrics(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
report = run_all_checks(entities, checks=["redundancy", "granularity"])
m = report.metrics()
assert "redundancy_ratio" in m
assert "granularity_entropy" in m
assert isinstance(m["redundancy_ratio"], float)
assert isinstance(m["granularity_entropy"], float)
def test_metrics_empty_report(self):
from markitect.infospace.checks.orchestrator import CheckReport
report = CheckReport()
assert report.metrics() == {}
def test_run_all_with_graph(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
g = _linear_graph()
report = run_all_checks(entities, graph=g, checks=["consistency"])
assert report.consistency is not None
assert report.consistency.cycle_count == 0
def test_run_all_with_cyclic_graph(self):
from markitect.infospace.checks.orchestrator import run_all_checks
entities = _sample_entities()
g = _cyclic_graph()
report = run_all_checks(entities, graph=g, checks=["consistency"])
assert report.consistency.cycle_count >= 1
# ── Shannon entropy helper ──────────────────────────────────────────
class TestShannonEntropy:
def test_uniform_distribution(self):
from markitect.infospace.checks.granularity import _shannon_entropy
counts = {"a": 1, "b": 1, "c": 1, "d": 1}
assert _shannon_entropy(counts) == pytest.approx(2.0) # log2(4)
def test_single_element(self):
from markitect.infospace.checks.granularity import _shannon_entropy
assert _shannon_entropy({"a": 10}) == 0.0
def test_empty(self):
from markitect.infospace.checks.granularity import _shannon_entropy
assert _shannon_entropy({}) == 0.0
def test_skewed(self):
from markitect.infospace.checks.granularity import _shannon_entropy
counts = {"a": 99, "b": 1}
entropy = _shannon_entropy(counts)
assert 0.0 < entropy < 1.0