feat(analysis): add Formal Concept Analysis for coverage gap detection (S1.7)

Pure-Python FCA implementation: FormalContext (entity × attribute binary relation with extent/intent/closure), ConceptLattice via NextClosure algorithm, find_gap_concepts() for structural coverage gaps, and find_empty_cells() for cross-tabulation analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:38:35 +01:00
parent f8c9ab33f0
commit dc22017b7c
2 changed files with 620 additions and 0 deletions
--- a/markitect/analysis/fca.py
+++ b/markitect/analysis/fca.py
@@ -0,0 +1,307 @@
+"""
+Formal Concept Analysis (FCA) for coverage gap detection.
+
+Provides a pure-Python implementation of:
+
+- :class:`FormalContext` — entity × attribute binary relation with
+  extent/intent operations and double-prime closure.
+- :class:`ConceptLattice` — the set of all formal concepts computed
+  via the NextClosure algorithm (Ganter, 1984).
+- :func:`find_gap_concepts` — attribute combinations present in the
+  lattice whose extent is empty, revealing structural coverage gaps.
+
+Sufficient for entity scales of ~100s.  For larger contexts a library
+such as ``concepts`` (PyPI) can be substituted.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Iterable, Optional
+
+
+class FormalContext:
+    """Binary relation between objects and attributes.
+
+    Args:
+        objects: Iterable of object identifiers (e.g. entity slugs).
+        attributes: Iterable of attribute identifiers (e.g. "domain:Production").
+        incidence: Mapping of object → set of attributes it possesses.
+    """
+
+    def __init__(
+        self,
+        objects: Iterable[str],
+        attributes: Iterable[str],
+        incidence: dict[str, set[str]],
+    ):
+        self._objects = sorted(set(objects))
+        self._attributes = sorted(set(attributes))
+        self._obj_set = frozenset(self._objects)
+        self._attr_set = frozenset(self._attributes)
+
+        # Normalise incidence: only keep known attributes
+        self._incidence: dict[str, frozenset[str]] = {}
+        for obj in self._objects:
+            raw = incidence.get(obj, set())
+            self._incidence[obj] = frozenset(raw) & self._attr_set
+
+        # Reverse index: attribute → set of objects that have it
+        self._attr_to_objs: dict[str, frozenset[str]] = {}
+        for attr in self._attributes:
+            self._attr_to_objs[attr] = frozenset(
+                obj for obj in self._objects if attr in self._incidence[obj]
+            )
+
+    @property
+    def objects(self) -> list[str]:
+        """Sorted list of objects."""
+        return list(self._objects)
+
+    @property
+    def attributes(self) -> list[str]:
+        """Sorted list of attributes."""
+        return list(self._attributes)
+
+    @property
+    def object_count(self) -> int:
+        return len(self._objects)
+
+    @property
+    def attribute_count(self) -> int:
+        return len(self._attributes)
+
+    def extent(self, attrs: Iterable[str]) -> frozenset[str]:
+        """Objects possessing **all** given attributes (B' operation)."""
+        attr_set = frozenset(attrs)
+        if not attr_set:
+            return self._obj_set
+        result = self._obj_set
+        for attr in attr_set:
+            result = result & self._attr_to_objs.get(attr, frozenset())
+        return result
+
+    def intent(self, objs: Iterable[str]) -> frozenset[str]:
+        """Attributes shared by **all** given objects (A' operation)."""
+        obj_list = [o for o in objs if o in self._incidence]
+        if not obj_list:
+            return self._attr_set
+        result = self._incidence[obj_list[0]]
+        for obj in obj_list[1:]:
+            result = result & self._incidence[obj]
+        return result
+
+    def closure(self, attrs: Iterable[str]) -> frozenset[str]:
+        """Double-prime closure: B'' = intent(extent(B))."""
+        return self.intent(self.extent(attrs))
+
+    def has_attribute(self, obj: str, attr: str) -> bool:
+        """Check if *obj* has *attr*."""
+        return attr in self._incidence.get(obj, frozenset())
+
+    def density(self) -> float:
+        """Proportion of 1s in the incidence matrix."""
+        total = len(self._objects) * len(self._attributes)
+        if total == 0:
+            return 0.0
+        filled = sum(len(attrs) for attrs in self._incidence.values())
+        return filled / total
+
+    @classmethod
+    def from_dict(cls, entity_attributes: dict[str, set[str]]) -> FormalContext:
+        """Convenience: build context from ``{object: {attr, ...}}``."""
+        objects = list(entity_attributes.keys())
+        all_attrs: set[str] = set()
+        for attrs in entity_attributes.values():
+            all_attrs.update(attrs)
+        return cls(objects, all_attrs, entity_attributes)
+
+
+@dataclass(frozen=True)
+class FormalConcept:
+    """A formal concept (A, B) where A' = B and B' = A."""
+
+    extent: frozenset[str]
+    intent: frozenset[str]
+
+    @property
+    def extent_size(self) -> int:
+        return len(self.extent)
+
+    @property
+    def intent_size(self) -> int:
+        return len(self.intent)
+
+
+@dataclass
+class ConceptLattice:
+    """The set of all formal concepts derived from a :class:`FormalContext`.
+
+    Concepts are ordered by extent inclusion (subconcept ≤ superconcept).
+    """
+
+    concepts: list[FormalConcept] = field(default_factory=list)
+
+    @property
+    def size(self) -> int:
+        """Number of formal concepts in the lattice."""
+        return len(self.concepts)
+
+    @property
+    def top(self) -> Optional[FormalConcept]:
+        """Supremum: concept with largest extent."""
+        if not self.concepts:
+            return None
+        return max(self.concepts, key=lambda c: c.extent_size)
+
+    @property
+    def bottom(self) -> Optional[FormalConcept]:
+        """Infimum: concept with largest intent."""
+        if not self.concepts:
+            return None
+        return max(self.concepts, key=lambda c: c.intent_size)
+
+    @classmethod
+    def from_context(cls, context: FormalContext) -> ConceptLattice:
+        """Compute all formal concepts using the NextClosure algorithm."""
+        attrs = context.attributes  # sorted, fixed order
+        if not attrs:
+            # Degenerate: no attributes → single concept with all objects
+            top = FormalConcept(
+                extent=frozenset(context.objects),
+                intent=frozenset(),
+            )
+            return cls(concepts=[top])
+
+        concepts: list[FormalConcept] = []
+
+        # Start with closure of empty attribute set
+        current = context.closure(frozenset())
+        ext = context.extent(current)
+        concepts.append(FormalConcept(extent=ext, intent=current))
+
+        while current != frozenset(attrs):
+            nxt = _next_closure(current, attrs, context.closure)
+            if nxt is None:
+                break
+            ext = context.extent(nxt)
+            concepts.append(FormalConcept(extent=ext, intent=nxt))
+            current = nxt
+
+        return cls(concepts=concepts)
+
+    def gap_concepts(self) -> list[FormalConcept]:
+        """Formal concepts whose extent is empty."""
+        return [c for c in self.concepts if c.extent_size == 0]
+
+    def concepts_with_extent_size(self, min_size: int = 0, max_size: Optional[int] = None) -> list[FormalConcept]:
+        """Filter concepts by extent size."""
+        result = [c for c in self.concepts if c.extent_size >= min_size]
+        if max_size is not None:
+            result = [c for c in result if c.extent_size <= max_size]
+        return result
+
+    def depth(self) -> int:
+        """Longest chain length in the concept ordering.
+
+        A chain is a sequence of concepts c_1 < c_2 < ... < c_k
+        where < means strict subconcept (extent inclusion).
+        """
+        if not self.concepts:
+            return 0
+
+        # Build DAG: concept i → j if i is direct subconcept of j
+        # Use extent inclusion: i < j iff extent_i ⊂ extent_j
+        n = len(self.concepts)
+        extents = [c.extent for c in self.concepts]
+
+        # Longest path via dynamic programming on sorted order
+        # Sort by extent size ascending (smaller extents = more specific)
+        order = sorted(range(n), key=lambda i: len(extents[i]))
+        longest = [1] * n
+
+        for idx in range(n):
+            i = order[idx]
+            for jdx in range(idx + 1, n):
+                j = order[jdx]
+                if extents[i] < extents[j]:  # strict subset
+                    if longest[j] < longest[i] + 1:
+                        longest[j] = longest[i] + 1
+
+        return max(longest) if longest else 0
+
+
+def find_gap_concepts(
+    context: FormalContext,
+    lattice: Optional[ConceptLattice] = None,
+) -> list[FormalConcept]:
+    """Find formal concepts with empty extent (coverage gaps).
+
+    These represent attribute combinations that are structurally
+    present in the lattice but have no corresponding entities.
+
+    Args:
+        context: The formal context.
+        lattice: Pre-computed lattice.  If ``None``, computed from *context*.
+
+    Returns:
+        List of :class:`FormalConcept` with empty extent, sorted by
+        intent size ascending (most specific gaps first).
+    """
+    if lattice is None:
+        lattice = ConceptLattice.from_context(context)
+    gaps = lattice.gap_concepts()
+    gaps.sort(key=lambda c: c.intent_size)
+    return gaps
+
+
+def find_empty_cells(
+    context: FormalContext,
+    dimension_a: list[str],
+    dimension_b: list[str],
+) -> list[tuple[str, str]]:
+    """Find empty cells in a two-dimensional cross-tabulation.
+
+    Given two sets of attributes (e.g. domain values and VSM systems),
+    return pairs ``(attr_a, attr_b)`` where no object possesses both.
+
+    This is a simpler alternative to full FCA for two-dimensional
+    coverage analysis.
+    """
+    empty: list[tuple[str, str]] = []
+    for a in sorted(dimension_a):
+        for b in sorted(dimension_b):
+            if not context.extent([a, b]):
+                empty.append((a, b))
+    return empty
+
+
+# ── NextClosure internals ───────────────────────────────────────────
+
+
+def _next_closure(
+    current: frozenset[str],
+    attrs: list[str],
+    closure_fn,
+) -> Optional[frozenset[str]]:
+    """Compute the next closed set in lectic order after *current*.
+
+    Implements Ganter's NextClosure algorithm.
+    """
+    for i in range(len(attrs) - 1, -1, -1):
+        m = attrs[i]
+        if m in current:
+            current = current - {m}
+        else:
+            candidate = current | {m}
+            closed = closure_fn(candidate)
+            # Canonicity test: no attribute before position i
+            # was added by the closure
+            canonical = True
+            for j in range(i):
+                if attrs[j] in closed and attrs[j] not in candidate:
+                    canonical = False
+                    break
+            if canonical:
+                return closed
+    return None
--- a/tests/unit/analysis/test_fca.py
+++ b/tests/unit/analysis/test_fca.py
@@ -0,0 +1,313 @@
+"""Tests for markitect.analysis.fca."""
+
+import pytest
+
+from markitect.analysis.fca import (
+    FormalContext,
+    FormalConcept,
+    ConceptLattice,
+    find_gap_concepts,
+    find_empty_cells,
+)
+
+
+# ── Test data ────────────────────────────────────────────────────────
+
+
+def _animal_context():
+    """Classic FCA example: animals × properties.
+
+    Context:
+        | animal    | legs | wings | feathers | fur |
+        |-----------|------|-------|----------|-----|
+        | dog       |  x   |       |          |  x  |
+        | cat       |  x   |       |          |  x  |
+        | eagle     |  x   |   x   |    x     |     |
+        | sparrow   |  x   |   x   |    x     |     |
+        | penguin   |  x   |       |    x     |     |
+    """
+    return FormalContext(
+        objects=["dog", "cat", "eagle", "sparrow", "penguin"],
+        attributes=["legs", "wings", "feathers", "fur"],
+        incidence={
+            "dog":     {"legs", "fur"},
+            "cat":     {"legs", "fur"},
+            "eagle":   {"legs", "wings", "feathers"},
+            "sparrow": {"legs", "wings", "feathers"},
+            "penguin": {"legs", "feathers"},
+        },
+    )
+
+
+def _infospace_context():
+    """Simplified infospace-style context: entities × {domain, vsm_system}.
+
+    Entities with domain and VSM classification, including a gap:
+    no entity has both domain:Exchange and vsm:S3.
+    """
+    return FormalContext.from_dict({
+        "division-of-labour":   {"domain:Production", "vsm:S1"},
+        "pin-factory":          {"domain:Production", "vsm:S1"},
+        "market-extent":        {"domain:Exchange", "vsm:S4"},
+        "wage-determination":   {"domain:Distribution", "vsm:S3"},
+        "rent-theory":          {"domain:Distribution", "vsm:S5"},
+        "capital-accumulation": {"domain:Production", "vsm:S3"},
+    })
+
+
+def _empty_context():
+    """Context with no objects."""
+    return FormalContext([], ["a", "b"], {})
+
+
+def _single_entity():
+    """Context with one object."""
+    return FormalContext(["only"], ["x", "y"], {"only": {"x", "y"}})
+
+
+# ── FormalContext ────────────────────────────────────────────────────
+
+
+class TestFormalContext:
+    def test_objects_sorted(self):
+        ctx = _animal_context()
+        assert ctx.objects == sorted(ctx.objects)
+
+    def test_attributes_sorted(self):
+        ctx = _animal_context()
+        assert ctx.attributes == sorted(ctx.attributes)
+
+    def test_object_count(self):
+        assert _animal_context().object_count == 5
+
+    def test_attribute_count(self):
+        assert _animal_context().attribute_count == 4
+
+    def test_extent_single_attr(self):
+        ctx = _animal_context()
+        assert ctx.extent(["fur"]) == frozenset({"dog", "cat"})
+
+    def test_extent_multiple_attrs(self):
+        ctx = _animal_context()
+        assert ctx.extent(["wings", "feathers"]) == frozenset({"eagle", "sparrow"})
+
+    def test_extent_empty_returns_all(self):
+        ctx = _animal_context()
+        assert ctx.extent([]) == frozenset(ctx.objects)
+
+    def test_extent_no_match(self):
+        ctx = _animal_context()
+        assert ctx.extent(["fur", "feathers"]) == frozenset()
+
+    def test_intent_single_obj(self):
+        ctx = _animal_context()
+        assert ctx.intent(["penguin"]) == frozenset({"legs", "feathers"})
+
+    def test_intent_multiple_objs(self):
+        ctx = _animal_context()
+        # dog and cat share: legs, fur
+        assert ctx.intent(["dog", "cat"]) == frozenset({"legs", "fur"})
+
+    def test_intent_empty_returns_all(self):
+        ctx = _animal_context()
+        assert ctx.intent([]) == frozenset(ctx.attributes)
+
+    def test_closure_is_idempotent(self):
+        ctx = _animal_context()
+        c1 = ctx.closure({"fur"})
+        c2 = ctx.closure(c1)
+        assert c1 == c2
+
+    def test_closure_expands(self):
+        ctx = _animal_context()
+        # fur → {dog, cat} → {legs, fur} (both have legs too)
+        assert ctx.closure({"fur"}) == frozenset({"legs", "fur"})
+
+    def test_has_attribute(self):
+        ctx = _animal_context()
+        assert ctx.has_attribute("dog", "legs") is True
+        assert ctx.has_attribute("dog", "wings") is False
+
+    def test_density(self):
+        ctx = _animal_context()
+        # 5 objects × 4 attributes = 20 cells
+        # dog:2, cat:2, eagle:3, sparrow:3, penguin:2 = 12 filled
+        assert ctx.density() == pytest.approx(12 / 20)
+
+    def test_density_empty(self):
+        assert FormalContext([], [], {}).density() == 0.0
+
+    def test_from_dict(self):
+        ctx = FormalContext.from_dict({
+            "a": {"x", "y"},
+            "b": {"y", "z"},
+        })
+        assert ctx.object_count == 2
+        assert ctx.attribute_count == 3
+
+    def test_unknown_attributes_ignored(self):
+        ctx = FormalContext(
+            ["a"], ["x"], {"a": {"x", "unknown"}}
+        )
+        assert ctx.intent(["a"]) == frozenset({"x"})
+
+
+# ── ConceptLattice ──────────────────────────────────────────────────
+
+
+class TestConceptLattice:
+    def test_animal_concept_count(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        # Known: the animal context produces exactly 7 formal concepts
+        # Top: ({all}, {legs}), Bottom: ({}, {all 4}),
+        # plus intermediate concepts
+        assert lattice.size >= 5
+
+    def test_top_has_all_objects(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        top = lattice.top
+        assert top is not None
+        assert top.extent == frozenset(ctx.objects)
+
+    def test_top_intent_is_common_attributes(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        top = lattice.top
+        # All animals have "legs"
+        assert "legs" in top.intent
+
+    def test_bottom_has_all_attributes(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        bottom = lattice.bottom
+        assert bottom is not None
+        assert bottom.intent == frozenset(ctx.attributes)
+
+    def test_bottom_extent_empty_when_no_universal_object(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        bottom = lattice.bottom
+        # No animal has all 4 attributes
+        assert bottom.extent_size == 0
+
+    def test_all_concepts_are_closed(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        for concept in lattice.concepts:
+            # intent should be closed: closure(intent) == intent
+            assert ctx.closure(concept.intent) == concept.intent
+            # extent' should equal intent
+            assert ctx.intent(concept.extent) == concept.intent
+            # intent' should equal extent
+            assert ctx.extent(concept.intent) == concept.extent
+
+    def test_empty_context(self):
+        ctx = _empty_context()
+        lattice = ConceptLattice.from_context(ctx)
+        # Empty context → gap concepts for all attribute combinations
+        assert lattice.size >= 1
+
+    def test_single_entity(self):
+        ctx = _single_entity()
+        lattice = ConceptLattice.from_context(ctx)
+        # At least 1 concept containing the single entity
+        has_entity = any(
+            "only" in c.extent for c in lattice.concepts
+        )
+        assert has_entity
+
+    def test_no_attributes_produces_one_concept(self):
+        ctx = FormalContext(["a", "b"], [], {})
+        lattice = ConceptLattice.from_context(ctx)
+        assert lattice.size == 1
+        assert lattice.concepts[0].extent == frozenset({"a", "b"})
+
+    def test_depth(self):
+        ctx = _animal_context()
+        lattice = ConceptLattice.from_context(ctx)
+        d = lattice.depth()
+        # At least 2 levels (top → bottom)
+        assert d >= 2
+
+    def test_depth_empty(self):
+        lattice = ConceptLattice(concepts=[])
+        assert lattice.depth() == 0
+
+
+# ── Gap concepts ────────────────────────────────────────────────────
+
+
+class TestGapConcepts:
+    def test_animal_has_gap(self):
+        ctx = _animal_context()
+        gaps = find_gap_concepts(ctx)
+        # {fur, feathers} has no animal → gap concept
+        fur_feathers_gap = any(
+            {"fur", "feathers"} <= c.intent for c in gaps
+        )
+        assert fur_feathers_gap
+
+    def test_gap_extents_are_empty(self):
+        ctx = _animal_context()
+        gaps = find_gap_concepts(ctx)
+        for gap in gaps:
+            assert gap.extent_size == 0
+
+    def test_no_gaps_when_all_combinations_covered(self):
+        # Every attribute combination has at least one object
+        ctx = FormalContext.from_dict({
+            "obj1": {"a", "b"},
+            "obj2": {"a"},
+            "obj3": {"b"},
+        })
+        lattice = ConceptLattice.from_context(ctx)
+        gaps = find_gap_concepts(ctx, lattice)
+        assert len(gaps) == 0
+
+    def test_sorted_by_intent_size(self):
+        ctx = _animal_context()
+        gaps = find_gap_concepts(ctx)
+        sizes = [g.intent_size for g in gaps]
+        assert sizes == sorted(sizes)
+
+    def test_infospace_gap(self):
+        ctx = _infospace_context()
+        gaps = find_gap_concepts(ctx)
+        # domain:Exchange + vsm:S1 has no entity → should appear as gap
+        gap_intents = [g.intent for g in gaps]
+        exchange_s1_covered = any(
+            {"domain:Exchange", "vsm:S1"} <= intent for intent in gap_intents
+        )
+        assert exchange_s1_covered
+
+
+# ── Empty cells (cross-tab) ─────────────────────────────────────────
+
+
+class TestFindEmptyCells:
+    def test_finds_empty_cells(self):
+        ctx = _infospace_context()
+        domains = ["domain:Production", "domain:Distribution", "domain:Exchange"]
+        vsm_systems = ["vsm:S1", "vsm:S3", "vsm:S4", "vsm:S5"]
+        empty = find_empty_cells(ctx, domains, vsm_systems)
+        # domain:Exchange + vsm:S1 should be empty
+        assert ("domain:Exchange", "vsm:S1") in empty
+        # domain:Production + vsm:S1 should NOT be empty (division-of-labour)
+        assert ("domain:Production", "vsm:S1") not in empty
+
+    def test_all_filled_returns_empty_list(self):
+        ctx = FormalContext.from_dict({
+            "a": {"x", "y"},
+            "b": {"x", "z"},
+            "c": {"y", "z"},
+            "d": {"x", "y", "z"},
+        })
+        empty = find_empty_cells(ctx, ["x", "y"], ["z"])
+        assert empty == []
+
+    def test_empty_context_all_cells_empty(self):
+        ctx = FormalContext([], ["a", "b", "c"], {})
+        empty = find_empty_cells(ctx, ["a"], ["b", "c"])
+        assert len(empty) == 2