markitect-main/markitect/analysis/fca.py

"""
Formal Concept Analysis (FCA) for coverage gap detection.

Provides a pure-Python implementation of:

- :class:`FormalContext` — entity × attribute binary relation with
  extent/intent operations and double-prime closure.
- :class:`ConceptLattice` — the set of all formal concepts computed
  via the NextClosure algorithm (Ganter, 1984).
- :func:`find_gap_concepts` — attribute combinations present in the
  lattice whose extent is empty, revealing structural coverage gaps.

Sufficient for entity scales of ~100s.  For larger contexts a library
such as ``concepts`` (PyPI) can be substituted.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Iterable, Optional


class FormalContext:
    """Binary relation between objects and attributes.

    Args:
        objects: Iterable of object identifiers (e.g. entity slugs).
        attributes: Iterable of attribute identifiers (e.g. "domain:Production").
        incidence: Mapping of object → set of attributes it possesses.
    """

    def __init__(
        self,
        objects: Iterable[str],
        attributes: Iterable[str],
        incidence: dict[str, set[str]],
    ):
        self._objects = sorted(set(objects))
        self._attributes = sorted(set(attributes))
        self._obj_set = frozenset(self._objects)
        self._attr_set = frozenset(self._attributes)

        # Normalise incidence: only keep known attributes
        self._incidence: dict[str, frozenset[str]] = {}
        for obj in self._objects:
            raw = incidence.get(obj, set())
            self._incidence[obj] = frozenset(raw) & self._attr_set

        # Reverse index: attribute → set of objects that have it
        self._attr_to_objs: dict[str, frozenset[str]] = {}
        for attr in self._attributes:
            self._attr_to_objs[attr] = frozenset(
                obj for obj in self._objects if attr in self._incidence[obj]
            )

    @property
    def objects(self) -> list[str]:
        """Sorted list of objects."""
        return list(self._objects)

    @property
    def attributes(self) -> list[str]:
        """Sorted list of attributes."""
        return list(self._attributes)

    @property
    def object_count(self) -> int:
        return len(self._objects)

    @property
    def attribute_count(self) -> int:
        return len(self._attributes)

    def extent(self, attrs: Iterable[str]) -> frozenset[str]:
        """Objects possessing **all** given attributes (B' operation)."""
        attr_set = frozenset(attrs)
        if not attr_set:
            return self._obj_set
        result = self._obj_set
        for attr in attr_set:
            result = result & self._attr_to_objs.get(attr, frozenset())
        return result

    def intent(self, objs: Iterable[str]) -> frozenset[str]:
        """Attributes shared by **all** given objects (A' operation)."""
        obj_list = [o for o in objs if o in self._incidence]
        if not obj_list:
            return self._attr_set
        result = self._incidence[obj_list[0]]
        for obj in obj_list[1:]:
            result = result & self._incidence[obj]
        return result

    def closure(self, attrs: Iterable[str]) -> frozenset[str]:
        """Double-prime closure: B'' = intent(extent(B))."""
        return self.intent(self.extent(attrs))

    def has_attribute(self, obj: str, attr: str) -> bool:
        """Check if *obj* has *attr*."""
        return attr in self._incidence.get(obj, frozenset())

    def density(self) -> float:
        """Proportion of 1s in the incidence matrix."""
        total = len(self._objects) * len(self._attributes)
        if total == 0:
            return 0.0
        filled = sum(len(attrs) for attrs in self._incidence.values())
        return filled / total

    @classmethod
    def from_dict(cls, entity_attributes: dict[str, set[str]]) -> FormalContext:
        """Convenience: build context from ``{object: {attr, ...}}``."""
        objects = list(entity_attributes.keys())
        all_attrs: set[str] = set()
        for attrs in entity_attributes.values():
            all_attrs.update(attrs)
        return cls(objects, all_attrs, entity_attributes)


@dataclass(frozen=True)
class FormalConcept:
    """A formal concept (A, B) where A' = B and B' = A."""

    extent: frozenset[str]
    intent: frozenset[str]

    @property
    def extent_size(self) -> int:
        return len(self.extent)

    @property
    def intent_size(self) -> int:
        return len(self.intent)


@dataclass
class ConceptLattice:
    """The set of all formal concepts derived from a :class:`FormalContext`.

    Concepts are ordered by extent inclusion (subconcept ≤ superconcept).
    """

    concepts: list[FormalConcept] = field(default_factory=list)

    @property
    def size(self) -> int:
        """Number of formal concepts in the lattice."""
        return len(self.concepts)

    @property
    def top(self) -> Optional[FormalConcept]:
        """Supremum: concept with largest extent."""
        if not self.concepts:
            return None
        return max(self.concepts, key=lambda c: c.extent_size)

    @property
    def bottom(self) -> Optional[FormalConcept]:
        """Infimum: concept with largest intent."""
        if not self.concepts:
            return None
        return max(self.concepts, key=lambda c: c.intent_size)

    @classmethod
    def from_context(cls, context: FormalContext) -> ConceptLattice:
        """Compute all formal concepts using the NextClosure algorithm."""
        attrs = context.attributes  # sorted, fixed order
        if not attrs:
            # Degenerate: no attributes → single concept with all objects
            top = FormalConcept(
                extent=frozenset(context.objects),
                intent=frozenset(),
            )
            return cls(concepts=[top])

        concepts: list[FormalConcept] = []

        # Start with closure of empty attribute set
        current = context.closure(frozenset())
        ext = context.extent(current)
        concepts.append(FormalConcept(extent=ext, intent=current))

        while current != frozenset(attrs):
            nxt = _next_closure(current, attrs, context.closure)
            if nxt is None:
                break
            ext = context.extent(nxt)
            concepts.append(FormalConcept(extent=ext, intent=nxt))
            current = nxt

        return cls(concepts=concepts)

    def gap_concepts(self) -> list[FormalConcept]:
        """Formal concepts whose extent is empty."""
        return [c for c in self.concepts if c.extent_size == 0]

    def concepts_with_extent_size(self, min_size: int = 0, max_size: Optional[int] = None) -> list[FormalConcept]:
        """Filter concepts by extent size."""
        result = [c for c in self.concepts if c.extent_size >= min_size]
        if max_size is not None:
            result = [c for c in result if c.extent_size <= max_size]
        return result

    def depth(self) -> int:
        """Longest chain length in the concept ordering.

        A chain is a sequence of concepts c_1 < c_2 < ... < c_k
        where < means strict subconcept (extent inclusion).
        """
        if not self.concepts:
            return 0

        # Build DAG: concept i → j if i is direct subconcept of j
        # Use extent inclusion: i < j iff extent_i ⊂ extent_j
        n = len(self.concepts)
        extents = [c.extent for c in self.concepts]

        # Longest path via dynamic programming on sorted order
        # Sort by extent size ascending (smaller extents = more specific)
        order = sorted(range(n), key=lambda i: len(extents[i]))
        longest = [1] * n

        for idx in range(n):
            i = order[idx]
            for jdx in range(idx + 1, n):
                j = order[jdx]
                if extents[i] < extents[j]:  # strict subset
                    if longest[j] < longest[i] + 1:
                        longest[j] = longest[i] + 1

        return max(longest) if longest else 0


def find_gap_concepts(
    context: FormalContext,
    lattice: Optional[ConceptLattice] = None,
) -> list[FormalConcept]:
    """Find formal concepts with empty extent (coverage gaps).

    These represent attribute combinations that are structurally
    present in the lattice but have no corresponding entities.

    Args:
        context: The formal context.
        lattice: Pre-computed lattice.  If ``None``, computed from *context*.

    Returns:
        List of :class:`FormalConcept` with empty extent, sorted by
        intent size ascending (most specific gaps first).
    """
    if lattice is None:
        lattice = ConceptLattice.from_context(context)
    gaps = lattice.gap_concepts()
    gaps.sort(key=lambda c: c.intent_size)
    return gaps


def find_empty_cells(
    context: FormalContext,
    dimension_a: list[str],
    dimension_b: list[str],
) -> list[tuple[str, str]]:
    """Find empty cells in a two-dimensional cross-tabulation.

    Given two sets of attributes (e.g. domain values and VSM systems),
    return pairs ``(attr_a, attr_b)`` where no object possesses both.

    This is a simpler alternative to full FCA for two-dimensional
    coverage analysis.
    """
    empty: list[tuple[str, str]] = []
    for a in sorted(dimension_a):
        for b in sorted(dimension_b):
            if not context.extent([a, b]):
                empty.append((a, b))
    return empty


# ── NextClosure internals ───────────────────────────────────────────


def _next_closure(
    current: frozenset[str],
    attrs: list[str],
    closure_fn,
) -> Optional[frozenset[str]]:
    """Compute the next closed set in lectic order after *current*.

    Implements Ganter's NextClosure algorithm.
    """
    for i in range(len(attrs) - 1, -1, -1):
        m = attrs[i]
        if m in current:
            current = current - {m}
        else:
            candidate = current | {m}
            closed = closure_fn(candidate)
            # Canonicity test: no attribute before position i
            # was added by the closure
            canonical = True
            for j in range(i):
                if attrs[j] in closed and attrs[j] not in candidate:
                    canonical = False
                    break
            if canonical:
                return closed
    return None