markitect-main/markitect/spaces/transclusion/reference_graph.py

"""
Reference graph for transclusion dependency tracking.

This module provides a graph-based system for tracking which documents
reference which other documents, enabling efficient cache invalidation.
"""

from collections import defaultdict
from typing import Dict, List, Set, Optional
from dataclasses import dataclass, field

from ..models import TransclusionReference
from ..repositories.interfaces import IReferenceRepository


@dataclass
class DependencyNode:
    """
    A node in the dependency graph representing a document.

    Tracks both outgoing references (what this document includes)
    and incoming references (what includes this document).
    """

    document_id: str
    space_id: str
    # Documents this document references (includes)
    references: Set[str] = field(default_factory=set)
    # Documents that reference (include) this document
    dependents: Set[str] = field(default_factory=set)


class ReferenceGraph:
    """
    In-memory graph of document dependencies.

    Used for efficient cache invalidation by tracking which documents
    depend on which other documents.
    """

    def __init__(self):
        """Initialize an empty reference graph."""
        # Map of document_id -> DependencyNode
        self._nodes: Dict[str, DependencyNode] = {}
        # Map of space_id -> set of document_ids in that space
        self._spaces: Dict[str, Set[str]] = defaultdict(set)

    def _get_or_create_node(self, document_id: str, space_id: str) -> DependencyNode:
        """Get or create a node for a document."""
        if document_id not in self._nodes:
            self._nodes[document_id] = DependencyNode(
                document_id=document_id,
                space_id=space_id,
            )
            self._spaces[space_id].add(document_id)
        return self._nodes[document_id]

    def add_reference(
        self,
        source_doc_id: str,
        target_doc_id: str,
        space_id: str,
    ) -> None:
        """
        Add a reference from source to target document.

        Args:
            source_doc_id: The document doing the including
            target_doc_id: The document being included
            space_id: The space ID
        """
        source_node = self._get_or_create_node(source_doc_id, space_id)
        target_node = self._get_or_create_node(target_doc_id, space_id)

        source_node.references.add(target_doc_id)
        target_node.dependents.add(source_doc_id)

    def remove_reference(
        self,
        source_doc_id: str,
        target_doc_id: str,
    ) -> None:
        """
        Remove a reference from source to target.

        Args:
            source_doc_id: The source document
            target_doc_id: The target document
        """
        if source_doc_id in self._nodes:
            self._nodes[source_doc_id].references.discard(target_doc_id)
        if target_doc_id in self._nodes:
            self._nodes[target_doc_id].dependents.discard(source_doc_id)

    def clear_references_from(self, source_doc_id: str) -> List[str]:
        """
        Clear all references from a source document.

        Args:
            source_doc_id: The source document

        Returns:
            List of target document IDs that were referenced
        """
        if source_doc_id not in self._nodes:
            return []

        node = self._nodes[source_doc_id]
        targets = list(node.references)

        # Remove from all targets' dependent lists
        for target_id in targets:
            if target_id in self._nodes:
                self._nodes[target_id].dependents.discard(source_doc_id)

        node.references.clear()
        return targets

    def get_references(self, document_id: str) -> Set[str]:
        """
        Get all documents referenced by a document.

        Args:
            document_id: The document ID

        Returns:
            Set of referenced document IDs
        """
        if document_id not in self._nodes:
            return set()
        return self._nodes[document_id].references.copy()

    def get_dependents(self, document_id: str) -> Set[str]:
        """
        Get all documents that depend on (reference) a document.

        Args:
            document_id: The document ID

        Returns:
            Set of dependent document IDs
        """
        if document_id not in self._nodes:
            return set()
        return self._nodes[document_id].dependents.copy()

    def get_transitive_dependents(self, document_id: str) -> Set[str]:
        """
        Get all documents that directly or indirectly depend on a document.

        Performs a breadth-first traversal of the dependency graph.

        Args:
            document_id: The document ID

        Returns:
            Set of all transitive dependent document IDs
        """
        result = set()
        to_visit = list(self.get_dependents(document_id))
        visited = {document_id}

        while to_visit:
            current = to_visit.pop(0)
            if current in visited:
                continue
            visited.add(current)
            result.add(current)

            # Add this document's dependents to visit list
            for dependent in self.get_dependents(current):
                if dependent not in visited:
                    to_visit.append(dependent)

        return result

    def get_documents_in_space(self, space_id: str) -> Set[str]:
        """
        Get all document IDs tracked in a space.

        Args:
            space_id: The space ID

        Returns:
            Set of document IDs
        """
        return self._spaces.get(space_id, set()).copy()

    def remove_document(self, document_id: str) -> None:
        """
        Remove a document and all its references from the graph.

        Args:
            document_id: The document ID to remove
        """
        if document_id not in self._nodes:
            return

        node = self._nodes[document_id]

        # Remove this document from all its targets' dependent lists
        for target_id in node.references:
            if target_id in self._nodes:
                self._nodes[target_id].dependents.discard(document_id)

        # Remove this document from all its dependents' reference lists
        for dependent_id in node.dependents:
            if dependent_id in self._nodes:
                self._nodes[dependent_id].references.discard(document_id)

        # Remove from space tracking
        self._spaces[node.space_id].discard(document_id)

        # Delete the node
        del self._nodes[document_id]

    def clear_space(self, space_id: str) -> None:
        """
        Clear all references for documents in a space.

        Args:
            space_id: The space ID
        """
        doc_ids = list(self._spaces.get(space_id, set()))
        for doc_id in doc_ids:
            self.remove_document(doc_id)


class PersistentReferenceGraph(ReferenceGraph):
    """
    Reference graph backed by persistent storage.

    Extends ReferenceGraph to persist references to a repository,
    enabling cache invalidation across restarts.
    """

    def __init__(
        self,
        space_id: str,
        reference_repo: IReferenceRepository,
        load_on_init: bool = True,
    ):
        """
        Initialize a persistent reference graph.

        Args:
            space_id: The space ID
            reference_repo: Repository for persisting references
            load_on_init: Whether to load existing references on init
        """
        super().__init__()
        self.space_id = space_id
        self._reference_repo = reference_repo

        if load_on_init:
            self._load_from_repository()

    def _load_from_repository(self) -> None:
        """Load all references from the repository."""
        # Get all documents in space and their references
        # This is a simplified approach - in production you might want
        # to load lazily or use a more efficient query
        pass  # Repository doesn't have a list_all method, would need to enhance

    def add_reference(
        self,
        source_doc_id: str,
        target_doc_id: str,
        space_id: Optional[str] = None,
    ) -> None:
        """
        Add a reference and persist it.

        Args:
            source_doc_id: Source document ID
            target_doc_id: Target document ID
            space_id: Optional space ID override
        """
        space = space_id or self.space_id

        # Update in-memory graph
        super().add_reference(source_doc_id, target_doc_id, space)

        # Persist to repository
        ref = TransclusionReference(
            source_doc_id=source_doc_id,
            target_doc_id=target_doc_id,
            space_id=space,
        )
        self._reference_repo.add_reference(ref)

    def clear_references_from(self, source_doc_id: str) -> List[str]:
        """
        Clear references from source and persist.

        Args:
            source_doc_id: Source document ID

        Returns:
            List of cleared target document IDs
        """
        # Clear from in-memory graph
        targets = super().clear_references_from(source_doc_id)

        # Clear from repository
        self._reference_repo.clear_references_from(source_doc_id, self.space_id)

        return targets

    def get_dependents_from_repo(self, document_id: str) -> List[str]:
        """
        Get dependents directly from repository.

        Useful when graph may not be fully loaded.

        Args:
            document_id: The document ID

        Returns:
            List of dependent document IDs
        """
        return self._reference_repo.get_dependents(document_id, self.space_id)

    def sync_with_repository(self) -> None:
        """
        Sync in-memory graph with repository.

        Useful after batch operations or to ensure consistency.
        """
        # Clear in-memory graph for this space
        self.clear_space(self.space_id)

        # Reload from repository
        # Note: This would need a method to list all references in space
        # For now, the graph is built incrementally during document processing
        pass