""" Reference graph for transclusion dependency tracking. This module provides a graph-based system for tracking which documents reference which other documents, enabling efficient cache invalidation. """ from collections import defaultdict from typing import Dict, List, Set, Optional from dataclasses import dataclass, field from ..models import TransclusionReference from ..repositories.interfaces import IReferenceRepository @dataclass class DependencyNode: """ A node in the dependency graph representing a document. Tracks both outgoing references (what this document includes) and incoming references (what includes this document). """ document_id: str space_id: str # Documents this document references (includes) references: Set[str] = field(default_factory=set) # Documents that reference (include) this document dependents: Set[str] = field(default_factory=set) class ReferenceGraph: """ In-memory graph of document dependencies. Used for efficient cache invalidation by tracking which documents depend on which other documents. """ def __init__(self): """Initialize an empty reference graph.""" # Map of document_id -> DependencyNode self._nodes: Dict[str, DependencyNode] = {} # Map of space_id -> set of document_ids in that space self._spaces: Dict[str, Set[str]] = defaultdict(set) def _get_or_create_node(self, document_id: str, space_id: str) -> DependencyNode: """Get or create a node for a document.""" if document_id not in self._nodes: self._nodes[document_id] = DependencyNode( document_id=document_id, space_id=space_id, ) self._spaces[space_id].add(document_id) return self._nodes[document_id] def add_reference( self, source_doc_id: str, target_doc_id: str, space_id: str, ) -> None: """ Add a reference from source to target document. Args: source_doc_id: The document doing the including target_doc_id: The document being included space_id: The space ID """ source_node = self._get_or_create_node(source_doc_id, space_id) target_node = self._get_or_create_node(target_doc_id, space_id) source_node.references.add(target_doc_id) target_node.dependents.add(source_doc_id) def remove_reference( self, source_doc_id: str, target_doc_id: str, ) -> None: """ Remove a reference from source to target. Args: source_doc_id: The source document target_doc_id: The target document """ if source_doc_id in self._nodes: self._nodes[source_doc_id].references.discard(target_doc_id) if target_doc_id in self._nodes: self._nodes[target_doc_id].dependents.discard(source_doc_id) def clear_references_from(self, source_doc_id: str) -> List[str]: """ Clear all references from a source document. Args: source_doc_id: The source document Returns: List of target document IDs that were referenced """ if source_doc_id not in self._nodes: return [] node = self._nodes[source_doc_id] targets = list(node.references) # Remove from all targets' dependent lists for target_id in targets: if target_id in self._nodes: self._nodes[target_id].dependents.discard(source_doc_id) node.references.clear() return targets def get_references(self, document_id: str) -> Set[str]: """ Get all documents referenced by a document. Args: document_id: The document ID Returns: Set of referenced document IDs """ if document_id not in self._nodes: return set() return self._nodes[document_id].references.copy() def get_dependents(self, document_id: str) -> Set[str]: """ Get all documents that depend on (reference) a document. Args: document_id: The document ID Returns: Set of dependent document IDs """ if document_id not in self._nodes: return set() return self._nodes[document_id].dependents.copy() def get_transitive_dependents(self, document_id: str) -> Set[str]: """ Get all documents that directly or indirectly depend on a document. Performs a breadth-first traversal of the dependency graph. Args: document_id: The document ID Returns: Set of all transitive dependent document IDs """ result = set() to_visit = list(self.get_dependents(document_id)) visited = {document_id} while to_visit: current = to_visit.pop(0) if current in visited: continue visited.add(current) result.add(current) # Add this document's dependents to visit list for dependent in self.get_dependents(current): if dependent not in visited: to_visit.append(dependent) return result def get_documents_in_space(self, space_id: str) -> Set[str]: """ Get all document IDs tracked in a space. Args: space_id: The space ID Returns: Set of document IDs """ return self._spaces.get(space_id, set()).copy() def remove_document(self, document_id: str) -> None: """ Remove a document and all its references from the graph. Args: document_id: The document ID to remove """ if document_id not in self._nodes: return node = self._nodes[document_id] # Remove this document from all its targets' dependent lists for target_id in node.references: if target_id in self._nodes: self._nodes[target_id].dependents.discard(document_id) # Remove this document from all its dependents' reference lists for dependent_id in node.dependents: if dependent_id in self._nodes: self._nodes[dependent_id].references.discard(document_id) # Remove from space tracking self._spaces[node.space_id].discard(document_id) # Delete the node del self._nodes[document_id] def clear_space(self, space_id: str) -> None: """ Clear all references for documents in a space. Args: space_id: The space ID """ doc_ids = list(self._spaces.get(space_id, set())) for doc_id in doc_ids: self.remove_document(doc_id) class PersistentReferenceGraph(ReferenceGraph): """ Reference graph backed by persistent storage. Extends ReferenceGraph to persist references to a repository, enabling cache invalidation across restarts. """ def __init__( self, space_id: str, reference_repo: IReferenceRepository, load_on_init: bool = True, ): """ Initialize a persistent reference graph. Args: space_id: The space ID reference_repo: Repository for persisting references load_on_init: Whether to load existing references on init """ super().__init__() self.space_id = space_id self._reference_repo = reference_repo if load_on_init: self._load_from_repository() def _load_from_repository(self) -> None: """Load all references from the repository.""" # Get all documents in space and their references # This is a simplified approach - in production you might want # to load lazily or use a more efficient query pass # Repository doesn't have a list_all method, would need to enhance def add_reference( self, source_doc_id: str, target_doc_id: str, space_id: Optional[str] = None, ) -> None: """ Add a reference and persist it. Args: source_doc_id: Source document ID target_doc_id: Target document ID space_id: Optional space ID override """ space = space_id or self.space_id # Update in-memory graph super().add_reference(source_doc_id, target_doc_id, space) # Persist to repository ref = TransclusionReference( source_doc_id=source_doc_id, target_doc_id=target_doc_id, space_id=space, ) self._reference_repo.add_reference(ref) def clear_references_from(self, source_doc_id: str) -> List[str]: """ Clear references from source and persist. Args: source_doc_id: Source document ID Returns: List of cleared target document IDs """ # Clear from in-memory graph targets = super().clear_references_from(source_doc_id) # Clear from repository self._reference_repo.clear_references_from(source_doc_id, self.space_id) return targets def get_dependents_from_repo(self, document_id: str) -> List[str]: """ Get dependents directly from repository. Useful when graph may not be fully loaded. Args: document_id: The document ID Returns: List of dependent document IDs """ return self._reference_repo.get_dependents(document_id, self.space_id) def sync_with_repository(self) -> None: """ Sync in-memory graph with repository. Useful after batch operations or to ensure consistency. """ # Clear in-memory graph for this space self.clear_space(self.space_id) # Reload from repository # Note: This would need a method to list all references in space # For now, the graph is built incrementally during document processing pass