""" Extractor registry — register and look up extractors by file extension. """ from __future__ import annotations import logging from typing import Dict, List, TYPE_CHECKING from pathlib import Path from markitect.proxy.exceptions import ExtractorNotFoundError if TYPE_CHECKING: from markitect.proxy.extractors.base import BaseExtractor logger = logging.getLogger("markitect.proxy.registry") class ExtractorRegistry: """Maps file extensions to their corresponding extractors.""" def __init__(self): self._extractors: Dict[str, BaseExtractor] = {} def register(self, extractor: BaseExtractor) -> None: """Register an extractor for all of its declared extensions.""" for ext in extractor.extensions: ext_lower = ext.lower() self._extractors[ext_lower] = extractor logger.debug("Registered %s extractor for %s", extractor.name, ext_lower) def get_extractor(self, extension: str) -> BaseExtractor: """Look up an extractor by file extension (e.g. ``'.pdf'``). Raises: ExtractorNotFoundError: If no extractor handles the extension. """ ext_lower = extension.lower() if ext_lower not in self._extractors: known = ", ".join(sorted(self._extractors.keys())) raise ExtractorNotFoundError( f"No extractor registered for {ext_lower!r}. " f"Supported extensions: {known}", context={"extension": ext_lower}, ) return self._extractors[ext_lower] def get_extractor_for_file(self, path: Path) -> BaseExtractor: """Look up an extractor for a file based on its suffix.""" return self.get_extractor(path.suffix) def list_extractors(self) -> List[BaseExtractor]: """Return a de-duplicated list of registered extractors.""" seen = set() result = [] for ext in self._extractors.values(): if id(ext) not in seen: seen.add(id(ext)) result.append(ext) return result # Module-level singleton registry = ExtractorRegistry()