Files
markitect-main/markitect/proxy/registry.py
tegwick ac334c679d feat(proxy): add proxy file system for non-markdown source conversion
Introduces a new `markitect/proxy/` module with pluggable extractors that
convert non-markdown sources (PDF, HTML) into tracked markdown proxy files.
Proxy files preserve origin metadata (path, checksum, timestamp) so they
can be kept in sync when the original changes.

CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`.
Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 19:06:09 +01:00

66 lines
2.2 KiB
Python

"""
Extractor registry — register and look up extractors by file extension.
"""
from __future__ import annotations
import logging
from typing import Dict, List, TYPE_CHECKING
from pathlib import Path
from markitect.proxy.exceptions import ExtractorNotFoundError
if TYPE_CHECKING:
from markitect.proxy.extractors.base import BaseExtractor
logger = logging.getLogger("markitect.proxy.registry")
class ExtractorRegistry:
"""Maps file extensions to their corresponding extractors."""
def __init__(self):
self._extractors: Dict[str, BaseExtractor] = {}
def register(self, extractor: BaseExtractor) -> None:
"""Register an extractor for all of its declared extensions."""
for ext in extractor.extensions:
ext_lower = ext.lower()
self._extractors[ext_lower] = extractor
logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
def get_extractor(self, extension: str) -> BaseExtractor:
"""Look up an extractor by file extension (e.g. ``'.pdf'``).
Raises:
ExtractorNotFoundError: If no extractor handles the extension.
"""
ext_lower = extension.lower()
if ext_lower not in self._extractors:
known = ", ".join(sorted(self._extractors.keys()))
raise ExtractorNotFoundError(
f"No extractor registered for {ext_lower!r}. "
f"Supported extensions: {known}",
context={"extension": ext_lower},
)
return self._extractors[ext_lower]
def get_extractor_for_file(self, path: Path) -> BaseExtractor:
"""Look up an extractor for a file based on its suffix."""
return self.get_extractor(path.suffix)
def list_extractors(self) -> List[BaseExtractor]:
"""Return a de-duplicated list of registered extractors."""
seen = set()
result = []
for ext in self._extractors.values():
if id(ext) not in seen:
seen.add(id(ext))
result.append(ext)
return result
# Module-level singleton
registry = ExtractorRegistry()