Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
66 lines
2.2 KiB
Python
66 lines
2.2 KiB
Python
"""
|
|
Extractor registry — register and look up extractors by file extension.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Dict, List, TYPE_CHECKING
|
|
|
|
from pathlib import Path
|
|
|
|
from markitect.proxy.exceptions import ExtractorNotFoundError
|
|
|
|
if TYPE_CHECKING:
|
|
from markitect.proxy.extractors.base import BaseExtractor
|
|
|
|
logger = logging.getLogger("markitect.proxy.registry")
|
|
|
|
|
|
class ExtractorRegistry:
|
|
"""Maps file extensions to their corresponding extractors."""
|
|
|
|
def __init__(self):
|
|
self._extractors: Dict[str, BaseExtractor] = {}
|
|
|
|
def register(self, extractor: BaseExtractor) -> None:
|
|
"""Register an extractor for all of its declared extensions."""
|
|
for ext in extractor.extensions:
|
|
ext_lower = ext.lower()
|
|
self._extractors[ext_lower] = extractor
|
|
logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
|
|
|
|
def get_extractor(self, extension: str) -> BaseExtractor:
|
|
"""Look up an extractor by file extension (e.g. ``'.pdf'``).
|
|
|
|
Raises:
|
|
ExtractorNotFoundError: If no extractor handles the extension.
|
|
"""
|
|
ext_lower = extension.lower()
|
|
if ext_lower not in self._extractors:
|
|
known = ", ".join(sorted(self._extractors.keys()))
|
|
raise ExtractorNotFoundError(
|
|
f"No extractor registered for {ext_lower!r}. "
|
|
f"Supported extensions: {known}",
|
|
context={"extension": ext_lower},
|
|
)
|
|
return self._extractors[ext_lower]
|
|
|
|
def get_extractor_for_file(self, path: Path) -> BaseExtractor:
|
|
"""Look up an extractor for a file based on its suffix."""
|
|
return self.get_extractor(path.suffix)
|
|
|
|
def list_extractors(self) -> List[BaseExtractor]:
|
|
"""Return a de-duplicated list of registered extractors."""
|
|
seen = set()
|
|
result = []
|
|
for ext in self._extractors.values():
|
|
if id(ext) not in seen:
|
|
seen.add(id(ext))
|
|
result.append(ext)
|
|
return result
|
|
|
|
|
|
# Module-level singleton
|
|
registry = ExtractorRegistry()
|