feat(proxy): add proxy file system for non-markdown source conversion
Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
65
markitect/proxy/registry.py
Normal file
65
markitect/proxy/registry.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Extractor registry — register and look up extractors by file extension.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, TYPE_CHECKING
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.exceptions import ExtractorNotFoundError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
|
||||
logger = logging.getLogger("markitect.proxy.registry")
|
||||
|
||||
|
||||
class ExtractorRegistry:
|
||||
"""Maps file extensions to their corresponding extractors."""
|
||||
|
||||
def __init__(self):
|
||||
self._extractors: Dict[str, BaseExtractor] = {}
|
||||
|
||||
def register(self, extractor: BaseExtractor) -> None:
|
||||
"""Register an extractor for all of its declared extensions."""
|
||||
for ext in extractor.extensions:
|
||||
ext_lower = ext.lower()
|
||||
self._extractors[ext_lower] = extractor
|
||||
logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
|
||||
|
||||
def get_extractor(self, extension: str) -> BaseExtractor:
|
||||
"""Look up an extractor by file extension (e.g. ``'.pdf'``).
|
||||
|
||||
Raises:
|
||||
ExtractorNotFoundError: If no extractor handles the extension.
|
||||
"""
|
||||
ext_lower = extension.lower()
|
||||
if ext_lower not in self._extractors:
|
||||
known = ", ".join(sorted(self._extractors.keys()))
|
||||
raise ExtractorNotFoundError(
|
||||
f"No extractor registered for {ext_lower!r}. "
|
||||
f"Supported extensions: {known}",
|
||||
context={"extension": ext_lower},
|
||||
)
|
||||
return self._extractors[ext_lower]
|
||||
|
||||
def get_extractor_for_file(self, path: Path) -> BaseExtractor:
|
||||
"""Look up an extractor for a file based on its suffix."""
|
||||
return self.get_extractor(path.suffix)
|
||||
|
||||
def list_extractors(self) -> List[BaseExtractor]:
|
||||
"""Return a de-duplicated list of registered extractors."""
|
||||
seen = set()
|
||||
result = []
|
||||
for ext in self._extractors.values():
|
||||
if id(ext) not in seen:
|
||||
seen.add(id(ext))
|
||||
result.append(ext)
|
||||
return result
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
registry = ExtractorRegistry()
|
||||
Reference in New Issue
Block a user