feat(proxy): add proxy file system for non-markdown source conversion

Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 19:06:09 +01:00
parent 69aea1ada7
commit ac334c679d
13 changed files with 781 additions and 0 deletions
--- a/markitect/cli.py
+++ b/markitect/cli.py
@@ -7147,6 +7147,13 @@ try:
 except ImportError:
    pass  # Helper module not available

+# Register proxy file system commands
+try:
+    from markitect.proxy.cli import proxy_group
+    cli.add_command(proxy_group)
+except ImportError:
+    pass  # Proxy module not available
+
 # Make cli function available as main entry point
 main = cli

--- a/markitect/proxy/init.py
+++ b/markitect/proxy/init.py
@@ -0,0 +1,39 @@
+"""
+markitect.proxy — Proxy file system for wrapping non-markdown sources.
+
+Creates markdown proxy files that track their origin (source path,
+checksum, timestamp) so they can be kept up-to-date when the original
+changes.
+
+Quick start::
+
+    from markitect.proxy import ProxyGenerator, registry
+
+    # Ensure built-in extractors are registered
+    import markitect.proxy.extractors  # noqa: F401
+
+    gen = ProxyGenerator(registry)
+    gen.create(Path("report.pdf"), Path("./output/"))
+"""
+
+from markitect.proxy.models import ProxyMetadata, ExtractionResult
+from markitect.proxy.exceptions import (
+    ProxyError,
+    ExtractorNotFoundError,
+    DependencyMissingError,
+)
+from markitect.proxy.registry import ExtractorRegistry, registry
+from markitect.proxy.generator import ProxyGenerator
+from markitect.proxy.extractors.base import BaseExtractor
+
+__all__ = [
+    "ProxyMetadata",
+    "ExtractionResult",
+    "ProxyError",
+    "ExtractorNotFoundError",
+    "DependencyMissingError",
+    "ExtractorRegistry",
+    "registry",
+    "ProxyGenerator",
+    "BaseExtractor",
+]
--- a/markitect/proxy/cli.py
+++ b/markitect/proxy/cli.py
@@ -0,0 +1,185 @@
+"""
+Click CLI commands for the proxy file system.
+"""
+
+import json
+from pathlib import Path
+
+import click
+
+from markitect.proxy.exceptions import ProxyError
+
+
+@click.group("proxy")
+def proxy_group():
+    """Proxy file operations — create, update, and manage markdown proxies."""
+    pass
+
+
+@proxy_group.command("create")
+@click.argument("source", type=click.Path(exists=True))
+@click.option(
+    "--output-dir", "-o",
+    type=click.Path(),
+    default=".",
+    help="Directory to write the proxy file (default: current dir).",
+)
+@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.")
+def proxy_create(source, output_dir, force):
+    """Create a markdown proxy for SOURCE."""
+    # Lazy imports so the CLI group registers even if deps are absent
+    from markitect.proxy.registry import registry
+    import markitect.proxy.extractors  # noqa: F401  — registers built-ins
+    from markitect.proxy.generator import ProxyGenerator
+
+    gen = ProxyGenerator(registry)
+    try:
+        proxy_path = gen.create(Path(source), Path(output_dir), force=force)
+        click.echo(f"Created proxy: {proxy_path}")
+    except ProxyError as exc:
+        click.echo(f"Error: {exc}", err=True)
+        raise SystemExit(1)
+
+
+@proxy_group.command("update")
+@click.argument("target", type=click.Path(exists=True))
+@click.option("--dry-run", is_flag=True, help="Show what would change without writing.")
+def proxy_update(target, dry_run):
+    """Re-extract a single proxy file or all proxies in a directory."""
+    from markitect.proxy.registry import registry
+    import markitect.proxy.extractors  # noqa: F401
+    from markitect.proxy.generator import ProxyGenerator
+
+    gen = ProxyGenerator(registry)
+    target_path = Path(target).resolve()
+
+    try:
+        if target_path.is_file():
+            _update_one(gen, target_path, dry_run)
+        elif target_path.is_dir():
+            proxies = _find_proxy_files(gen, target_path)
+            if not proxies:
+                click.echo("No proxy files found.")
+                return
+            for p in proxies:
+                _update_one(gen, p, dry_run)
+        else:
+            click.echo(f"Error: {target} is not a file or directory.", err=True)
+            raise SystemExit(1)
+    except ProxyError as exc:
+        click.echo(f"Error: {exc}", err=True)
+        raise SystemExit(1)
+
+
+@proxy_group.command("status")
+@click.argument("directory", type=click.Path(exists=True), default=".")
+@click.option(
+    "--format", "output_format",
+    type=click.Choice(["table", "json"]),
+    default="table",
+    help="Output format.",
+)
+def proxy_status(directory, output_format):
+    """Show freshness of all proxy files in DIRECTORY (default: current dir)."""
+    from markitect.proxy.registry import registry
+    import markitect.proxy.extractors  # noqa: F401
+    from markitect.proxy.generator import ProxyGenerator
+
+    gen = ProxyGenerator(registry)
+
+    try:
+        results = gen.bulk_status(Path(directory))
+    except ProxyError as exc:
+        click.echo(f"Error: {exc}", err=True)
+        raise SystemExit(1)
+
+    if not results:
+        click.echo("No proxy files found.")
+        return
+
+    if output_format == "json":
+        click.echo(json.dumps(results, indent=2))
+    else:
+        _print_status_table(results)
+
+
+@proxy_group.command("extractors")
+@click.option(
+    "--format", "output_format",
+    type=click.Choice(["table", "json"]),
+    default="table",
+    help="Output format.",
+)
+def proxy_extractors(output_format):
+    """List registered extractors and their dependency status."""
+    from markitect.proxy.registry import registry
+    import markitect.proxy.extractors  # noqa: F401
+
+    extractors = registry.list_extractors()
+
+    if output_format == "json":
+        rows = [
+            {
+                "name": e.name,
+                "version": e.version,
+                "extensions": list(e.extensions),
+                "installed": e.check_dependencies(),
+                "hint": e.dependency_hint(),
+            }
+            for e in extractors
+        ]
+        click.echo(json.dumps(rows, indent=2))
+    else:
+        _print_extractor_table(extractors)
+
+
+# ------------------------------------------------------------------
+# helpers
+# ------------------------------------------------------------------
+
+def _update_one(gen, proxy_path, dry_run):
+    """Update a single proxy file, respecting --dry-run."""
+    info = gen.status(proxy_path)
+    if info["status"] == "current":
+        click.echo(f"  {proxy_path.name}: current")
+        return
+    if info["status"] == "missing-source":
+        click.echo(f"  {proxy_path.name}: source missing")
+        return
+    if dry_run:
+        click.echo(f"  {proxy_path.name}: stale (would update)")
+    else:
+        updated = gen.update(proxy_path)
+        label = "updated" if updated else "current"
+        click.echo(f"  {proxy_path.name}: {label}")
+
+
+def _find_proxy_files(gen, directory):
+    """Return proxy file paths within *directory*."""
+    results = []
+    for path in sorted(directory.rglob("*.md")):
+        meta, _ = gen._try_read_proxy(path)
+        if meta is not None and meta.get("proxy") is True:
+            results.append(path)
+    return results
+
+
+def _print_status_table(results):
+    """Pretty-print a status table."""
+    click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}")
+    click.echo("-" * 95)
+    for r in results:
+        proxy_name = Path(r["proxy"]).name
+        click.echo(
+            f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}"
+        )
+
+
+def _print_extractor_table(extractors):
+    """Pretty-print an extractor table."""
+    click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}")
+    click.echo("-" * 60)
+    for e in extractors:
+        exts = ", ".join(e.extensions)
+        status = "installed" if e.check_dependencies() else "missing"
+        click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}")
--- a/markitect/proxy/exceptions.py
+++ b/markitect/proxy/exceptions.py
@@ -0,0 +1,40 @@
+"""
+Proxy-specific exceptions.
+
+Extends the MarkitectError hierarchy for proxy file operations.
+"""
+
+from typing import Optional, Dict, Any
+
+from markitect.exceptions import MarkitectError
+
+
+class ProxyError(MarkitectError):
+    """Base exception for all proxy operations."""
+    pass
+
+
+class ExtractorNotFoundError(ProxyError):
+    """No extractor registered for the given file extension."""
+    pass
+
+
+class DependencyMissingError(ProxyError):
+    """An extractor's optional dependency is not installed.
+
+    Attributes:
+        package: The missing Python package name.
+        install_hint: Suggested pip install command.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        package: str = "",
+        install_hint: str = "",
+        cause: Optional[Exception] = None,
+        context: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(message, cause=cause, context=context)
+        self.package = package
+        self.install_hint = install_hint
--- a/markitect/proxy/extractors/init.py
+++ b/markitect/proxy/extractors/init.py
@@ -0,0 +1,14 @@
+"""
+Built-in extractor registration.
+
+Importing this module registers all built-in extractors with the global registry.
+"""
+
+from markitect.proxy.registry import registry
+from markitect.proxy.extractors.pdf import PdfExtractor
+from markitect.proxy.extractors.html import HtmlExtractor
+from markitect.proxy.extractors.markdown import MarkdownNormalizer
+
+registry.register(PdfExtractor())
+registry.register(HtmlExtractor())
+registry.register(MarkdownNormalizer())
--- a/markitect/proxy/extractors/base.py
+++ b/markitect/proxy/extractors/base.py
@@ -0,0 +1,43 @@
+"""
+Abstract base class for proxy file extractors.
+"""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+from markitect.proxy.models import ExtractionResult
+
+
+class BaseExtractor(ABC):
+    """Base class that all proxy extractors must implement."""
+
+    name: str = ""
+    version: str = "1.0"
+    extensions: tuple = ()
+
+    @abstractmethod
+    def extract(self, source_path: Path) -> ExtractionResult:
+        """Extract markdown content from a source file.
+
+        Args:
+            source_path: Path to the source file.
+
+        Returns:
+            ExtractionResult with the extracted markdown content.
+        """
+
+    @abstractmethod
+    def check_dependencies(self) -> bool:
+        """Check whether all required dependencies are available.
+
+        Returns:
+            True if all dependencies are installed, False otherwise.
+        """
+
+    def dependency_hint(self) -> str:
+        """Human-readable install instructions for missing dependencies.
+
+        Returns:
+            A string like ``pip install markitect[proxy-pdf]``.
+        """
+        return ""
--- a/markitect/proxy/extractors/html.py
+++ b/markitect/proxy/extractors/html.py
@@ -0,0 +1,45 @@
+"""
+HTML extractor using markdownify.
+"""
+
+from pathlib import Path
+
+from markitect.proxy.extractors.base import BaseExtractor
+from markitect.proxy.models import ExtractionResult
+from markitect.proxy.exceptions import DependencyMissingError
+
+
+class HtmlExtractor(BaseExtractor):
+    """Converts HTML files to Markdown via markdownify."""
+
+    name = "html"
+    version = "1.0"
+    extensions = (".html", ".htm")
+
+    def check_dependencies(self) -> bool:
+        try:
+            import markdownify  # noqa: F401
+            return True
+        except ImportError:
+            return False
+
+    def dependency_hint(self) -> str:
+        return 'pip install "markitect[proxy-html]"  (or: pip install markdownify)'
+
+    def extract(self, source_path: Path) -> ExtractionResult:
+        if not self.check_dependencies():
+            raise DependencyMissingError(
+                "markdownify is required to extract HTML files.",
+                package="markdownify",
+                install_hint=self.dependency_hint(),
+            )
+
+        import markdownify
+
+        html_content = source_path.read_text(encoding="utf-8")
+        md_text = markdownify.markdownify(html_content, heading_style="ATX")
+        return ExtractionResult(
+            content=md_text,
+            extractor=self.name,
+            extractor_version=self.version,
+        )
--- a/markitect/proxy/extractors/markdown.py
+++ b/markitect/proxy/extractors/markdown.py
@@ -0,0 +1,29 @@
+"""
+Markdown normalizer — passes through Markdown with minimal transformation.
+
+No external dependencies required.
+"""
+
+from pathlib import Path
+
+from markitect.proxy.extractors.base import BaseExtractor
+from markitect.proxy.models import ExtractionResult
+
+
+class MarkdownNormalizer(BaseExtractor):
+    """Normalizes other Markdown files (built-in, no optional deps)."""
+
+    name = "markdown"
+    version = "1.0"
+    extensions = (".md", ".markdown", ".mdown")
+
+    def check_dependencies(self) -> bool:
+        return True
+
+    def extract(self, source_path: Path) -> ExtractionResult:
+        content = source_path.read_text(encoding="utf-8")
+        return ExtractionResult(
+            content=content,
+            extractor=self.name,
+            extractor_version=self.version,
+        )
--- a/markitect/proxy/extractors/pdf.py
+++ b/markitect/proxy/extractors/pdf.py
@@ -0,0 +1,44 @@
+"""
+PDF extractor using pymupdf4llm.
+"""
+
+from pathlib import Path
+
+from markitect.proxy.extractors.base import BaseExtractor
+from markitect.proxy.models import ExtractionResult
+from markitect.proxy.exceptions import DependencyMissingError
+
+
+class PdfExtractor(BaseExtractor):
+    """Extracts markdown from PDF files via pymupdf4llm."""
+
+    name = "pdf"
+    version = "1.0"
+    extensions = (".pdf",)
+
+    def check_dependencies(self) -> bool:
+        try:
+            import pymupdf4llm  # noqa: F401
+            return True
+        except ImportError:
+            return False
+
+    def dependency_hint(self) -> str:
+        return 'pip install "markitect[proxy-pdf]"  (or: pip install pymupdf4llm)'
+
+    def extract(self, source_path: Path) -> ExtractionResult:
+        if not self.check_dependencies():
+            raise DependencyMissingError(
+                "pymupdf4llm is required to extract PDF files.",
+                package="pymupdf4llm",
+                install_hint=self.dependency_hint(),
+            )
+
+        import pymupdf4llm
+
+        md_text = pymupdf4llm.to_markdown(str(source_path))
+        return ExtractionResult(
+            content=md_text,
+            extractor=self.name,
+            extractor_version=self.version,
+        )
--- a/markitect/proxy/generator.py
+++ b/markitect/proxy/generator.py
@@ -0,0 +1,241 @@
+"""
+ProxyGenerator — create, update, and check status of proxy files.
+"""
+
+import logging
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List
+
+import yaml
+
+from markitect.assets.utils import ContentHasher
+from markitect.frontmatter import FrontMatterParser
+from markitect.proxy.exceptions import ProxyError
+from markitect.proxy.models import ProxyMetadata
+from markitect.proxy.registry import ExtractorRegistry
+
+logger = logging.getLogger("markitect.proxy.generator")
+
+_frontmatter_parser = FrontMatterParser()
+
+
+class ProxyGenerator:
+    """Creates and manages markdown proxy files."""
+
+    def __init__(self, registry: ExtractorRegistry):
+        self.registry = registry
+
+    # ------------------------------------------------------------------
+    # create
+    # ------------------------------------------------------------------
+
+    def create(self, source: Path, output_dir: Path, force: bool = False) -> Path:
+        """Create a proxy markdown file for *source*.
+
+        Args:
+            source: Path to the original file (e.g. ``report.pdf``).
+            output_dir: Directory where the proxy file will be written.
+            force: If True, overwrite an existing proxy file.
+
+        Returns:
+            Path to the created proxy file.
+
+        Raises:
+            ProxyError: If the source doesn't exist, extractor fails, etc.
+        """
+        source = source.resolve()
+        if not source.is_file():
+            raise ProxyError(
+                f"Source file does not exist: {source}",
+                context={"source": str(source)},
+            )
+
+        output_dir = output_dir.resolve()
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        proxy_path = output_dir / f"{source.name}.md"
+        if proxy_path.exists() and not force:
+            raise ProxyError(
+                f"Proxy file already exists: {proxy_path}  (use --force to overwrite)",
+                context={"proxy": str(proxy_path)},
+            )
+
+        extractor = self.registry.get_extractor_for_file(source)
+
+        if not extractor.check_dependencies():
+            raise ProxyError(
+                f"Missing dependency for {extractor.name} extractor. "
+                f"{extractor.dependency_hint()}",
+            )
+
+        result = extractor.extract(source)
+
+        checksum = ContentHasher.hash_file(source)
+        source_size = source.stat().st_size
+
+        # Relative path from proxy location to source
+        rel_source = os.path.relpath(source, output_dir)
+
+        meta = ProxyMetadata(
+            source_path=rel_source,
+            source_checksum=f"sha256:{checksum}",
+            source_size=source_size,
+            generated_at=datetime.now(timezone.utc).isoformat(),
+            extractor=result.extractor,
+            extractor_version=result.extractor_version,
+        )
+
+        self._write_proxy(proxy_path, meta, source.name, result.content)
+        logger.info("Created proxy %s -> %s", proxy_path.name, rel_source)
+        return proxy_path
+
+    # ------------------------------------------------------------------
+    # update
+    # ------------------------------------------------------------------
+
+    def update(self, proxy_path: Path) -> bool:
+        """Re-extract the proxy file if its source has changed.
+
+        Returns:
+            True if the proxy was updated, False if already current.
+        """
+        proxy_path = proxy_path.resolve()
+        meta, body = self._read_proxy(proxy_path)
+
+        source = self._resolve_source(proxy_path, meta)
+        if not source.is_file():
+            raise ProxyError(
+                f"Source file missing: {source}",
+                context={"source_path": meta["source_path"]},
+            )
+
+        current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
+        if current_checksum == meta.get("source_checksum"):
+            return False
+
+        extractor = self.registry.get_extractor(source.suffix)
+        if not extractor.check_dependencies():
+            raise ProxyError(
+                f"Missing dependency for {extractor.name} extractor. "
+                f"{extractor.dependency_hint()}",
+            )
+
+        result = extractor.extract(source)
+
+        rel_source = meta["source_path"]
+        new_meta = ProxyMetadata(
+            source_path=rel_source,
+            source_checksum=current_checksum,
+            source_size=source.stat().st_size,
+            generated_at=datetime.now(timezone.utc).isoformat(),
+            extractor=result.extractor,
+            extractor_version=result.extractor_version,
+        )
+
+        self._write_proxy(proxy_path, new_meta, source.name, result.content)
+        logger.info("Updated proxy %s", proxy_path.name)
+        return True
+
+    # ------------------------------------------------------------------
+    # status
+    # ------------------------------------------------------------------
+
+    def status(self, proxy_path: Path) -> Dict:
+        """Check a single proxy file's freshness.
+
+        Returns a dict with keys: proxy, source, status, extractor.
+        Status is one of: ``current``, ``stale``, ``missing-source``.
+        """
+        proxy_path = proxy_path.resolve()
+        meta, _ = self._read_proxy(proxy_path)
+
+        source = self._resolve_source(proxy_path, meta)
+        if not source.is_file():
+            return {
+                "proxy": str(proxy_path),
+                "source": meta.get("source_path", ""),
+                "status": "missing-source",
+                "extractor": meta.get("extractor", ""),
+            }
+
+        current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
+        is_current = current_checksum == meta.get("source_checksum")
+        return {
+            "proxy": str(proxy_path),
+            "source": meta.get("source_path", ""),
+            "status": "current" if is_current else "stale",
+            "extractor": meta.get("extractor", ""),
+        }
+
+    def bulk_status(self, directory: Path) -> List[Dict]:
+        """Scan *directory* for proxy files and return their statuses."""
+        directory = directory.resolve()
+        results = []
+        for path in sorted(directory.rglob("*.md")):
+            meta, _ = self._try_read_proxy(path)
+            if meta is not None and meta.get("proxy") is True:
+                results.append(self.status(path))
+        return results
+
+    # ------------------------------------------------------------------
+    # helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _write_proxy(
+        proxy_path: Path,
+        meta: ProxyMetadata,
+        source_name: str,
+        content: str,
+    ) -> None:
+        fm = {
+            "proxy": True,
+            "source_path": meta.source_path,
+            "source_checksum": meta.source_checksum,
+            "source_size": meta.source_size,
+            "generated_at": meta.generated_at,
+            "extractor": meta.extractor,
+            "extractor_version": meta.extractor_version,
+        }
+        fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False)
+        body = (
+            f"---\n{fm_text}---\n\n"
+            f"# {source_name}\n\n"
+            f"*Proxy generated from `{meta.source_path}`*\n\n"
+            f"{content}"
+        )
+        proxy_path.write_text(body, encoding="utf-8")
+
+    @staticmethod
+    def _read_proxy(proxy_path: Path):
+        """Read and parse an existing proxy file.
+
+        Returns:
+            Tuple of (frontmatter dict, body str).
+        """
+        raw = proxy_path.read_text(encoding="utf-8")
+        meta, body = _frontmatter_parser.parse(raw)
+        if not meta.get("proxy"):
+            raise ProxyError(
+                f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}",
+                context={"path": str(proxy_path)},
+            )
+        return meta, body
+
+    @staticmethod
+    def _try_read_proxy(path: Path):
+        """Attempt to read a proxy file, returning (None, None) on failure."""
+        try:
+            raw = path.read_text(encoding="utf-8")
+            meta, body = _frontmatter_parser.parse(raw)
+            return meta, body
+        except Exception:
+            return None, None
+
+    @staticmethod
+    def _resolve_source(proxy_path: Path, meta: Dict) -> Path:
+        """Resolve the source path relative to the proxy file's directory."""
+        source_rel = meta.get("source_path", "")
+        return (proxy_path.parent / source_rel).resolve()
--- a/markitect/proxy/models.py
+++ b/markitect/proxy/models.py
@@ -0,0 +1,26 @@
+"""
+Data models for the proxy file system.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class ProxyMetadata:
+    """Metadata stored in a proxy file's YAML frontmatter."""
+
+    source_path: str
+    source_checksum: str       # "sha256:<hex>"
+    source_size: int
+    generated_at: str          # ISO 8601
+    extractor: str
+    extractor_version: str
+
+
+@dataclass
+class ExtractionResult:
+    """Result returned by an extractor after processing a source file."""
+
+    content: str
+    extractor: str
+    extractor_version: str
--- a/markitect/proxy/registry.py
+++ b/markitect/proxy/registry.py
@@ -0,0 +1,65 @@
+"""
+Extractor registry — register and look up extractors by file extension.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, TYPE_CHECKING
+
+from pathlib import Path
+
+from markitect.proxy.exceptions import ExtractorNotFoundError
+
+if TYPE_CHECKING:
+    from markitect.proxy.extractors.base import BaseExtractor
+
+logger = logging.getLogger("markitect.proxy.registry")
+
+
+class ExtractorRegistry:
+    """Maps file extensions to their corresponding extractors."""
+
+    def __init__(self):
+        self._extractors: Dict[str, BaseExtractor] = {}
+
+    def register(self, extractor: BaseExtractor) -> None:
+        """Register an extractor for all of its declared extensions."""
+        for ext in extractor.extensions:
+            ext_lower = ext.lower()
+            self._extractors[ext_lower] = extractor
+            logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
+
+    def get_extractor(self, extension: str) -> BaseExtractor:
+        """Look up an extractor by file extension (e.g. ``'.pdf'``).
+
+        Raises:
+            ExtractorNotFoundError: If no extractor handles the extension.
+        """
+        ext_lower = extension.lower()
+        if ext_lower not in self._extractors:
+            known = ", ".join(sorted(self._extractors.keys()))
+            raise ExtractorNotFoundError(
+                f"No extractor registered for {ext_lower!r}. "
+                f"Supported extensions: {known}",
+                context={"extension": ext_lower},
+            )
+        return self._extractors[ext_lower]
+
+    def get_extractor_for_file(self, path: Path) -> BaseExtractor:
+        """Look up an extractor for a file based on its suffix."""
+        return self.get_extractor(path.suffix)
+
+    def list_extractors(self) -> List[BaseExtractor]:
+        """Return a de-duplicated list of registered extractors."""
+        seen = set()
+        result = []
+        for ext in self._extractors.values():
+            if id(ext) not in seen:
+                seen.add(id(ext))
+                result.append(ext)
+        return result
+
+
+# Module-level singleton
+registry = ExtractorRegistry()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,9 @@ capabilities = [
 development = [
    "kaizen-agentic @ file:./capabilities/kaizen-agentic"
 ]
+proxy-pdf = ["pymupdf4llm>=0.0.10"]
+proxy-html = ["markdownify>=0.13.1"]
+proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]

 [project.scripts]
 markitect = "markitect.cli:main"