feat(proxy): add proxy file system for non-markdown source conversion

Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 19:06:09 +01:00
parent 69aea1ada7
commit ac334c679d
13 changed files with 781 additions and 0 deletions
--- a/markitect/cli.py
+++ b/markitect/cli.py
@@ -7147,6 +7147,13 @@ try:
 except ImportError:
    pass  # Helper module not available
 # Register proxy file system commands
 try:
    from markitect.proxy.cli import proxy_group
    cli.add_command(proxy_group)
 except ImportError:
    pass  # Proxy module not available
 # Make cli function available as main entry point
 main = cli
--- a/markitect/proxy/init.py
+++ b/markitect/proxy/init.py
@@ -0,0 +1,39 @@
 """
 markitect.proxy — Proxy file system for wrapping non-markdown sources.
 Creates markdown proxy files that track their origin (source path,
 checksum, timestamp) so they can be kept up-to-date when the original
 changes.
 Quick start::
    from markitect.proxy import ProxyGenerator, registry
    # Ensure built-in extractors are registered
    import markitect.proxy.extractors  # noqa: F401
    gen = ProxyGenerator(registry)
    gen.create(Path("report.pdf"), Path("./output/"))
 """
 from markitect.proxy.models import ProxyMetadata, ExtractionResult
 from markitect.proxy.exceptions import (
    ProxyError,
    ExtractorNotFoundError,
    DependencyMissingError,
 )
 from markitect.proxy.registry import ExtractorRegistry, registry
 from markitect.proxy.generator import ProxyGenerator
 from markitect.proxy.extractors.base import BaseExtractor
 __all__ = [
    "ProxyMetadata",
    "ExtractionResult",
    "ProxyError",
    "ExtractorNotFoundError",
    "DependencyMissingError",
    "ExtractorRegistry",
    "registry",
    "ProxyGenerator",
    "BaseExtractor",
 ]
--- a/markitect/proxy/cli.py
+++ b/markitect/proxy/cli.py
@@ -0,0 +1,185 @@
 """
 Click CLI commands for the proxy file system.
 """
 import json
 from pathlib import Path
 import click
 from markitect.proxy.exceptions import ProxyError
@click.group("proxy")
 def proxy_group():
    """Proxy file operations — create, update, and manage markdown proxies."""
    pass
@proxy_group.command("create")
@click.argument("source", type=click.Path(exists=True))
@click.option(
    "--output-dir", "-o",
    type=click.Path(),
    default=".",
    help="Directory to write the proxy file (default: current dir).",
 )
@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.")
 def proxy_create(source, output_dir, force):
    """Create a markdown proxy for SOURCE."""
    # Lazy imports so the CLI group registers even if deps are absent
    from markitect.proxy.registry import registry
    import markitect.proxy.extractors  # noqa: F401  — registers built-ins
    from markitect.proxy.generator import ProxyGenerator
    gen = ProxyGenerator(registry)
    try:
        proxy_path = gen.create(Path(source), Path(output_dir), force=force)
        click.echo(f"Created proxy: {proxy_path}")
    except ProxyError as exc:
        click.echo(f"Error: {exc}", err=True)
        raise SystemExit(1)
@proxy_group.command("update")
@click.argument("target", type=click.Path(exists=True))
@click.option("--dry-run", is_flag=True, help="Show what would change without writing.")
 def proxy_update(target, dry_run):
    """Re-extract a single proxy file or all proxies in a directory."""
    from markitect.proxy.registry import registry
    import markitect.proxy.extractors  # noqa: F401
    from markitect.proxy.generator import ProxyGenerator
    gen = ProxyGenerator(registry)
    target_path = Path(target).resolve()
    try:
        if target_path.is_file():
            _update_one(gen, target_path, dry_run)
        elif target_path.is_dir():
            proxies = _find_proxy_files(gen, target_path)
            if not proxies:
                click.echo("No proxy files found.")
                return
            for p in proxies:
                _update_one(gen, p, dry_run)
        else:
            click.echo(f"Error: {target} is not a file or directory.", err=True)
            raise SystemExit(1)
    except ProxyError as exc:
        click.echo(f"Error: {exc}", err=True)
        raise SystemExit(1)
@proxy_group.command("status")
@click.argument("directory", type=click.Path(exists=True), default=".")
@click.option(
    "--format", "output_format",
    type=click.Choice(["table", "json"]),
    default="table",
    help="Output format.",
 )
 def proxy_status(directory, output_format):
    """Show freshness of all proxy files in DIRECTORY (default: current dir)."""
    from markitect.proxy.registry import registry
    import markitect.proxy.extractors  # noqa: F401
    from markitect.proxy.generator import ProxyGenerator
    gen = ProxyGenerator(registry)
    try:
        results = gen.bulk_status(Path(directory))
    except ProxyError as exc:
        click.echo(f"Error: {exc}", err=True)
        raise SystemExit(1)
    if not results:
        click.echo("No proxy files found.")
        return
    if output_format == "json":
        click.echo(json.dumps(results, indent=2))
    else:
        _print_status_table(results)
@proxy_group.command("extractors")
@click.option(
    "--format", "output_format",
    type=click.Choice(["table", "json"]),
    default="table",
    help="Output format.",
 )
 def proxy_extractors(output_format):
    """List registered extractors and their dependency status."""
    from markitect.proxy.registry import registry
    import markitect.proxy.extractors  # noqa: F401
    extractors = registry.list_extractors()
    if output_format == "json":
        rows = [
            {
                "name": e.name,
                "version": e.version,
                "extensions": list(e.extensions),
                "installed": e.check_dependencies(),
                "hint": e.dependency_hint(),
            }
            for e in extractors
        ]
        click.echo(json.dumps(rows, indent=2))
    else:
        _print_extractor_table(extractors)
 # ------------------------------------------------------------------
 # helpers
 # ------------------------------------------------------------------
 def _update_one(gen, proxy_path, dry_run):
    """Update a single proxy file, respecting --dry-run."""
    info = gen.status(proxy_path)
    if info["status"] == "current":
        click.echo(f"  {proxy_path.name}: current")
        return
    if info["status"] == "missing-source":
        click.echo(f"  {proxy_path.name}: source missing")
        return
    if dry_run:
        click.echo(f"  {proxy_path.name}: stale (would update)")
    else:
        updated = gen.update(proxy_path)
        label = "updated" if updated else "current"
        click.echo(f"  {proxy_path.name}: {label}")
 def _find_proxy_files(gen, directory):
    """Return proxy file paths within *directory*."""
    results = []
    for path in sorted(directory.rglob("*.md")):
        meta, _ = gen._try_read_proxy(path)
        if meta is not None and meta.get("proxy") is True:
            results.append(path)
    return results
 def _print_status_table(results):
    """Pretty-print a status table."""
    click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}")
    click.echo("-" * 95)
    for r in results:
        proxy_name = Path(r["proxy"]).name
        click.echo(
            f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}"
        )
 def _print_extractor_table(extractors):
    """Pretty-print an extractor table."""
    click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}")
    click.echo("-" * 60)
    for e in extractors:
        exts = ", ".join(e.extensions)
        status = "installed" if e.check_dependencies() else "missing"
        click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}")
--- a/markitect/proxy/exceptions.py
+++ b/markitect/proxy/exceptions.py
@@ -0,0 +1,40 @@
 """
 Proxy-specific exceptions.
 Extends the MarkitectError hierarchy for proxy file operations.
 """
 from typing import Optional, Dict, Any
 from markitect.exceptions import MarkitectError
 class ProxyError(MarkitectError):
    """Base exception for all proxy operations."""
    pass
 class ExtractorNotFoundError(ProxyError):
    """No extractor registered for the given file extension."""
    pass
 class DependencyMissingError(ProxyError):
    """An extractor's optional dependency is not installed.
    Attributes:
        package: The missing Python package name.
        install_hint: Suggested pip install command.
    """
    def __init__(
        self,
        message: str,
        package: str = "",
        install_hint: str = "",
        cause: Optional[Exception] = None,
        context: Optional[Dict[str, Any]] = None,
    ):
        super().__init__(message, cause=cause, context=context)
        self.package = package
        self.install_hint = install_hint
--- a/markitect/proxy/extractors/init.py
+++ b/markitect/proxy/extractors/init.py
@@ -0,0 +1,14 @@
 """
 Built-in extractor registration.
 Importing this module registers all built-in extractors with the global registry.
 """
 from markitect.proxy.registry import registry
 from markitect.proxy.extractors.pdf import PdfExtractor
 from markitect.proxy.extractors.html import HtmlExtractor
 from markitect.proxy.extractors.markdown import MarkdownNormalizer
 registry.register(PdfExtractor())
 registry.register(HtmlExtractor())
 registry.register(MarkdownNormalizer())
--- a/markitect/proxy/extractors/base.py
+++ b/markitect/proxy/extractors/base.py
@@ -0,0 +1,43 @@
 """
 Abstract base class for proxy file extractors.
 """
 from abc import ABC, abstractmethod
 from pathlib import Path
 from markitect.proxy.models import ExtractionResult
 class BaseExtractor(ABC):
    """Base class that all proxy extractors must implement."""
    name: str = ""
    version: str = "1.0"
    extensions: tuple = ()
    @abstractmethod
    def extract(self, source_path: Path) -> ExtractionResult:
        """Extract markdown content from a source file.
        Args:
            source_path: Path to the source file.
        Returns:
            ExtractionResult with the extracted markdown content.
        """
    @abstractmethod
    def check_dependencies(self) -> bool:
        """Check whether all required dependencies are available.
        Returns:
            True if all dependencies are installed, False otherwise.
        """
    def dependency_hint(self) -> str:
        """Human-readable install instructions for missing dependencies.
        Returns:
            A string like ``pip install markitect[proxy-pdf]``.
        """
        return ""
--- a/markitect/proxy/extractors/html.py
+++ b/markitect/proxy/extractors/html.py
@@ -0,0 +1,45 @@
 """
 HTML extractor using markdownify.
 """
 from pathlib import Path
 from markitect.proxy.extractors.base import BaseExtractor
 from markitect.proxy.models import ExtractionResult
 from markitect.proxy.exceptions import DependencyMissingError
 class HtmlExtractor(BaseExtractor):
    """Converts HTML files to Markdown via markdownify."""
    name = "html"
    version = "1.0"
    extensions = (".html", ".htm")
    def check_dependencies(self) -> bool:
        try:
            import markdownify  # noqa: F401
            return True
        except ImportError:
            return False
    def dependency_hint(self) -> str:
        return 'pip install "markitect[proxy-html]"  (or: pip install markdownify)'
    def extract(self, source_path: Path) -> ExtractionResult:
        if not self.check_dependencies():
            raise DependencyMissingError(
                "markdownify is required to extract HTML files.",
                package="markdownify",
                install_hint=self.dependency_hint(),
            )
        import markdownify
        html_content = source_path.read_text(encoding="utf-8")
        md_text = markdownify.markdownify(html_content, heading_style="ATX")
        return ExtractionResult(
            content=md_text,
            extractor=self.name,
            extractor_version=self.version,
        )
--- a/markitect/proxy/extractors/markdown.py
+++ b/markitect/proxy/extractors/markdown.py
@@ -0,0 +1,29 @@
 """
 Markdown normalizer — passes through Markdown with minimal transformation.
 No external dependencies required.
 """
 from pathlib import Path
 from markitect.proxy.extractors.base import BaseExtractor
 from markitect.proxy.models import ExtractionResult
 class MarkdownNormalizer(BaseExtractor):
    """Normalizes other Markdown files (built-in, no optional deps)."""
    name = "markdown"
    version = "1.0"
    extensions = (".md", ".markdown", ".mdown")
    def check_dependencies(self) -> bool:
        return True
    def extract(self, source_path: Path) -> ExtractionResult:
        content = source_path.read_text(encoding="utf-8")
        return ExtractionResult(
            content=content,
            extractor=self.name,
            extractor_version=self.version,
        )
--- a/markitect/proxy/extractors/pdf.py
+++ b/markitect/proxy/extractors/pdf.py
@@ -0,0 +1,44 @@
 """
 PDF extractor using pymupdf4llm.
 """
 from pathlib import Path
 from markitect.proxy.extractors.base import BaseExtractor
 from markitect.proxy.models import ExtractionResult
 from markitect.proxy.exceptions import DependencyMissingError
 class PdfExtractor(BaseExtractor):
    """Extracts markdown from PDF files via pymupdf4llm."""
    name = "pdf"
    version = "1.0"
    extensions = (".pdf",)
    def check_dependencies(self) -> bool:
        try:
            import pymupdf4llm  # noqa: F401
            return True
        except ImportError:
            return False
    def dependency_hint(self) -> str:
        return 'pip install "markitect[proxy-pdf]"  (or: pip install pymupdf4llm)'
    def extract(self, source_path: Path) -> ExtractionResult:
        if not self.check_dependencies():
            raise DependencyMissingError(
                "pymupdf4llm is required to extract PDF files.",
                package="pymupdf4llm",
                install_hint=self.dependency_hint(),
            )
        import pymupdf4llm
        md_text = pymupdf4llm.to_markdown(str(source_path))
        return ExtractionResult(
            content=md_text,
            extractor=self.name,
            extractor_version=self.version,
        )
--- a/markitect/proxy/generator.py
+++ b/markitect/proxy/generator.py
@@ -0,0 +1,241 @@
 """
 ProxyGenerator — create, update, and check status of proxy files.
 """
 import logging
 import os
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List
 import yaml
 from markitect.assets.utils import ContentHasher
 from markitect.frontmatter import FrontMatterParser
 from markitect.proxy.exceptions import ProxyError
 from markitect.proxy.models import ProxyMetadata
 from markitect.proxy.registry import ExtractorRegistry
 logger = logging.getLogger("markitect.proxy.generator")
 _frontmatter_parser = FrontMatterParser()
 class ProxyGenerator:
    """Creates and manages markdown proxy files."""
    def __init__(self, registry: ExtractorRegistry):
        self.registry = registry
    # ------------------------------------------------------------------
    # create
    # ------------------------------------------------------------------
    def create(self, source: Path, output_dir: Path, force: bool = False) -> Path:
        """Create a proxy markdown file for *source*.
        Args:
            source: Path to the original file (e.g. ``report.pdf``).
            output_dir: Directory where the proxy file will be written.
            force: If True, overwrite an existing proxy file.
        Returns:
            Path to the created proxy file.
        Raises:
            ProxyError: If the source doesn't exist, extractor fails, etc.
        """
        source = source.resolve()
        if not source.is_file():
            raise ProxyError(
                f"Source file does not exist: {source}",
                context={"source": str(source)},
            )
        output_dir = output_dir.resolve()
        output_dir.mkdir(parents=True, exist_ok=True)
        proxy_path = output_dir / f"{source.name}.md"
        if proxy_path.exists() and not force:
            raise ProxyError(
                f"Proxy file already exists: {proxy_path}  (use --force to overwrite)",
                context={"proxy": str(proxy_path)},
            )
        extractor = self.registry.get_extractor_for_file(source)
        if not extractor.check_dependencies():
            raise ProxyError(
                f"Missing dependency for {extractor.name} extractor. "
                f"{extractor.dependency_hint()}",
            )
        result = extractor.extract(source)
        checksum = ContentHasher.hash_file(source)
        source_size = source.stat().st_size
        # Relative path from proxy location to source
        rel_source = os.path.relpath(source, output_dir)
        meta = ProxyMetadata(
            source_path=rel_source,
            source_checksum=f"sha256:{checksum}",
            source_size=source_size,
            generated_at=datetime.now(timezone.utc).isoformat(),
            extractor=result.extractor,
            extractor_version=result.extractor_version,
        )
        self._write_proxy(proxy_path, meta, source.name, result.content)
        logger.info("Created proxy %s -> %s", proxy_path.name, rel_source)
        return proxy_path
    # ------------------------------------------------------------------
    # update
    # ------------------------------------------------------------------
    def update(self, proxy_path: Path) -> bool:
        """Re-extract the proxy file if its source has changed.
        Returns:
            True if the proxy was updated, False if already current.
        """
        proxy_path = proxy_path.resolve()
        meta, body = self._read_proxy(proxy_path)
        source = self._resolve_source(proxy_path, meta)
        if not source.is_file():
            raise ProxyError(
                f"Source file missing: {source}",
                context={"source_path": meta["source_path"]},
            )
        current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
        if current_checksum == meta.get("source_checksum"):
            return False
        extractor = self.registry.get_extractor(source.suffix)
        if not extractor.check_dependencies():
            raise ProxyError(
                f"Missing dependency for {extractor.name} extractor. "
                f"{extractor.dependency_hint()}",
            )
        result = extractor.extract(source)
        rel_source = meta["source_path"]
        new_meta = ProxyMetadata(
            source_path=rel_source,
            source_checksum=current_checksum,
            source_size=source.stat().st_size,
            generated_at=datetime.now(timezone.utc).isoformat(),
            extractor=result.extractor,
            extractor_version=result.extractor_version,
        )
        self._write_proxy(proxy_path, new_meta, source.name, result.content)
        logger.info("Updated proxy %s", proxy_path.name)
        return True
    # ------------------------------------------------------------------
    # status
    # ------------------------------------------------------------------
    def status(self, proxy_path: Path) -> Dict:
        """Check a single proxy file's freshness.
        Returns a dict with keys: proxy, source, status, extractor.
        Status is one of: ``current``, ``stale``, ``missing-source``.
        """
        proxy_path = proxy_path.resolve()
        meta, _ = self._read_proxy(proxy_path)
        source = self._resolve_source(proxy_path, meta)
        if not source.is_file():
            return {
                "proxy": str(proxy_path),
                "source": meta.get("source_path", ""),
                "status": "missing-source",
                "extractor": meta.get("extractor", ""),
            }
        current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
        is_current = current_checksum == meta.get("source_checksum")
        return {
            "proxy": str(proxy_path),
            "source": meta.get("source_path", ""),
            "status": "current" if is_current else "stale",
            "extractor": meta.get("extractor", ""),
        }
    def bulk_status(self, directory: Path) -> List[Dict]:
        """Scan *directory* for proxy files and return their statuses."""
        directory = directory.resolve()
        results = []
        for path in sorted(directory.rglob("*.md")):
            meta, _ = self._try_read_proxy(path)
            if meta is not None and meta.get("proxy") is True:
                results.append(self.status(path))
        return results
    # ------------------------------------------------------------------
    # helpers
    # ------------------------------------------------------------------
    @staticmethod
    def _write_proxy(
        proxy_path: Path,
        meta: ProxyMetadata,
        source_name: str,
        content: str,
    ) -> None:
        fm = {
            "proxy": True,
            "source_path": meta.source_path,
            "source_checksum": meta.source_checksum,
            "source_size": meta.source_size,
            "generated_at": meta.generated_at,
            "extractor": meta.extractor,
            "extractor_version": meta.extractor_version,
        }
        fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False)
        body = (
            f"---\n{fm_text}---\n\n"
            f"# {source_name}\n\n"
            f"*Proxy generated from `{meta.source_path}`*\n\n"
            f"{content}"
        )
        proxy_path.write_text(body, encoding="utf-8")
    @staticmethod
    def _read_proxy(proxy_path: Path):
        """Read and parse an existing proxy file.
        Returns:
            Tuple of (frontmatter dict, body str).
        """
        raw = proxy_path.read_text(encoding="utf-8")
        meta, body = _frontmatter_parser.parse(raw)
        if not meta.get("proxy"):
            raise ProxyError(
                f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}",
                context={"path": str(proxy_path)},
            )
        return meta, body
    @staticmethod
    def _try_read_proxy(path: Path):
        """Attempt to read a proxy file, returning (None, None) on failure."""
        try:
            raw = path.read_text(encoding="utf-8")
            meta, body = _frontmatter_parser.parse(raw)
            return meta, body
        except Exception:
            return None, None
    @staticmethod
    def _resolve_source(proxy_path: Path, meta: Dict) -> Path:
        """Resolve the source path relative to the proxy file's directory."""
        source_rel = meta.get("source_path", "")
        return (proxy_path.parent / source_rel).resolve()
--- a/markitect/proxy/models.py
+++ b/markitect/proxy/models.py
@@ -0,0 +1,26 @@
 """
 Data models for the proxy file system.
 """
 from dataclasses import dataclass
@dataclass
 class ProxyMetadata:
    """Metadata stored in a proxy file's YAML frontmatter."""
    source_path: str
    source_checksum: str       # "sha256:<hex>"
    source_size: int
    generated_at: str          # ISO 8601
    extractor: str
    extractor_version: str
@dataclass
 class ExtractionResult:
    """Result returned by an extractor after processing a source file."""
    content: str
    extractor: str
    extractor_version: str
--- a/markitect/proxy/registry.py
+++ b/markitect/proxy/registry.py
@@ -0,0 +1,65 @@
 """
 Extractor registry — register and look up extractors by file extension.
 """
 from __future__ import annotations
 import logging
 from typing import Dict, List, TYPE_CHECKING
 from pathlib import Path
 from markitect.proxy.exceptions import ExtractorNotFoundError
 if TYPE_CHECKING:
    from markitect.proxy.extractors.base import BaseExtractor
 logger = logging.getLogger("markitect.proxy.registry")
 class ExtractorRegistry:
    """Maps file extensions to their corresponding extractors."""
    def __init__(self):
        self._extractors: Dict[str, BaseExtractor] = {}
    def register(self, extractor: BaseExtractor) -> None:
        """Register an extractor for all of its declared extensions."""
        for ext in extractor.extensions:
            ext_lower = ext.lower()
            self._extractors[ext_lower] = extractor
            logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
    def get_extractor(self, extension: str) -> BaseExtractor:
        """Look up an extractor by file extension (e.g. ``'.pdf'``).
        Raises:
            ExtractorNotFoundError: If no extractor handles the extension.
        """
        ext_lower = extension.lower()
        if ext_lower not in self._extractors:
            known = ", ".join(sorted(self._extractors.keys()))
            raise ExtractorNotFoundError(
                f"No extractor registered for {ext_lower!r}. "
                f"Supported extensions: {known}",
                context={"extension": ext_lower},
            )
        return self._extractors[ext_lower]
    def get_extractor_for_file(self, path: Path) -> BaseExtractor:
        """Look up an extractor for a file based on its suffix."""
        return self.get_extractor(path.suffix)
    def list_extractors(self) -> List[BaseExtractor]:
        """Return a de-duplicated list of registered extractors."""
        seen = set()
        result = []
        for ext in self._extractors.values():
            if id(ext) not in seen:
                seen.add(id(ext))
                result.append(ext)
        return result
 # Module-level singleton
 registry = ExtractorRegistry()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,9 @@ capabilities = [
 development = [
    "kaizen-agentic @ file:./capabilities/kaizen-agentic"
 ]
 proxy-pdf = ["pymupdf4llm>=0.0.10"]
 proxy-html = ["markdownify>=0.13.1"]
 proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
 [project.scripts]
 markitect = "markitect.cli:main"