From ac334c679db0e39093a96f8f130a03040915c58b Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 13 Feb 2026 19:06:09 +0100 Subject: [PATCH] feat(proxy): add proxy file system for non-markdown source conversion Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 --- markitect/cli.py | 7 + markitect/proxy/__init__.py | 39 ++++ markitect/proxy/cli.py | 185 +++++++++++++++++++ markitect/proxy/exceptions.py | 40 ++++ markitect/proxy/extractors/__init__.py | 14 ++ markitect/proxy/extractors/base.py | 43 +++++ markitect/proxy/extractors/html.py | 45 +++++ markitect/proxy/extractors/markdown.py | 29 +++ markitect/proxy/extractors/pdf.py | 44 +++++ markitect/proxy/generator.py | 241 +++++++++++++++++++++++++ markitect/proxy/models.py | 26 +++ markitect/proxy/registry.py | 65 +++++++ pyproject.toml | 3 + 13 files changed, 781 insertions(+) create mode 100644 markitect/proxy/__init__.py create mode 100644 markitect/proxy/cli.py create mode 100644 markitect/proxy/exceptions.py create mode 100644 markitect/proxy/extractors/__init__.py create mode 100644 markitect/proxy/extractors/base.py create mode 100644 markitect/proxy/extractors/html.py create mode 100644 markitect/proxy/extractors/markdown.py create mode 100644 markitect/proxy/extractors/pdf.py create mode 100644 markitect/proxy/generator.py create mode 100644 markitect/proxy/models.py create mode 100644 markitect/proxy/registry.py diff --git a/markitect/cli.py b/markitect/cli.py index ffd7fa9e..c26c07e7 100644 --- a/markitect/cli.py +++ b/markitect/cli.py @@ -7147,6 +7147,13 @@ try: except ImportError: pass # Helper module not available +# Register proxy file system commands +try: + from markitect.proxy.cli import proxy_group + cli.add_command(proxy_group) +except ImportError: + pass # Proxy module not available + # Make cli function available as main entry point main = cli diff --git a/markitect/proxy/__init__.py b/markitect/proxy/__init__.py new file mode 100644 index 00000000..a0543fc1 --- /dev/null +++ b/markitect/proxy/__init__.py @@ -0,0 +1,39 @@ +""" +markitect.proxy — Proxy file system for wrapping non-markdown sources. + +Creates markdown proxy files that track their origin (source path, +checksum, timestamp) so they can be kept up-to-date when the original +changes. + +Quick start:: + + from markitect.proxy import ProxyGenerator, registry + + # Ensure built-in extractors are registered + import markitect.proxy.extractors # noqa: F401 + + gen = ProxyGenerator(registry) + gen.create(Path("report.pdf"), Path("./output/")) +""" + +from markitect.proxy.models import ProxyMetadata, ExtractionResult +from markitect.proxy.exceptions import ( + ProxyError, + ExtractorNotFoundError, + DependencyMissingError, +) +from markitect.proxy.registry import ExtractorRegistry, registry +from markitect.proxy.generator import ProxyGenerator +from markitect.proxy.extractors.base import BaseExtractor + +__all__ = [ + "ProxyMetadata", + "ExtractionResult", + "ProxyError", + "ExtractorNotFoundError", + "DependencyMissingError", + "ExtractorRegistry", + "registry", + "ProxyGenerator", + "BaseExtractor", +] diff --git a/markitect/proxy/cli.py b/markitect/proxy/cli.py new file mode 100644 index 00000000..0d28b909 --- /dev/null +++ b/markitect/proxy/cli.py @@ -0,0 +1,185 @@ +""" +Click CLI commands for the proxy file system. +""" + +import json +from pathlib import Path + +import click + +from markitect.proxy.exceptions import ProxyError + + +@click.group("proxy") +def proxy_group(): + """Proxy file operations — create, update, and manage markdown proxies.""" + pass + + +@proxy_group.command("create") +@click.argument("source", type=click.Path(exists=True)) +@click.option( + "--output-dir", "-o", + type=click.Path(), + default=".", + help="Directory to write the proxy file (default: current dir).", +) +@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.") +def proxy_create(source, output_dir, force): + """Create a markdown proxy for SOURCE.""" + # Lazy imports so the CLI group registers even if deps are absent + from markitect.proxy.registry import registry + import markitect.proxy.extractors # noqa: F401 — registers built-ins + from markitect.proxy.generator import ProxyGenerator + + gen = ProxyGenerator(registry) + try: + proxy_path = gen.create(Path(source), Path(output_dir), force=force) + click.echo(f"Created proxy: {proxy_path}") + except ProxyError as exc: + click.echo(f"Error: {exc}", err=True) + raise SystemExit(1) + + +@proxy_group.command("update") +@click.argument("target", type=click.Path(exists=True)) +@click.option("--dry-run", is_flag=True, help="Show what would change without writing.") +def proxy_update(target, dry_run): + """Re-extract a single proxy file or all proxies in a directory.""" + from markitect.proxy.registry import registry + import markitect.proxy.extractors # noqa: F401 + from markitect.proxy.generator import ProxyGenerator + + gen = ProxyGenerator(registry) + target_path = Path(target).resolve() + + try: + if target_path.is_file(): + _update_one(gen, target_path, dry_run) + elif target_path.is_dir(): + proxies = _find_proxy_files(gen, target_path) + if not proxies: + click.echo("No proxy files found.") + return + for p in proxies: + _update_one(gen, p, dry_run) + else: + click.echo(f"Error: {target} is not a file or directory.", err=True) + raise SystemExit(1) + except ProxyError as exc: + click.echo(f"Error: {exc}", err=True) + raise SystemExit(1) + + +@proxy_group.command("status") +@click.argument("directory", type=click.Path(exists=True), default=".") +@click.option( + "--format", "output_format", + type=click.Choice(["table", "json"]), + default="table", + help="Output format.", +) +def proxy_status(directory, output_format): + """Show freshness of all proxy files in DIRECTORY (default: current dir).""" + from markitect.proxy.registry import registry + import markitect.proxy.extractors # noqa: F401 + from markitect.proxy.generator import ProxyGenerator + + gen = ProxyGenerator(registry) + + try: + results = gen.bulk_status(Path(directory)) + except ProxyError as exc: + click.echo(f"Error: {exc}", err=True) + raise SystemExit(1) + + if not results: + click.echo("No proxy files found.") + return + + if output_format == "json": + click.echo(json.dumps(results, indent=2)) + else: + _print_status_table(results) + + +@proxy_group.command("extractors") +@click.option( + "--format", "output_format", + type=click.Choice(["table", "json"]), + default="table", + help="Output format.", +) +def proxy_extractors(output_format): + """List registered extractors and their dependency status.""" + from markitect.proxy.registry import registry + import markitect.proxy.extractors # noqa: F401 + + extractors = registry.list_extractors() + + if output_format == "json": + rows = [ + { + "name": e.name, + "version": e.version, + "extensions": list(e.extensions), + "installed": e.check_dependencies(), + "hint": e.dependency_hint(), + } + for e in extractors + ] + click.echo(json.dumps(rows, indent=2)) + else: + _print_extractor_table(extractors) + + +# ------------------------------------------------------------------ +# helpers +# ------------------------------------------------------------------ + +def _update_one(gen, proxy_path, dry_run): + """Update a single proxy file, respecting --dry-run.""" + info = gen.status(proxy_path) + if info["status"] == "current": + click.echo(f" {proxy_path.name}: current") + return + if info["status"] == "missing-source": + click.echo(f" {proxy_path.name}: source missing") + return + if dry_run: + click.echo(f" {proxy_path.name}: stale (would update)") + else: + updated = gen.update(proxy_path) + label = "updated" if updated else "current" + click.echo(f" {proxy_path.name}: {label}") + + +def _find_proxy_files(gen, directory): + """Return proxy file paths within *directory*.""" + results = [] + for path in sorted(directory.rglob("*.md")): + meta, _ = gen._try_read_proxy(path) + if meta is not None and meta.get("proxy") is True: + results.append(path) + return results + + +def _print_status_table(results): + """Pretty-print a status table.""" + click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}") + click.echo("-" * 95) + for r in results: + proxy_name = Path(r["proxy"]).name + click.echo( + f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}" + ) + + +def _print_extractor_table(extractors): + """Pretty-print an extractor table.""" + click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}") + click.echo("-" * 60) + for e in extractors: + exts = ", ".join(e.extensions) + status = "installed" if e.check_dependencies() else "missing" + click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}") diff --git a/markitect/proxy/exceptions.py b/markitect/proxy/exceptions.py new file mode 100644 index 00000000..5447900c --- /dev/null +++ b/markitect/proxy/exceptions.py @@ -0,0 +1,40 @@ +""" +Proxy-specific exceptions. + +Extends the MarkitectError hierarchy for proxy file operations. +""" + +from typing import Optional, Dict, Any + +from markitect.exceptions import MarkitectError + + +class ProxyError(MarkitectError): + """Base exception for all proxy operations.""" + pass + + +class ExtractorNotFoundError(ProxyError): + """No extractor registered for the given file extension.""" + pass + + +class DependencyMissingError(ProxyError): + """An extractor's optional dependency is not installed. + + Attributes: + package: The missing Python package name. + install_hint: Suggested pip install command. + """ + + def __init__( + self, + message: str, + package: str = "", + install_hint: str = "", + cause: Optional[Exception] = None, + context: Optional[Dict[str, Any]] = None, + ): + super().__init__(message, cause=cause, context=context) + self.package = package + self.install_hint = install_hint diff --git a/markitect/proxy/extractors/__init__.py b/markitect/proxy/extractors/__init__.py new file mode 100644 index 00000000..c9e379af --- /dev/null +++ b/markitect/proxy/extractors/__init__.py @@ -0,0 +1,14 @@ +""" +Built-in extractor registration. + +Importing this module registers all built-in extractors with the global registry. +""" + +from markitect.proxy.registry import registry +from markitect.proxy.extractors.pdf import PdfExtractor +from markitect.proxy.extractors.html import HtmlExtractor +from markitect.proxy.extractors.markdown import MarkdownNormalizer + +registry.register(PdfExtractor()) +registry.register(HtmlExtractor()) +registry.register(MarkdownNormalizer()) diff --git a/markitect/proxy/extractors/base.py b/markitect/proxy/extractors/base.py new file mode 100644 index 00000000..dd6fccbd --- /dev/null +++ b/markitect/proxy/extractors/base.py @@ -0,0 +1,43 @@ +""" +Abstract base class for proxy file extractors. +""" + +from abc import ABC, abstractmethod +from pathlib import Path + +from markitect.proxy.models import ExtractionResult + + +class BaseExtractor(ABC): + """Base class that all proxy extractors must implement.""" + + name: str = "" + version: str = "1.0" + extensions: tuple = () + + @abstractmethod + def extract(self, source_path: Path) -> ExtractionResult: + """Extract markdown content from a source file. + + Args: + source_path: Path to the source file. + + Returns: + ExtractionResult with the extracted markdown content. + """ + + @abstractmethod + def check_dependencies(self) -> bool: + """Check whether all required dependencies are available. + + Returns: + True if all dependencies are installed, False otherwise. + """ + + def dependency_hint(self) -> str: + """Human-readable install instructions for missing dependencies. + + Returns: + A string like ``pip install markitect[proxy-pdf]``. + """ + return "" diff --git a/markitect/proxy/extractors/html.py b/markitect/proxy/extractors/html.py new file mode 100644 index 00000000..b8b8d9c3 --- /dev/null +++ b/markitect/proxy/extractors/html.py @@ -0,0 +1,45 @@ +""" +HTML extractor using markdownify. +""" + +from pathlib import Path + +from markitect.proxy.extractors.base import BaseExtractor +from markitect.proxy.models import ExtractionResult +from markitect.proxy.exceptions import DependencyMissingError + + +class HtmlExtractor(BaseExtractor): + """Converts HTML files to Markdown via markdownify.""" + + name = "html" + version = "1.0" + extensions = (".html", ".htm") + + def check_dependencies(self) -> bool: + try: + import markdownify # noqa: F401 + return True + except ImportError: + return False + + def dependency_hint(self) -> str: + return 'pip install "markitect[proxy-html]" (or: pip install markdownify)' + + def extract(self, source_path: Path) -> ExtractionResult: + if not self.check_dependencies(): + raise DependencyMissingError( + "markdownify is required to extract HTML files.", + package="markdownify", + install_hint=self.dependency_hint(), + ) + + import markdownify + + html_content = source_path.read_text(encoding="utf-8") + md_text = markdownify.markdownify(html_content, heading_style="ATX") + return ExtractionResult( + content=md_text, + extractor=self.name, + extractor_version=self.version, + ) diff --git a/markitect/proxy/extractors/markdown.py b/markitect/proxy/extractors/markdown.py new file mode 100644 index 00000000..e99e8c99 --- /dev/null +++ b/markitect/proxy/extractors/markdown.py @@ -0,0 +1,29 @@ +""" +Markdown normalizer — passes through Markdown with minimal transformation. + +No external dependencies required. +""" + +from pathlib import Path + +from markitect.proxy.extractors.base import BaseExtractor +from markitect.proxy.models import ExtractionResult + + +class MarkdownNormalizer(BaseExtractor): + """Normalizes other Markdown files (built-in, no optional deps).""" + + name = "markdown" + version = "1.0" + extensions = (".md", ".markdown", ".mdown") + + def check_dependencies(self) -> bool: + return True + + def extract(self, source_path: Path) -> ExtractionResult: + content = source_path.read_text(encoding="utf-8") + return ExtractionResult( + content=content, + extractor=self.name, + extractor_version=self.version, + ) diff --git a/markitect/proxy/extractors/pdf.py b/markitect/proxy/extractors/pdf.py new file mode 100644 index 00000000..2af76e0d --- /dev/null +++ b/markitect/proxy/extractors/pdf.py @@ -0,0 +1,44 @@ +""" +PDF extractor using pymupdf4llm. +""" + +from pathlib import Path + +from markitect.proxy.extractors.base import BaseExtractor +from markitect.proxy.models import ExtractionResult +from markitect.proxy.exceptions import DependencyMissingError + + +class PdfExtractor(BaseExtractor): + """Extracts markdown from PDF files via pymupdf4llm.""" + + name = "pdf" + version = "1.0" + extensions = (".pdf",) + + def check_dependencies(self) -> bool: + try: + import pymupdf4llm # noqa: F401 + return True + except ImportError: + return False + + def dependency_hint(self) -> str: + return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)' + + def extract(self, source_path: Path) -> ExtractionResult: + if not self.check_dependencies(): + raise DependencyMissingError( + "pymupdf4llm is required to extract PDF files.", + package="pymupdf4llm", + install_hint=self.dependency_hint(), + ) + + import pymupdf4llm + + md_text = pymupdf4llm.to_markdown(str(source_path)) + return ExtractionResult( + content=md_text, + extractor=self.name, + extractor_version=self.version, + ) diff --git a/markitect/proxy/generator.py b/markitect/proxy/generator.py new file mode 100644 index 00000000..c77bdb67 --- /dev/null +++ b/markitect/proxy/generator.py @@ -0,0 +1,241 @@ +""" +ProxyGenerator — create, update, and check status of proxy files. +""" + +import logging +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List + +import yaml + +from markitect.assets.utils import ContentHasher +from markitect.frontmatter import FrontMatterParser +from markitect.proxy.exceptions import ProxyError +from markitect.proxy.models import ProxyMetadata +from markitect.proxy.registry import ExtractorRegistry + +logger = logging.getLogger("markitect.proxy.generator") + +_frontmatter_parser = FrontMatterParser() + + +class ProxyGenerator: + """Creates and manages markdown proxy files.""" + + def __init__(self, registry: ExtractorRegistry): + self.registry = registry + + # ------------------------------------------------------------------ + # create + # ------------------------------------------------------------------ + + def create(self, source: Path, output_dir: Path, force: bool = False) -> Path: + """Create a proxy markdown file for *source*. + + Args: + source: Path to the original file (e.g. ``report.pdf``). + output_dir: Directory where the proxy file will be written. + force: If True, overwrite an existing proxy file. + + Returns: + Path to the created proxy file. + + Raises: + ProxyError: If the source doesn't exist, extractor fails, etc. + """ + source = source.resolve() + if not source.is_file(): + raise ProxyError( + f"Source file does not exist: {source}", + context={"source": str(source)}, + ) + + output_dir = output_dir.resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + proxy_path = output_dir / f"{source.name}.md" + if proxy_path.exists() and not force: + raise ProxyError( + f"Proxy file already exists: {proxy_path} (use --force to overwrite)", + context={"proxy": str(proxy_path)}, + ) + + extractor = self.registry.get_extractor_for_file(source) + + if not extractor.check_dependencies(): + raise ProxyError( + f"Missing dependency for {extractor.name} extractor. " + f"{extractor.dependency_hint()}", + ) + + result = extractor.extract(source) + + checksum = ContentHasher.hash_file(source) + source_size = source.stat().st_size + + # Relative path from proxy location to source + rel_source = os.path.relpath(source, output_dir) + + meta = ProxyMetadata( + source_path=rel_source, + source_checksum=f"sha256:{checksum}", + source_size=source_size, + generated_at=datetime.now(timezone.utc).isoformat(), + extractor=result.extractor, + extractor_version=result.extractor_version, + ) + + self._write_proxy(proxy_path, meta, source.name, result.content) + logger.info("Created proxy %s -> %s", proxy_path.name, rel_source) + return proxy_path + + # ------------------------------------------------------------------ + # update + # ------------------------------------------------------------------ + + def update(self, proxy_path: Path) -> bool: + """Re-extract the proxy file if its source has changed. + + Returns: + True if the proxy was updated, False if already current. + """ + proxy_path = proxy_path.resolve() + meta, body = self._read_proxy(proxy_path) + + source = self._resolve_source(proxy_path, meta) + if not source.is_file(): + raise ProxyError( + f"Source file missing: {source}", + context={"source_path": meta["source_path"]}, + ) + + current_checksum = f"sha256:{ContentHasher.hash_file(source)}" + if current_checksum == meta.get("source_checksum"): + return False + + extractor = self.registry.get_extractor(source.suffix) + if not extractor.check_dependencies(): + raise ProxyError( + f"Missing dependency for {extractor.name} extractor. " + f"{extractor.dependency_hint()}", + ) + + result = extractor.extract(source) + + rel_source = meta["source_path"] + new_meta = ProxyMetadata( + source_path=rel_source, + source_checksum=current_checksum, + source_size=source.stat().st_size, + generated_at=datetime.now(timezone.utc).isoformat(), + extractor=result.extractor, + extractor_version=result.extractor_version, + ) + + self._write_proxy(proxy_path, new_meta, source.name, result.content) + logger.info("Updated proxy %s", proxy_path.name) + return True + + # ------------------------------------------------------------------ + # status + # ------------------------------------------------------------------ + + def status(self, proxy_path: Path) -> Dict: + """Check a single proxy file's freshness. + + Returns a dict with keys: proxy, source, status, extractor. + Status is one of: ``current``, ``stale``, ``missing-source``. + """ + proxy_path = proxy_path.resolve() + meta, _ = self._read_proxy(proxy_path) + + source = self._resolve_source(proxy_path, meta) + if not source.is_file(): + return { + "proxy": str(proxy_path), + "source": meta.get("source_path", ""), + "status": "missing-source", + "extractor": meta.get("extractor", ""), + } + + current_checksum = f"sha256:{ContentHasher.hash_file(source)}" + is_current = current_checksum == meta.get("source_checksum") + return { + "proxy": str(proxy_path), + "source": meta.get("source_path", ""), + "status": "current" if is_current else "stale", + "extractor": meta.get("extractor", ""), + } + + def bulk_status(self, directory: Path) -> List[Dict]: + """Scan *directory* for proxy files and return their statuses.""" + directory = directory.resolve() + results = [] + for path in sorted(directory.rglob("*.md")): + meta, _ = self._try_read_proxy(path) + if meta is not None and meta.get("proxy") is True: + results.append(self.status(path)) + return results + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + + @staticmethod + def _write_proxy( + proxy_path: Path, + meta: ProxyMetadata, + source_name: str, + content: str, + ) -> None: + fm = { + "proxy": True, + "source_path": meta.source_path, + "source_checksum": meta.source_checksum, + "source_size": meta.source_size, + "generated_at": meta.generated_at, + "extractor": meta.extractor, + "extractor_version": meta.extractor_version, + } + fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False) + body = ( + f"---\n{fm_text}---\n\n" + f"# {source_name}\n\n" + f"*Proxy generated from `{meta.source_path}`*\n\n" + f"{content}" + ) + proxy_path.write_text(body, encoding="utf-8") + + @staticmethod + def _read_proxy(proxy_path: Path): + """Read and parse an existing proxy file. + + Returns: + Tuple of (frontmatter dict, body str). + """ + raw = proxy_path.read_text(encoding="utf-8") + meta, body = _frontmatter_parser.parse(raw) + if not meta.get("proxy"): + raise ProxyError( + f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}", + context={"path": str(proxy_path)}, + ) + return meta, body + + @staticmethod + def _try_read_proxy(path: Path): + """Attempt to read a proxy file, returning (None, None) on failure.""" + try: + raw = path.read_text(encoding="utf-8") + meta, body = _frontmatter_parser.parse(raw) + return meta, body + except Exception: + return None, None + + @staticmethod + def _resolve_source(proxy_path: Path, meta: Dict) -> Path: + """Resolve the source path relative to the proxy file's directory.""" + source_rel = meta.get("source_path", "") + return (proxy_path.parent / source_rel).resolve() diff --git a/markitect/proxy/models.py b/markitect/proxy/models.py new file mode 100644 index 00000000..3cd708c8 --- /dev/null +++ b/markitect/proxy/models.py @@ -0,0 +1,26 @@ +""" +Data models for the proxy file system. +""" + +from dataclasses import dataclass + + +@dataclass +class ProxyMetadata: + """Metadata stored in a proxy file's YAML frontmatter.""" + + source_path: str + source_checksum: str # "sha256:" + source_size: int + generated_at: str # ISO 8601 + extractor: str + extractor_version: str + + +@dataclass +class ExtractionResult: + """Result returned by an extractor after processing a source file.""" + + content: str + extractor: str + extractor_version: str diff --git a/markitect/proxy/registry.py b/markitect/proxy/registry.py new file mode 100644 index 00000000..cf002fdb --- /dev/null +++ b/markitect/proxy/registry.py @@ -0,0 +1,65 @@ +""" +Extractor registry — register and look up extractors by file extension. +""" + +from __future__ import annotations + +import logging +from typing import Dict, List, TYPE_CHECKING + +from pathlib import Path + +from markitect.proxy.exceptions import ExtractorNotFoundError + +if TYPE_CHECKING: + from markitect.proxy.extractors.base import BaseExtractor + +logger = logging.getLogger("markitect.proxy.registry") + + +class ExtractorRegistry: + """Maps file extensions to their corresponding extractors.""" + + def __init__(self): + self._extractors: Dict[str, BaseExtractor] = {} + + def register(self, extractor: BaseExtractor) -> None: + """Register an extractor for all of its declared extensions.""" + for ext in extractor.extensions: + ext_lower = ext.lower() + self._extractors[ext_lower] = extractor + logger.debug("Registered %s extractor for %s", extractor.name, ext_lower) + + def get_extractor(self, extension: str) -> BaseExtractor: + """Look up an extractor by file extension (e.g. ``'.pdf'``). + + Raises: + ExtractorNotFoundError: If no extractor handles the extension. + """ + ext_lower = extension.lower() + if ext_lower not in self._extractors: + known = ", ".join(sorted(self._extractors.keys())) + raise ExtractorNotFoundError( + f"No extractor registered for {ext_lower!r}. " + f"Supported extensions: {known}", + context={"extension": ext_lower}, + ) + return self._extractors[ext_lower] + + def get_extractor_for_file(self, path: Path) -> BaseExtractor: + """Look up an extractor for a file based on its suffix.""" + return self.get_extractor(path.suffix) + + def list_extractors(self) -> List[BaseExtractor]: + """Return a de-duplicated list of registered extractors.""" + seen = set() + result = [] + for ext in self._extractors.values(): + if id(ext) not in seen: + seen.add(id(ext)) + result.append(ext) + return result + + +# Module-level singleton +registry = ExtractorRegistry() diff --git a/pyproject.toml b/pyproject.toml index ee2c8183..1bacd192 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,9 @@ capabilities = [ development = [ "kaizen-agentic @ file:./capabilities/kaizen-agentic" ] +proxy-pdf = ["pymupdf4llm>=0.0.10"] +proxy-html = ["markdownify>=0.13.1"] +proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"] [project.scripts] markitect = "markitect.cli:main"