feat(proxy): add proxy file system for non-markdown source conversion

Introduces a new `markitect/proxy/` module with pluggable extractors that
convert non-markdown sources (PDF, HTML) into tracked markdown proxy files.
Proxy files preserve origin metadata (path, checksum, timestamp) so they
can be kept in sync when the original changes.

CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`.
Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 19:06:09 +01:00
parent 69aea1ada7
commit ac334c679d
13 changed files with 781 additions and 0 deletions

View File

@@ -7147,6 +7147,13 @@ try:
except ImportError:
pass # Helper module not available
# Register proxy file system commands
try:
from markitect.proxy.cli import proxy_group
cli.add_command(proxy_group)
except ImportError:
pass # Proxy module not available
# Make cli function available as main entry point
main = cli

View File

@@ -0,0 +1,39 @@
"""
markitect.proxy — Proxy file system for wrapping non-markdown sources.
Creates markdown proxy files that track their origin (source path,
checksum, timestamp) so they can be kept up-to-date when the original
changes.
Quick start::
from markitect.proxy import ProxyGenerator, registry
# Ensure built-in extractors are registered
import markitect.proxy.extractors # noqa: F401
gen = ProxyGenerator(registry)
gen.create(Path("report.pdf"), Path("./output/"))
"""
from markitect.proxy.models import ProxyMetadata, ExtractionResult
from markitect.proxy.exceptions import (
ProxyError,
ExtractorNotFoundError,
DependencyMissingError,
)
from markitect.proxy.registry import ExtractorRegistry, registry
from markitect.proxy.generator import ProxyGenerator
from markitect.proxy.extractors.base import BaseExtractor
__all__ = [
"ProxyMetadata",
"ExtractionResult",
"ProxyError",
"ExtractorNotFoundError",
"DependencyMissingError",
"ExtractorRegistry",
"registry",
"ProxyGenerator",
"BaseExtractor",
]

185
markitect/proxy/cli.py Normal file
View File

@@ -0,0 +1,185 @@
"""
Click CLI commands for the proxy file system.
"""
import json
from pathlib import Path
import click
from markitect.proxy.exceptions import ProxyError
@click.group("proxy")
def proxy_group():
"""Proxy file operations — create, update, and manage markdown proxies."""
pass
@proxy_group.command("create")
@click.argument("source", type=click.Path(exists=True))
@click.option(
"--output-dir", "-o",
type=click.Path(),
default=".",
help="Directory to write the proxy file (default: current dir).",
)
@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.")
def proxy_create(source, output_dir, force):
"""Create a markdown proxy for SOURCE."""
# Lazy imports so the CLI group registers even if deps are absent
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401 — registers built-ins
from markitect.proxy.generator import ProxyGenerator
gen = ProxyGenerator(registry)
try:
proxy_path = gen.create(Path(source), Path(output_dir), force=force)
click.echo(f"Created proxy: {proxy_path}")
except ProxyError as exc:
click.echo(f"Error: {exc}", err=True)
raise SystemExit(1)
@proxy_group.command("update")
@click.argument("target", type=click.Path(exists=True))
@click.option("--dry-run", is_flag=True, help="Show what would change without writing.")
def proxy_update(target, dry_run):
"""Re-extract a single proxy file or all proxies in a directory."""
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401
from markitect.proxy.generator import ProxyGenerator
gen = ProxyGenerator(registry)
target_path = Path(target).resolve()
try:
if target_path.is_file():
_update_one(gen, target_path, dry_run)
elif target_path.is_dir():
proxies = _find_proxy_files(gen, target_path)
if not proxies:
click.echo("No proxy files found.")
return
for p in proxies:
_update_one(gen, p, dry_run)
else:
click.echo(f"Error: {target} is not a file or directory.", err=True)
raise SystemExit(1)
except ProxyError as exc:
click.echo(f"Error: {exc}", err=True)
raise SystemExit(1)
@proxy_group.command("status")
@click.argument("directory", type=click.Path(exists=True), default=".")
@click.option(
"--format", "output_format",
type=click.Choice(["table", "json"]),
default="table",
help="Output format.",
)
def proxy_status(directory, output_format):
"""Show freshness of all proxy files in DIRECTORY (default: current dir)."""
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401
from markitect.proxy.generator import ProxyGenerator
gen = ProxyGenerator(registry)
try:
results = gen.bulk_status(Path(directory))
except ProxyError as exc:
click.echo(f"Error: {exc}", err=True)
raise SystemExit(1)
if not results:
click.echo("No proxy files found.")
return
if output_format == "json":
click.echo(json.dumps(results, indent=2))
else:
_print_status_table(results)
@proxy_group.command("extractors")
@click.option(
"--format", "output_format",
type=click.Choice(["table", "json"]),
default="table",
help="Output format.",
)
def proxy_extractors(output_format):
"""List registered extractors and their dependency status."""
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401
extractors = registry.list_extractors()
if output_format == "json":
rows = [
{
"name": e.name,
"version": e.version,
"extensions": list(e.extensions),
"installed": e.check_dependencies(),
"hint": e.dependency_hint(),
}
for e in extractors
]
click.echo(json.dumps(rows, indent=2))
else:
_print_extractor_table(extractors)
# ------------------------------------------------------------------
# helpers
# ------------------------------------------------------------------
def _update_one(gen, proxy_path, dry_run):
"""Update a single proxy file, respecting --dry-run."""
info = gen.status(proxy_path)
if info["status"] == "current":
click.echo(f" {proxy_path.name}: current")
return
if info["status"] == "missing-source":
click.echo(f" {proxy_path.name}: source missing")
return
if dry_run:
click.echo(f" {proxy_path.name}: stale (would update)")
else:
updated = gen.update(proxy_path)
label = "updated" if updated else "current"
click.echo(f" {proxy_path.name}: {label}")
def _find_proxy_files(gen, directory):
"""Return proxy file paths within *directory*."""
results = []
for path in sorted(directory.rglob("*.md")):
meta, _ = gen._try_read_proxy(path)
if meta is not None and meta.get("proxy") is True:
results.append(path)
return results
def _print_status_table(results):
"""Pretty-print a status table."""
click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}")
click.echo("-" * 95)
for r in results:
proxy_name = Path(r["proxy"]).name
click.echo(
f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}"
)
def _print_extractor_table(extractors):
"""Pretty-print an extractor table."""
click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}")
click.echo("-" * 60)
for e in extractors:
exts = ", ".join(e.extensions)
status = "installed" if e.check_dependencies() else "missing"
click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}")

View File

@@ -0,0 +1,40 @@
"""
Proxy-specific exceptions.
Extends the MarkitectError hierarchy for proxy file operations.
"""
from typing import Optional, Dict, Any
from markitect.exceptions import MarkitectError
class ProxyError(MarkitectError):
"""Base exception for all proxy operations."""
pass
class ExtractorNotFoundError(ProxyError):
"""No extractor registered for the given file extension."""
pass
class DependencyMissingError(ProxyError):
"""An extractor's optional dependency is not installed.
Attributes:
package: The missing Python package name.
install_hint: Suggested pip install command.
"""
def __init__(
self,
message: str,
package: str = "",
install_hint: str = "",
cause: Optional[Exception] = None,
context: Optional[Dict[str, Any]] = None,
):
super().__init__(message, cause=cause, context=context)
self.package = package
self.install_hint = install_hint

View File

@@ -0,0 +1,14 @@
"""
Built-in extractor registration.
Importing this module registers all built-in extractors with the global registry.
"""
from markitect.proxy.registry import registry
from markitect.proxy.extractors.pdf import PdfExtractor
from markitect.proxy.extractors.html import HtmlExtractor
from markitect.proxy.extractors.markdown import MarkdownNormalizer
registry.register(PdfExtractor())
registry.register(HtmlExtractor())
registry.register(MarkdownNormalizer())

View File

@@ -0,0 +1,43 @@
"""
Abstract base class for proxy file extractors.
"""
from abc import ABC, abstractmethod
from pathlib import Path
from markitect.proxy.models import ExtractionResult
class BaseExtractor(ABC):
"""Base class that all proxy extractors must implement."""
name: str = ""
version: str = "1.0"
extensions: tuple = ()
@abstractmethod
def extract(self, source_path: Path) -> ExtractionResult:
"""Extract markdown content from a source file.
Args:
source_path: Path to the source file.
Returns:
ExtractionResult with the extracted markdown content.
"""
@abstractmethod
def check_dependencies(self) -> bool:
"""Check whether all required dependencies are available.
Returns:
True if all dependencies are installed, False otherwise.
"""
def dependency_hint(self) -> str:
"""Human-readable install instructions for missing dependencies.
Returns:
A string like ``pip install markitect[proxy-pdf]``.
"""
return ""

View File

@@ -0,0 +1,45 @@
"""
HTML extractor using markdownify.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
class HtmlExtractor(BaseExtractor):
"""Converts HTML files to Markdown via markdownify."""
name = "html"
version = "1.0"
extensions = (".html", ".htm")
def check_dependencies(self) -> bool:
try:
import markdownify # noqa: F401
return True
except ImportError:
return False
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-html]" (or: pip install markdownify)'
def extract(self, source_path: Path) -> ExtractionResult:
if not self.check_dependencies():
raise DependencyMissingError(
"markdownify is required to extract HTML files.",
package="markdownify",
install_hint=self.dependency_hint(),
)
import markdownify
html_content = source_path.read_text(encoding="utf-8")
md_text = markdownify.markdownify(html_content, heading_style="ATX")
return ExtractionResult(
content=md_text,
extractor=self.name,
extractor_version=self.version,
)

View File

@@ -0,0 +1,29 @@
"""
Markdown normalizer — passes through Markdown with minimal transformation.
No external dependencies required.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
class MarkdownNormalizer(BaseExtractor):
"""Normalizes other Markdown files (built-in, no optional deps)."""
name = "markdown"
version = "1.0"
extensions = (".md", ".markdown", ".mdown")
def check_dependencies(self) -> bool:
return True
def extract(self, source_path: Path) -> ExtractionResult:
content = source_path.read_text(encoding="utf-8")
return ExtractionResult(
content=content,
extractor=self.name,
extractor_version=self.version,
)

View File

@@ -0,0 +1,44 @@
"""
PDF extractor using pymupdf4llm.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
class PdfExtractor(BaseExtractor):
"""Extracts markdown from PDF files via pymupdf4llm."""
name = "pdf"
version = "1.0"
extensions = (".pdf",)
def check_dependencies(self) -> bool:
try:
import pymupdf4llm # noqa: F401
return True
except ImportError:
return False
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)'
def extract(self, source_path: Path) -> ExtractionResult:
if not self.check_dependencies():
raise DependencyMissingError(
"pymupdf4llm is required to extract PDF files.",
package="pymupdf4llm",
install_hint=self.dependency_hint(),
)
import pymupdf4llm
md_text = pymupdf4llm.to_markdown(str(source_path))
return ExtractionResult(
content=md_text,
extractor=self.name,
extractor_version=self.version,
)

View File

@@ -0,0 +1,241 @@
"""
ProxyGenerator — create, update, and check status of proxy files.
"""
import logging
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
import yaml
from markitect.assets.utils import ContentHasher
from markitect.frontmatter import FrontMatterParser
from markitect.proxy.exceptions import ProxyError
from markitect.proxy.models import ProxyMetadata
from markitect.proxy.registry import ExtractorRegistry
logger = logging.getLogger("markitect.proxy.generator")
_frontmatter_parser = FrontMatterParser()
class ProxyGenerator:
"""Creates and manages markdown proxy files."""
def __init__(self, registry: ExtractorRegistry):
self.registry = registry
# ------------------------------------------------------------------
# create
# ------------------------------------------------------------------
def create(self, source: Path, output_dir: Path, force: bool = False) -> Path:
"""Create a proxy markdown file for *source*.
Args:
source: Path to the original file (e.g. ``report.pdf``).
output_dir: Directory where the proxy file will be written.
force: If True, overwrite an existing proxy file.
Returns:
Path to the created proxy file.
Raises:
ProxyError: If the source doesn't exist, extractor fails, etc.
"""
source = source.resolve()
if not source.is_file():
raise ProxyError(
f"Source file does not exist: {source}",
context={"source": str(source)},
)
output_dir = output_dir.resolve()
output_dir.mkdir(parents=True, exist_ok=True)
proxy_path = output_dir / f"{source.name}.md"
if proxy_path.exists() and not force:
raise ProxyError(
f"Proxy file already exists: {proxy_path} (use --force to overwrite)",
context={"proxy": str(proxy_path)},
)
extractor = self.registry.get_extractor_for_file(source)
if not extractor.check_dependencies():
raise ProxyError(
f"Missing dependency for {extractor.name} extractor. "
f"{extractor.dependency_hint()}",
)
result = extractor.extract(source)
checksum = ContentHasher.hash_file(source)
source_size = source.stat().st_size
# Relative path from proxy location to source
rel_source = os.path.relpath(source, output_dir)
meta = ProxyMetadata(
source_path=rel_source,
source_checksum=f"sha256:{checksum}",
source_size=source_size,
generated_at=datetime.now(timezone.utc).isoformat(),
extractor=result.extractor,
extractor_version=result.extractor_version,
)
self._write_proxy(proxy_path, meta, source.name, result.content)
logger.info("Created proxy %s -> %s", proxy_path.name, rel_source)
return proxy_path
# ------------------------------------------------------------------
# update
# ------------------------------------------------------------------
def update(self, proxy_path: Path) -> bool:
"""Re-extract the proxy file if its source has changed.
Returns:
True if the proxy was updated, False if already current.
"""
proxy_path = proxy_path.resolve()
meta, body = self._read_proxy(proxy_path)
source = self._resolve_source(proxy_path, meta)
if not source.is_file():
raise ProxyError(
f"Source file missing: {source}",
context={"source_path": meta["source_path"]},
)
current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
if current_checksum == meta.get("source_checksum"):
return False
extractor = self.registry.get_extractor(source.suffix)
if not extractor.check_dependencies():
raise ProxyError(
f"Missing dependency for {extractor.name} extractor. "
f"{extractor.dependency_hint()}",
)
result = extractor.extract(source)
rel_source = meta["source_path"]
new_meta = ProxyMetadata(
source_path=rel_source,
source_checksum=current_checksum,
source_size=source.stat().st_size,
generated_at=datetime.now(timezone.utc).isoformat(),
extractor=result.extractor,
extractor_version=result.extractor_version,
)
self._write_proxy(proxy_path, new_meta, source.name, result.content)
logger.info("Updated proxy %s", proxy_path.name)
return True
# ------------------------------------------------------------------
# status
# ------------------------------------------------------------------
def status(self, proxy_path: Path) -> Dict:
"""Check a single proxy file's freshness.
Returns a dict with keys: proxy, source, status, extractor.
Status is one of: ``current``, ``stale``, ``missing-source``.
"""
proxy_path = proxy_path.resolve()
meta, _ = self._read_proxy(proxy_path)
source = self._resolve_source(proxy_path, meta)
if not source.is_file():
return {
"proxy": str(proxy_path),
"source": meta.get("source_path", ""),
"status": "missing-source",
"extractor": meta.get("extractor", ""),
}
current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
is_current = current_checksum == meta.get("source_checksum")
return {
"proxy": str(proxy_path),
"source": meta.get("source_path", ""),
"status": "current" if is_current else "stale",
"extractor": meta.get("extractor", ""),
}
def bulk_status(self, directory: Path) -> List[Dict]:
"""Scan *directory* for proxy files and return their statuses."""
directory = directory.resolve()
results = []
for path in sorted(directory.rglob("*.md")):
meta, _ = self._try_read_proxy(path)
if meta is not None and meta.get("proxy") is True:
results.append(self.status(path))
return results
# ------------------------------------------------------------------
# helpers
# ------------------------------------------------------------------
@staticmethod
def _write_proxy(
proxy_path: Path,
meta: ProxyMetadata,
source_name: str,
content: str,
) -> None:
fm = {
"proxy": True,
"source_path": meta.source_path,
"source_checksum": meta.source_checksum,
"source_size": meta.source_size,
"generated_at": meta.generated_at,
"extractor": meta.extractor,
"extractor_version": meta.extractor_version,
}
fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False)
body = (
f"---\n{fm_text}---\n\n"
f"# {source_name}\n\n"
f"*Proxy generated from `{meta.source_path}`*\n\n"
f"{content}"
)
proxy_path.write_text(body, encoding="utf-8")
@staticmethod
def _read_proxy(proxy_path: Path):
"""Read and parse an existing proxy file.
Returns:
Tuple of (frontmatter dict, body str).
"""
raw = proxy_path.read_text(encoding="utf-8")
meta, body = _frontmatter_parser.parse(raw)
if not meta.get("proxy"):
raise ProxyError(
f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}",
context={"path": str(proxy_path)},
)
return meta, body
@staticmethod
def _try_read_proxy(path: Path):
"""Attempt to read a proxy file, returning (None, None) on failure."""
try:
raw = path.read_text(encoding="utf-8")
meta, body = _frontmatter_parser.parse(raw)
return meta, body
except Exception:
return None, None
@staticmethod
def _resolve_source(proxy_path: Path, meta: Dict) -> Path:
"""Resolve the source path relative to the proxy file's directory."""
source_rel = meta.get("source_path", "")
return (proxy_path.parent / source_rel).resolve()

26
markitect/proxy/models.py Normal file
View File

@@ -0,0 +1,26 @@
"""
Data models for the proxy file system.
"""
from dataclasses import dataclass
@dataclass
class ProxyMetadata:
"""Metadata stored in a proxy file's YAML frontmatter."""
source_path: str
source_checksum: str # "sha256:<hex>"
source_size: int
generated_at: str # ISO 8601
extractor: str
extractor_version: str
@dataclass
class ExtractionResult:
"""Result returned by an extractor after processing a source file."""
content: str
extractor: str
extractor_version: str

View File

@@ -0,0 +1,65 @@
"""
Extractor registry — register and look up extractors by file extension.
"""
from __future__ import annotations
import logging
from typing import Dict, List, TYPE_CHECKING
from pathlib import Path
from markitect.proxy.exceptions import ExtractorNotFoundError
if TYPE_CHECKING:
from markitect.proxy.extractors.base import BaseExtractor
logger = logging.getLogger("markitect.proxy.registry")
class ExtractorRegistry:
"""Maps file extensions to their corresponding extractors."""
def __init__(self):
self._extractors: Dict[str, BaseExtractor] = {}
def register(self, extractor: BaseExtractor) -> None:
"""Register an extractor for all of its declared extensions."""
for ext in extractor.extensions:
ext_lower = ext.lower()
self._extractors[ext_lower] = extractor
logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
def get_extractor(self, extension: str) -> BaseExtractor:
"""Look up an extractor by file extension (e.g. ``'.pdf'``).
Raises:
ExtractorNotFoundError: If no extractor handles the extension.
"""
ext_lower = extension.lower()
if ext_lower not in self._extractors:
known = ", ".join(sorted(self._extractors.keys()))
raise ExtractorNotFoundError(
f"No extractor registered for {ext_lower!r}. "
f"Supported extensions: {known}",
context={"extension": ext_lower},
)
return self._extractors[ext_lower]
def get_extractor_for_file(self, path: Path) -> BaseExtractor:
"""Look up an extractor for a file based on its suffix."""
return self.get_extractor(path.suffix)
def list_extractors(self) -> List[BaseExtractor]:
"""Return a de-duplicated list of registered extractors."""
seen = set()
result = []
for ext in self._extractors.values():
if id(ext) not in seen:
seen.add(id(ext))
result.append(ext)
return result
# Module-level singleton
registry = ExtractorRegistry()

View File

@@ -32,6 +32,9 @@ capabilities = [
development = [
"kaizen-agentic @ file:./capabilities/kaizen-agentic"
]
proxy-pdf = ["pymupdf4llm>=0.0.10"]
proxy-html = ["markdownify>=0.13.1"]
proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
[project.scripts]
markitect = "markitect.cli:main"