feat(proxy): add proxy file system for non-markdown source conversion

Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 19:06:09 +01:00
parent 69aea1ada7
commit ac334c679d
13 changed files with 781 additions and 0 deletions
--- a/markitect/proxy/init.py
+++ b/markitect/proxy/init.py
@@ -0,0 +1,39 @@
+"""
+markitect.proxy — Proxy file system for wrapping non-markdown sources.
+
+Creates markdown proxy files that track their origin (source path,
+checksum, timestamp) so they can be kept up-to-date when the original
+changes.
+
+Quick start::
+
+    from markitect.proxy import ProxyGenerator, registry
+
+    # Ensure built-in extractors are registered
+    import markitect.proxy.extractors  # noqa: F401
+
+    gen = ProxyGenerator(registry)
+    gen.create(Path("report.pdf"), Path("./output/"))
+"""
+
+from markitect.proxy.models import ProxyMetadata, ExtractionResult
+from markitect.proxy.exceptions import (
+    ProxyError,
+    ExtractorNotFoundError,
+    DependencyMissingError,
+)
+from markitect.proxy.registry import ExtractorRegistry, registry
+from markitect.proxy.generator import ProxyGenerator
+from markitect.proxy.extractors.base import BaseExtractor
+
+__all__ = [
+    "ProxyMetadata",
+    "ExtractionResult",
+    "ProxyError",
+    "ExtractorNotFoundError",
+    "DependencyMissingError",
+    "ExtractorRegistry",
+    "registry",
+    "ProxyGenerator",
+    "BaseExtractor",
+]