Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
"""
|
|
markitect.proxy — Proxy file system for wrapping non-markdown sources.
|
|
|
|
Creates markdown proxy files that track their origin (source path,
|
|
checksum, timestamp) so they can be kept up-to-date when the original
|
|
changes.
|
|
|
|
Quick start::
|
|
|
|
from markitect.proxy import ProxyGenerator, registry
|
|
|
|
# Ensure built-in extractors are registered
|
|
import markitect.proxy.extractors # noqa: F401
|
|
|
|
gen = ProxyGenerator(registry)
|
|
gen.create(Path("report.pdf"), Path("./output/"))
|
|
"""
|
|
|
|
from markitect.proxy.models import ProxyMetadata, ExtractionResult
|
|
from markitect.proxy.exceptions import (
|
|
ProxyError,
|
|
ExtractorNotFoundError,
|
|
DependencyMissingError,
|
|
)
|
|
from markitect.proxy.registry import ExtractorRegistry, registry
|
|
from markitect.proxy.generator import ProxyGenerator
|
|
from markitect.proxy.extractors.base import BaseExtractor
|
|
|
|
__all__ = [
|
|
"ProxyMetadata",
|
|
"ExtractionResult",
|
|
"ProxyError",
|
|
"ExtractorNotFoundError",
|
|
"DependencyMissingError",
|
|
"ExtractorRegistry",
|
|
"registry",
|
|
"ProxyGenerator",
|
|
"BaseExtractor",
|
|
]
|