feat(proxy): add proxy file system for non-markdown source conversion
Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7147,6 +7147,13 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pass # Helper module not available
|
pass # Helper module not available
|
||||||
|
|
||||||
|
# Register proxy file system commands
|
||||||
|
try:
|
||||||
|
from markitect.proxy.cli import proxy_group
|
||||||
|
cli.add_command(proxy_group)
|
||||||
|
except ImportError:
|
||||||
|
pass # Proxy module not available
|
||||||
|
|
||||||
# Make cli function available as main entry point
|
# Make cli function available as main entry point
|
||||||
main = cli
|
main = cli
|
||||||
|
|
||||||
|
|||||||
39
markitect/proxy/__init__.py
Normal file
39
markitect/proxy/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""
|
||||||
|
markitect.proxy — Proxy file system for wrapping non-markdown sources.
|
||||||
|
|
||||||
|
Creates markdown proxy files that track their origin (source path,
|
||||||
|
checksum, timestamp) so they can be kept up-to-date when the original
|
||||||
|
changes.
|
||||||
|
|
||||||
|
Quick start::
|
||||||
|
|
||||||
|
from markitect.proxy import ProxyGenerator, registry
|
||||||
|
|
||||||
|
# Ensure built-in extractors are registered
|
||||||
|
import markitect.proxy.extractors # noqa: F401
|
||||||
|
|
||||||
|
gen = ProxyGenerator(registry)
|
||||||
|
gen.create(Path("report.pdf"), Path("./output/"))
|
||||||
|
"""
|
||||||
|
|
||||||
|
from markitect.proxy.models import ProxyMetadata, ExtractionResult
|
||||||
|
from markitect.proxy.exceptions import (
|
||||||
|
ProxyError,
|
||||||
|
ExtractorNotFoundError,
|
||||||
|
DependencyMissingError,
|
||||||
|
)
|
||||||
|
from markitect.proxy.registry import ExtractorRegistry, registry
|
||||||
|
from markitect.proxy.generator import ProxyGenerator
|
||||||
|
from markitect.proxy.extractors.base import BaseExtractor
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ProxyMetadata",
|
||||||
|
"ExtractionResult",
|
||||||
|
"ProxyError",
|
||||||
|
"ExtractorNotFoundError",
|
||||||
|
"DependencyMissingError",
|
||||||
|
"ExtractorRegistry",
|
||||||
|
"registry",
|
||||||
|
"ProxyGenerator",
|
||||||
|
"BaseExtractor",
|
||||||
|
]
|
||||||
185
markitect/proxy/cli.py
Normal file
185
markitect/proxy/cli.py
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
"""
|
||||||
|
Click CLI commands for the proxy file system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from markitect.proxy.exceptions import ProxyError
|
||||||
|
|
||||||
|
|
||||||
|
@click.group("proxy")
|
||||||
|
def proxy_group():
|
||||||
|
"""Proxy file operations — create, update, and manage markdown proxies."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@proxy_group.command("create")
|
||||||
|
@click.argument("source", type=click.Path(exists=True))
|
||||||
|
@click.option(
|
||||||
|
"--output-dir", "-o",
|
||||||
|
type=click.Path(),
|
||||||
|
default=".",
|
||||||
|
help="Directory to write the proxy file (default: current dir).",
|
||||||
|
)
|
||||||
|
@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.")
|
||||||
|
def proxy_create(source, output_dir, force):
|
||||||
|
"""Create a markdown proxy for SOURCE."""
|
||||||
|
# Lazy imports so the CLI group registers even if deps are absent
|
||||||
|
from markitect.proxy.registry import registry
|
||||||
|
import markitect.proxy.extractors # noqa: F401 — registers built-ins
|
||||||
|
from markitect.proxy.generator import ProxyGenerator
|
||||||
|
|
||||||
|
gen = ProxyGenerator(registry)
|
||||||
|
try:
|
||||||
|
proxy_path = gen.create(Path(source), Path(output_dir), force=force)
|
||||||
|
click.echo(f"Created proxy: {proxy_path}")
|
||||||
|
except ProxyError as exc:
|
||||||
|
click.echo(f"Error: {exc}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
|
@proxy_group.command("update")
|
||||||
|
@click.argument("target", type=click.Path(exists=True))
|
||||||
|
@click.option("--dry-run", is_flag=True, help="Show what would change without writing.")
|
||||||
|
def proxy_update(target, dry_run):
|
||||||
|
"""Re-extract a single proxy file or all proxies in a directory."""
|
||||||
|
from markitect.proxy.registry import registry
|
||||||
|
import markitect.proxy.extractors # noqa: F401
|
||||||
|
from markitect.proxy.generator import ProxyGenerator
|
||||||
|
|
||||||
|
gen = ProxyGenerator(registry)
|
||||||
|
target_path = Path(target).resolve()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if target_path.is_file():
|
||||||
|
_update_one(gen, target_path, dry_run)
|
||||||
|
elif target_path.is_dir():
|
||||||
|
proxies = _find_proxy_files(gen, target_path)
|
||||||
|
if not proxies:
|
||||||
|
click.echo("No proxy files found.")
|
||||||
|
return
|
||||||
|
for p in proxies:
|
||||||
|
_update_one(gen, p, dry_run)
|
||||||
|
else:
|
||||||
|
click.echo(f"Error: {target} is not a file or directory.", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
except ProxyError as exc:
|
||||||
|
click.echo(f"Error: {exc}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
|
@proxy_group.command("status")
|
||||||
|
@click.argument("directory", type=click.Path(exists=True), default=".")
|
||||||
|
@click.option(
|
||||||
|
"--format", "output_format",
|
||||||
|
type=click.Choice(["table", "json"]),
|
||||||
|
default="table",
|
||||||
|
help="Output format.",
|
||||||
|
)
|
||||||
|
def proxy_status(directory, output_format):
|
||||||
|
"""Show freshness of all proxy files in DIRECTORY (default: current dir)."""
|
||||||
|
from markitect.proxy.registry import registry
|
||||||
|
import markitect.proxy.extractors # noqa: F401
|
||||||
|
from markitect.proxy.generator import ProxyGenerator
|
||||||
|
|
||||||
|
gen = ProxyGenerator(registry)
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = gen.bulk_status(Path(directory))
|
||||||
|
except ProxyError as exc:
|
||||||
|
click.echo(f"Error: {exc}", err=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
click.echo("No proxy files found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if output_format == "json":
|
||||||
|
click.echo(json.dumps(results, indent=2))
|
||||||
|
else:
|
||||||
|
_print_status_table(results)
|
||||||
|
|
||||||
|
|
||||||
|
@proxy_group.command("extractors")
|
||||||
|
@click.option(
|
||||||
|
"--format", "output_format",
|
||||||
|
type=click.Choice(["table", "json"]),
|
||||||
|
default="table",
|
||||||
|
help="Output format.",
|
||||||
|
)
|
||||||
|
def proxy_extractors(output_format):
|
||||||
|
"""List registered extractors and their dependency status."""
|
||||||
|
from markitect.proxy.registry import registry
|
||||||
|
import markitect.proxy.extractors # noqa: F401
|
||||||
|
|
||||||
|
extractors = registry.list_extractors()
|
||||||
|
|
||||||
|
if output_format == "json":
|
||||||
|
rows = [
|
||||||
|
{
|
||||||
|
"name": e.name,
|
||||||
|
"version": e.version,
|
||||||
|
"extensions": list(e.extensions),
|
||||||
|
"installed": e.check_dependencies(),
|
||||||
|
"hint": e.dependency_hint(),
|
||||||
|
}
|
||||||
|
for e in extractors
|
||||||
|
]
|
||||||
|
click.echo(json.dumps(rows, indent=2))
|
||||||
|
else:
|
||||||
|
_print_extractor_table(extractors)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _update_one(gen, proxy_path, dry_run):
|
||||||
|
"""Update a single proxy file, respecting --dry-run."""
|
||||||
|
info = gen.status(proxy_path)
|
||||||
|
if info["status"] == "current":
|
||||||
|
click.echo(f" {proxy_path.name}: current")
|
||||||
|
return
|
||||||
|
if info["status"] == "missing-source":
|
||||||
|
click.echo(f" {proxy_path.name}: source missing")
|
||||||
|
return
|
||||||
|
if dry_run:
|
||||||
|
click.echo(f" {proxy_path.name}: stale (would update)")
|
||||||
|
else:
|
||||||
|
updated = gen.update(proxy_path)
|
||||||
|
label = "updated" if updated else "current"
|
||||||
|
click.echo(f" {proxy_path.name}: {label}")
|
||||||
|
|
||||||
|
|
||||||
|
def _find_proxy_files(gen, directory):
|
||||||
|
"""Return proxy file paths within *directory*."""
|
||||||
|
results = []
|
||||||
|
for path in sorted(directory.rglob("*.md")):
|
||||||
|
meta, _ = gen._try_read_proxy(path)
|
||||||
|
if meta is not None and meta.get("proxy") is True:
|
||||||
|
results.append(path)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _print_status_table(results):
|
||||||
|
"""Pretty-print a status table."""
|
||||||
|
click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}")
|
||||||
|
click.echo("-" * 95)
|
||||||
|
for r in results:
|
||||||
|
proxy_name = Path(r["proxy"]).name
|
||||||
|
click.echo(
|
||||||
|
f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _print_extractor_table(extractors):
|
||||||
|
"""Pretty-print an extractor table."""
|
||||||
|
click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}")
|
||||||
|
click.echo("-" * 60)
|
||||||
|
for e in extractors:
|
||||||
|
exts = ", ".join(e.extensions)
|
||||||
|
status = "installed" if e.check_dependencies() else "missing"
|
||||||
|
click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}")
|
||||||
40
markitect/proxy/exceptions.py
Normal file
40
markitect/proxy/exceptions.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
"""
|
||||||
|
Proxy-specific exceptions.
|
||||||
|
|
||||||
|
Extends the MarkitectError hierarchy for proxy file operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
from markitect.exceptions import MarkitectError
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyError(MarkitectError):
|
||||||
|
"""Base exception for all proxy operations."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractorNotFoundError(ProxyError):
|
||||||
|
"""No extractor registered for the given file extension."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyMissingError(ProxyError):
|
||||||
|
"""An extractor's optional dependency is not installed.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
package: The missing Python package name.
|
||||||
|
install_hint: Suggested pip install command.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
package: str = "",
|
||||||
|
install_hint: str = "",
|
||||||
|
cause: Optional[Exception] = None,
|
||||||
|
context: Optional[Dict[str, Any]] = None,
|
||||||
|
):
|
||||||
|
super().__init__(message, cause=cause, context=context)
|
||||||
|
self.package = package
|
||||||
|
self.install_hint = install_hint
|
||||||
14
markitect/proxy/extractors/__init__.py
Normal file
14
markitect/proxy/extractors/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
"""
|
||||||
|
Built-in extractor registration.
|
||||||
|
|
||||||
|
Importing this module registers all built-in extractors with the global registry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from markitect.proxy.registry import registry
|
||||||
|
from markitect.proxy.extractors.pdf import PdfExtractor
|
||||||
|
from markitect.proxy.extractors.html import HtmlExtractor
|
||||||
|
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
||||||
|
|
||||||
|
registry.register(PdfExtractor())
|
||||||
|
registry.register(HtmlExtractor())
|
||||||
|
registry.register(MarkdownNormalizer())
|
||||||
43
markitect/proxy/extractors/base.py
Normal file
43
markitect/proxy/extractors/base.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
"""
|
||||||
|
Abstract base class for proxy file extractors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from markitect.proxy.models import ExtractionResult
|
||||||
|
|
||||||
|
|
||||||
|
class BaseExtractor(ABC):
|
||||||
|
"""Base class that all proxy extractors must implement."""
|
||||||
|
|
||||||
|
name: str = ""
|
||||||
|
version: str = "1.0"
|
||||||
|
extensions: tuple = ()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract(self, source_path: Path) -> ExtractionResult:
|
||||||
|
"""Extract markdown content from a source file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_path: Path to the source file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractionResult with the extracted markdown content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def check_dependencies(self) -> bool:
|
||||||
|
"""Check whether all required dependencies are available.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all dependencies are installed, False otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def dependency_hint(self) -> str:
|
||||||
|
"""Human-readable install instructions for missing dependencies.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string like ``pip install markitect[proxy-pdf]``.
|
||||||
|
"""
|
||||||
|
return ""
|
||||||
45
markitect/proxy/extractors/html.py
Normal file
45
markitect/proxy/extractors/html.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
"""
|
||||||
|
HTML extractor using markdownify.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from markitect.proxy.extractors.base import BaseExtractor
|
||||||
|
from markitect.proxy.models import ExtractionResult
|
||||||
|
from markitect.proxy.exceptions import DependencyMissingError
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlExtractor(BaseExtractor):
|
||||||
|
"""Converts HTML files to Markdown via markdownify."""
|
||||||
|
|
||||||
|
name = "html"
|
||||||
|
version = "1.0"
|
||||||
|
extensions = (".html", ".htm")
|
||||||
|
|
||||||
|
def check_dependencies(self) -> bool:
|
||||||
|
try:
|
||||||
|
import markdownify # noqa: F401
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def dependency_hint(self) -> str:
|
||||||
|
return 'pip install "markitect[proxy-html]" (or: pip install markdownify)'
|
||||||
|
|
||||||
|
def extract(self, source_path: Path) -> ExtractionResult:
|
||||||
|
if not self.check_dependencies():
|
||||||
|
raise DependencyMissingError(
|
||||||
|
"markdownify is required to extract HTML files.",
|
||||||
|
package="markdownify",
|
||||||
|
install_hint=self.dependency_hint(),
|
||||||
|
)
|
||||||
|
|
||||||
|
import markdownify
|
||||||
|
|
||||||
|
html_content = source_path.read_text(encoding="utf-8")
|
||||||
|
md_text = markdownify.markdownify(html_content, heading_style="ATX")
|
||||||
|
return ExtractionResult(
|
||||||
|
content=md_text,
|
||||||
|
extractor=self.name,
|
||||||
|
extractor_version=self.version,
|
||||||
|
)
|
||||||
29
markitect/proxy/extractors/markdown.py
Normal file
29
markitect/proxy/extractors/markdown.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
"""
|
||||||
|
Markdown normalizer — passes through Markdown with minimal transformation.
|
||||||
|
|
||||||
|
No external dependencies required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from markitect.proxy.extractors.base import BaseExtractor
|
||||||
|
from markitect.proxy.models import ExtractionResult
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownNormalizer(BaseExtractor):
|
||||||
|
"""Normalizes other Markdown files (built-in, no optional deps)."""
|
||||||
|
|
||||||
|
name = "markdown"
|
||||||
|
version = "1.0"
|
||||||
|
extensions = (".md", ".markdown", ".mdown")
|
||||||
|
|
||||||
|
def check_dependencies(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def extract(self, source_path: Path) -> ExtractionResult:
|
||||||
|
content = source_path.read_text(encoding="utf-8")
|
||||||
|
return ExtractionResult(
|
||||||
|
content=content,
|
||||||
|
extractor=self.name,
|
||||||
|
extractor_version=self.version,
|
||||||
|
)
|
||||||
44
markitect/proxy/extractors/pdf.py
Normal file
44
markitect/proxy/extractors/pdf.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
PDF extractor using pymupdf4llm.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from markitect.proxy.extractors.base import BaseExtractor
|
||||||
|
from markitect.proxy.models import ExtractionResult
|
||||||
|
from markitect.proxy.exceptions import DependencyMissingError
|
||||||
|
|
||||||
|
|
||||||
|
class PdfExtractor(BaseExtractor):
|
||||||
|
"""Extracts markdown from PDF files via pymupdf4llm."""
|
||||||
|
|
||||||
|
name = "pdf"
|
||||||
|
version = "1.0"
|
||||||
|
extensions = (".pdf",)
|
||||||
|
|
||||||
|
def check_dependencies(self) -> bool:
|
||||||
|
try:
|
||||||
|
import pymupdf4llm # noqa: F401
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def dependency_hint(self) -> str:
|
||||||
|
return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)'
|
||||||
|
|
||||||
|
def extract(self, source_path: Path) -> ExtractionResult:
|
||||||
|
if not self.check_dependencies():
|
||||||
|
raise DependencyMissingError(
|
||||||
|
"pymupdf4llm is required to extract PDF files.",
|
||||||
|
package="pymupdf4llm",
|
||||||
|
install_hint=self.dependency_hint(),
|
||||||
|
)
|
||||||
|
|
||||||
|
import pymupdf4llm
|
||||||
|
|
||||||
|
md_text = pymupdf4llm.to_markdown(str(source_path))
|
||||||
|
return ExtractionResult(
|
||||||
|
content=md_text,
|
||||||
|
extractor=self.name,
|
||||||
|
extractor_version=self.version,
|
||||||
|
)
|
||||||
241
markitect/proxy/generator.py
Normal file
241
markitect/proxy/generator.py
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
"""
|
||||||
|
ProxyGenerator — create, update, and check status of proxy files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from markitect.assets.utils import ContentHasher
|
||||||
|
from markitect.frontmatter import FrontMatterParser
|
||||||
|
from markitect.proxy.exceptions import ProxyError
|
||||||
|
from markitect.proxy.models import ProxyMetadata
|
||||||
|
from markitect.proxy.registry import ExtractorRegistry
|
||||||
|
|
||||||
|
logger = logging.getLogger("markitect.proxy.generator")
|
||||||
|
|
||||||
|
_frontmatter_parser = FrontMatterParser()
|
||||||
|
|
||||||
|
|
||||||
|
class ProxyGenerator:
|
||||||
|
"""Creates and manages markdown proxy files."""
|
||||||
|
|
||||||
|
def __init__(self, registry: ExtractorRegistry):
|
||||||
|
self.registry = registry
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# create
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def create(self, source: Path, output_dir: Path, force: bool = False) -> Path:
|
||||||
|
"""Create a proxy markdown file for *source*.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: Path to the original file (e.g. ``report.pdf``).
|
||||||
|
output_dir: Directory where the proxy file will be written.
|
||||||
|
force: If True, overwrite an existing proxy file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the created proxy file.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ProxyError: If the source doesn't exist, extractor fails, etc.
|
||||||
|
"""
|
||||||
|
source = source.resolve()
|
||||||
|
if not source.is_file():
|
||||||
|
raise ProxyError(
|
||||||
|
f"Source file does not exist: {source}",
|
||||||
|
context={"source": str(source)},
|
||||||
|
)
|
||||||
|
|
||||||
|
output_dir = output_dir.resolve()
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
proxy_path = output_dir / f"{source.name}.md"
|
||||||
|
if proxy_path.exists() and not force:
|
||||||
|
raise ProxyError(
|
||||||
|
f"Proxy file already exists: {proxy_path} (use --force to overwrite)",
|
||||||
|
context={"proxy": str(proxy_path)},
|
||||||
|
)
|
||||||
|
|
||||||
|
extractor = self.registry.get_extractor_for_file(source)
|
||||||
|
|
||||||
|
if not extractor.check_dependencies():
|
||||||
|
raise ProxyError(
|
||||||
|
f"Missing dependency for {extractor.name} extractor. "
|
||||||
|
f"{extractor.dependency_hint()}",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = extractor.extract(source)
|
||||||
|
|
||||||
|
checksum = ContentHasher.hash_file(source)
|
||||||
|
source_size = source.stat().st_size
|
||||||
|
|
||||||
|
# Relative path from proxy location to source
|
||||||
|
rel_source = os.path.relpath(source, output_dir)
|
||||||
|
|
||||||
|
meta = ProxyMetadata(
|
||||||
|
source_path=rel_source,
|
||||||
|
source_checksum=f"sha256:{checksum}",
|
||||||
|
source_size=source_size,
|
||||||
|
generated_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
extractor=result.extractor,
|
||||||
|
extractor_version=result.extractor_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._write_proxy(proxy_path, meta, source.name, result.content)
|
||||||
|
logger.info("Created proxy %s -> %s", proxy_path.name, rel_source)
|
||||||
|
return proxy_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# update
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def update(self, proxy_path: Path) -> bool:
|
||||||
|
"""Re-extract the proxy file if its source has changed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the proxy was updated, False if already current.
|
||||||
|
"""
|
||||||
|
proxy_path = proxy_path.resolve()
|
||||||
|
meta, body = self._read_proxy(proxy_path)
|
||||||
|
|
||||||
|
source = self._resolve_source(proxy_path, meta)
|
||||||
|
if not source.is_file():
|
||||||
|
raise ProxyError(
|
||||||
|
f"Source file missing: {source}",
|
||||||
|
context={"source_path": meta["source_path"]},
|
||||||
|
)
|
||||||
|
|
||||||
|
current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
|
||||||
|
if current_checksum == meta.get("source_checksum"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
extractor = self.registry.get_extractor(source.suffix)
|
||||||
|
if not extractor.check_dependencies():
|
||||||
|
raise ProxyError(
|
||||||
|
f"Missing dependency for {extractor.name} extractor. "
|
||||||
|
f"{extractor.dependency_hint()}",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = extractor.extract(source)
|
||||||
|
|
||||||
|
rel_source = meta["source_path"]
|
||||||
|
new_meta = ProxyMetadata(
|
||||||
|
source_path=rel_source,
|
||||||
|
source_checksum=current_checksum,
|
||||||
|
source_size=source.stat().st_size,
|
||||||
|
generated_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
extractor=result.extractor,
|
||||||
|
extractor_version=result.extractor_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._write_proxy(proxy_path, new_meta, source.name, result.content)
|
||||||
|
logger.info("Updated proxy %s", proxy_path.name)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# status
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def status(self, proxy_path: Path) -> Dict:
|
||||||
|
"""Check a single proxy file's freshness.
|
||||||
|
|
||||||
|
Returns a dict with keys: proxy, source, status, extractor.
|
||||||
|
Status is one of: ``current``, ``stale``, ``missing-source``.
|
||||||
|
"""
|
||||||
|
proxy_path = proxy_path.resolve()
|
||||||
|
meta, _ = self._read_proxy(proxy_path)
|
||||||
|
|
||||||
|
source = self._resolve_source(proxy_path, meta)
|
||||||
|
if not source.is_file():
|
||||||
|
return {
|
||||||
|
"proxy": str(proxy_path),
|
||||||
|
"source": meta.get("source_path", ""),
|
||||||
|
"status": "missing-source",
|
||||||
|
"extractor": meta.get("extractor", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
|
||||||
|
is_current = current_checksum == meta.get("source_checksum")
|
||||||
|
return {
|
||||||
|
"proxy": str(proxy_path),
|
||||||
|
"source": meta.get("source_path", ""),
|
||||||
|
"status": "current" if is_current else "stale",
|
||||||
|
"extractor": meta.get("extractor", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
def bulk_status(self, directory: Path) -> List[Dict]:
|
||||||
|
"""Scan *directory* for proxy files and return their statuses."""
|
||||||
|
directory = directory.resolve()
|
||||||
|
results = []
|
||||||
|
for path in sorted(directory.rglob("*.md")):
|
||||||
|
meta, _ = self._try_read_proxy(path)
|
||||||
|
if meta is not None and meta.get("proxy") is True:
|
||||||
|
results.append(self.status(path))
|
||||||
|
return results
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _write_proxy(
|
||||||
|
proxy_path: Path,
|
||||||
|
meta: ProxyMetadata,
|
||||||
|
source_name: str,
|
||||||
|
content: str,
|
||||||
|
) -> None:
|
||||||
|
fm = {
|
||||||
|
"proxy": True,
|
||||||
|
"source_path": meta.source_path,
|
||||||
|
"source_checksum": meta.source_checksum,
|
||||||
|
"source_size": meta.source_size,
|
||||||
|
"generated_at": meta.generated_at,
|
||||||
|
"extractor": meta.extractor,
|
||||||
|
"extractor_version": meta.extractor_version,
|
||||||
|
}
|
||||||
|
fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False)
|
||||||
|
body = (
|
||||||
|
f"---\n{fm_text}---\n\n"
|
||||||
|
f"# {source_name}\n\n"
|
||||||
|
f"*Proxy generated from `{meta.source_path}`*\n\n"
|
||||||
|
f"{content}"
|
||||||
|
)
|
||||||
|
proxy_path.write_text(body, encoding="utf-8")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _read_proxy(proxy_path: Path):
|
||||||
|
"""Read and parse an existing proxy file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (frontmatter dict, body str).
|
||||||
|
"""
|
||||||
|
raw = proxy_path.read_text(encoding="utf-8")
|
||||||
|
meta, body = _frontmatter_parser.parse(raw)
|
||||||
|
if not meta.get("proxy"):
|
||||||
|
raise ProxyError(
|
||||||
|
f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}",
|
||||||
|
context={"path": str(proxy_path)},
|
||||||
|
)
|
||||||
|
return meta, body
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _try_read_proxy(path: Path):
|
||||||
|
"""Attempt to read a proxy file, returning (None, None) on failure."""
|
||||||
|
try:
|
||||||
|
raw = path.read_text(encoding="utf-8")
|
||||||
|
meta, body = _frontmatter_parser.parse(raw)
|
||||||
|
return meta, body
|
||||||
|
except Exception:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _resolve_source(proxy_path: Path, meta: Dict) -> Path:
|
||||||
|
"""Resolve the source path relative to the proxy file's directory."""
|
||||||
|
source_rel = meta.get("source_path", "")
|
||||||
|
return (proxy_path.parent / source_rel).resolve()
|
||||||
26
markitect/proxy/models.py
Normal file
26
markitect/proxy/models.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
"""
|
||||||
|
Data models for the proxy file system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProxyMetadata:
|
||||||
|
"""Metadata stored in a proxy file's YAML frontmatter."""
|
||||||
|
|
||||||
|
source_path: str
|
||||||
|
source_checksum: str # "sha256:<hex>"
|
||||||
|
source_size: int
|
||||||
|
generated_at: str # ISO 8601
|
||||||
|
extractor: str
|
||||||
|
extractor_version: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionResult:
|
||||||
|
"""Result returned by an extractor after processing a source file."""
|
||||||
|
|
||||||
|
content: str
|
||||||
|
extractor: str
|
||||||
|
extractor_version: str
|
||||||
65
markitect/proxy/registry.py
Normal file
65
markitect/proxy/registry.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""
|
||||||
|
Extractor registry — register and look up extractors by file extension.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, TYPE_CHECKING
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from markitect.proxy.exceptions import ExtractorNotFoundError
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from markitect.proxy.extractors.base import BaseExtractor
|
||||||
|
|
||||||
|
logger = logging.getLogger("markitect.proxy.registry")
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractorRegistry:
|
||||||
|
"""Maps file extensions to their corresponding extractors."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._extractors: Dict[str, BaseExtractor] = {}
|
||||||
|
|
||||||
|
def register(self, extractor: BaseExtractor) -> None:
|
||||||
|
"""Register an extractor for all of its declared extensions."""
|
||||||
|
for ext in extractor.extensions:
|
||||||
|
ext_lower = ext.lower()
|
||||||
|
self._extractors[ext_lower] = extractor
|
||||||
|
logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
|
||||||
|
|
||||||
|
def get_extractor(self, extension: str) -> BaseExtractor:
|
||||||
|
"""Look up an extractor by file extension (e.g. ``'.pdf'``).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractorNotFoundError: If no extractor handles the extension.
|
||||||
|
"""
|
||||||
|
ext_lower = extension.lower()
|
||||||
|
if ext_lower not in self._extractors:
|
||||||
|
known = ", ".join(sorted(self._extractors.keys()))
|
||||||
|
raise ExtractorNotFoundError(
|
||||||
|
f"No extractor registered for {ext_lower!r}. "
|
||||||
|
f"Supported extensions: {known}",
|
||||||
|
context={"extension": ext_lower},
|
||||||
|
)
|
||||||
|
return self._extractors[ext_lower]
|
||||||
|
|
||||||
|
def get_extractor_for_file(self, path: Path) -> BaseExtractor:
|
||||||
|
"""Look up an extractor for a file based on its suffix."""
|
||||||
|
return self.get_extractor(path.suffix)
|
||||||
|
|
||||||
|
def list_extractors(self) -> List[BaseExtractor]:
|
||||||
|
"""Return a de-duplicated list of registered extractors."""
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for ext in self._extractors.values():
|
||||||
|
if id(ext) not in seen:
|
||||||
|
seen.add(id(ext))
|
||||||
|
result.append(ext)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton
|
||||||
|
registry = ExtractorRegistry()
|
||||||
@@ -32,6 +32,9 @@ capabilities = [
|
|||||||
development = [
|
development = [
|
||||||
"kaizen-agentic @ file:./capabilities/kaizen-agentic"
|
"kaizen-agentic @ file:./capabilities/kaizen-agentic"
|
||||||
]
|
]
|
||||||
|
proxy-pdf = ["pymupdf4llm>=0.0.10"]
|
||||||
|
proxy-html = ["markdownify>=0.13.1"]
|
||||||
|
proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
markitect = "markitect.cli:main"
|
markitect = "markitect.cli:main"
|
||||||
|
|||||||
Reference in New Issue
Block a user