feat(proxy): add proxy file system for non-markdown source conversion
Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7147,6 +7147,13 @@ try:
|
||||
except ImportError:
|
||||
pass # Helper module not available
|
||||
|
||||
# Register proxy file system commands
|
||||
try:
|
||||
from markitect.proxy.cli import proxy_group
|
||||
cli.add_command(proxy_group)
|
||||
except ImportError:
|
||||
pass # Proxy module not available
|
||||
|
||||
# Make cli function available as main entry point
|
||||
main = cli
|
||||
|
||||
|
||||
39
markitect/proxy/__init__.py
Normal file
39
markitect/proxy/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""
|
||||
markitect.proxy — Proxy file system for wrapping non-markdown sources.
|
||||
|
||||
Creates markdown proxy files that track their origin (source path,
|
||||
checksum, timestamp) so they can be kept up-to-date when the original
|
||||
changes.
|
||||
|
||||
Quick start::
|
||||
|
||||
from markitect.proxy import ProxyGenerator, registry
|
||||
|
||||
# Ensure built-in extractors are registered
|
||||
import markitect.proxy.extractors # noqa: F401
|
||||
|
||||
gen = ProxyGenerator(registry)
|
||||
gen.create(Path("report.pdf"), Path("./output/"))
|
||||
"""
|
||||
|
||||
from markitect.proxy.models import ProxyMetadata, ExtractionResult
|
||||
from markitect.proxy.exceptions import (
|
||||
ProxyError,
|
||||
ExtractorNotFoundError,
|
||||
DependencyMissingError,
|
||||
)
|
||||
from markitect.proxy.registry import ExtractorRegistry, registry
|
||||
from markitect.proxy.generator import ProxyGenerator
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
|
||||
__all__ = [
|
||||
"ProxyMetadata",
|
||||
"ExtractionResult",
|
||||
"ProxyError",
|
||||
"ExtractorNotFoundError",
|
||||
"DependencyMissingError",
|
||||
"ExtractorRegistry",
|
||||
"registry",
|
||||
"ProxyGenerator",
|
||||
"BaseExtractor",
|
||||
]
|
||||
185
markitect/proxy/cli.py
Normal file
185
markitect/proxy/cli.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""
|
||||
Click CLI commands for the proxy file system.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from markitect.proxy.exceptions import ProxyError
|
||||
|
||||
|
||||
@click.group("proxy")
|
||||
def proxy_group():
|
||||
"""Proxy file operations — create, update, and manage markdown proxies."""
|
||||
pass
|
||||
|
||||
|
||||
@proxy_group.command("create")
|
||||
@click.argument("source", type=click.Path(exists=True))
|
||||
@click.option(
|
||||
"--output-dir", "-o",
|
||||
type=click.Path(),
|
||||
default=".",
|
||||
help="Directory to write the proxy file (default: current dir).",
|
||||
)
|
||||
@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.")
|
||||
def proxy_create(source, output_dir, force):
|
||||
"""Create a markdown proxy for SOURCE."""
|
||||
# Lazy imports so the CLI group registers even if deps are absent
|
||||
from markitect.proxy.registry import registry
|
||||
import markitect.proxy.extractors # noqa: F401 — registers built-ins
|
||||
from markitect.proxy.generator import ProxyGenerator
|
||||
|
||||
gen = ProxyGenerator(registry)
|
||||
try:
|
||||
proxy_path = gen.create(Path(source), Path(output_dir), force=force)
|
||||
click.echo(f"Created proxy: {proxy_path}")
|
||||
except ProxyError as exc:
|
||||
click.echo(f"Error: {exc}", err=True)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
@proxy_group.command("update")
|
||||
@click.argument("target", type=click.Path(exists=True))
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would change without writing.")
|
||||
def proxy_update(target, dry_run):
|
||||
"""Re-extract a single proxy file or all proxies in a directory."""
|
||||
from markitect.proxy.registry import registry
|
||||
import markitect.proxy.extractors # noqa: F401
|
||||
from markitect.proxy.generator import ProxyGenerator
|
||||
|
||||
gen = ProxyGenerator(registry)
|
||||
target_path = Path(target).resolve()
|
||||
|
||||
try:
|
||||
if target_path.is_file():
|
||||
_update_one(gen, target_path, dry_run)
|
||||
elif target_path.is_dir():
|
||||
proxies = _find_proxy_files(gen, target_path)
|
||||
if not proxies:
|
||||
click.echo("No proxy files found.")
|
||||
return
|
||||
for p in proxies:
|
||||
_update_one(gen, p, dry_run)
|
||||
else:
|
||||
click.echo(f"Error: {target} is not a file or directory.", err=True)
|
||||
raise SystemExit(1)
|
||||
except ProxyError as exc:
|
||||
click.echo(f"Error: {exc}", err=True)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
@proxy_group.command("status")
|
||||
@click.argument("directory", type=click.Path(exists=True), default=".")
|
||||
@click.option(
|
||||
"--format", "output_format",
|
||||
type=click.Choice(["table", "json"]),
|
||||
default="table",
|
||||
help="Output format.",
|
||||
)
|
||||
def proxy_status(directory, output_format):
|
||||
"""Show freshness of all proxy files in DIRECTORY (default: current dir)."""
|
||||
from markitect.proxy.registry import registry
|
||||
import markitect.proxy.extractors # noqa: F401
|
||||
from markitect.proxy.generator import ProxyGenerator
|
||||
|
||||
gen = ProxyGenerator(registry)
|
||||
|
||||
try:
|
||||
results = gen.bulk_status(Path(directory))
|
||||
except ProxyError as exc:
|
||||
click.echo(f"Error: {exc}", err=True)
|
||||
raise SystemExit(1)
|
||||
|
||||
if not results:
|
||||
click.echo("No proxy files found.")
|
||||
return
|
||||
|
||||
if output_format == "json":
|
||||
click.echo(json.dumps(results, indent=2))
|
||||
else:
|
||||
_print_status_table(results)
|
||||
|
||||
|
||||
@proxy_group.command("extractors")
|
||||
@click.option(
|
||||
"--format", "output_format",
|
||||
type=click.Choice(["table", "json"]),
|
||||
default="table",
|
||||
help="Output format.",
|
||||
)
|
||||
def proxy_extractors(output_format):
|
||||
"""List registered extractors and their dependency status."""
|
||||
from markitect.proxy.registry import registry
|
||||
import markitect.proxy.extractors # noqa: F401
|
||||
|
||||
extractors = registry.list_extractors()
|
||||
|
||||
if output_format == "json":
|
||||
rows = [
|
||||
{
|
||||
"name": e.name,
|
||||
"version": e.version,
|
||||
"extensions": list(e.extensions),
|
||||
"installed": e.check_dependencies(),
|
||||
"hint": e.dependency_hint(),
|
||||
}
|
||||
for e in extractors
|
||||
]
|
||||
click.echo(json.dumps(rows, indent=2))
|
||||
else:
|
||||
_print_extractor_table(extractors)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _update_one(gen, proxy_path, dry_run):
|
||||
"""Update a single proxy file, respecting --dry-run."""
|
||||
info = gen.status(proxy_path)
|
||||
if info["status"] == "current":
|
||||
click.echo(f" {proxy_path.name}: current")
|
||||
return
|
||||
if info["status"] == "missing-source":
|
||||
click.echo(f" {proxy_path.name}: source missing")
|
||||
return
|
||||
if dry_run:
|
||||
click.echo(f" {proxy_path.name}: stale (would update)")
|
||||
else:
|
||||
updated = gen.update(proxy_path)
|
||||
label = "updated" if updated else "current"
|
||||
click.echo(f" {proxy_path.name}: {label}")
|
||||
|
||||
|
||||
def _find_proxy_files(gen, directory):
|
||||
"""Return proxy file paths within *directory*."""
|
||||
results = []
|
||||
for path in sorted(directory.rglob("*.md")):
|
||||
meta, _ = gen._try_read_proxy(path)
|
||||
if meta is not None and meta.get("proxy") is True:
|
||||
results.append(path)
|
||||
return results
|
||||
|
||||
|
||||
def _print_status_table(results):
|
||||
"""Pretty-print a status table."""
|
||||
click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}")
|
||||
click.echo("-" * 95)
|
||||
for r in results:
|
||||
proxy_name = Path(r["proxy"]).name
|
||||
click.echo(
|
||||
f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}"
|
||||
)
|
||||
|
||||
|
||||
def _print_extractor_table(extractors):
|
||||
"""Pretty-print an extractor table."""
|
||||
click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}")
|
||||
click.echo("-" * 60)
|
||||
for e in extractors:
|
||||
exts = ", ".join(e.extensions)
|
||||
status = "installed" if e.check_dependencies() else "missing"
|
||||
click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}")
|
||||
40
markitect/proxy/exceptions.py
Normal file
40
markitect/proxy/exceptions.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
Proxy-specific exceptions.
|
||||
|
||||
Extends the MarkitectError hierarchy for proxy file operations.
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from markitect.exceptions import MarkitectError
|
||||
|
||||
|
||||
class ProxyError(MarkitectError):
|
||||
"""Base exception for all proxy operations."""
|
||||
pass
|
||||
|
||||
|
||||
class ExtractorNotFoundError(ProxyError):
|
||||
"""No extractor registered for the given file extension."""
|
||||
pass
|
||||
|
||||
|
||||
class DependencyMissingError(ProxyError):
|
||||
"""An extractor's optional dependency is not installed.
|
||||
|
||||
Attributes:
|
||||
package: The missing Python package name.
|
||||
install_hint: Suggested pip install command.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: str,
|
||||
package: str = "",
|
||||
install_hint: str = "",
|
||||
cause: Optional[Exception] = None,
|
||||
context: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
super().__init__(message, cause=cause, context=context)
|
||||
self.package = package
|
||||
self.install_hint = install_hint
|
||||
14
markitect/proxy/extractors/__init__.py
Normal file
14
markitect/proxy/extractors/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""
|
||||
Built-in extractor registration.
|
||||
|
||||
Importing this module registers all built-in extractors with the global registry.
|
||||
"""
|
||||
|
||||
from markitect.proxy.registry import registry
|
||||
from markitect.proxy.extractors.pdf import PdfExtractor
|
||||
from markitect.proxy.extractors.html import HtmlExtractor
|
||||
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
||||
|
||||
registry.register(PdfExtractor())
|
||||
registry.register(HtmlExtractor())
|
||||
registry.register(MarkdownNormalizer())
|
||||
43
markitect/proxy/extractors/base.py
Normal file
43
markitect/proxy/extractors/base.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Abstract base class for proxy file extractors.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
|
||||
|
||||
class BaseExtractor(ABC):
|
||||
"""Base class that all proxy extractors must implement."""
|
||||
|
||||
name: str = ""
|
||||
version: str = "1.0"
|
||||
extensions: tuple = ()
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
"""Extract markdown content from a source file.
|
||||
|
||||
Args:
|
||||
source_path: Path to the source file.
|
||||
|
||||
Returns:
|
||||
ExtractionResult with the extracted markdown content.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def check_dependencies(self) -> bool:
|
||||
"""Check whether all required dependencies are available.
|
||||
|
||||
Returns:
|
||||
True if all dependencies are installed, False otherwise.
|
||||
"""
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
"""Human-readable install instructions for missing dependencies.
|
||||
|
||||
Returns:
|
||||
A string like ``pip install markitect[proxy-pdf]``.
|
||||
"""
|
||||
return ""
|
||||
45
markitect/proxy/extractors/html.py
Normal file
45
markitect/proxy/extractors/html.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""
|
||||
HTML extractor using markdownify.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
from markitect.proxy.exceptions import DependencyMissingError
|
||||
|
||||
|
||||
class HtmlExtractor(BaseExtractor):
|
||||
"""Converts HTML files to Markdown via markdownify."""
|
||||
|
||||
name = "html"
|
||||
version = "1.0"
|
||||
extensions = (".html", ".htm")
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
try:
|
||||
import markdownify # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
return 'pip install "markitect[proxy-html]" (or: pip install markdownify)'
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
if not self.check_dependencies():
|
||||
raise DependencyMissingError(
|
||||
"markdownify is required to extract HTML files.",
|
||||
package="markdownify",
|
||||
install_hint=self.dependency_hint(),
|
||||
)
|
||||
|
||||
import markdownify
|
||||
|
||||
html_content = source_path.read_text(encoding="utf-8")
|
||||
md_text = markdownify.markdownify(html_content, heading_style="ATX")
|
||||
return ExtractionResult(
|
||||
content=md_text,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
29
markitect/proxy/extractors/markdown.py
Normal file
29
markitect/proxy/extractors/markdown.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""
|
||||
Markdown normalizer — passes through Markdown with minimal transformation.
|
||||
|
||||
No external dependencies required.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
|
||||
|
||||
class MarkdownNormalizer(BaseExtractor):
|
||||
"""Normalizes other Markdown files (built-in, no optional deps)."""
|
||||
|
||||
name = "markdown"
|
||||
version = "1.0"
|
||||
extensions = (".md", ".markdown", ".mdown")
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
return True
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
content = source_path.read_text(encoding="utf-8")
|
||||
return ExtractionResult(
|
||||
content=content,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
44
markitect/proxy/extractors/pdf.py
Normal file
44
markitect/proxy/extractors/pdf.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
PDF extractor using pymupdf4llm.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
from markitect.proxy.exceptions import DependencyMissingError
|
||||
|
||||
|
||||
class PdfExtractor(BaseExtractor):
|
||||
"""Extracts markdown from PDF files via pymupdf4llm."""
|
||||
|
||||
name = "pdf"
|
||||
version = "1.0"
|
||||
extensions = (".pdf",)
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
try:
|
||||
import pymupdf4llm # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)'
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
if not self.check_dependencies():
|
||||
raise DependencyMissingError(
|
||||
"pymupdf4llm is required to extract PDF files.",
|
||||
package="pymupdf4llm",
|
||||
install_hint=self.dependency_hint(),
|
||||
)
|
||||
|
||||
import pymupdf4llm
|
||||
|
||||
md_text = pymupdf4llm.to_markdown(str(source_path))
|
||||
return ExtractionResult(
|
||||
content=md_text,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
241
markitect/proxy/generator.py
Normal file
241
markitect/proxy/generator.py
Normal file
@@ -0,0 +1,241 @@
|
||||
"""
|
||||
ProxyGenerator — create, update, and check status of proxy files.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import yaml
|
||||
|
||||
from markitect.assets.utils import ContentHasher
|
||||
from markitect.frontmatter import FrontMatterParser
|
||||
from markitect.proxy.exceptions import ProxyError
|
||||
from markitect.proxy.models import ProxyMetadata
|
||||
from markitect.proxy.registry import ExtractorRegistry
|
||||
|
||||
logger = logging.getLogger("markitect.proxy.generator")
|
||||
|
||||
_frontmatter_parser = FrontMatterParser()
|
||||
|
||||
|
||||
class ProxyGenerator:
|
||||
"""Creates and manages markdown proxy files."""
|
||||
|
||||
def __init__(self, registry: ExtractorRegistry):
|
||||
self.registry = registry
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# create
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def create(self, source: Path, output_dir: Path, force: bool = False) -> Path:
|
||||
"""Create a proxy markdown file for *source*.
|
||||
|
||||
Args:
|
||||
source: Path to the original file (e.g. ``report.pdf``).
|
||||
output_dir: Directory where the proxy file will be written.
|
||||
force: If True, overwrite an existing proxy file.
|
||||
|
||||
Returns:
|
||||
Path to the created proxy file.
|
||||
|
||||
Raises:
|
||||
ProxyError: If the source doesn't exist, extractor fails, etc.
|
||||
"""
|
||||
source = source.resolve()
|
||||
if not source.is_file():
|
||||
raise ProxyError(
|
||||
f"Source file does not exist: {source}",
|
||||
context={"source": str(source)},
|
||||
)
|
||||
|
||||
output_dir = output_dir.resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
proxy_path = output_dir / f"{source.name}.md"
|
||||
if proxy_path.exists() and not force:
|
||||
raise ProxyError(
|
||||
f"Proxy file already exists: {proxy_path} (use --force to overwrite)",
|
||||
context={"proxy": str(proxy_path)},
|
||||
)
|
||||
|
||||
extractor = self.registry.get_extractor_for_file(source)
|
||||
|
||||
if not extractor.check_dependencies():
|
||||
raise ProxyError(
|
||||
f"Missing dependency for {extractor.name} extractor. "
|
||||
f"{extractor.dependency_hint()}",
|
||||
)
|
||||
|
||||
result = extractor.extract(source)
|
||||
|
||||
checksum = ContentHasher.hash_file(source)
|
||||
source_size = source.stat().st_size
|
||||
|
||||
# Relative path from proxy location to source
|
||||
rel_source = os.path.relpath(source, output_dir)
|
||||
|
||||
meta = ProxyMetadata(
|
||||
source_path=rel_source,
|
||||
source_checksum=f"sha256:{checksum}",
|
||||
source_size=source_size,
|
||||
generated_at=datetime.now(timezone.utc).isoformat(),
|
||||
extractor=result.extractor,
|
||||
extractor_version=result.extractor_version,
|
||||
)
|
||||
|
||||
self._write_proxy(proxy_path, meta, source.name, result.content)
|
||||
logger.info("Created proxy %s -> %s", proxy_path.name, rel_source)
|
||||
return proxy_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# update
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def update(self, proxy_path: Path) -> bool:
|
||||
"""Re-extract the proxy file if its source has changed.
|
||||
|
||||
Returns:
|
||||
True if the proxy was updated, False if already current.
|
||||
"""
|
||||
proxy_path = proxy_path.resolve()
|
||||
meta, body = self._read_proxy(proxy_path)
|
||||
|
||||
source = self._resolve_source(proxy_path, meta)
|
||||
if not source.is_file():
|
||||
raise ProxyError(
|
||||
f"Source file missing: {source}",
|
||||
context={"source_path": meta["source_path"]},
|
||||
)
|
||||
|
||||
current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
|
||||
if current_checksum == meta.get("source_checksum"):
|
||||
return False
|
||||
|
||||
extractor = self.registry.get_extractor(source.suffix)
|
||||
if not extractor.check_dependencies():
|
||||
raise ProxyError(
|
||||
f"Missing dependency for {extractor.name} extractor. "
|
||||
f"{extractor.dependency_hint()}",
|
||||
)
|
||||
|
||||
result = extractor.extract(source)
|
||||
|
||||
rel_source = meta["source_path"]
|
||||
new_meta = ProxyMetadata(
|
||||
source_path=rel_source,
|
||||
source_checksum=current_checksum,
|
||||
source_size=source.stat().st_size,
|
||||
generated_at=datetime.now(timezone.utc).isoformat(),
|
||||
extractor=result.extractor,
|
||||
extractor_version=result.extractor_version,
|
||||
)
|
||||
|
||||
self._write_proxy(proxy_path, new_meta, source.name, result.content)
|
||||
logger.info("Updated proxy %s", proxy_path.name)
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# status
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def status(self, proxy_path: Path) -> Dict:
|
||||
"""Check a single proxy file's freshness.
|
||||
|
||||
Returns a dict with keys: proxy, source, status, extractor.
|
||||
Status is one of: ``current``, ``stale``, ``missing-source``.
|
||||
"""
|
||||
proxy_path = proxy_path.resolve()
|
||||
meta, _ = self._read_proxy(proxy_path)
|
||||
|
||||
source = self._resolve_source(proxy_path, meta)
|
||||
if not source.is_file():
|
||||
return {
|
||||
"proxy": str(proxy_path),
|
||||
"source": meta.get("source_path", ""),
|
||||
"status": "missing-source",
|
||||
"extractor": meta.get("extractor", ""),
|
||||
}
|
||||
|
||||
current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
|
||||
is_current = current_checksum == meta.get("source_checksum")
|
||||
return {
|
||||
"proxy": str(proxy_path),
|
||||
"source": meta.get("source_path", ""),
|
||||
"status": "current" if is_current else "stale",
|
||||
"extractor": meta.get("extractor", ""),
|
||||
}
|
||||
|
||||
def bulk_status(self, directory: Path) -> List[Dict]:
|
||||
"""Scan *directory* for proxy files and return their statuses."""
|
||||
directory = directory.resolve()
|
||||
results = []
|
||||
for path in sorted(directory.rglob("*.md")):
|
||||
meta, _ = self._try_read_proxy(path)
|
||||
if meta is not None and meta.get("proxy") is True:
|
||||
results.append(self.status(path))
|
||||
return results
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _write_proxy(
|
||||
proxy_path: Path,
|
||||
meta: ProxyMetadata,
|
||||
source_name: str,
|
||||
content: str,
|
||||
) -> None:
|
||||
fm = {
|
||||
"proxy": True,
|
||||
"source_path": meta.source_path,
|
||||
"source_checksum": meta.source_checksum,
|
||||
"source_size": meta.source_size,
|
||||
"generated_at": meta.generated_at,
|
||||
"extractor": meta.extractor,
|
||||
"extractor_version": meta.extractor_version,
|
||||
}
|
||||
fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False)
|
||||
body = (
|
||||
f"---\n{fm_text}---\n\n"
|
||||
f"# {source_name}\n\n"
|
||||
f"*Proxy generated from `{meta.source_path}`*\n\n"
|
||||
f"{content}"
|
||||
)
|
||||
proxy_path.write_text(body, encoding="utf-8")
|
||||
|
||||
@staticmethod
|
||||
def _read_proxy(proxy_path: Path):
|
||||
"""Read and parse an existing proxy file.
|
||||
|
||||
Returns:
|
||||
Tuple of (frontmatter dict, body str).
|
||||
"""
|
||||
raw = proxy_path.read_text(encoding="utf-8")
|
||||
meta, body = _frontmatter_parser.parse(raw)
|
||||
if not meta.get("proxy"):
|
||||
raise ProxyError(
|
||||
f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}",
|
||||
context={"path": str(proxy_path)},
|
||||
)
|
||||
return meta, body
|
||||
|
||||
@staticmethod
|
||||
def _try_read_proxy(path: Path):
|
||||
"""Attempt to read a proxy file, returning (None, None) on failure."""
|
||||
try:
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
meta, body = _frontmatter_parser.parse(raw)
|
||||
return meta, body
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
@staticmethod
|
||||
def _resolve_source(proxy_path: Path, meta: Dict) -> Path:
|
||||
"""Resolve the source path relative to the proxy file's directory."""
|
||||
source_rel = meta.get("source_path", "")
|
||||
return (proxy_path.parent / source_rel).resolve()
|
||||
26
markitect/proxy/models.py
Normal file
26
markitect/proxy/models.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Data models for the proxy file system.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProxyMetadata:
|
||||
"""Metadata stored in a proxy file's YAML frontmatter."""
|
||||
|
||||
source_path: str
|
||||
source_checksum: str # "sha256:<hex>"
|
||||
source_size: int
|
||||
generated_at: str # ISO 8601
|
||||
extractor: str
|
||||
extractor_version: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Result returned by an extractor after processing a source file."""
|
||||
|
||||
content: str
|
||||
extractor: str
|
||||
extractor_version: str
|
||||
65
markitect/proxy/registry.py
Normal file
65
markitect/proxy/registry.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Extractor registry — register and look up extractors by file extension.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, TYPE_CHECKING
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.exceptions import ExtractorNotFoundError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
|
||||
logger = logging.getLogger("markitect.proxy.registry")
|
||||
|
||||
|
||||
class ExtractorRegistry:
|
||||
"""Maps file extensions to their corresponding extractors."""
|
||||
|
||||
def __init__(self):
|
||||
self._extractors: Dict[str, BaseExtractor] = {}
|
||||
|
||||
def register(self, extractor: BaseExtractor) -> None:
|
||||
"""Register an extractor for all of its declared extensions."""
|
||||
for ext in extractor.extensions:
|
||||
ext_lower = ext.lower()
|
||||
self._extractors[ext_lower] = extractor
|
||||
logger.debug("Registered %s extractor for %s", extractor.name, ext_lower)
|
||||
|
||||
def get_extractor(self, extension: str) -> BaseExtractor:
|
||||
"""Look up an extractor by file extension (e.g. ``'.pdf'``).
|
||||
|
||||
Raises:
|
||||
ExtractorNotFoundError: If no extractor handles the extension.
|
||||
"""
|
||||
ext_lower = extension.lower()
|
||||
if ext_lower not in self._extractors:
|
||||
known = ", ".join(sorted(self._extractors.keys()))
|
||||
raise ExtractorNotFoundError(
|
||||
f"No extractor registered for {ext_lower!r}. "
|
||||
f"Supported extensions: {known}",
|
||||
context={"extension": ext_lower},
|
||||
)
|
||||
return self._extractors[ext_lower]
|
||||
|
||||
def get_extractor_for_file(self, path: Path) -> BaseExtractor:
|
||||
"""Look up an extractor for a file based on its suffix."""
|
||||
return self.get_extractor(path.suffix)
|
||||
|
||||
def list_extractors(self) -> List[BaseExtractor]:
|
||||
"""Return a de-duplicated list of registered extractors."""
|
||||
seen = set()
|
||||
result = []
|
||||
for ext in self._extractors.values():
|
||||
if id(ext) not in seen:
|
||||
seen.add(id(ext))
|
||||
result.append(ext)
|
||||
return result
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
registry = ExtractorRegistry()
|
||||
@@ -32,6 +32,9 @@ capabilities = [
|
||||
development = [
|
||||
"kaizen-agentic @ file:./capabilities/kaizen-agentic"
|
||||
]
|
||||
proxy-pdf = ["pymupdf4llm>=0.0.10"]
|
||||
proxy-html = ["markdownify>=0.13.1"]
|
||||
proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
|
||||
|
||||
[project.scripts]
|
||||
markitect = "markitect.cli:main"
|
||||
|
||||
Reference in New Issue
Block a user