Files
markitect-main/markitect/proxy/cli.py
tegwick ac334c679d feat(proxy): add proxy file system for non-markdown source conversion
Introduces a new `markitect/proxy/` module with pluggable extractors that
convert non-markdown sources (PDF, HTML) into tracked markdown proxy files.
Proxy files preserve origin metadata (path, checksum, timestamp) so they
can be kept in sync when the original changes.

CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`.
Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 19:06:09 +01:00

186 lines
6.0 KiB
Python

"""
Click CLI commands for the proxy file system.
"""
import json
from pathlib import Path
import click
from markitect.proxy.exceptions import ProxyError
@click.group("proxy")
def proxy_group():
"""Proxy file operations — create, update, and manage markdown proxies."""
pass
@proxy_group.command("create")
@click.argument("source", type=click.Path(exists=True))
@click.option(
"--output-dir", "-o",
type=click.Path(),
default=".",
help="Directory to write the proxy file (default: current dir).",
)
@click.option("--force", "-f", is_flag=True, help="Overwrite existing proxy file.")
def proxy_create(source, output_dir, force):
"""Create a markdown proxy for SOURCE."""
# Lazy imports so the CLI group registers even if deps are absent
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401 — registers built-ins
from markitect.proxy.generator import ProxyGenerator
gen = ProxyGenerator(registry)
try:
proxy_path = gen.create(Path(source), Path(output_dir), force=force)
click.echo(f"Created proxy: {proxy_path}")
except ProxyError as exc:
click.echo(f"Error: {exc}", err=True)
raise SystemExit(1)
@proxy_group.command("update")
@click.argument("target", type=click.Path(exists=True))
@click.option("--dry-run", is_flag=True, help="Show what would change without writing.")
def proxy_update(target, dry_run):
"""Re-extract a single proxy file or all proxies in a directory."""
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401
from markitect.proxy.generator import ProxyGenerator
gen = ProxyGenerator(registry)
target_path = Path(target).resolve()
try:
if target_path.is_file():
_update_one(gen, target_path, dry_run)
elif target_path.is_dir():
proxies = _find_proxy_files(gen, target_path)
if not proxies:
click.echo("No proxy files found.")
return
for p in proxies:
_update_one(gen, p, dry_run)
else:
click.echo(f"Error: {target} is not a file or directory.", err=True)
raise SystemExit(1)
except ProxyError as exc:
click.echo(f"Error: {exc}", err=True)
raise SystemExit(1)
@proxy_group.command("status")
@click.argument("directory", type=click.Path(exists=True), default=".")
@click.option(
"--format", "output_format",
type=click.Choice(["table", "json"]),
default="table",
help="Output format.",
)
def proxy_status(directory, output_format):
"""Show freshness of all proxy files in DIRECTORY (default: current dir)."""
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401
from markitect.proxy.generator import ProxyGenerator
gen = ProxyGenerator(registry)
try:
results = gen.bulk_status(Path(directory))
except ProxyError as exc:
click.echo(f"Error: {exc}", err=True)
raise SystemExit(1)
if not results:
click.echo("No proxy files found.")
return
if output_format == "json":
click.echo(json.dumps(results, indent=2))
else:
_print_status_table(results)
@proxy_group.command("extractors")
@click.option(
"--format", "output_format",
type=click.Choice(["table", "json"]),
default="table",
help="Output format.",
)
def proxy_extractors(output_format):
"""List registered extractors and their dependency status."""
from markitect.proxy.registry import registry
import markitect.proxy.extractors # noqa: F401
extractors = registry.list_extractors()
if output_format == "json":
rows = [
{
"name": e.name,
"version": e.version,
"extensions": list(e.extensions),
"installed": e.check_dependencies(),
"hint": e.dependency_hint(),
}
for e in extractors
]
click.echo(json.dumps(rows, indent=2))
else:
_print_extractor_table(extractors)
# ------------------------------------------------------------------
# helpers
# ------------------------------------------------------------------
def _update_one(gen, proxy_path, dry_run):
"""Update a single proxy file, respecting --dry-run."""
info = gen.status(proxy_path)
if info["status"] == "current":
click.echo(f" {proxy_path.name}: current")
return
if info["status"] == "missing-source":
click.echo(f" {proxy_path.name}: source missing")
return
if dry_run:
click.echo(f" {proxy_path.name}: stale (would update)")
else:
updated = gen.update(proxy_path)
label = "updated" if updated else "current"
click.echo(f" {proxy_path.name}: {label}")
def _find_proxy_files(gen, directory):
"""Return proxy file paths within *directory*."""
results = []
for path in sorted(directory.rglob("*.md")):
meta, _ = gen._try_read_proxy(path)
if meta is not None and meta.get("proxy") is True:
results.append(path)
return results
def _print_status_table(results):
"""Pretty-print a status table."""
click.echo(f"{'Proxy':<40} {'Source':<30} {'Status':<15} {'Extractor':<10}")
click.echo("-" * 95)
for r in results:
proxy_name = Path(r["proxy"]).name
click.echo(
f"{proxy_name:<40} {r['source']:<30} {r['status']:<15} {r['extractor']:<10}"
)
def _print_extractor_table(extractors):
"""Pretty-print an extractor table."""
click.echo(f"{'Name':<15} {'Version':<10} {'Extensions':<25} {'Status':<10}")
click.echo("-" * 60)
for e in extractors:
exts = ", ".join(e.extensions)
status = "installed" if e.check_dependencies() else "missing"
click.echo(f"{e.name:<15} {e.version:<10} {exts:<25} {status:<10}")