markitect-main/markitect/proxy/generator.py

"""
ProxyGenerator — create, update, and check status of proxy files.
"""

import logging
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List

import yaml

from markitect.assets.utils import ContentHasher
from markitect.frontmatter import FrontMatterParser
from markitect.proxy.exceptions import ProxyError
from markitect.proxy.models import ProxyMetadata
from markitect.proxy.registry import ExtractorRegistry

logger = logging.getLogger("markitect.proxy.generator")

_frontmatter_parser = FrontMatterParser()


class ProxyGenerator:
    """Creates and manages markdown proxy files."""

    def __init__(self, registry: ExtractorRegistry):
        self.registry = registry

    # ------------------------------------------------------------------
    # create
    # ------------------------------------------------------------------

    def create(self, source: Path, output_dir: Path, force: bool = False) -> Path:
        """Create a proxy markdown file for *source*.

        Args:
            source: Path to the original file (e.g. ``report.pdf``).
            output_dir: Directory where the proxy file will be written.
            force: If True, overwrite an existing proxy file.

        Returns:
            Path to the created proxy file.

        Raises:
            ProxyError: If the source doesn't exist, extractor fails, etc.
        """
        source = source.resolve()
        if not source.is_file():
            raise ProxyError(
                f"Source file does not exist: {source}",
                context={"source": str(source)},
            )

        output_dir = output_dir.resolve()
        output_dir.mkdir(parents=True, exist_ok=True)

        proxy_path = output_dir / f"{source.name}.md"
        if proxy_path.exists() and not force:
            raise ProxyError(
                f"Proxy file already exists: {proxy_path}  (use --force to overwrite)",
                context={"proxy": str(proxy_path)},
            )

        extractor = self.registry.get_extractor_for_file(source)

        if not extractor.check_dependencies():
            raise ProxyError(
                f"Missing dependency for {extractor.name} extractor. "
                f"{extractor.dependency_hint()}",
            )

        result = extractor.extract(source)

        checksum = ContentHasher.hash_file(source)
        source_size = source.stat().st_size

        # Relative path from proxy location to source
        rel_source = os.path.relpath(source, output_dir)

        meta = ProxyMetadata(
            source_path=rel_source,
            source_checksum=f"sha256:{checksum}",
            source_size=source_size,
            generated_at=datetime.now(timezone.utc).isoformat(),
            extractor=result.extractor,
            extractor_version=result.extractor_version,
        )

        self._write_proxy(proxy_path, meta, source.name, result.content)
        logger.info("Created proxy %s -> %s", proxy_path.name, rel_source)
        return proxy_path

    # ------------------------------------------------------------------
    # update
    # ------------------------------------------------------------------

    def update(self, proxy_path: Path) -> bool:
        """Re-extract the proxy file if its source has changed.

        Returns:
            True if the proxy was updated, False if already current.
        """
        proxy_path = proxy_path.resolve()
        meta, body = self._read_proxy(proxy_path)

        source = self._resolve_source(proxy_path, meta)
        if not source.is_file():
            raise ProxyError(
                f"Source file missing: {source}",
                context={"source_path": meta["source_path"]},
            )

        current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
        if current_checksum == meta.get("source_checksum"):
            return False

        extractor = self.registry.get_extractor(source.suffix)
        if not extractor.check_dependencies():
            raise ProxyError(
                f"Missing dependency for {extractor.name} extractor. "
                f"{extractor.dependency_hint()}",
            )

        result = extractor.extract(source)

        rel_source = meta["source_path"]
        new_meta = ProxyMetadata(
            source_path=rel_source,
            source_checksum=current_checksum,
            source_size=source.stat().st_size,
            generated_at=datetime.now(timezone.utc).isoformat(),
            extractor=result.extractor,
            extractor_version=result.extractor_version,
        )

        self._write_proxy(proxy_path, new_meta, source.name, result.content)
        logger.info("Updated proxy %s", proxy_path.name)
        return True

    # ------------------------------------------------------------------
    # status
    # ------------------------------------------------------------------

    def status(self, proxy_path: Path) -> Dict:
        """Check a single proxy file's freshness.

        Returns a dict with keys: proxy, source, status, extractor.
        Status is one of: ``current``, ``stale``, ``missing-source``.
        """
        proxy_path = proxy_path.resolve()
        meta, _ = self._read_proxy(proxy_path)

        source = self._resolve_source(proxy_path, meta)
        if not source.is_file():
            return {
                "proxy": str(proxy_path),
                "source": meta.get("source_path", ""),
                "status": "missing-source",
                "extractor": meta.get("extractor", ""),
            }

        current_checksum = f"sha256:{ContentHasher.hash_file(source)}"
        is_current = current_checksum == meta.get("source_checksum")
        return {
            "proxy": str(proxy_path),
            "source": meta.get("source_path", ""),
            "status": "current" if is_current else "stale",
            "extractor": meta.get("extractor", ""),
        }

    def bulk_status(self, directory: Path) -> List[Dict]:
        """Scan *directory* for proxy files and return their statuses."""
        directory = directory.resolve()
        results = []
        for path in sorted(directory.rglob("*.md")):
            meta, _ = self._try_read_proxy(path)
            if meta is not None and meta.get("proxy") is True:
                results.append(self.status(path))
        return results

    # ------------------------------------------------------------------
    # helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _write_proxy(
        proxy_path: Path,
        meta: ProxyMetadata,
        source_name: str,
        content: str,
    ) -> None:
        fm = {
            "proxy": True,
            "source_path": meta.source_path,
            "source_checksum": meta.source_checksum,
            "source_size": meta.source_size,
            "generated_at": meta.generated_at,
            "extractor": meta.extractor,
            "extractor_version": meta.extractor_version,
        }
        fm_text = yaml.dump(fm, default_flow_style=False, sort_keys=False)
        body = (
            f"---\n{fm_text}---\n\n"
            f"# {source_name}\n\n"
            f"*Proxy generated from `{meta.source_path}`*\n\n"
            f"{content}"
        )
        proxy_path.write_text(body, encoding="utf-8")

    @staticmethod
    def _read_proxy(proxy_path: Path):
        """Read and parse an existing proxy file.

        Returns:
            Tuple of (frontmatter dict, body str).
        """
        raw = proxy_path.read_text(encoding="utf-8")
        meta, body = _frontmatter_parser.parse(raw)
        if not meta.get("proxy"):
            raise ProxyError(
                f"Not a proxy file (missing 'proxy: true' in frontmatter): {proxy_path}",
                context={"path": str(proxy_path)},
            )
        return meta, body

    @staticmethod
    def _try_read_proxy(path: Path):
        """Attempt to read a proxy file, returning (None, None) on failure."""
        try:
            raw = path.read_text(encoding="utf-8")
            meta, body = _frontmatter_parser.parse(raw)
            return meta, body
        except Exception:
            return None, None

    @staticmethod
    def _resolve_source(proxy_path: Path, meta: Dict) -> Path:
        """Resolve the source path relative to the proxy file's directory."""
        source_rel = meta.get("source_path", "")
        return (proxy_path.parent / source_rel).resolve()