markitect-main/markitect/infospace/pipeline.py

"""
Source file processing pipeline for infospace tooling.

Processes source files through the pipeline stages defined in
``infospace.yaml`` using simple ``@{macro}`` template substitution —
no SQLite or space-resolver dependencies.

Example usage from CLI::

    markitect infospace process "book-1-chapter-0[1-3].md" \\
        --provider openrouter --check-after-each

Or programmatically::

    from markitect.infospace.pipeline import SourcePipeline
    from markitect.llm import create_adapter

    adapter = create_adapter("openrouter")
    pipeline = SourcePipeline(config, root, adapter=adapter)
    pipeline.process_sources(source_files)
"""

from __future__ import annotations

import re
import subprocess
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from markitect.infospace.config import InfospaceConfig, PipelineStage


class SourcePipeline:
    """Processes source files through infospace pipeline stages.

    Each stage in :attr:`config.pipeline.stages` is run in order for
    every source file.  Stage outputs are passed as macros to subsequent
    stages via the ``output_macro`` field.

    Template substitution uses ``@{macro_name}`` tokens.  Static macros
    are loaded from files listed under ``stage.macros``; dynamic macros
    include ``chapter_text`` (source file content), ``existing_entities``
    (for split-entities stages), and outputs from prior stages.
    """

    _PROVIDER_DEFAULTS: Dict[str, str] = {
        "openrouter": "arcee-ai/trinity-large-preview:free",
    }

    def __init__(
        self,
        config: InfospaceConfig,
        root: Path,
        adapter=None,
        no_commit: bool = False,
    ) -> None:
        self.config = config
        self.root = root
        self.adapter = adapter
        self.no_commit = no_commit

    # ── Public API ────────────────────────────────────────────────────

    def process_sources(
        self,
        source_files: List[Path],
        check_after_each: bool = False,
    ) -> int:
        """Process a list of source files through all pipeline stages.

        Args:
            source_files: Ordered list of source files to process.
            check_after_each: If True, run collection checks after each
                source and record a metrics snapshot.

        Returns:
            Number of source files fully processed (all stages complete).
        """
        completed = 0
        for source_file in source_files:
            success = self.process_source(source_file)
            if success:
                completed += 1
                if check_after_each:
                    self.run_collection_check()
        return completed

    def process_source(self, source_file: Path) -> bool:
        """Run the full pipeline for a single source file.

        Args:
            source_file: Path to the source markdown file.

        Returns:
            True if all stages completed successfully.
        """
        source_id = source_file.stem
        source_content = source_file.read_text(encoding="utf-8")
        stage_outputs: Dict[str, str] = {}

        print(f"\nProcessing: {source_id}")
        print("=" * 60)

        if not self.config.pipeline or not self.config.pipeline.stages:
            print("  No pipeline stages configured.")
            return False

        for stage in self.config.pipeline.stages:
            content = self._run_stage(stage, source_id, source_content, stage_outputs)
            if content is None:
                stage_label = stage.name or stage.template
                print(f"\n  Pipeline paused at stage '{stage_label}'.")
                return False
            if stage.output_macro:
                stage_outputs[stage.output_macro] = content

        print(f"\n  {source_id}: all stages complete.")
        if not self.no_commit:
            self._git_commit(source_id)

        return True

    def run_collection_check(self) -> None:
        """Run collection-level quality checks (C1–C5) and print metrics."""
        from markitect.infospace.entity_parser import parse_entity_directory
        from markitect.infospace.checks import run_all_checks
        from markitect.infospace.history import record_check_results

        entities_dir = self.root / self.config.entities_dir
        if not entities_dir.is_dir():
            print("\n  Collection check: no entities directory.")
            return

        entities = parse_entity_directory(entities_dir)
        if not entities:
            print("\n  Collection check: no entities found.")
            return

        print(f"\n  Collection check ({len(entities)} entities):")
        report = run_all_checks(entities=entities)
        m = report.metrics()
        for k, v in sorted(m.items()):
            print(f"    {k}: {v:.4f}")

        snap = record_check_results(
            report, self.config, self.root, entity_count=len(entities)
        )
        print(f"    Snapshot recorded: {snap.snapshot_id}")

    # ── Stage Execution ───────────────────────────────────────────────

    def _run_stage(
        self,
        stage: PipelineStage,
        source_id: str,
        source_content: str,
        stage_outputs: Dict[str, str],
    ) -> Optional[str]:
        """Run a single pipeline stage.

        Returns the stage's output content, or None on failure.
        """
        stage_label = stage.name or stage.template
        print(f"\n  [{stage_label}]")

        output_file = self._get_output_file(stage, source_id)

        # Skip-if-exists — load from disk
        if output_file and output_file.exists():
            print(f"    Found existing output: {output_file.name}")
            if stage.split_entities:
                return self._load_from_view(output_file)
            return output_file.read_text(encoding="utf-8")

        # Build macro substitution dict
        macros = self._build_macros(stage, source_content, stage_outputs)

        # Load and resolve template
        template_path = self.root / stage.template
        if not template_path.exists():
            print(f"    ERROR: Template not found: {stage.template}")
            return None

        template_content = template_path.read_text(encoding="utf-8")
        prompt = self._resolve_macros(template_content, macros)

        # Write prompt for inspection
        if output_file:
            prompt_file = output_file.parent / f"{source_id}-prompt.md"
            prompt_file.parent.mkdir(parents=True, exist_ok=True)
            prompt_file.write_text(prompt, encoding="utf-8")
            print(f"    Prompt written to {prompt_file.name}")

        # Without an adapter, we cannot generate output
        if self.adapter is None:
            print("    No LLM adapter — skipping generation (manual mode).")
            return None

        # Call LLM
        content = self._call_llm(prompt, stage_label)
        if content is None:
            return None

        # Persist output
        if stage.split_entities:
            entity_files = self._split_and_write_entities(stage, content)
            self._write_entity_view(source_id, entity_files, output_file)
            return content
        else:
            if output_file:
                output_file.parent.mkdir(parents=True, exist_ok=True)
                output_file.write_text(content, encoding="utf-8")
                print(f"    Output written to {output_file.name}")
            return content

    # ── Output File Resolution ────────────────────────────────────────

    def _get_output_file(
        self, stage: PipelineStage, source_id: str
    ) -> Optional[Path]:
        """Return the expected output file path for a stage, or None."""
        if not stage.output_dir or not stage.output_macro:
            return None
        return self.root / stage.output_dir / f"{source_id}-{stage.output_macro}.md"

    # ── Macro Resolution ──────────────────────────────────────────────

    def _build_macros(
        self,
        stage: PipelineStage,
        source_content: str,
        stage_outputs: Dict[str, str],
    ) -> Dict[str, str]:
        """Assemble the macro dict for template substitution."""
        macros: Dict[str, str] = {"chapter_text": source_content}

        # Static macros loaded from files
        for macro_name, rel_path in stage.macros.items():
            file_path = self.root / rel_path
            if file_path.exists():
                macros[macro_name] = file_path.read_text(encoding="utf-8")
            else:
                print(f"    WARNING: macro file not found: {rel_path}")
                macros[macro_name] = f"(File not found: {rel_path})"

        # Existing entities list (for split_entities stages)
        if stage.split_entities:
            entities_dir = self._get_entities_dir(stage)
            existing = self._list_existing_entities(entities_dir)
            if existing:
                macros["existing_entities"] = "\n".join(
                    f"- {s}" for s in existing
                )
            else:
                macros["existing_entities"] = (
                    "(none — this is the first source file)"
                )

        # Outputs from prior stages (override static macros if same name)
        macros.update(stage_outputs)

        return macros

    @staticmethod
    def _resolve_macros(template: str, macros: Dict[str, str]) -> str:
        """Substitute ``@{macro_name}`` tokens with values from *macros*."""

        def replacer(match: re.Match) -> str:
            key = match.group(1)
            return macros.get(key, match.group(0))

        return re.sub(r"@\{([^}]+)\}", replacer, template)

    # ── Entity Splitting ──────────────────────────────────────────────

    def _get_entities_dir(self, stage: Optional[PipelineStage] = None) -> Path:
        """Return the entities output directory."""
        if stage and stage.output_dir:
            return self.root / stage.output_dir
        # Fall back to the first split_entities stage's output_dir
        if self.config.pipeline:
            for s in self.config.pipeline.stages:
                if s.split_entities and s.output_dir:
                    return self.root / s.output_dir
        return self.root / "output" / "entities"

    def _list_existing_entities(self, entities_dir: Path) -> List[str]:
        """Return sorted slugs of all canonical entity files on disk."""
        if not entities_dir.is_dir():
            return []
        return sorted(
            f.stem
            for f in entities_dir.glob("*.md")
            if not f.name.endswith("-entities.md")
            and not f.name.endswith("-prompt.md")
        )

    @staticmethod
    def _normalize_entity_name(name: str) -> str:
        """Normalise entity name to a kebab-case filename stem."""
        slug = name.lower().strip()
        slug = slug.replace("_", "-").replace(" ", "-")
        slug = re.sub(r"[^a-z0-9-]", "", slug)
        slug = re.sub(r"-{2,}", "-", slug)
        return slug.strip("-")

    def _split_and_write_entities(
        self,
        stage: PipelineStage,
        combined_content: str,
    ) -> List[Tuple[str, Path]]:
        """Split ``--- ENTITY: <name> ---`` delimited output into files.

        Writes each entity to ``<output_dir>/<slug>.md``.  Existing files
        are skipped (first-occurrence wins).

        Returns:
            List of ``(entity_name, file_path)`` for all entities in
            *combined_content* (new and pre-existing alike).
        """
        entities_dir = self._get_entities_dir(stage)
        entities_dir.mkdir(parents=True, exist_ok=True)

        parts = re.split(
            r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
            combined_content,
            flags=re.MULTILINE,
        )

        entity_files: List[Tuple[str, Path]] = []
        new_count = 0
        skipped_count = 0

        for i in range(1, len(parts), 2):
            entity_name = parts[i]
            entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""

            slug = self._normalize_entity_name(entity_name)
            if not slug:
                continue

            file_path = entities_dir / f"{slug}.md"
            if file_path.exists():
                skipped_count += 1
            else:
                file_path.write_text(entity_content + "\n", encoding="utf-8")
                new_count += 1

            entity_files.append((entity_name, file_path))

        msg = f"    {new_count} new entities written"
        if skipped_count:
            msg += f", {skipped_count} pre-existing (skipped)"
        print(msg)
        return entity_files

    def _write_entity_view(
        self,
        source_id: str,
        entity_files: List[Tuple[str, Path]],
        view_file: Optional[Path],
    ) -> None:
        """Write a per-source view file with ``{{ include }}`` directives."""
        if view_file is None:
            return

        lines = [f"# Entities: {source_id}\n"]
        for _name, file_path in entity_files:
            lines.append(f'{{{{ include "{file_path.name}" }}}}')
            lines.append("")
            lines.append("---")
            lines.append("")

        # Remove trailing separator after last entity
        if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
            lines = lines[:-2]

        view_file.parent.mkdir(parents=True, exist_ok=True)
        view_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
        print(f"    View written to {view_file.name}")

    def _load_from_view(self, view_file: Path) -> str:
        """Reconstruct combined entity content from a view file.

        Parses ``{{ include "filename.md" }}`` directives and assembles
        the ``--- ENTITY: <slug> ---`` delimited content for downstream
        stages.
        """
        view_content = view_file.read_text(encoding="utf-8")
        includes = re.findall(
            r'\{\{\s*include\s+"([^"]+)"\s*\}\}', view_content
        )

        entities_dir = view_file.parent
        parts: List[str] = []
        for rel_path in includes:
            file_path = entities_dir / rel_path
            if file_path.exists():
                slug = file_path.stem
                body = file_path.read_text(encoding="utf-8").strip()
                parts.append(f"--- ENTITY: {slug} ---\n\n{body}")

        return "\n\n".join(parts) + "\n" if parts else ""

    # ── LLM Execution ─────────────────────────────────────────────────

    def _call_llm(
        self, prompt: str, stage_name: str, max_tokens: int = 8192
    ) -> Optional[str]:
        """Call the LLM adapter with exponential back-off on rate limits.

        Returns the response content string, or None on failure.
        """
        from markitect.prompts.execution.models import RunConfig
        from markitect.llm.exceptions import LLMRateLimitError

        print(f"    Calling LLM ({stage_name})...")
        t0 = time.time()
        max_retries = 3
        response = None

        for attempt in range(max_retries + 1):
            try:
                response = self.adapter.execute_prompt(
                    prompt, RunConfig(max_tokens=max_tokens)
                )
                break
            except LLMRateLimitError as exc:
                if attempt < max_retries:
                    wait = 15 * (attempt + 1)
                    print(
                        f"    Rate limited — retrying in {wait}s "
                        f"(attempt {attempt + 1}/{max_retries})..."
                    )
                    time.sleep(wait)
                else:
                    print(f"    Rate limit exceeded after {max_retries} retries: {exc}")
                    return None
            except Exception as exc:
                print(f"    LLM error: {exc}")
                return None

        if response is None:
            return None

        elapsed = time.time() - t0
        usage = response.usage
        print(
            f"    Done in {elapsed:.1f}s — "
            f"prompt {usage.get('prompt_tokens', '?')} tok, "
            f"completion {usage.get('completion_tokens', '?')} tok"
        )

        content = response.content
        if not content or not content.strip():
            print("    LLM returned empty content.")
            return None

        return content

    # ── Git Integration ───────────────────────────────────────────────

    def _git_commit(self, source_id: str) -> None:
        """Stage all output changes and commit them for *source_id*."""
        output_dir = self.root / "output"
        try:
            subprocess.run(
                ["git", "add", str(output_dir)],
                cwd=str(self.root),
                check=True,
                capture_output=True,
            )
            result = subprocess.run(
                [
                    "git", "commit", "-m",
                    f"infospace: process {source_id}\n\n"
                    f"Extract entities, map to VSM, and synthesize analysis.",
                ],
                cwd=str(self.root),
                capture_output=True,
                text=True,
            )
            if result.returncode == 0:
                print(f"  Git commit: process {source_id}")
            else:
                output = result.stdout + result.stderr
                if "nothing to commit" in output or "nothing added" in output:
                    print(f"  Git: nothing to commit for {source_id}")
                else:
                    print(f"  Warning: Git commit failed: {output.strip()}")
        except subprocess.CalledProcessError as e:
            stderr = e.stderr.decode() if isinstance(e.stderr, bytes) else (e.stderr or "")
            print(f"  Warning: Git error: {stderr.strip()}")