markitect-main/examples/infospace-with-history/process_chapters.py

#!/usr/bin/env python3
"""
Infospace with History — Chapter Processing Pipeline

Processes chapters from Adam Smith's "The Wealth of Nations" through a
three-stage analysis pipeline, mapping economic content to Stafford Beer's
Viable System Model.

Pipeline per chapter:
  1. extract-entities  — Extract economic entities from chapter text
  2. map-to-vsm       — Map entities to VSM concepts
  3. synthesize-analysis — Produce chapter-level VSM analysis

After all chapters:
  4. assess-metrics    — Evaluate completeness and consistency

Usage:
  # Process a single chapter
  python process_chapters.py --chapter book-1-chapter-01

  # Process all chapters in Book I
  python process_chapters.py --book 1

  # Process all chapters
  python process_chapters.py --all

  # Assess metrics only (after chapters have been processed)
  python process_chapters.py --metrics

  # List available chapters
  python process_chapters.py --list
"""

import argparse
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from markitect.prompts.models import Artifact, ArtifactType
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
from markitect.prompts.services.artifact_service import ArtifactService
from markitect.prompts.templates.models import PromptTemplate
from markitect.prompts.templates.analyzer import TemplateAnalyzer
from markitect.prompts.resolver.resolver import PromptResolver
from markitect.prompts.resolver.compiler import ContextCompiler
from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
from markitect.prompts.execution.manifest import RunManifest
from markitect.prompts.dependencies.graph import GraphBuilder
from markitect.prompts.traceability.service import TraceabilityService
from markitect.prompts.queries.operations import PromptQueryService


class ChapterProcessor:
    """Processes Wealth of Nations chapters through the VSM analysis pipeline."""

    def __init__(
        self,
        example_dir: Path,
        db_path: Optional[str] = None,
        llm_adapter=None,
    ):
        self.example_dir = example_dir
        self.db_path = db_path or str(example_dir / "infospace.db")
        self.llm_adapter = llm_adapter

        # Initialize repositories
        self.artifact_repo = SQLiteArtifactRepository(self.db_path)
        self.dep_repo = SQLiteDependencyRepository(self.db_path)
        self.artifact_service = ArtifactService(self.artifact_repo)
        self.graph_builder = GraphBuilder(self.dep_repo)
        self.trace_service = TraceabilityService(
            self.artifact_repo, self.dep_repo, db_path=self.db_path
        )
        self.query_service = PromptQueryService(
            self.artifact_repo, self.dep_repo, db_path=self.db_path
        )

        # Template analysis and compilation
        self.analyzer = TemplateAnalyzer()
        self.compiler = ContextCompiler()

        # Information spaces
        self.spaces = {
            "templates": "infospace-templates",
            "sources": "infospace-sources",
            "guidelines": "infospace-guidelines",
            "vsm-reference": "infospace-vsm-reference",
            "entities": "infospace-entities",
            "mappings": "infospace-mappings",
            "analyses": "infospace-analyses",
            "metrics": "infospace-metrics",
        }

    # ── Artifact Management ──────────────────────────────────────────

    def load_or_create_artifact(
        self,
        space: str,
        filepath: Path,
        artifact_type: ArtifactType,
        name: Optional[str] = None,
    ) -> Artifact:
        """Load artifact from file, create in repo if needed."""
        if name is None:
            name = filepath.stem

        content = filepath.read_text()

        existing = self.artifact_repo.get_by_name(space, name)
        if existing:
            return existing

        artifact = Artifact.create(
            space_id=space, name=name, content=content, artifact_type=artifact_type
        )
        artifact = self.artifact_repo.create(artifact)
        print(f"  + {name} ({artifact.content_digest[:8]})")
        return artifact

    def store_output_artifact(
        self, space: str, name: str, content: str, artifact_type: ArtifactType
    ) -> Artifact:
        """Store a generated output artifact, updating if it already exists."""
        existing = self.artifact_repo.get_by_name(space, name)
        if existing:
            self.artifact_repo.delete(existing.id)

        artifact = Artifact.create(
            space_id=space, name=name, content=content, artifact_type=artifact_type
        )
        artifact = self.artifact_repo.create(artifact)
        return artifact

    def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
        """Bind content to a macro name in a space (for template resolution)."""
        existing = self.artifact_repo.get_by_name(space, macro_name)
        if existing:
            self.artifact_repo.delete(existing.id)

        artifact = Artifact.create(
            space_id=space,
            name=macro_name,
            content=content,
            artifact_type=ArtifactType.CONTENT,
        )
        artifact = self.artifact_repo.create(artifact)
        return artifact

    # ── Setup ────────────────────────────────────────────────────────

    def setup(self):
        """Load all static artifacts (templates, guidelines, VSM reference)."""
        print("Loading artifacts...")

        # Templates
        for tmpl_file in (self.example_dir / "templates").glob("*.md"):
            self.load_or_create_artifact(
                self.spaces["templates"], tmpl_file, ArtifactType.TEMPLATE
            )

        # VSM reference
        for ref_file in (self.example_dir / "artifacts" / "vsm-reference").glob("*.md"):
            self.load_or_create_artifact(
                self.spaces["vsm-reference"], ref_file, ArtifactType.CONTENT,
                name="vsm_framework",
            )

        # Guidelines
        guideline_name_map = {
            "extraction-rules.md": "extraction_rules",
            "mapping-rules.md": "mapping_rules",
        }
        for guide_file in (self.example_dir / "artifacts" / "guidelines").glob("*.md"):
            name = guideline_name_map.get(guide_file.name, guide_file.stem)
            self.load_or_create_artifact(
                self.spaces["guidelines"], guide_file, ArtifactType.CONTENT, name=name
            )

        print("  Done.\n")

    # ── Template Resolution ──────────────────────────────────────────

    def resolve_and_compile(
        self, template_name: str, extra_spaces: list[str]
    ) -> Optional[str]:
        """Resolve macros and compile a template into a final prompt string.

        Uses TemplateAnalyzer to parse @{target} macros from the template,
        the resolver to look up artifact content, and ContextCompiler to
        assemble the final prompt.
        """
        template_artifact = self.artifact_repo.get_by_name(
            self.spaces["templates"], template_name
        )
        if not template_artifact:
            print(f"  ERROR: Template '{template_name}' not found")
            return None

        template = PromptTemplate.from_artifact(template_artifact)
        template_content = template_artifact.content

        # Analyze template to extract @{target} macros
        self.analyzer.analyze(template, template_content)

        config = ResolutionConfig(
            space_id=self.spaces["templates"],
            included_spaces=[self.spaces[s] for s in extra_spaces],
        )

        strategy = MultiSpaceResolutionStrategy()
        resolver = PromptResolver(self.artifact_service, strategy)
        result = resolver.resolve_template(template, config)

        if not result.success:
            print(f"  ERROR: Resolution failed: {result.context.errors}")
            return None

        # Compile template with resolved content
        compiled = self.compiler.compile(template, template_content, result)
        return compiled.content

    # ── LLM Execution Helpers ─────────────────────────────────────────

    def _call_llm(self, prompt: str, stage_label: str, max_tokens: int = 8192) -> Optional[str]:
        """Call the LLM and return the content string, or ``None`` on failure.

        Retries up to 3 times on rate-limit (429) errors with exponential backoff.
        Does **not** write any files — callers decide where to persist.
        """
        import time as _time
        from markitect.prompts.execution.models import RunConfig
        from markitect.llm.exceptions import LLMRateLimitError

        print(f"        Calling LLM ({stage_label})...")
        t0 = _time.time()
        max_retries = 3
        for attempt in range(max_retries + 1):
            try:
                response = self.llm_adapter.execute_prompt(prompt, RunConfig(max_tokens=max_tokens))
                break  # success
            except LLMRateLimitError as exc:
                if attempt < max_retries:
                    wait = 15 * (attempt + 1)  # 15, 30, 45 seconds
                    print(f"        Rate limited, retrying in {wait}s (attempt {attempt + 1}/{max_retries})...")
                    _time.sleep(wait)
                else:
                    print(f"        LLM rate limit after {max_retries} retries ({_time.time() - t0:.1f}s): {exc}")
                    return None
            except Exception as exc:
                print(f"        LLM error ({_time.time() - t0:.1f}s): {exc}")
                return None

        elapsed = _time.time() - t0
        usage = response.usage
        print(
            f"        LLM done in {elapsed:.1f}s — "
            f"prompt {usage.get('prompt_tokens', '?')} tok, "
            f"completion {usage.get('completion_tokens', '?')} tok, "
            f"total {usage.get('total_tokens', '?')} tok"
        )

        content = response.content
        if not content or not content.strip():
            print(f"        LLM returned empty content")
            return None

        return content

    def _execute_llm(self, prompt: str, output_file: Path, stage_label: str, max_tokens: int = 8192) -> Optional[str]:
        """Call the LLM, write the result to *output_file*, and return it."""
        content = self._call_llm(prompt, stage_label, max_tokens=max_tokens)
        if content:
            output_file.parent.mkdir(parents=True, exist_ok=True)
            output_file.write_text(content)
            print(f"        LLM output written to {output_file.name}")
        return content

    # ── Entity Management (flat canonical set) ─────────────────────

    @staticmethod
    def _normalize_entity_name(name: str) -> str:
        """Normalize an entity name to a kebab-case filename stem."""
        slug = name.lower().strip()
        slug = slug.replace("_", "-").replace(" ", "-")
        slug = re.sub(r"[^a-z0-9-]", "", slug)
        slug = re.sub(r"-{2,}", "-", slug)
        return slug.strip("-")

    def _entities_dir(self) -> Path:
        return self.example_dir / "output" / "entities"

    def _archive_dir(self) -> Path:
        return self._entities_dir() / "archive"

    def _list_existing_entity_names(self) -> list[str]:
        """Return sorted slugs of all canonical entity files already on disk."""
        return sorted(
            f.stem
            for f in self._entities_dir().glob("*.md")
            if not f.name.endswith("-entities.md")
            and not f.name.endswith("-prompt.md")
        )

    def archive_entity(self, slug: str, reason: str) -> None:
        """Move a canonical entity to the archive with a documented reason.

        The entity file is prepended with an archive header explaining why
        it was retired, then moved to ``output/entities/archive/<slug>.md``.
        Chapter views that reference this entity are **not** updated
        automatically — review and update them manually.
        """
        src = self._entities_dir() / f"{slug}.md"
        if not src.exists():
            print(f"  Entity not found: {slug}")
            return

        archive = self._archive_dir()
        archive.mkdir(parents=True, exist_ok=True)
        dest = archive / f"{slug}.md"

        from datetime import date
        header = (
            f"<!-- ARCHIVED {date.today().isoformat()}\n"
            f"     Reason: {reason}\n"
            f"-->\n\n"
        )
        content = src.read_text()
        dest.write_text(header + content)
        src.unlink()

        # Report which chapter views still reference this entity
        refs = []
        for view in self._entities_dir().glob("*-entities.md"):
            if f'include "{slug}.md"' in view.read_text():
                refs.append(view.name)

        print(f"  Archived: {slug}.md -> archive/{slug}.md")
        print(f"  Reason: {reason}")
        if refs:
            print(f"  Referenced by: {', '.join(refs)} (update these views)")
        print(f"  Canonical set: {len(self._list_existing_entity_names())} entities")

    def _split_entities(
        self, combined_content: str
    ) -> list[tuple[str, Path]]:
        """Split combined LLM output into the flat canonical entity directory.

        Writes each entity to ``output/entities/<slug>.md``.  If a file
        with that slug already exists it is **skipped** (first-occurrence
        wins), but the entity is still included in the returned list so
        the chapter view can reference it.

        Returns list of (entity_name, file_path) for every entity in
        *combined_content* (new and pre-existing alike).
        """
        entities_dir = self._entities_dir()
        entities_dir.mkdir(parents=True, exist_ok=True)

        parts = re.split(
            r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
            combined_content,
            flags=re.MULTILINE,
        )

        entity_files: list[tuple[str, Path]] = []
        new_count = 0
        skipped_count = 0

        for i in range(1, len(parts), 2):
            entity_name = parts[i]
            entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""

            slug = self._normalize_entity_name(entity_name)
            if not slug:
                continue

            file_path = entities_dir / f"{slug}.md"
            if file_path.exists():
                skipped_count += 1
            else:
                file_path.write_text(entity_content + "\n")
                new_count += 1

            entity_files.append((entity_name, file_path))

        msg = f"        {new_count} new entities written"
        if skipped_count:
            msg += f", {skipped_count} pre-existing (skipped)"
        print(msg)
        return entity_files

    def _write_chapter_entity_view(
        self, chapter_id: str, entity_files: list[tuple[str, Path]]
    ) -> Path:
        """Write a per-chapter view file that transcludes individual entities."""
        parts = chapter_id.split("-")
        book_num = int(parts[1]) if len(parts) >= 2 else 1
        ch_num = int(parts[3]) if len(parts) >= 4 else 0
        roman = {1: "I", 2: "II", 3: "III", 4: "IV", 5: "V"}.get(book_num, str(book_num))
        title = f"# Economic Entities — Book {roman}, Chapter {ch_num}\n"

        lines = [title]
        for _name, file_path in entity_files:
            lines.append(f'{{{{ include "{file_path.name}" }}}}')
            lines.append("")
            lines.append("---")
            lines.append("")

        # Remove trailing separator after last entity
        if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
            lines = lines[:-2]

        view_path = self._entities_dir() / f"{chapter_id}-entities.md"
        view_path.write_text("\n".join(lines) + "\n")
        print(f"        Chapter view written to {view_path.name}")
        return view_path

    def _read_entities_from_view(
        self, chapter_id: str
    ) -> tuple[str, list[tuple[str, Path]]]:
        """Reconstruct combined entity content from a chapter view file.

        Parses ``{{ include "..." }}`` directives in the view to discover
        which canonical entity files belong to this chapter, reads them,
        and rebuilds the delimited combined content needed by downstream
        stages.
        """
        from markitect.packaging.transclusion.directives import DirectiveParser

        view_path = self._entities_dir() / f"{chapter_id}-entities.md"
        view_content = view_path.read_text()
        includes = DirectiveParser.extract_file_includes(view_content)

        entities_dir = self._entities_dir()
        entity_files: list[tuple[str, Path]] = []
        parts: list[str] = []

        for rel_path in includes:
            file_path = entities_dir / rel_path
            if not file_path.exists():
                continue
            slug = file_path.stem
            body = file_path.read_text().strip()
            parts.append(f"--- ENTITY: {slug} ---\n\n{body}")
            entity_files.append((slug, file_path))

        combined = "\n\n".join(parts) + "\n" if parts else ""
        return combined, entity_files

    # ── Pipeline Stages ──────────────────────────────────────────────

    def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
        """Stage 1: Extract economic entities from a chapter.

        Canonical entity files live in a **flat** directory
        (``output/entities/<slug>.md``).  Duplicates across chapters are
        skipped — first occurrence wins.  The per-chapter view file
        (``<chapter_id>-entities.md``) is a **secondary** transclusion view
        that ``{{ include }}``s each entity relevant to the chapter.
        """
        print(f"  [1/3] Extracting entities...")

        # Bind the chapter content to the macro name
        self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)

        # Bind existing entity list so the LLM knows what already exists
        existing = self._list_existing_entity_names()
        if existing:
            entity_list = "\n".join(f"- {name}" for name in existing)
        else:
            entity_list = "(none — this is the first chapter)"
        self.bind_macro_artifact(
            self.spaces["entities"], "existing_entities", entity_list
        )

        prompt = self.resolve_and_compile(
            "extract-entities",
            ["sources", "guidelines", "vsm-reference", "entities"],
        )
        if not prompt:
            return None

        view_file = self._entities_dir() / f"{chapter_id}-entities.md"

        # Write compiled prompt only when no output exists yet (avoids dirty
        # working tree on DB-only rebuilds — Task 5 fix)
        prompt_file = self._entities_dir() / f"{chapter_id}-prompt.md"
        if not (view_file.exists() and "{{ include" in view_file.read_text()):
            prompt_file.parent.mkdir(parents=True, exist_ok=True)
            prompt_file.write_text(prompt)
            print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

        # ── PRIMARY: chapter view with transclusion already on disk ──
        if view_file.exists() and "{{ include" in view_file.read_text():
            content, entity_files = self._read_entities_from_view(chapter_id)
            self.store_output_artifact(
                self.spaces["entities"],
                f"{chapter_id}-entities",
                content,
                ArtifactType.GENERATED,
            )
            print(f"        Found chapter view referencing {len(entity_files)} entities")
            return content

        # ── MIGRATION: per-chapter subdirectory (previous format) ──
        subdir = self._entities_dir() / chapter_id
        if subdir.is_dir() and list(subdir.glob("*.md")):
            print(f"        Migrating per-chapter subdir: {chapter_id}/")
            entity_files: list[tuple[str, Path]] = []
            entities_dir = self._entities_dir()
            for src in sorted(subdir.glob("*.md")):
                dest = entities_dir / src.name
                if not dest.exists():
                    src.rename(dest)
                entity_files.append((src.stem, dest))
            # Clean up empty subdir
            if not list(subdir.glob("*")):
                subdir.rmdir()
            self._write_chapter_entity_view(chapter_id, entity_files)
            content = self._read_entities_from_view(chapter_id)[0]
            self.store_output_artifact(
                self.spaces["entities"],
                f"{chapter_id}-entities",
                content,
                ArtifactType.GENERATED,
            )
            return content

        # ── MIGRATION: legacy combined file (pre-split format) ──
        if view_file.exists():
            raw = view_file.read_text()
            if "--- ENTITY:" in raw:
                print(f"        Migrating legacy combined file: {view_file.name}")
                entity_files = self._split_entities(raw)
                self._write_chapter_entity_view(chapter_id, entity_files)
                self.store_output_artifact(
                    self.spaces["entities"],
                    f"{chapter_id}-entities",
                    raw,
                    ArtifactType.GENERATED,
                )
                return raw

        # ── GENERATE: call LLM, persist individual files first ──
        if self.llm_adapter and prompt:
            combined = self._call_llm(prompt, "entities")
            if combined:
                entity_files = self._split_entities(combined)
                self._write_chapter_entity_view(chapter_id, entity_files)
                self.store_output_artifact(
                    self.spaces["entities"],
                    f"{chapter_id}-entities",
                    combined,
                    ArtifactType.GENERATED,
                )
                return combined

        print(f"        Awaiting entity files in: output/entities/")
        return None

    def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
        """Stage 2: Map extracted entities to VSM concepts."""
        print(f"  [2/3] Mapping to VSM...")

        self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)

        prompt = self.resolve_and_compile(
            "map-to-vsm", ["entities", "vsm-reference", "guidelines"]
        )
        if not prompt:
            return None

        output_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-mappings.md"
        # Write compiled prompt only when output does not yet exist (Task 5 fix)
        if not output_file.exists():
            prompt_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-prompt.md"
            prompt_file.parent.mkdir(parents=True, exist_ok=True)
            prompt_file.write_text(prompt)
            print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["mappings"],
                f"{chapter_id}-mappings",
                content,
                ArtifactType.GENERATED,
            )
            print(f"        Found existing output: {output_file.name}")
            return content

        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "mappings")
            if content:
                self.store_output_artifact(
                    self.spaces["mappings"],
                    f"{chapter_id}-mappings",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"        Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    def stage_synthesize_analysis(
        self, chapter_id: str, chapter_content: str, entities_content: str, mappings_content: str
    ) -> Optional[str]:
        """Stage 3: Synthesize chapter-level VSM analysis."""
        print(f"  [3/3] Synthesizing analysis...")

        self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
        self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
        self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)

        prompt = self.resolve_and_compile(
            "synthesize-analysis",
            ["sources", "entities", "mappings", "vsm-reference"],
        )
        if not prompt:
            return None

        output_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-analysis.md"
        # Write compiled prompt only when output does not yet exist (Task 5 fix)
        if not output_file.exists():
            prompt_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-prompt.md"
            prompt_file.parent.mkdir(parents=True, exist_ok=True)
            prompt_file.write_text(prompt)
            print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["analyses"],
                f"{chapter_id}-analysis",
                content,
                ArtifactType.GENERATED,
            )
            print(f"        Found existing output: {output_file.name}")
            return content

        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "analysis")
            if content:
                self.store_output_artifact(
                    self.spaces["analyses"],
                    f"{chapter_id}-analysis",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"        Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    # ── Metrics ──────────────────────────────────────────────────────

    def assess_metrics(self) -> Optional[str]:
        """Run the assess-metrics template across all completed analyses."""
        print("Assessing metrics...")

        analyses_dir = self.example_dir / "output" / "analyses"
        analysis_files = sorted(analyses_dir.glob("*-analysis.md"))

        if not analysis_files:
            print("  No completed analyses found. Process chapters first.")
            return None

        # Concatenate all analyses
        all_analyses = []
        for f in analysis_files:
            all_analyses.append(f"<!-- Source: {f.name} -->\n{f.read_text()}")
        combined = "\n\n---\n\n".join(all_analyses)

        self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)

        prompt = self.resolve_and_compile(
            "assess-metrics", ["analyses", "vsm-reference"]
        )
        if not prompt:
            return None

        output_file = self.example_dir / "output" / "metrics" / "metrics-report.md"
        # Write compiled prompt only when output does not yet exist (Task 5 fix)
        if not output_file.exists():
            prompt_file = self.example_dir / "output" / "metrics" / "metrics-prompt.md"
            prompt_file.parent.mkdir(parents=True, exist_ok=True)
            prompt_file.write_text(prompt)
            print(f"  Prompt written to {prompt_file.relative_to(self.example_dir)}")

        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["metrics"],
                "metrics-report",
                content,
                ArtifactType.GENERATED,
            )
            print(f"  Found existing output: {output_file.name}")
            return content

        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "metrics")
            if content:
                self.store_output_artifact(
                    self.spaces["metrics"],
                    "metrics-report",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"  Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    # ── Entity Evaluation (Task 9) ────────────────────────────────────

    def _extract_quality_rubric(self) -> str:
        """Extract the Quality Metrics section from the entity schema file."""
        schema_file = self.example_dir / "schemas" / "economic-entity-schema-v1.0.md"
        text = schema_file.read_text()
        # Find the ## Quality Metrics section up to the next ## section
        import re as _re
        m = _re.search(
            r"^## Quality Metrics\n(.*?)^## ",
            text,
            flags=_re.MULTILINE | _re.DOTALL,
        )
        if m:
            return ("## Quality Metrics\n" + m.group(1)).strip()
        return text  # fallback: whole schema

    def _extract_source_chapter_from_entity(self, entity_text: str) -> str:
        """Extract the Source Chapter field from an entity markdown file."""
        import re as _re
        m = _re.search(
            r"^## Source Chapter\s*\n+(.+?)(?:\n\n|\n##|\Z)",
            entity_text,
            flags=_re.MULTILINE | _re.DOTALL,
        )
        if m:
            return m.group(1).strip()
        return "Unknown chapter"

    def evaluate_entities(self, chapter_id: Optional[str] = None) -> None:
        """Evaluate canonical entities using the evaluate-entity template.

        If *chapter_id* is given, evaluates only entities introduced by that
        chapter (determined from the chapter view file). Otherwise evaluates
        all canonical entities.

        Outputs are written to ``output/evaluations/<slug>-eval.md``.
        Existing evaluation files are skipped (idempotent).
        """
        evaluations_dir = self.example_dir / "output" / "evaluations"
        evaluations_dir.mkdir(parents=True, exist_ok=True)

        # Determine which entity files to evaluate
        if chapter_id:
            view_file = self._entities_dir() / f"{chapter_id}-entities.md"
            if not view_file.exists():
                print(f"  No chapter view found for {chapter_id}")
                return
            _, entity_files = self._read_entities_from_view(chapter_id)
            if not entity_files:
                print(f"  No entities found for chapter {chapter_id}")
                return
            print(f"Evaluating {len(entity_files)} entities from {chapter_id}...")
        else:
            slugs = self._list_existing_entity_names()
            entity_files = [(s, self._entities_dir() / f"{s}.md") for s in slugs]
            print(f"Evaluating {len(entity_files)} canonical entities...")

        if not entity_files:
            print("  No entities to evaluate.")
            return

        # Shared context loaded once
        quality_rubric = self._extract_quality_rubric()
        self.bind_macro_artifact(self.spaces["guidelines"], "quality_rubric", quality_rubric)

        done = 0
        skipped = 0
        failed = 0

        for slug, entity_path in entity_files:
            output_file = evaluations_dir / f"{slug}-eval.md"
            if output_file.exists():
                skipped += 1
                continue

            if not entity_path.exists():
                print(f"  MISSING: {entity_path.name}")
                failed += 1
                continue

            entity_text = entity_path.read_text()
            source_chapter = self._extract_source_chapter_from_entity(entity_text)

            # Bind per-entity macros
            self.bind_macro_artifact(self.spaces["entities"], "entity_content", entity_text)
            self.bind_macro_artifact(self.spaces["sources"], "source_chapter", source_chapter)

            prompt = self.resolve_and_compile(
                "evaluate-entity",
                ["entities", "sources", "vsm-reference", "guidelines"],
            )
            if not prompt:
                print(f"  FAILED to compile prompt for {slug}")
                failed += 1
                continue

            # Write prompt only when output does not yet exist (Task 5 fix)
            prompt_file = evaluations_dir / f"{slug}-eval-prompt.md"
            if not output_file.exists():
                prompt_file.write_text(prompt)

            if not self.llm_adapter:
                print(f"  {slug}: prompt written, awaiting manual evaluation")
                done += 1
                continue

            print(f"  Evaluating: {slug}...")
            content = self._execute_llm(prompt, output_file, f"eval:{slug}", max_tokens=1024)
            if content:
                done += 1
            else:
                failed += 1

        total = done + skipped + failed
        print(f"\nEvaluation complete: {done} done, {skipped} skipped (existing), {failed} failed — {total} total")

    # ── Chapter Processing ───────────────────────────────────────────

    def process_chapter(self, chapter_id: str, auto_commit: bool = True):
        """Run the full pipeline for a single chapter."""
        source_file = self.example_dir / "artifacts" / "sources" / f"{chapter_id}.md"
        if not source_file.exists():
            print(f"ERROR: Source file not found: {source_file}")
            return

        print(f"Processing: {chapter_id}")
        print(f"{'=' * 60}")

        chapter_content = source_file.read_text()

        # Store source artifact
        self.load_or_create_artifact(
            self.spaces["sources"], source_file, ArtifactType.CONTENT
        )

        # Stage 1: Extract entities
        entities = self.stage_extract_entities(chapter_id, chapter_content)
        if entities is None:
            print(f"\n  Pipeline paused. Generate entities output and re-run.")
            return

        # Stage 2: Map to VSM
        mappings = self.stage_map_to_vsm(chapter_id, entities)
        if mappings is None:
            print(f"\n  Pipeline paused. Generate mappings output and re-run.")
            return

        # Stage 3: Synthesize analysis
        analysis = self.stage_synthesize_analysis(
            chapter_id, chapter_content, entities, mappings
        )
        if analysis is None:
            print(f"\n  Pipeline paused. Generate analysis output and re-run.")
            return

        print(f"\n  Chapter {chapter_id} fully processed.")

        # Record dependency edges
        self._record_chapter_dependencies(chapter_id)

        # Git commit
        if auto_commit:
            self._git_commit_chapter(chapter_id)

    def _record_chapter_dependencies(self, chapter_id: str):
        """Record dependency edges for a processed chapter."""
        run_id = f"run-{chapter_id}"
        manifest = RunManifest.create(
            run_id=run_id,
            template_id="extract-entities",
            template_name="extract-entities",
            template_digest="",
        )

        # Source → Run
        source = self.artifact_repo.get_by_name(self.spaces["sources"], chapter_id)
        if source:
            manifest.add_dependency_edge(source.id, run_id, "requires")

        # Run → Outputs
        for output_type in ["entities", "mappings", "analyses"]:
            space = self.spaces[output_type]
            suffix = {"entities": "entities", "mappings": "mappings", "analyses": "analysis"}
            name = f"{chapter_id}-{suffix[output_type]}"
            artifact = self.artifact_repo.get_by_name(space, name)
            if artifact:
                manifest.add_dependency_edge(run_id, artifact.id, "generates")

        try:
            edges = self.graph_builder.persist_edges(manifest)
            print(f"  Recorded {len(edges)} dependency edges.")
        except Exception as e:
            print(f"  Warning: Could not record dependencies: {e}")

    def _git_commit_chapter(self, chapter_id: str):
        """Commit chapter outputs to git."""
        output_dir = self.example_dir / "output"
        try:
            subprocess.run(
                ["git", "add", str(output_dir)],
                cwd=str(self.example_dir),
                check=True,
                capture_output=True,
            )
            subprocess.run(
                ["git", "commit", "-m", f"infospace: process {chapter_id}\n\n"
                 f"Extract entities, map to VSM, and synthesize analysis\n"
                 f"for {chapter_id}."],
                cwd=str(project_root),
                check=True,
                capture_output=True,
            )
            print(f"  Git commit: infospace: process {chapter_id}")
        except subprocess.CalledProcessError as e:
            print(f"  Warning: Git commit skipped ({e})")

    # ── Listing ──────────────────────────────────────────────────────

    def list_chapters(self):
        """List all available chapters and their processing status."""
        sources_dir = self.example_dir / "artifacts" / "sources"
        chapters = sorted(f.stem for f in sources_dir.glob("*.md"))

        print(f"Available chapters ({len(chapters)}):\n")
        print(f"  {'Chapter':<30} {'Entities':<12} {'Mappings':<12} {'Analysis':<12}")
        print(f"  {'-'*30} {'-'*12} {'-'*12} {'-'*12}")

        for ch in chapters:
            view_file = self._entities_dir() / f"{ch}-entities.md"
            entity_count = 0
            if view_file.exists() and "{{ include" in view_file.read_text():
                from markitect.packaging.transclusion.directives import DirectiveParser
                entity_count = len(DirectiveParser.extract_file_includes(view_file.read_text()))
            entities = f"done ({entity_count})" if entity_count else "-"
            mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
            analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
            print(f"  {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")

        total_entities = len(self._list_existing_entity_names())
        if total_entities:
            print(f"\n  Canonical entity set: {total_entities} unique entities")
        archive = self._archive_dir()
        if archive.exists():
            archived = len(list(archive.glob("*.md")))
            if archived:
                print(f"  Archived entities: {archived}")

    # ── Statistics ───────────────────────────────────────────────────

    def show_stats(self):
        """Show dependency graph statistics."""
        print("\nDependency Statistics:")
        try:
            stats = self.query_service.get_dependency_stats()
            print(f"  Nodes: {stats['total_nodes']}")
            print(f"  Edges: {stats['total_edges']}")
            print(f"  Root artifacts: {stats['root_count']}")
            print(f"  Leaf artifacts: {stats['leaf_count']}")
            print(f"  Has cycles: {stats['has_cycles']}")
        except Exception as e:
            print(f"  (No data yet: {e})")


# ── Infospace tooling integration ─────────────────────────────────


def _load_infospace(example_dir: Path):
    """Load infospace config and entities from the example directory."""
    from markitect.infospace.config import load_infospace_config
    from markitect.infospace.entity_parser import parse_entity_directory

    config_path = example_dir / "infospace.yaml"
    if not config_path.is_file():
        print("Error: No infospace.yaml found. Create one first.")
        sys.exit(1)

    config = load_infospace_config(config_path)
    entities_dir = example_dir / config.entities_dir
    entities = parse_entity_directory(entities_dir) if entities_dir.is_dir() else []
    return config, config_path, entities


def _run_infospace_status(example_dir: Path):
    """Show infospace status using the tooling layer."""
    from markitect.infospace.state import build_state

    config, config_path, entities = _load_infospace(example_dir)
    state = build_state(config, entities=entities)

    print(f"Infospace: {state.topic_name}")
    print(f"Domain:    {config.topic.domain}")
    print(f"Entities:  {state.entity_count}")
    if state.domains:
        print(f"Domains:   {', '.join(state.domains)}")
    if config.disciplines:
        names = [d.name for d in config.disciplines]
        print(f"Disciplines: {', '.join(names)}")

    # Show processing progress
    sources_dir = example_dir / "artifacts" / "sources"
    total_chapters = len(list(sources_dir.glob("*.md")))
    processed = len(list((example_dir / "output" / "analyses").glob("*-analysis.md")))
    print(f"Chapters:  {processed}/{total_chapters} processed")


def _run_infospace_check(example_dir: Path):
    """Run collection-level quality checks."""
    from markitect.infospace.checks import run_all_checks
    from markitect.infospace.history import record_check_results

    config, config_path, entities = _load_infospace(example_dir)

    if not entities:
        print("No entities to check.")
        return

    print(f"Running collection checks on {len(entities)} entities...\n")
    report = run_all_checks(entities=entities)

    d = report.to_dict()
    for concern_name, concern_data in d.items():
        label = concern_data.get("concern", concern_name.upper())
        print(f"  {label} — {concern_name}")
        for k, v in concern_data.items():
            if k == "concern":
                continue
            print(f"    {k}: {v}")
        print()

    m = report.metrics()
    if m:
        print("Metrics summary:")
        for k, v in sorted(m.items()):
            print(f"  {k}: {v:.4f}")
        snap = record_check_results(report, config, example_dir, entity_count=len(entities))
        print(f"\nRecorded snapshot {snap.snapshot_id}")


def _run_infospace_viability(example_dir: Path):
    """Show viability dashboard."""
    from markitect.infospace.history import read_metrics_file
    from markitect.infospace.state import build_state

    config, config_path, entities = _load_infospace(example_dir)

    if not config.viability:
        print("No viability thresholds configured.")
        return

    metrics = read_metrics_file(example_dir / config.metrics_dir / "metrics.yaml")
    if not metrics:
        print("No metrics available. Run --infospace-check first.")
        print("\nConfigured thresholds:")
        for name, t in config.viability.items():
            bounds = []
            if t.min is not None:
                bounds.append(f"min={t.min}")
            if t.max is not None:
                bounds.append(f"max={t.max}")
            print(f"  {name}: {', '.join(bounds)}")
        return

    state = build_state(config, entities=entities, metrics=metrics)

    print(f"{'Metric':<30} {'Value':>8} {'Threshold':>15} {'Status':>8}")
    print("-" * 63)
    for r in state.viability_results:
        bounds = []
        if r.threshold.min is not None:
            bounds.append(f"min={r.threshold.min}")
        if r.threshold.max is not None:
            bounds.append(f"max={r.threshold.max}")
        status_str = "PASS" if r.passed else "FAIL"
        print(f"{r.metric:<30} {r.value:>8.4f} {', '.join(bounds):>15} {status_str:>8}")

    print()
    if state.is_viable:
        print(f"Viable: YES ({state.viability_pass_count}/{state.viability_total_count} thresholds met)")
    else:
        print(f"Viable: NO ({state.viability_pass_count}/{state.viability_total_count} thresholds met)")


def main():
    parser = argparse.ArgumentParser(
        description="Process Wealth of Nations chapters through VSM analysis pipeline"
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--chapter", type=str, help="Process a single chapter (e.g., book-1-chapter-01)")
    group.add_argument("--book", type=int, help="Process all chapters in a book (1-5)")
    group.add_argument("--all", action="store_true", help="Process all chapters")
    group.add_argument("--metrics", action="store_true", help="Assess metrics only")
    group.add_argument("--list", action="store_true", help="List available chapters")
    group.add_argument("--stats", action="store_true", help="Show dependency statistics")
    group.add_argument("--archive-entity", type=str, metavar="SLUG",
                       help="Archive an entity (move to archive/ with reason)")
    group.add_argument("--infospace-status", action="store_true",
                       help="Show infospace status via infospace tooling")
    group.add_argument("--infospace-check", action="store_true",
                       help="Run collection-level quality checks (C1-C5)")
    group.add_argument("--infospace-viability", action="store_true",
                       help="Show viability dashboard")
    group.add_argument("--evaluate", action="store_true",
                       help="Evaluate entity quality using the evaluate-entity template")

    parser.add_argument("--reason", type=str, default=None,
                        help="Reason for archiving (used with --archive-entity)")
    parser.add_argument("--eval-chapter", type=str, default=None, metavar="CHAPTER_ID",
                        help="Limit --evaluate to entities from a specific chapter")
    parser.add_argument("--no-commit", action="store_true", help="Skip git commits")
    parser.add_argument(
        "--provider",
        type=str,
        choices=["openrouter", "claude-code", "gemini", "openai"],
        default=None,
        help="LLM provider for auto-generating outputs (omit for manual mode)",
    )
    parser.add_argument("--model", type=str, default=None, help="Model name to pass to the LLM provider")

    args = parser.parse_args()

    # Build optional LLM adapter
    _PROVIDER_DEFAULTS = {
        "openrouter": "arcee-ai/trinity-large-preview:free",
    }
    llm_adapter = None
    if args.provider:
        from markitect.llm import create_adapter
        model = args.model or _PROVIDER_DEFAULTS.get(args.provider)
        llm_adapter = create_adapter(args.provider, model=model)
        print(f"LLM: {args.provider} ({model or 'default'})")

    example_dir = Path(__file__).parent
    processor = ChapterProcessor(example_dir, llm_adapter=llm_adapter)
    processor.setup()

    if args.archive_entity:
        if not args.reason:
            parser.error("--archive-entity requires --reason")
        processor.archive_entity(args.archive_entity, args.reason)
    elif args.list:
        processor.list_chapters()
    elif args.stats:
        processor.show_stats()
    elif args.metrics:
        processor.assess_metrics()
    elif args.chapter:
        processor.process_chapter(args.chapter, auto_commit=not args.no_commit)
    elif args.book:
        sources_dir = example_dir / "artifacts" / "sources"
        chapters = sorted(
            f.stem for f in sources_dir.glob(f"book-{args.book}-chapter-*.md")
        )
        if not chapters:
            print(f"No chapters found for Book {args.book}")
            return
        print(f"Processing {len(chapters)} chapters from Book {args.book}\n")
        for ch in chapters:
            processor.process_chapter(ch, auto_commit=not args.no_commit)
            print()
    elif args.all:
        sources_dir = example_dir / "artifacts" / "sources"
        chapters = sorted(f.stem for f in sources_dir.glob("*.md"))
        print(f"Processing all {len(chapters)} chapters\n")
        for ch in chapters:
            processor.process_chapter(ch, auto_commit=not args.no_commit)
            print()
    elif args.infospace_status:
        _run_infospace_status(example_dir)
        return
    elif args.infospace_check:
        _run_infospace_check(example_dir)
        return
    elif args.infospace_viability:
        _run_infospace_viability(example_dir)
        return
    elif args.evaluate:
        processor.evaluate_entities(chapter_id=args.eval_chapter)
        return

    processor.show_stats()


if __name__ == "__main__":
    main()