markitect-main/examples/infospace-with-history/process_chapters.py

#!/usr/bin/env python3
"""
Infospace with History — Chapter Processing Pipeline

Processes chapters from Adam Smith's "The Wealth of Nations" through a
three-stage analysis pipeline, mapping economic content to Stafford Beer's
Viable System Model.

Pipeline per chapter:
  1. extract-entities  — Extract economic entities from chapter text
  2. map-to-vsm       — Map entities to VSM concepts
  3. synthesize-analysis — Produce chapter-level VSM analysis

After all chapters:
  4. assess-metrics    — Evaluate completeness and consistency

Usage:
  # Process a single chapter
  python process_chapters.py --chapter book-1-chapter-01

  # Process all chapters in Book I
  python process_chapters.py --book 1

  # Process all chapters
  python process_chapters.py --all

  # Assess metrics only (after chapters have been processed)
  python process_chapters.py --metrics

  # List available chapters
  python process_chapters.py --list
"""

import argparse
import subprocess
import sys
from pathlib import Path
from typing import Optional

# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from markitect.prompts.models import Artifact, ArtifactType
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
from markitect.prompts.services.artifact_service import ArtifactService
from markitect.prompts.templates.models import PromptTemplate, ContentMacro, MacroKind
from markitect.prompts.resolver.resolver import PromptResolver
from markitect.prompts.resolver.compiler import ContextCompiler
from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
from markitect.prompts.execution.manifest import RunManifest
from markitect.prompts.dependencies.graph import GraphBuilder
from markitect.prompts.traceability.service import TraceabilityService
from markitect.prompts.queries.operations import PromptQueryService


class ChapterProcessor:
    """Processes Wealth of Nations chapters through the VSM analysis pipeline."""

    def __init__(
        self,
        example_dir: Path,
        db_path: Optional[str] = None,
        llm_adapter=None,
    ):
        self.example_dir = example_dir
        self.db_path = db_path or str(example_dir / "infospace.db")
        self.llm_adapter = llm_adapter

        # Initialize repositories
        self.artifact_repo = SQLiteArtifactRepository(self.db_path)
        self.dep_repo = SQLiteDependencyRepository(self.db_path)
        self.artifact_service = ArtifactService(self.artifact_repo)
        self.graph_builder = GraphBuilder(self.dep_repo)
        self.trace_service = TraceabilityService(
            self.artifact_repo, self.dep_repo, db_path=self.db_path
        )
        self.query_service = PromptQueryService(
            self.artifact_repo, self.dep_repo, db_path=self.db_path
        )

        # Information spaces
        self.spaces = {
            "templates": "infospace-templates",
            "sources": "infospace-sources",
            "guidelines": "infospace-guidelines",
            "vsm-reference": "infospace-vsm-reference",
            "entities": "infospace-entities",
            "mappings": "infospace-mappings",
            "analyses": "infospace-analyses",
            "metrics": "infospace-metrics",
        }

        # Content cache (repository stores metadata, we cache content)
        self.artifact_content: dict[str, str] = {}

    # ── Artifact Management ──────────────────────────────────────────

    def load_or_create_artifact(
        self,
        space: str,
        filepath: Path,
        artifact_type: ArtifactType,
        name: Optional[str] = None,
    ) -> tuple[Artifact, str]:
        """Load artifact from file, create in repo if needed."""
        if name is None:
            name = filepath.stem

        content = filepath.read_text()

        existing = self.artifact_repo.get_by_name(space, name)
        if existing:
            self.artifact_content[existing.id] = content
            return existing, content

        artifact = Artifact.create(
            space_id=space, name=name, content=content, artifact_type=artifact_type
        )
        artifact = self.artifact_repo.create(artifact)
        self.artifact_content[artifact.id] = content
        print(f"  + {name} ({artifact.content_digest[:8]})")
        return artifact, content

    def store_output_artifact(
        self, space: str, name: str, content: str, artifact_type: ArtifactType
    ) -> Artifact:
        """Store a generated output artifact, updating if it already exists."""
        existing = self.artifact_repo.get_by_name(space, name)
        if existing:
            self.artifact_repo.delete(existing.id)

        artifact = Artifact.create(
            space_id=space, name=name, content=content, artifact_type=artifact_type
        )
        artifact = self.artifact_repo.create(artifact)
        self.artifact_content[artifact.id] = content
        return artifact

    def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
        """Bind content to a macro name in a space (for template resolution)."""
        existing = self.artifact_repo.get_by_name(space, macro_name)
        if existing:
            self.artifact_repo.delete(existing.id)

        artifact = Artifact.create(
            space_id=space,
            name=macro_name,
            content=content,
            artifact_type=ArtifactType.CONTENT,
        )
        artifact = self.artifact_repo.create(artifact)
        self.artifact_content[artifact.id] = content
        return artifact

    # ── Setup ────────────────────────────────────────────────────────

    def setup(self):
        """Load all static artifacts (templates, guidelines, VSM reference)."""
        print("Loading artifacts...")

        # Templates
        for tmpl_file in (self.example_dir / "templates").glob("*.md"):
            self.load_or_create_artifact(
                self.spaces["templates"], tmpl_file, ArtifactType.TEMPLATE
            )

        # VSM reference
        for ref_file in (self.example_dir / "artifacts" / "vsm-reference").glob("*.md"):
            self.load_or_create_artifact(
                self.spaces["vsm-reference"], ref_file, ArtifactType.CONTENT,
                name="vsm_framework",
            )

        # Guidelines
        guideline_name_map = {
            "extraction-rules.md": "extraction_rules",
            "mapping-rules.md": "mapping_rules",
        }
        for guide_file in (self.example_dir / "artifacts" / "guidelines").glob("*.md"):
            name = guideline_name_map.get(guide_file.name, guide_file.stem)
            self.load_or_create_artifact(
                self.spaces["guidelines"], guide_file, ArtifactType.CONTENT, name=name
            )

        print("  Done.\n")

    # ── Helpers ───────────────────────────────────────────────────────

    @staticmethod
    def _macro(target: str, kind: MacroKind = MacroKind.REQUIRED) -> ContentMacro:
        """Create a ContentMacro with correct raw_text for @{target} syntax."""
        return ContentMacro(kind=kind, target=target, raw_text=f"@{{{target}}}")

    # ── Template Resolution ──────────────────────────────────────────

    def resolve_and_compile(
        self, template_name: str, macros: list[ContentMacro], extra_spaces: list[str]
    ) -> Optional[str]:
        """Resolve macros and compile a template into a final prompt string.

        Uses the resolver for dependency validation, then performs content
        substitution from our local cache (since the artifact repository
        doesn't persist content — see resolver.py line 147).
        """
        template_artifact = self.artifact_repo.get_by_name(
            self.spaces["templates"], template_name
        )
        if not template_artifact:
            print(f"  ERROR: Template '{template_name}' not found")
            return None

        template = PromptTemplate.from_artifact(template_artifact)
        template.macros = macros
        template.analyzed = True

        config = ResolutionConfig(
            space_id=self.spaces["templates"],
            included_spaces=[self.spaces[s] for s in extra_spaces],
        )

        strategy = MultiSpaceResolutionStrategy()
        resolver = PromptResolver(self.artifact_service, strategy)
        result = resolver.resolve_template(template, config)

        if not result.success:
            print(f"  ERROR: Resolution failed: {result.context.errors}")
            return None

        # Load template content
        template_content = self.artifact_content.get(template_artifact.id)
        if not template_content:
            template_content = (
                self.example_dir / "templates" / f"{template_name}.md"
            ).read_text()

        # Substitute macros with actual content from cache
        # (The resolver returns placeholders because the repo doesn't store content)
        compiled_content = template_content
        for resolved in result.context.resolved_macros:
            if resolved.resolved and resolved.artifact:
                actual_content = self.artifact_content.get(resolved.artifact.id, "")
                compiled_content = compiled_content.replace(
                    f"@{{{resolved.macro.target}}}", actual_content
                )

        return compiled_content

    # ── LLM Execution Helper ────────────────────────────────────────

    def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
        """Execute *prompt* via the configured LLM adapter and write the result.

        Returns the generated content, or ``None`` on failure.
        """
        import time as _time
        from markitect.prompts.execution.models import RunConfig

        print(f"        Calling LLM ({stage_label})...")
        t0 = _time.time()
        try:
            response = self.llm_adapter.execute_prompt(prompt, RunConfig())
        except Exception as exc:
            print(f"        LLM error ({_time.time() - t0:.1f}s): {exc}")
            return None

        elapsed = _time.time() - t0
        usage = response.usage
        print(
            f"        LLM done in {elapsed:.1f}s — "
            f"prompt {usage.get('prompt_tokens', '?')} tok, "
            f"completion {usage.get('completion_tokens', '?')} tok, "
            f"total {usage.get('total_tokens', '?')} tok"
        )

        content = response.content
        if not content or not content.strip():
            print(f"        LLM returned empty content")
            return None

        output_file.parent.mkdir(parents=True, exist_ok=True)
        output_file.write_text(content)
        print(f"        LLM output written to {output_file.name}")
        return content

    # ── Pipeline Stages ──────────────────────────────────────────────

    def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
        """Stage 1: Extract economic entities from a chapter."""
        print(f"  [1/3] Extracting entities...")

        # Bind the chapter content to the macro name
        self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)

        macros = [
            self._macro("chapter_text"),
            self._macro("extraction_rules"),
            self._macro("vsm_framework"),
        ]

        prompt = self.resolve_and_compile(
            "extract-entities", macros, ["sources", "guidelines", "vsm-reference"]
        )
        if not prompt:
            return None

        # Write compiled prompt for inspection / LLM execution
        prompt_file = self.example_dir / "output" / "entities" / f"{chapter_id}-prompt.md"
        prompt_file.write_text(prompt)
        print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

        # Check for existing output (manual or LLM-generated)
        output_file = self.example_dir / "output" / "entities" / f"{chapter_id}-entities.md"
        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["entities"],
                f"{chapter_id}-entities",
                content,
                ArtifactType.GENERATED,
            )
            print(f"        Found existing output: {output_file.name}")
            return content

        # Auto-generate via LLM if adapter is available
        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "entities")
            if content:
                self.store_output_artifact(
                    self.spaces["entities"],
                    f"{chapter_id}-entities",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"        Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
        """Stage 2: Map extracted entities to VSM concepts."""
        print(f"  [2/3] Mapping to VSM...")

        self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)

        macros = [
            self._macro("entities"),
            self._macro("vsm_framework"),
            self._macro("mapping_rules"),
        ]

        prompt = self.resolve_and_compile(
            "map-to-vsm", macros, ["entities", "vsm-reference", "guidelines"]
        )
        if not prompt:
            return None

        prompt_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-prompt.md"
        prompt_file.write_text(prompt)
        print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

        output_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-mappings.md"
        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["mappings"],
                f"{chapter_id}-mappings",
                content,
                ArtifactType.GENERATED,
            )
            print(f"        Found existing output: {output_file.name}")
            return content

        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "mappings")
            if content:
                self.store_output_artifact(
                    self.spaces["mappings"],
                    f"{chapter_id}-mappings",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"        Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    def stage_synthesize_analysis(
        self, chapter_id: str, chapter_content: str, entities_content: str, mappings_content: str
    ) -> Optional[str]:
        """Stage 3: Synthesize chapter-level VSM analysis."""
        print(f"  [3/3] Synthesizing analysis...")

        self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
        self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
        self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)

        macros = [
            self._macro("chapter_text"),
            self._macro("entities"),
            self._macro("mappings"),
            self._macro("vsm_framework"),
        ]

        prompt = self.resolve_and_compile(
            "synthesize-analysis",
            macros,
            ["sources", "entities", "mappings", "vsm-reference"],
        )
        if not prompt:
            return None

        prompt_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-prompt.md"
        prompt_file.write_text(prompt)
        print(f"        Prompt written to {prompt_file.relative_to(self.example_dir)}")

        output_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-analysis.md"
        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["analyses"],
                f"{chapter_id}-analysis",
                content,
                ArtifactType.GENERATED,
            )
            print(f"        Found existing output: {output_file.name}")
            return content

        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "analysis")
            if content:
                self.store_output_artifact(
                    self.spaces["analyses"],
                    f"{chapter_id}-analysis",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"        Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    # ── Metrics ──────────────────────────────────────────────────────

    def assess_metrics(self) -> Optional[str]:
        """Run the assess-metrics template across all completed analyses."""
        print("Assessing metrics...")

        analyses_dir = self.example_dir / "output" / "analyses"
        analysis_files = sorted(analyses_dir.glob("*-analysis.md"))

        if not analysis_files:
            print("  No completed analyses found. Process chapters first.")
            return None

        # Concatenate all analyses
        all_analyses = []
        for f in analysis_files:
            all_analyses.append(f"<!-- Source: {f.name} -->\n{f.read_text()}")
        combined = "\n\n---\n\n".join(all_analyses)

        self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)

        macros = [
            self._macro("all_analyses"),
            self._macro("vsm_framework"),
        ]

        prompt = self.resolve_and_compile(
            "assess-metrics", macros, ["analyses", "vsm-reference"]
        )
        if not prompt:
            return None

        prompt_file = self.example_dir / "output" / "metrics" / "metrics-prompt.md"
        prompt_file.write_text(prompt)
        print(f"  Prompt written to {prompt_file.relative_to(self.example_dir)}")

        output_file = self.example_dir / "output" / "metrics" / "metrics-report.md"
        if output_file.exists():
            content = output_file.read_text()
            self.store_output_artifact(
                self.spaces["metrics"],
                "metrics-report",
                content,
                ArtifactType.GENERATED,
            )
            print(f"  Found existing output: {output_file.name}")
            return content

        if self.llm_adapter and prompt:
            content = self._execute_llm(prompt, output_file, "metrics")
            if content:
                self.store_output_artifact(
                    self.spaces["metrics"],
                    "metrics-report",
                    content,
                    ArtifactType.GENERATED,
                )
                return content

        print(f"  Awaiting output at: {output_file.relative_to(self.example_dir)}")
        return None

    # ── Chapter Processing ───────────────────────────────────────────

    def process_chapter(self, chapter_id: str, auto_commit: bool = True):
        """Run the full pipeline for a single chapter."""
        source_file = self.example_dir / "artifacts" / "sources" / f"{chapter_id}.md"
        if not source_file.exists():
            print(f"ERROR: Source file not found: {source_file}")
            return

        print(f"Processing: {chapter_id}")
        print(f"{'=' * 60}")

        chapter_content = source_file.read_text()

        # Store source artifact
        self.load_or_create_artifact(
            self.spaces["sources"], source_file, ArtifactType.CONTENT
        )

        # Stage 1: Extract entities
        entities = self.stage_extract_entities(chapter_id, chapter_content)
        if entities is None:
            print(f"\n  Pipeline paused. Generate entities output and re-run.")
            return

        # Stage 2: Map to VSM
        mappings = self.stage_map_to_vsm(chapter_id, entities)
        if mappings is None:
            print(f"\n  Pipeline paused. Generate mappings output and re-run.")
            return

        # Stage 3: Synthesize analysis
        analysis = self.stage_synthesize_analysis(
            chapter_id, chapter_content, entities, mappings
        )
        if analysis is None:
            print(f"\n  Pipeline paused. Generate analysis output and re-run.")
            return

        print(f"\n  Chapter {chapter_id} fully processed.")

        # Record dependency edges
        self._record_chapter_dependencies(chapter_id)

        # Git commit
        if auto_commit:
            self._git_commit_chapter(chapter_id)

    def _record_chapter_dependencies(self, chapter_id: str):
        """Record dependency edges for a processed chapter."""
        run_id = f"run-{chapter_id}"
        manifest = RunManifest.create(
            run_id=run_id,
            template_id="extract-entities",
            template_name="extract-entities",
            template_digest="",
        )

        # Source → Run
        source = self.artifact_repo.get_by_name(self.spaces["sources"], chapter_id)
        if source:
            manifest.add_dependency_edge(source.id, run_id, "requires")

        # Run → Outputs
        for output_type in ["entities", "mappings", "analyses"]:
            space = self.spaces[output_type]
            suffix = {"entities": "entities", "mappings": "mappings", "analyses": "analysis"}
            name = f"{chapter_id}-{suffix[output_type]}"
            artifact = self.artifact_repo.get_by_name(space, name)
            if artifact:
                manifest.add_dependency_edge(run_id, artifact.id, "generates")

        try:
            edges = self.graph_builder.persist_edges(manifest)
            print(f"  Recorded {len(edges)} dependency edges.")
        except Exception as e:
            print(f"  Warning: Could not record dependencies: {e}")

    def _git_commit_chapter(self, chapter_id: str):
        """Commit chapter outputs to git."""
        output_dir = self.example_dir / "output"
        try:
            subprocess.run(
                ["git", "add", str(output_dir)],
                cwd=str(self.example_dir),
                check=True,
                capture_output=True,
            )
            subprocess.run(
                ["git", "commit", "-m", f"infospace: process {chapter_id}\n\n"
                 f"Extract entities, map to VSM, and synthesize analysis\n"
                 f"for {chapter_id}."],
                cwd=str(project_root),
                check=True,
                capture_output=True,
            )
            print(f"  Git commit: infospace: process {chapter_id}")
        except subprocess.CalledProcessError as e:
            print(f"  Warning: Git commit skipped ({e})")

    # ── Listing ──────────────────────────────────────────────────────

    def list_chapters(self):
        """List all available chapters and their processing status."""
        sources_dir = self.example_dir / "artifacts" / "sources"
        chapters = sorted(f.stem for f in sources_dir.glob("*.md"))

        print(f"Available chapters ({len(chapters)}):\n")
        print(f"  {'Chapter':<30} {'Entities':<12} {'Mappings':<12} {'Analysis':<12}")
        print(f"  {'-'*30} {'-'*12} {'-'*12} {'-'*12}")

        for ch in chapters:
            entities = "done" if (self.example_dir / "output" / "entities" / f"{ch}-entities.md").exists() else "-"
            mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
            analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
            print(f"  {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")

    # ── Statistics ───────────────────────────────────────────────────

    def show_stats(self):
        """Show dependency graph statistics."""
        print("\nDependency Statistics:")
        try:
            stats = self.query_service.get_dependency_stats()
            print(f"  Nodes: {stats['total_nodes']}")
            print(f"  Edges: {stats['total_edges']}")
            print(f"  Root artifacts: {stats['root_count']}")
            print(f"  Leaf artifacts: {stats['leaf_count']}")
            print(f"  Has cycles: {stats['has_cycles']}")
        except Exception as e:
            print(f"  (No data yet: {e})")


def main():
    parser = argparse.ArgumentParser(
        description="Process Wealth of Nations chapters through VSM analysis pipeline"
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--chapter", type=str, help="Process a single chapter (e.g., book-1-chapter-01)")
    group.add_argument("--book", type=int, help="Process all chapters in a book (1-5)")
    group.add_argument("--all", action="store_true", help="Process all chapters")
    group.add_argument("--metrics", action="store_true", help="Assess metrics only")
    group.add_argument("--list", action="store_true", help="List available chapters")
    group.add_argument("--stats", action="store_true", help="Show dependency statistics")

    parser.add_argument("--no-commit", action="store_true", help="Skip git commits")
    parser.add_argument(
        "--provider",
        type=str,
        choices=["openrouter", "claude-code"],
        default=None,
        help="LLM provider for auto-generating outputs (omit for manual mode)",
    )
    parser.add_argument("--model", type=str, default=None, help="Model name to pass to the LLM provider")

    args = parser.parse_args()

    # Build optional LLM adapter
    llm_adapter = None
    if args.provider:
        from markitect.llm import create_adapter
        llm_adapter = create_adapter(args.provider, model=args.model)
        print(f"LLM: {args.provider}" + (f" ({args.model})" if args.model else ""))

    example_dir = Path(__file__).parent
    processor = ChapterProcessor(example_dir, llm_adapter=llm_adapter)
    processor.setup()

    if args.list:
        processor.list_chapters()
    elif args.stats:
        processor.show_stats()
    elif args.metrics:
        processor.assess_metrics()
    elif args.chapter:
        processor.process_chapter(args.chapter, auto_commit=not args.no_commit)
    elif args.book:
        sources_dir = example_dir / "artifacts" / "sources"
        chapters = sorted(
            f.stem for f in sources_dir.glob(f"book-{args.book}-chapter-*.md")
        )
        if not chapters:
            print(f"No chapters found for Book {args.book}")
            return
        print(f"Processing {len(chapters)} chapters from Book {args.book}\n")
        for ch in chapters:
            processor.process_chapter(ch, auto_commit=not args.no_commit)
            print()
    elif args.all:
        sources_dir = example_dir / "artifacts" / "sources"
        chapters = sorted(f.stem for f in sources_dir.glob("*.md"))
        print(f"Processing all {len(chapters)} chapters\n")
        for ch in chapters:
            processor.process_chapter(ch, auto_commit=not args.no_commit)
            print()

    processor.show_stats()


if __name__ == "__main__":
    main()