Files
markitect-main/examples/infospace-with-history/process_chapters.py
tegwick e806a701ca infospace: process book-1-chapter-03 with LLM integration
Auto-generated mappings and analysis via Claude Code CLI adapter.
Entities were already present from a previous session.

Stats: 5m04s wall time, ~51K estimated tokens, ~$0.35 estimated cost.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 01:32:24 +01:00

707 lines
28 KiB
Python

#!/usr/bin/env python3
"""
Infospace with History — Chapter Processing Pipeline
Processes chapters from Adam Smith's "The Wealth of Nations" through a
three-stage analysis pipeline, mapping economic content to Stafford Beer's
Viable System Model.
Pipeline per chapter:
1. extract-entities — Extract economic entities from chapter text
2. map-to-vsm — Map entities to VSM concepts
3. synthesize-analysis — Produce chapter-level VSM analysis
After all chapters:
4. assess-metrics — Evaluate completeness and consistency
Usage:
# Process a single chapter
python process_chapters.py --chapter book-1-chapter-01
# Process all chapters in Book I
python process_chapters.py --book 1
# Process all chapters
python process_chapters.py --all
# Assess metrics only (after chapters have been processed)
python process_chapters.py --metrics
# List available chapters
python process_chapters.py --list
"""
import argparse
import subprocess
import sys
from pathlib import Path
from typing import Optional
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from markitect.prompts.models import Artifact, ArtifactType
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
from markitect.prompts.services.artifact_service import ArtifactService
from markitect.prompts.templates.models import PromptTemplate, ContentMacro, MacroKind
from markitect.prompts.resolver.resolver import PromptResolver
from markitect.prompts.resolver.compiler import ContextCompiler
from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
from markitect.prompts.execution.manifest import RunManifest
from markitect.prompts.dependencies.graph import GraphBuilder
from markitect.prompts.traceability.service import TraceabilityService
from markitect.prompts.queries.operations import PromptQueryService
class ChapterProcessor:
"""Processes Wealth of Nations chapters through the VSM analysis pipeline."""
def __init__(
self,
example_dir: Path,
db_path: Optional[str] = None,
llm_adapter=None,
):
self.example_dir = example_dir
self.db_path = db_path or str(example_dir / "infospace.db")
self.llm_adapter = llm_adapter
# Initialize repositories
self.artifact_repo = SQLiteArtifactRepository(self.db_path)
self.dep_repo = SQLiteDependencyRepository(self.db_path)
self.artifact_service = ArtifactService(self.artifact_repo)
self.graph_builder = GraphBuilder(self.dep_repo)
self.trace_service = TraceabilityService(
self.artifact_repo, self.dep_repo, db_path=self.db_path
)
self.query_service = PromptQueryService(
self.artifact_repo, self.dep_repo, db_path=self.db_path
)
# Information spaces
self.spaces = {
"templates": "infospace-templates",
"sources": "infospace-sources",
"guidelines": "infospace-guidelines",
"vsm-reference": "infospace-vsm-reference",
"entities": "infospace-entities",
"mappings": "infospace-mappings",
"analyses": "infospace-analyses",
"metrics": "infospace-metrics",
}
# Content cache (repository stores metadata, we cache content)
self.artifact_content: dict[str, str] = {}
# ── Artifact Management ──────────────────────────────────────────
def load_or_create_artifact(
self,
space: str,
filepath: Path,
artifact_type: ArtifactType,
name: Optional[str] = None,
) -> tuple[Artifact, str]:
"""Load artifact from file, create in repo if needed."""
if name is None:
name = filepath.stem
content = filepath.read_text()
existing = self.artifact_repo.get_by_name(space, name)
if existing:
self.artifact_content[existing.id] = content
return existing, content
artifact = Artifact.create(
space_id=space, name=name, content=content, artifact_type=artifact_type
)
artifact = self.artifact_repo.create(artifact)
self.artifact_content[artifact.id] = content
print(f" + {name} ({artifact.content_digest[:8]})")
return artifact, content
def store_output_artifact(
self, space: str, name: str, content: str, artifact_type: ArtifactType
) -> Artifact:
"""Store a generated output artifact, updating if it already exists."""
existing = self.artifact_repo.get_by_name(space, name)
if existing:
self.artifact_repo.delete(existing.id)
artifact = Artifact.create(
space_id=space, name=name, content=content, artifact_type=artifact_type
)
artifact = self.artifact_repo.create(artifact)
self.artifact_content[artifact.id] = content
return artifact
def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
"""Bind content to a macro name in a space (for template resolution)."""
existing = self.artifact_repo.get_by_name(space, macro_name)
if existing:
self.artifact_repo.delete(existing.id)
artifact = Artifact.create(
space_id=space,
name=macro_name,
content=content,
artifact_type=ArtifactType.CONTENT,
)
artifact = self.artifact_repo.create(artifact)
self.artifact_content[artifact.id] = content
return artifact
# ── Setup ────────────────────────────────────────────────────────
def setup(self):
"""Load all static artifacts (templates, guidelines, VSM reference)."""
print("Loading artifacts...")
# Templates
for tmpl_file in (self.example_dir / "templates").glob("*.md"):
self.load_or_create_artifact(
self.spaces["templates"], tmpl_file, ArtifactType.TEMPLATE
)
# VSM reference
for ref_file in (self.example_dir / "artifacts" / "vsm-reference").glob("*.md"):
self.load_or_create_artifact(
self.spaces["vsm-reference"], ref_file, ArtifactType.CONTENT,
name="vsm_framework",
)
# Guidelines
guideline_name_map = {
"extraction-rules.md": "extraction_rules",
"mapping-rules.md": "mapping_rules",
}
for guide_file in (self.example_dir / "artifacts" / "guidelines").glob("*.md"):
name = guideline_name_map.get(guide_file.name, guide_file.stem)
self.load_or_create_artifact(
self.spaces["guidelines"], guide_file, ArtifactType.CONTENT, name=name
)
print(" Done.\n")
# ── Helpers ───────────────────────────────────────────────────────
@staticmethod
def _macro(target: str, kind: MacroKind = MacroKind.REQUIRED) -> ContentMacro:
"""Create a ContentMacro with correct raw_text for @{target} syntax."""
return ContentMacro(kind=kind, target=target, raw_text=f"@{{{target}}}")
# ── Template Resolution ──────────────────────────────────────────
def resolve_and_compile(
self, template_name: str, macros: list[ContentMacro], extra_spaces: list[str]
) -> Optional[str]:
"""Resolve macros and compile a template into a final prompt string.
Uses the resolver for dependency validation, then performs content
substitution from our local cache (since the artifact repository
doesn't persist content — see resolver.py line 147).
"""
template_artifact = self.artifact_repo.get_by_name(
self.spaces["templates"], template_name
)
if not template_artifact:
print(f" ERROR: Template '{template_name}' not found")
return None
template = PromptTemplate.from_artifact(template_artifact)
template.macros = macros
template.analyzed = True
config = ResolutionConfig(
space_id=self.spaces["templates"],
included_spaces=[self.spaces[s] for s in extra_spaces],
)
strategy = MultiSpaceResolutionStrategy()
resolver = PromptResolver(self.artifact_service, strategy)
result = resolver.resolve_template(template, config)
if not result.success:
print(f" ERROR: Resolution failed: {result.context.errors}")
return None
# Load template content
template_content = self.artifact_content.get(template_artifact.id)
if not template_content:
template_content = (
self.example_dir / "templates" / f"{template_name}.md"
).read_text()
# Substitute macros with actual content from cache
# (The resolver returns placeholders because the repo doesn't store content)
compiled_content = template_content
for resolved in result.context.resolved_macros:
if resolved.resolved and resolved.artifact:
actual_content = self.artifact_content.get(resolved.artifact.id, "")
compiled_content = compiled_content.replace(
f"@{{{resolved.macro.target}}}", actual_content
)
return compiled_content
# ── LLM Execution Helper ────────────────────────────────────────
def _execute_llm(self, prompt: str, output_file: Path, stage_label: str) -> Optional[str]:
"""Execute *prompt* via the configured LLM adapter and write the result.
Returns the generated content, or ``None`` on failure.
"""
import time as _time
from markitect.prompts.execution.models import RunConfig
print(f" Calling LLM ({stage_label})...")
t0 = _time.time()
try:
response = self.llm_adapter.execute_prompt(prompt, RunConfig())
except Exception as exc:
print(f" LLM error ({_time.time() - t0:.1f}s): {exc}")
return None
elapsed = _time.time() - t0
usage = response.usage
print(
f" LLM done in {elapsed:.1f}s — "
f"prompt {usage.get('prompt_tokens', '?')} tok, "
f"completion {usage.get('completion_tokens', '?')} tok, "
f"total {usage.get('total_tokens', '?')} tok"
)
content = response.content
if not content or not content.strip():
print(f" LLM returned empty content")
return None
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content)
print(f" LLM output written to {output_file.name}")
return content
# ── Pipeline Stages ──────────────────────────────────────────────
def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
"""Stage 1: Extract economic entities from a chapter."""
print(f" [1/3] Extracting entities...")
# Bind the chapter content to the macro name
self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
macros = [
self._macro("chapter_text"),
self._macro("extraction_rules"),
self._macro("vsm_framework"),
]
prompt = self.resolve_and_compile(
"extract-entities", macros, ["sources", "guidelines", "vsm-reference"]
)
if not prompt:
return None
# Write compiled prompt for inspection / LLM execution
prompt_file = self.example_dir / "output" / "entities" / f"{chapter_id}-prompt.md"
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
# Check for existing output (manual or LLM-generated)
output_file = self.example_dir / "output" / "entities" / f"{chapter_id}-entities.md"
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
# Auto-generate via LLM if adapter is available
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "entities")
if content:
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
"""Stage 2: Map extracted entities to VSM concepts."""
print(f" [2/3] Mapping to VSM...")
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
macros = [
self._macro("entities"),
self._macro("vsm_framework"),
self._macro("mapping_rules"),
]
prompt = self.resolve_and_compile(
"map-to-vsm", macros, ["entities", "vsm-reference", "guidelines"]
)
if not prompt:
return None
prompt_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-prompt.md"
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
output_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-mappings.md"
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["mappings"],
f"{chapter_id}-mappings",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "mappings")
if content:
self.store_output_artifact(
self.spaces["mappings"],
f"{chapter_id}-mappings",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
def stage_synthesize_analysis(
self, chapter_id: str, chapter_content: str, entities_content: str, mappings_content: str
) -> Optional[str]:
"""Stage 3: Synthesize chapter-level VSM analysis."""
print(f" [3/3] Synthesizing analysis...")
self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)
macros = [
self._macro("chapter_text"),
self._macro("entities"),
self._macro("mappings"),
self._macro("vsm_framework"),
]
prompt = self.resolve_and_compile(
"synthesize-analysis",
macros,
["sources", "entities", "mappings", "vsm-reference"],
)
if not prompt:
return None
prompt_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-prompt.md"
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
output_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-analysis.md"
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["analyses"],
f"{chapter_id}-analysis",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "analysis")
if content:
self.store_output_artifact(
self.spaces["analyses"],
f"{chapter_id}-analysis",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
# ── Metrics ──────────────────────────────────────────────────────
def assess_metrics(self) -> Optional[str]:
"""Run the assess-metrics template across all completed analyses."""
print("Assessing metrics...")
analyses_dir = self.example_dir / "output" / "analyses"
analysis_files = sorted(analyses_dir.glob("*-analysis.md"))
if not analysis_files:
print(" No completed analyses found. Process chapters first.")
return None
# Concatenate all analyses
all_analyses = []
for f in analysis_files:
all_analyses.append(f"<!-- Source: {f.name} -->\n{f.read_text()}")
combined = "\n\n---\n\n".join(all_analyses)
self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)
macros = [
self._macro("all_analyses"),
self._macro("vsm_framework"),
]
prompt = self.resolve_and_compile(
"assess-metrics", macros, ["analyses", "vsm-reference"]
)
if not prompt:
return None
prompt_file = self.example_dir / "output" / "metrics" / "metrics-prompt.md"
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
output_file = self.example_dir / "output" / "metrics" / "metrics-report.md"
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["metrics"],
"metrics-report",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "metrics")
if content:
self.store_output_artifact(
self.spaces["metrics"],
"metrics-report",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
# ── Chapter Processing ───────────────────────────────────────────
def process_chapter(self, chapter_id: str, auto_commit: bool = True):
"""Run the full pipeline for a single chapter."""
source_file = self.example_dir / "artifacts" / "sources" / f"{chapter_id}.md"
if not source_file.exists():
print(f"ERROR: Source file not found: {source_file}")
return
print(f"Processing: {chapter_id}")
print(f"{'=' * 60}")
chapter_content = source_file.read_text()
# Store source artifact
self.load_or_create_artifact(
self.spaces["sources"], source_file, ArtifactType.CONTENT
)
# Stage 1: Extract entities
entities = self.stage_extract_entities(chapter_id, chapter_content)
if entities is None:
print(f"\n Pipeline paused. Generate entities output and re-run.")
return
# Stage 2: Map to VSM
mappings = self.stage_map_to_vsm(chapter_id, entities)
if mappings is None:
print(f"\n Pipeline paused. Generate mappings output and re-run.")
return
# Stage 3: Synthesize analysis
analysis = self.stage_synthesize_analysis(
chapter_id, chapter_content, entities, mappings
)
if analysis is None:
print(f"\n Pipeline paused. Generate analysis output and re-run.")
return
print(f"\n Chapter {chapter_id} fully processed.")
# Record dependency edges
self._record_chapter_dependencies(chapter_id)
# Git commit
if auto_commit:
self._git_commit_chapter(chapter_id)
def _record_chapter_dependencies(self, chapter_id: str):
"""Record dependency edges for a processed chapter."""
run_id = f"run-{chapter_id}"
manifest = RunManifest.create(
run_id=run_id,
template_id="extract-entities",
template_name="extract-entities",
template_digest="",
)
# Source → Run
source = self.artifact_repo.get_by_name(self.spaces["sources"], chapter_id)
if source:
manifest.add_dependency_edge(source.id, run_id, "requires")
# Run → Outputs
for output_type in ["entities", "mappings", "analyses"]:
space = self.spaces[output_type]
suffix = {"entities": "entities", "mappings": "mappings", "analyses": "analysis"}
name = f"{chapter_id}-{suffix[output_type]}"
artifact = self.artifact_repo.get_by_name(space, name)
if artifact:
manifest.add_dependency_edge(run_id, artifact.id, "generates")
try:
edges = self.graph_builder.persist_edges(manifest)
print(f" Recorded {len(edges)} dependency edges.")
except Exception as e:
print(f" Warning: Could not record dependencies: {e}")
def _git_commit_chapter(self, chapter_id: str):
"""Commit chapter outputs to git."""
output_dir = self.example_dir / "output"
try:
subprocess.run(
["git", "add", str(output_dir)],
cwd=str(self.example_dir),
check=True,
capture_output=True,
)
subprocess.run(
["git", "commit", "-m", f"infospace: process {chapter_id}\n\n"
f"Extract entities, map to VSM, and synthesize analysis\n"
f"for {chapter_id}."],
cwd=str(project_root),
check=True,
capture_output=True,
)
print(f" Git commit: infospace: process {chapter_id}")
except subprocess.CalledProcessError as e:
print(f" Warning: Git commit skipped ({e})")
# ── Listing ──────────────────────────────────────────────────────
def list_chapters(self):
"""List all available chapters and their processing status."""
sources_dir = self.example_dir / "artifacts" / "sources"
chapters = sorted(f.stem for f in sources_dir.glob("*.md"))
print(f"Available chapters ({len(chapters)}):\n")
print(f" {'Chapter':<30} {'Entities':<12} {'Mappings':<12} {'Analysis':<12}")
print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*12}")
for ch in chapters:
entities = "done" if (self.example_dir / "output" / "entities" / f"{ch}-entities.md").exists() else "-"
mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
print(f" {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")
# ── Statistics ───────────────────────────────────────────────────
def show_stats(self):
"""Show dependency graph statistics."""
print("\nDependency Statistics:")
try:
stats = self.query_service.get_dependency_stats()
print(f" Nodes: {stats['total_nodes']}")
print(f" Edges: {stats['total_edges']}")
print(f" Root artifacts: {stats['root_count']}")
print(f" Leaf artifacts: {stats['leaf_count']}")
print(f" Has cycles: {stats['has_cycles']}")
except Exception as e:
print(f" (No data yet: {e})")
def main():
parser = argparse.ArgumentParser(
description="Process Wealth of Nations chapters through VSM analysis pipeline"
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--chapter", type=str, help="Process a single chapter (e.g., book-1-chapter-01)")
group.add_argument("--book", type=int, help="Process all chapters in a book (1-5)")
group.add_argument("--all", action="store_true", help="Process all chapters")
group.add_argument("--metrics", action="store_true", help="Assess metrics only")
group.add_argument("--list", action="store_true", help="List available chapters")
group.add_argument("--stats", action="store_true", help="Show dependency statistics")
parser.add_argument("--no-commit", action="store_true", help="Skip git commits")
parser.add_argument(
"--provider",
type=str,
choices=["openrouter", "claude-code"],
default=None,
help="LLM provider for auto-generating outputs (omit for manual mode)",
)
parser.add_argument("--model", type=str, default=None, help="Model name to pass to the LLM provider")
args = parser.parse_args()
# Build optional LLM adapter
llm_adapter = None
if args.provider:
from markitect.llm import create_adapter
llm_adapter = create_adapter(args.provider, model=args.model)
print(f"LLM: {args.provider}" + (f" ({args.model})" if args.model else ""))
example_dir = Path(__file__).parent
processor = ChapterProcessor(example_dir, llm_adapter=llm_adapter)
processor.setup()
if args.list:
processor.list_chapters()
elif args.stats:
processor.show_stats()
elif args.metrics:
processor.assess_metrics()
elif args.chapter:
processor.process_chapter(args.chapter, auto_commit=not args.no_commit)
elif args.book:
sources_dir = example_dir / "artifacts" / "sources"
chapters = sorted(
f.stem for f in sources_dir.glob(f"book-{args.book}-chapter-*.md")
)
if not chapters:
print(f"No chapters found for Book {args.book}")
return
print(f"Processing {len(chapters)} chapters from Book {args.book}\n")
for ch in chapters:
processor.process_chapter(ch, auto_commit=not args.no_commit)
print()
elif args.all:
sources_dir = example_dir / "artifacts" / "sources"
chapters = sorted(f.stem for f in sources_dir.glob("*.md"))
print(f"Processing all {len(chapters)} chapters\n")
for ch in chapters:
processor.process_chapter(ch, auto_commit=not args.no_commit)
print()
processor.show_stats()
if __name__ == "__main__":
main()