Files
markitect-main/examples/infospace-with-history/process_chapters.py
tegwick fa27572f43 fix(example): skip prompt writes when output exists, add quality rubrics
INFRA-TASKS #5 — process_chapters.py now skips writing *-prompt.md files
when the corresponding output file already exists on disk. DB-only rebuilds
no longer dirty the working tree with unchanged prompt content.

INFRA-TASKS #8 — Added '## Quality Metrics' section to the entity and VSM
mapping schemas, defining the five evaluation dimensions (Definition Precision,
Source Grounding, Domain Placement, VSM Relevance, Explanatory Value) with
1–5 rubrics used by the evaluate-entity template.

Also updated INFRA-TASKS.md to reflect current resolution status for tasks
4–19 across S2 and S3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 06:04:09 +01:00

1208 lines
49 KiB
Python

#!/usr/bin/env python3
"""
Infospace with History — Chapter Processing Pipeline
Processes chapters from Adam Smith's "The Wealth of Nations" through a
three-stage analysis pipeline, mapping economic content to Stafford Beer's
Viable System Model.
Pipeline per chapter:
1. extract-entities — Extract economic entities from chapter text
2. map-to-vsm — Map entities to VSM concepts
3. synthesize-analysis — Produce chapter-level VSM analysis
After all chapters:
4. assess-metrics — Evaluate completeness and consistency
Usage:
# Process a single chapter
python process_chapters.py --chapter book-1-chapter-01
# Process all chapters in Book I
python process_chapters.py --book 1
# Process all chapters
python process_chapters.py --all
# Assess metrics only (after chapters have been processed)
python process_chapters.py --metrics
# List available chapters
python process_chapters.py --list
"""
import argparse
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from markitect.prompts.models import Artifact, ArtifactType
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
from markitect.prompts.dependencies.repository import SQLiteDependencyRepository
from markitect.prompts.services.artifact_service import ArtifactService
from markitect.prompts.templates.models import PromptTemplate
from markitect.prompts.templates.analyzer import TemplateAnalyzer
from markitect.prompts.resolver.resolver import PromptResolver
from markitect.prompts.resolver.compiler import ContextCompiler
from markitect.prompts.resolver.strategy import ResolutionConfig, MultiSpaceResolutionStrategy
from markitect.prompts.execution.manifest import RunManifest
from markitect.prompts.dependencies.graph import GraphBuilder
from markitect.prompts.traceability.service import TraceabilityService
from markitect.prompts.queries.operations import PromptQueryService
class ChapterProcessor:
"""Processes Wealth of Nations chapters through the VSM analysis pipeline."""
def __init__(
self,
example_dir: Path,
db_path: Optional[str] = None,
llm_adapter=None,
):
self.example_dir = example_dir
self.db_path = db_path or str(example_dir / "infospace.db")
self.llm_adapter = llm_adapter
# Initialize repositories
self.artifact_repo = SQLiteArtifactRepository(self.db_path)
self.dep_repo = SQLiteDependencyRepository(self.db_path)
self.artifact_service = ArtifactService(self.artifact_repo)
self.graph_builder = GraphBuilder(self.dep_repo)
self.trace_service = TraceabilityService(
self.artifact_repo, self.dep_repo, db_path=self.db_path
)
self.query_service = PromptQueryService(
self.artifact_repo, self.dep_repo, db_path=self.db_path
)
# Template analysis and compilation
self.analyzer = TemplateAnalyzer()
self.compiler = ContextCompiler()
# Information spaces
self.spaces = {
"templates": "infospace-templates",
"sources": "infospace-sources",
"guidelines": "infospace-guidelines",
"vsm-reference": "infospace-vsm-reference",
"entities": "infospace-entities",
"mappings": "infospace-mappings",
"analyses": "infospace-analyses",
"metrics": "infospace-metrics",
}
# ── Artifact Management ──────────────────────────────────────────
def load_or_create_artifact(
self,
space: str,
filepath: Path,
artifact_type: ArtifactType,
name: Optional[str] = None,
) -> Artifact:
"""Load artifact from file, create in repo if needed."""
if name is None:
name = filepath.stem
content = filepath.read_text()
existing = self.artifact_repo.get_by_name(space, name)
if existing:
return existing
artifact = Artifact.create(
space_id=space, name=name, content=content, artifact_type=artifact_type
)
artifact = self.artifact_repo.create(artifact)
print(f" + {name} ({artifact.content_digest[:8]})")
return artifact
def store_output_artifact(
self, space: str, name: str, content: str, artifact_type: ArtifactType
) -> Artifact:
"""Store a generated output artifact, updating if it already exists."""
existing = self.artifact_repo.get_by_name(space, name)
if existing:
self.artifact_repo.delete(existing.id)
artifact = Artifact.create(
space_id=space, name=name, content=content, artifact_type=artifact_type
)
artifact = self.artifact_repo.create(artifact)
return artifact
def bind_macro_artifact(self, space: str, macro_name: str, content: str) -> Artifact:
"""Bind content to a macro name in a space (for template resolution)."""
existing = self.artifact_repo.get_by_name(space, macro_name)
if existing:
self.artifact_repo.delete(existing.id)
artifact = Artifact.create(
space_id=space,
name=macro_name,
content=content,
artifact_type=ArtifactType.CONTENT,
)
artifact = self.artifact_repo.create(artifact)
return artifact
# ── Setup ────────────────────────────────────────────────────────
def setup(self):
"""Load all static artifacts (templates, guidelines, VSM reference)."""
print("Loading artifacts...")
# Templates
for tmpl_file in (self.example_dir / "templates").glob("*.md"):
self.load_or_create_artifact(
self.spaces["templates"], tmpl_file, ArtifactType.TEMPLATE
)
# VSM reference
for ref_file in (self.example_dir / "artifacts" / "vsm-reference").glob("*.md"):
self.load_or_create_artifact(
self.spaces["vsm-reference"], ref_file, ArtifactType.CONTENT,
name="vsm_framework",
)
# Guidelines
guideline_name_map = {
"extraction-rules.md": "extraction_rules",
"mapping-rules.md": "mapping_rules",
}
for guide_file in (self.example_dir / "artifacts" / "guidelines").glob("*.md"):
name = guideline_name_map.get(guide_file.name, guide_file.stem)
self.load_or_create_artifact(
self.spaces["guidelines"], guide_file, ArtifactType.CONTENT, name=name
)
print(" Done.\n")
# ── Template Resolution ──────────────────────────────────────────
def resolve_and_compile(
self, template_name: str, extra_spaces: list[str]
) -> Optional[str]:
"""Resolve macros and compile a template into a final prompt string.
Uses TemplateAnalyzer to parse @{target} macros from the template,
the resolver to look up artifact content, and ContextCompiler to
assemble the final prompt.
"""
template_artifact = self.artifact_repo.get_by_name(
self.spaces["templates"], template_name
)
if not template_artifact:
print(f" ERROR: Template '{template_name}' not found")
return None
template = PromptTemplate.from_artifact(template_artifact)
template_content = template_artifact.content
# Analyze template to extract @{target} macros
self.analyzer.analyze(template, template_content)
config = ResolutionConfig(
space_id=self.spaces["templates"],
included_spaces=[self.spaces[s] for s in extra_spaces],
)
strategy = MultiSpaceResolutionStrategy()
resolver = PromptResolver(self.artifact_service, strategy)
result = resolver.resolve_template(template, config)
if not result.success:
print(f" ERROR: Resolution failed: {result.context.errors}")
return None
# Compile template with resolved content
compiled = self.compiler.compile(template, template_content, result)
return compiled.content
# ── LLM Execution Helpers ─────────────────────────────────────────
def _call_llm(self, prompt: str, stage_label: str, max_tokens: int = 8192) -> Optional[str]:
"""Call the LLM and return the content string, or ``None`` on failure.
Retries up to 3 times on rate-limit (429) errors with exponential backoff.
Does **not** write any files — callers decide where to persist.
"""
import time as _time
from markitect.prompts.execution.models import RunConfig
from markitect.llm.exceptions import LLMRateLimitError
print(f" Calling LLM ({stage_label})...")
t0 = _time.time()
max_retries = 3
for attempt in range(max_retries + 1):
try:
response = self.llm_adapter.execute_prompt(prompt, RunConfig(max_tokens=max_tokens))
break # success
except LLMRateLimitError as exc:
if attempt < max_retries:
wait = 15 * (attempt + 1) # 15, 30, 45 seconds
print(f" Rate limited, retrying in {wait}s (attempt {attempt + 1}/{max_retries})...")
_time.sleep(wait)
else:
print(f" LLM rate limit after {max_retries} retries ({_time.time() - t0:.1f}s): {exc}")
return None
except Exception as exc:
print(f" LLM error ({_time.time() - t0:.1f}s): {exc}")
return None
elapsed = _time.time() - t0
usage = response.usage
print(
f" LLM done in {elapsed:.1f}s — "
f"prompt {usage.get('prompt_tokens', '?')} tok, "
f"completion {usage.get('completion_tokens', '?')} tok, "
f"total {usage.get('total_tokens', '?')} tok"
)
content = response.content
if not content or not content.strip():
print(f" LLM returned empty content")
return None
return content
def _execute_llm(self, prompt: str, output_file: Path, stage_label: str, max_tokens: int = 8192) -> Optional[str]:
"""Call the LLM, write the result to *output_file*, and return it."""
content = self._call_llm(prompt, stage_label, max_tokens=max_tokens)
if content:
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content)
print(f" LLM output written to {output_file.name}")
return content
# ── Entity Management (flat canonical set) ─────────────────────
@staticmethod
def _normalize_entity_name(name: str) -> str:
"""Normalize an entity name to a kebab-case filename stem."""
slug = name.lower().strip()
slug = slug.replace("_", "-").replace(" ", "-")
slug = re.sub(r"[^a-z0-9-]", "", slug)
slug = re.sub(r"-{2,}", "-", slug)
return slug.strip("-")
def _entities_dir(self) -> Path:
return self.example_dir / "output" / "entities"
def _archive_dir(self) -> Path:
return self._entities_dir() / "archive"
def _list_existing_entity_names(self) -> list[str]:
"""Return sorted slugs of all canonical entity files already on disk."""
return sorted(
f.stem
for f in self._entities_dir().glob("*.md")
if not f.name.endswith("-entities.md")
and not f.name.endswith("-prompt.md")
)
def archive_entity(self, slug: str, reason: str) -> None:
"""Move a canonical entity to the archive with a documented reason.
The entity file is prepended with an archive header explaining why
it was retired, then moved to ``output/entities/archive/<slug>.md``.
Chapter views that reference this entity are **not** updated
automatically — review and update them manually.
"""
src = self._entities_dir() / f"{slug}.md"
if not src.exists():
print(f" Entity not found: {slug}")
return
archive = self._archive_dir()
archive.mkdir(parents=True, exist_ok=True)
dest = archive / f"{slug}.md"
from datetime import date
header = (
f"<!-- ARCHIVED {date.today().isoformat()}\n"
f" Reason: {reason}\n"
f"-->\n\n"
)
content = src.read_text()
dest.write_text(header + content)
src.unlink()
# Report which chapter views still reference this entity
refs = []
for view in self._entities_dir().glob("*-entities.md"):
if f'include "{slug}.md"' in view.read_text():
refs.append(view.name)
print(f" Archived: {slug}.md -> archive/{slug}.md")
print(f" Reason: {reason}")
if refs:
print(f" Referenced by: {', '.join(refs)} (update these views)")
print(f" Canonical set: {len(self._list_existing_entity_names())} entities")
def _split_entities(
self, combined_content: str
) -> list[tuple[str, Path]]:
"""Split combined LLM output into the flat canonical entity directory.
Writes each entity to ``output/entities/<slug>.md``. If a file
with that slug already exists it is **skipped** (first-occurrence
wins), but the entity is still included in the returned list so
the chapter view can reference it.
Returns list of (entity_name, file_path) for every entity in
*combined_content* (new and pre-existing alike).
"""
entities_dir = self._entities_dir()
entities_dir.mkdir(parents=True, exist_ok=True)
parts = re.split(
r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
combined_content,
flags=re.MULTILINE,
)
entity_files: list[tuple[str, Path]] = []
new_count = 0
skipped_count = 0
for i in range(1, len(parts), 2):
entity_name = parts[i]
entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""
slug = self._normalize_entity_name(entity_name)
if not slug:
continue
file_path = entities_dir / f"{slug}.md"
if file_path.exists():
skipped_count += 1
else:
file_path.write_text(entity_content + "\n")
new_count += 1
entity_files.append((entity_name, file_path))
msg = f" {new_count} new entities written"
if skipped_count:
msg += f", {skipped_count} pre-existing (skipped)"
print(msg)
return entity_files
def _write_chapter_entity_view(
self, chapter_id: str, entity_files: list[tuple[str, Path]]
) -> Path:
"""Write a per-chapter view file that transcludes individual entities."""
parts = chapter_id.split("-")
book_num = int(parts[1]) if len(parts) >= 2 else 1
ch_num = int(parts[3]) if len(parts) >= 4 else 0
roman = {1: "I", 2: "II", 3: "III", 4: "IV", 5: "V"}.get(book_num, str(book_num))
title = f"# Economic Entities — Book {roman}, Chapter {ch_num}\n"
lines = [title]
for _name, file_path in entity_files:
lines.append(f'{{{{ include "{file_path.name}" }}}}')
lines.append("")
lines.append("---")
lines.append("")
# Remove trailing separator after last entity
if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
lines = lines[:-2]
view_path = self._entities_dir() / f"{chapter_id}-entities.md"
view_path.write_text("\n".join(lines) + "\n")
print(f" Chapter view written to {view_path.name}")
return view_path
def _read_entities_from_view(
self, chapter_id: str
) -> tuple[str, list[tuple[str, Path]]]:
"""Reconstruct combined entity content from a chapter view file.
Parses ``{{ include "..." }}`` directives in the view to discover
which canonical entity files belong to this chapter, reads them,
and rebuilds the delimited combined content needed by downstream
stages.
"""
from markitect.packaging.transclusion.directives import DirectiveParser
view_path = self._entities_dir() / f"{chapter_id}-entities.md"
view_content = view_path.read_text()
includes = DirectiveParser.extract_file_includes(view_content)
entities_dir = self._entities_dir()
entity_files: list[tuple[str, Path]] = []
parts: list[str] = []
for rel_path in includes:
file_path = entities_dir / rel_path
if not file_path.exists():
continue
slug = file_path.stem
body = file_path.read_text().strip()
parts.append(f"--- ENTITY: {slug} ---\n\n{body}")
entity_files.append((slug, file_path))
combined = "\n\n".join(parts) + "\n" if parts else ""
return combined, entity_files
# ── Pipeline Stages ──────────────────────────────────────────────
def stage_extract_entities(self, chapter_id: str, chapter_content: str) -> Optional[str]:
"""Stage 1: Extract economic entities from a chapter.
Canonical entity files live in a **flat** directory
(``output/entities/<slug>.md``). Duplicates across chapters are
skipped — first occurrence wins. The per-chapter view file
(``<chapter_id>-entities.md``) is a **secondary** transclusion view
that ``{{ include }}``s each entity relevant to the chapter.
"""
print(f" [1/3] Extracting entities...")
# Bind the chapter content to the macro name
self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
# Bind existing entity list so the LLM knows what already exists
existing = self._list_existing_entity_names()
if existing:
entity_list = "\n".join(f"- {name}" for name in existing)
else:
entity_list = "(none — this is the first chapter)"
self.bind_macro_artifact(
self.spaces["entities"], "existing_entities", entity_list
)
prompt = self.resolve_and_compile(
"extract-entities",
["sources", "guidelines", "vsm-reference", "entities"],
)
if not prompt:
return None
view_file = self._entities_dir() / f"{chapter_id}-entities.md"
# Write compiled prompt only when no output exists yet (avoids dirty
# working tree on DB-only rebuilds — Task 5 fix)
prompt_file = self._entities_dir() / f"{chapter_id}-prompt.md"
if not (view_file.exists() and "{{ include" in view_file.read_text()):
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
# ── PRIMARY: chapter view with transclusion already on disk ──
if view_file.exists() and "{{ include" in view_file.read_text():
content, entity_files = self._read_entities_from_view(chapter_id)
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
ArtifactType.GENERATED,
)
print(f" Found chapter view referencing {len(entity_files)} entities")
return content
# ── MIGRATION: per-chapter subdirectory (previous format) ──
subdir = self._entities_dir() / chapter_id
if subdir.is_dir() and list(subdir.glob("*.md")):
print(f" Migrating per-chapter subdir: {chapter_id}/")
entity_files: list[tuple[str, Path]] = []
entities_dir = self._entities_dir()
for src in sorted(subdir.glob("*.md")):
dest = entities_dir / src.name
if not dest.exists():
src.rename(dest)
entity_files.append((src.stem, dest))
# Clean up empty subdir
if not list(subdir.glob("*")):
subdir.rmdir()
self._write_chapter_entity_view(chapter_id, entity_files)
content = self._read_entities_from_view(chapter_id)[0]
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
content,
ArtifactType.GENERATED,
)
return content
# ── MIGRATION: legacy combined file (pre-split format) ──
if view_file.exists():
raw = view_file.read_text()
if "--- ENTITY:" in raw:
print(f" Migrating legacy combined file: {view_file.name}")
entity_files = self._split_entities(raw)
self._write_chapter_entity_view(chapter_id, entity_files)
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
raw,
ArtifactType.GENERATED,
)
return raw
# ── GENERATE: call LLM, persist individual files first ──
if self.llm_adapter and prompt:
combined = self._call_llm(prompt, "entities")
if combined:
entity_files = self._split_entities(combined)
self._write_chapter_entity_view(chapter_id, entity_files)
self.store_output_artifact(
self.spaces["entities"],
f"{chapter_id}-entities",
combined,
ArtifactType.GENERATED,
)
return combined
print(f" Awaiting entity files in: output/entities/")
return None
def stage_map_to_vsm(self, chapter_id: str, entities_content: str) -> Optional[str]:
"""Stage 2: Map extracted entities to VSM concepts."""
print(f" [2/3] Mapping to VSM...")
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
prompt = self.resolve_and_compile(
"map-to-vsm", ["entities", "vsm-reference", "guidelines"]
)
if not prompt:
return None
output_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-mappings.md"
# Write compiled prompt only when output does not yet exist (Task 5 fix)
if not output_file.exists():
prompt_file = self.example_dir / "output" / "mappings" / f"{chapter_id}-prompt.md"
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["mappings"],
f"{chapter_id}-mappings",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "mappings")
if content:
self.store_output_artifact(
self.spaces["mappings"],
f"{chapter_id}-mappings",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
def stage_synthesize_analysis(
self, chapter_id: str, chapter_content: str, entities_content: str, mappings_content: str
) -> Optional[str]:
"""Stage 3: Synthesize chapter-level VSM analysis."""
print(f" [3/3] Synthesizing analysis...")
self.bind_macro_artifact(self.spaces["sources"], "chapter_text", chapter_content)
self.bind_macro_artifact(self.spaces["entities"], "entities", entities_content)
self.bind_macro_artifact(self.spaces["mappings"], "mappings", mappings_content)
prompt = self.resolve_and_compile(
"synthesize-analysis",
["sources", "entities", "mappings", "vsm-reference"],
)
if not prompt:
return None
output_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-analysis.md"
# Write compiled prompt only when output does not yet exist (Task 5 fix)
if not output_file.exists():
prompt_file = self.example_dir / "output" / "analyses" / f"{chapter_id}-prompt.md"
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["analyses"],
f"{chapter_id}-analysis",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "analysis")
if content:
self.store_output_artifact(
self.spaces["analyses"],
f"{chapter_id}-analysis",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
# ── Metrics ──────────────────────────────────────────────────────
def assess_metrics(self) -> Optional[str]:
"""Run the assess-metrics template across all completed analyses."""
print("Assessing metrics...")
analyses_dir = self.example_dir / "output" / "analyses"
analysis_files = sorted(analyses_dir.glob("*-analysis.md"))
if not analysis_files:
print(" No completed analyses found. Process chapters first.")
return None
# Concatenate all analyses
all_analyses = []
for f in analysis_files:
all_analyses.append(f"<!-- Source: {f.name} -->\n{f.read_text()}")
combined = "\n\n---\n\n".join(all_analyses)
self.bind_macro_artifact(self.spaces["analyses"], "all_analyses", combined)
prompt = self.resolve_and_compile(
"assess-metrics", ["analyses", "vsm-reference"]
)
if not prompt:
return None
output_file = self.example_dir / "output" / "metrics" / "metrics-report.md"
# Write compiled prompt only when output does not yet exist (Task 5 fix)
if not output_file.exists():
prompt_file = self.example_dir / "output" / "metrics" / "metrics-prompt.md"
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text(prompt)
print(f" Prompt written to {prompt_file.relative_to(self.example_dir)}")
if output_file.exists():
content = output_file.read_text()
self.store_output_artifact(
self.spaces["metrics"],
"metrics-report",
content,
ArtifactType.GENERATED,
)
print(f" Found existing output: {output_file.name}")
return content
if self.llm_adapter and prompt:
content = self._execute_llm(prompt, output_file, "metrics")
if content:
self.store_output_artifact(
self.spaces["metrics"],
"metrics-report",
content,
ArtifactType.GENERATED,
)
return content
print(f" Awaiting output at: {output_file.relative_to(self.example_dir)}")
return None
# ── Entity Evaluation (Task 9) ────────────────────────────────────
def _extract_quality_rubric(self) -> str:
"""Extract the Quality Metrics section from the entity schema file."""
schema_file = self.example_dir / "schemas" / "economic-entity-schema-v1.0.md"
text = schema_file.read_text()
# Find the ## Quality Metrics section up to the next ## section
import re as _re
m = _re.search(
r"^## Quality Metrics\n(.*?)^## ",
text,
flags=_re.MULTILINE | _re.DOTALL,
)
if m:
return ("## Quality Metrics\n" + m.group(1)).strip()
return text # fallback: whole schema
def _extract_source_chapter_from_entity(self, entity_text: str) -> str:
"""Extract the Source Chapter field from an entity markdown file."""
import re as _re
m = _re.search(
r"^## Source Chapter\s*\n+(.+?)(?:\n\n|\n##|\Z)",
entity_text,
flags=_re.MULTILINE | _re.DOTALL,
)
if m:
return m.group(1).strip()
return "Unknown chapter"
def evaluate_entities(self, chapter_id: Optional[str] = None) -> None:
"""Evaluate canonical entities using the evaluate-entity template.
If *chapter_id* is given, evaluates only entities introduced by that
chapter (determined from the chapter view file). Otherwise evaluates
all canonical entities.
Outputs are written to ``output/evaluations/<slug>-eval.md``.
Existing evaluation files are skipped (idempotent).
"""
evaluations_dir = self.example_dir / "output" / "evaluations"
evaluations_dir.mkdir(parents=True, exist_ok=True)
# Determine which entity files to evaluate
if chapter_id:
view_file = self._entities_dir() / f"{chapter_id}-entities.md"
if not view_file.exists():
print(f" No chapter view found for {chapter_id}")
return
_, entity_files = self._read_entities_from_view(chapter_id)
if not entity_files:
print(f" No entities found for chapter {chapter_id}")
return
print(f"Evaluating {len(entity_files)} entities from {chapter_id}...")
else:
slugs = self._list_existing_entity_names()
entity_files = [(s, self._entities_dir() / f"{s}.md") for s in slugs]
print(f"Evaluating {len(entity_files)} canonical entities...")
if not entity_files:
print(" No entities to evaluate.")
return
# Shared context loaded once
quality_rubric = self._extract_quality_rubric()
self.bind_macro_artifact(self.spaces["guidelines"], "quality_rubric", quality_rubric)
done = 0
skipped = 0
failed = 0
for slug, entity_path in entity_files:
output_file = evaluations_dir / f"{slug}-eval.md"
if output_file.exists():
skipped += 1
continue
if not entity_path.exists():
print(f" MISSING: {entity_path.name}")
failed += 1
continue
entity_text = entity_path.read_text()
source_chapter = self._extract_source_chapter_from_entity(entity_text)
# Bind per-entity macros
self.bind_macro_artifact(self.spaces["entities"], "entity_content", entity_text)
self.bind_macro_artifact(self.spaces["sources"], "source_chapter", source_chapter)
prompt = self.resolve_and_compile(
"evaluate-entity",
["entities", "sources", "vsm-reference", "guidelines"],
)
if not prompt:
print(f" FAILED to compile prompt for {slug}")
failed += 1
continue
# Write prompt only when output does not yet exist (Task 5 fix)
prompt_file = evaluations_dir / f"{slug}-eval-prompt.md"
if not output_file.exists():
prompt_file.write_text(prompt)
if not self.llm_adapter:
print(f" {slug}: prompt written, awaiting manual evaluation")
done += 1
continue
print(f" Evaluating: {slug}...")
content = self._execute_llm(prompt, output_file, f"eval:{slug}", max_tokens=1024)
if content:
done += 1
else:
failed += 1
total = done + skipped + failed
print(f"\nEvaluation complete: {done} done, {skipped} skipped (existing), {failed} failed — {total} total")
# ── Chapter Processing ───────────────────────────────────────────
def process_chapter(self, chapter_id: str, auto_commit: bool = True):
"""Run the full pipeline for a single chapter."""
source_file = self.example_dir / "artifacts" / "sources" / f"{chapter_id}.md"
if not source_file.exists():
print(f"ERROR: Source file not found: {source_file}")
return
print(f"Processing: {chapter_id}")
print(f"{'=' * 60}")
chapter_content = source_file.read_text()
# Store source artifact
self.load_or_create_artifact(
self.spaces["sources"], source_file, ArtifactType.CONTENT
)
# Stage 1: Extract entities
entities = self.stage_extract_entities(chapter_id, chapter_content)
if entities is None:
print(f"\n Pipeline paused. Generate entities output and re-run.")
return
# Stage 2: Map to VSM
mappings = self.stage_map_to_vsm(chapter_id, entities)
if mappings is None:
print(f"\n Pipeline paused. Generate mappings output and re-run.")
return
# Stage 3: Synthesize analysis
analysis = self.stage_synthesize_analysis(
chapter_id, chapter_content, entities, mappings
)
if analysis is None:
print(f"\n Pipeline paused. Generate analysis output and re-run.")
return
print(f"\n Chapter {chapter_id} fully processed.")
# Record dependency edges
self._record_chapter_dependencies(chapter_id)
# Git commit
if auto_commit:
self._git_commit_chapter(chapter_id)
def _record_chapter_dependencies(self, chapter_id: str):
"""Record dependency edges for a processed chapter."""
run_id = f"run-{chapter_id}"
manifest = RunManifest.create(
run_id=run_id,
template_id="extract-entities",
template_name="extract-entities",
template_digest="",
)
# Source → Run
source = self.artifact_repo.get_by_name(self.spaces["sources"], chapter_id)
if source:
manifest.add_dependency_edge(source.id, run_id, "requires")
# Run → Outputs
for output_type in ["entities", "mappings", "analyses"]:
space = self.spaces[output_type]
suffix = {"entities": "entities", "mappings": "mappings", "analyses": "analysis"}
name = f"{chapter_id}-{suffix[output_type]}"
artifact = self.artifact_repo.get_by_name(space, name)
if artifact:
manifest.add_dependency_edge(run_id, artifact.id, "generates")
try:
edges = self.graph_builder.persist_edges(manifest)
print(f" Recorded {len(edges)} dependency edges.")
except Exception as e:
print(f" Warning: Could not record dependencies: {e}")
def _git_commit_chapter(self, chapter_id: str):
"""Commit chapter outputs to git."""
output_dir = self.example_dir / "output"
try:
subprocess.run(
["git", "add", str(output_dir)],
cwd=str(self.example_dir),
check=True,
capture_output=True,
)
subprocess.run(
["git", "commit", "-m", f"infospace: process {chapter_id}\n\n"
f"Extract entities, map to VSM, and synthesize analysis\n"
f"for {chapter_id}."],
cwd=str(project_root),
check=True,
capture_output=True,
)
print(f" Git commit: infospace: process {chapter_id}")
except subprocess.CalledProcessError as e:
print(f" Warning: Git commit skipped ({e})")
# ── Listing ──────────────────────────────────────────────────────
def list_chapters(self):
"""List all available chapters and their processing status."""
sources_dir = self.example_dir / "artifacts" / "sources"
chapters = sorted(f.stem for f in sources_dir.glob("*.md"))
print(f"Available chapters ({len(chapters)}):\n")
print(f" {'Chapter':<30} {'Entities':<12} {'Mappings':<12} {'Analysis':<12}")
print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*12}")
for ch in chapters:
view_file = self._entities_dir() / f"{ch}-entities.md"
entity_count = 0
if view_file.exists() and "{{ include" in view_file.read_text():
from markitect.packaging.transclusion.directives import DirectiveParser
entity_count = len(DirectiveParser.extract_file_includes(view_file.read_text()))
entities = f"done ({entity_count})" if entity_count else "-"
mappings = "done" if (self.example_dir / "output" / "mappings" / f"{ch}-mappings.md").exists() else "-"
analysis = "done" if (self.example_dir / "output" / "analyses" / f"{ch}-analysis.md").exists() else "-"
print(f" {ch:<30} {entities:<12} {mappings:<12} {analysis:<12}")
total_entities = len(self._list_existing_entity_names())
if total_entities:
print(f"\n Canonical entity set: {total_entities} unique entities")
archive = self._archive_dir()
if archive.exists():
archived = len(list(archive.glob("*.md")))
if archived:
print(f" Archived entities: {archived}")
# ── Statistics ───────────────────────────────────────────────────
def show_stats(self):
"""Show dependency graph statistics."""
print("\nDependency Statistics:")
try:
stats = self.query_service.get_dependency_stats()
print(f" Nodes: {stats['total_nodes']}")
print(f" Edges: {stats['total_edges']}")
print(f" Root artifacts: {stats['root_count']}")
print(f" Leaf artifacts: {stats['leaf_count']}")
print(f" Has cycles: {stats['has_cycles']}")
except Exception as e:
print(f" (No data yet: {e})")
# ── Infospace tooling integration ─────────────────────────────────
def _load_infospace(example_dir: Path):
"""Load infospace config and entities from the example directory."""
from markitect.infospace.config import load_infospace_config
from markitect.infospace.entity_parser import parse_entity_directory
config_path = example_dir / "infospace.yaml"
if not config_path.is_file():
print("Error: No infospace.yaml found. Create one first.")
sys.exit(1)
config = load_infospace_config(config_path)
entities_dir = example_dir / config.entities_dir
entities = parse_entity_directory(entities_dir) if entities_dir.is_dir() else []
return config, config_path, entities
def _run_infospace_status(example_dir: Path):
"""Show infospace status using the tooling layer."""
from markitect.infospace.state import build_state
config, config_path, entities = _load_infospace(example_dir)
state = build_state(config, entities=entities)
print(f"Infospace: {state.topic_name}")
print(f"Domain: {config.topic.domain}")
print(f"Entities: {state.entity_count}")
if state.domains:
print(f"Domains: {', '.join(state.domains)}")
if config.disciplines:
names = [d.name for d in config.disciplines]
print(f"Disciplines: {', '.join(names)}")
# Show processing progress
sources_dir = example_dir / "artifacts" / "sources"
total_chapters = len(list(sources_dir.glob("*.md")))
processed = len(list((example_dir / "output" / "analyses").glob("*-analysis.md")))
print(f"Chapters: {processed}/{total_chapters} processed")
def _run_infospace_check(example_dir: Path):
"""Run collection-level quality checks."""
from markitect.infospace.checks import run_all_checks
from markitect.infospace.history import record_check_results
config, config_path, entities = _load_infospace(example_dir)
if not entities:
print("No entities to check.")
return
print(f"Running collection checks on {len(entities)} entities...\n")
report = run_all_checks(entities=entities)
d = report.to_dict()
for concern_name, concern_data in d.items():
label = concern_data.get("concern", concern_name.upper())
print(f" {label}{concern_name}")
for k, v in concern_data.items():
if k == "concern":
continue
print(f" {k}: {v}")
print()
m = report.metrics()
if m:
print("Metrics summary:")
for k, v in sorted(m.items()):
print(f" {k}: {v:.4f}")
snap = record_check_results(report, config, example_dir, entity_count=len(entities))
print(f"\nRecorded snapshot {snap.snapshot_id}")
def _run_infospace_viability(example_dir: Path):
"""Show viability dashboard."""
from markitect.infospace.history import read_metrics_file
from markitect.infospace.state import build_state
config, config_path, entities = _load_infospace(example_dir)
if not config.viability:
print("No viability thresholds configured.")
return
metrics = read_metrics_file(example_dir / config.metrics_dir / "metrics.yaml")
if not metrics:
print("No metrics available. Run --infospace-check first.")
print("\nConfigured thresholds:")
for name, t in config.viability.items():
bounds = []
if t.min is not None:
bounds.append(f"min={t.min}")
if t.max is not None:
bounds.append(f"max={t.max}")
print(f" {name}: {', '.join(bounds)}")
return
state = build_state(config, entities=entities, metrics=metrics)
print(f"{'Metric':<30} {'Value':>8} {'Threshold':>15} {'Status':>8}")
print("-" * 63)
for r in state.viability_results:
bounds = []
if r.threshold.min is not None:
bounds.append(f"min={r.threshold.min}")
if r.threshold.max is not None:
bounds.append(f"max={r.threshold.max}")
status_str = "PASS" if r.passed else "FAIL"
print(f"{r.metric:<30} {r.value:>8.4f} {', '.join(bounds):>15} {status_str:>8}")
print()
if state.is_viable:
print(f"Viable: YES ({state.viability_pass_count}/{state.viability_total_count} thresholds met)")
else:
print(f"Viable: NO ({state.viability_pass_count}/{state.viability_total_count} thresholds met)")
def main():
parser = argparse.ArgumentParser(
description="Process Wealth of Nations chapters through VSM analysis pipeline"
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--chapter", type=str, help="Process a single chapter (e.g., book-1-chapter-01)")
group.add_argument("--book", type=int, help="Process all chapters in a book (1-5)")
group.add_argument("--all", action="store_true", help="Process all chapters")
group.add_argument("--metrics", action="store_true", help="Assess metrics only")
group.add_argument("--list", action="store_true", help="List available chapters")
group.add_argument("--stats", action="store_true", help="Show dependency statistics")
group.add_argument("--archive-entity", type=str, metavar="SLUG",
help="Archive an entity (move to archive/ with reason)")
group.add_argument("--infospace-status", action="store_true",
help="Show infospace status via infospace tooling")
group.add_argument("--infospace-check", action="store_true",
help="Run collection-level quality checks (C1-C5)")
group.add_argument("--infospace-viability", action="store_true",
help="Show viability dashboard")
group.add_argument("--evaluate", action="store_true",
help="Evaluate entity quality using the evaluate-entity template")
parser.add_argument("--reason", type=str, default=None,
help="Reason for archiving (used with --archive-entity)")
parser.add_argument("--eval-chapter", type=str, default=None, metavar="CHAPTER_ID",
help="Limit --evaluate to entities from a specific chapter")
parser.add_argument("--no-commit", action="store_true", help="Skip git commits")
parser.add_argument(
"--provider",
type=str,
choices=["openrouter", "claude-code", "gemini", "openai"],
default=None,
help="LLM provider for auto-generating outputs (omit for manual mode)",
)
parser.add_argument("--model", type=str, default=None, help="Model name to pass to the LLM provider")
args = parser.parse_args()
# Build optional LLM adapter
_PROVIDER_DEFAULTS = {
"openrouter": "arcee-ai/trinity-large-preview:free",
}
llm_adapter = None
if args.provider:
from markitect.llm import create_adapter
model = args.model or _PROVIDER_DEFAULTS.get(args.provider)
llm_adapter = create_adapter(args.provider, model=model)
print(f"LLM: {args.provider} ({model or 'default'})")
example_dir = Path(__file__).parent
processor = ChapterProcessor(example_dir, llm_adapter=llm_adapter)
processor.setup()
if args.archive_entity:
if not args.reason:
parser.error("--archive-entity requires --reason")
processor.archive_entity(args.archive_entity, args.reason)
elif args.list:
processor.list_chapters()
elif args.stats:
processor.show_stats()
elif args.metrics:
processor.assess_metrics()
elif args.chapter:
processor.process_chapter(args.chapter, auto_commit=not args.no_commit)
elif args.book:
sources_dir = example_dir / "artifacts" / "sources"
chapters = sorted(
f.stem for f in sources_dir.glob(f"book-{args.book}-chapter-*.md")
)
if not chapters:
print(f"No chapters found for Book {args.book}")
return
print(f"Processing {len(chapters)} chapters from Book {args.book}\n")
for ch in chapters:
processor.process_chapter(ch, auto_commit=not args.no_commit)
print()
elif args.all:
sources_dir = example_dir / "artifacts" / "sources"
chapters = sorted(f.stem for f in sources_dir.glob("*.md"))
print(f"Processing all {len(chapters)} chapters\n")
for ch in chapters:
processor.process_chapter(ch, auto_commit=not args.no_commit)
print()
elif args.infospace_status:
_run_infospace_status(example_dir)
return
elif args.infospace_check:
_run_infospace_check(example_dir)
return
elif args.infospace_viability:
_run_infospace_viability(example_dir)
return
elif args.evaluate:
processor.evaluate_entities(chapter_id=args.eval_chapter)
return
processor.show_stats()
if __name__ == "__main__":
main()