feat(infospace): add process command for batch source file processing

- Extend PipelineStage with name, output_dir, output_macro, split_entities, and macros fields for declarative pipeline config - Add SourcePipeline class (pipeline.py) using simple @{macro} substitution — no SQLite dependency, skip-if-exists per stage, LLM retry on rate limits, git commit per source - Add `markitect infospace process [GLOB_PATTERN]` CLI command with --all, --provider, --model, --check-after-each, --no-commit flags - Update infospace.yaml with output_dir, output_macro, split_entities, and macros for each pipeline stage in the WoN example Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-19 13:29:32 +01:00
parent 4e0b27b075
commit c594bc3a38
4 changed files with 654 additions and 1 deletions
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -479,6 +479,117 @@ def disciplines(config_path: Optional[str]):
            click.echo(f"  Error: {status.error}")


+# ── process ─────────────────────────────────────────────────────
+
+
+@infospace_commands.command()
+@click.argument("glob_pattern", default=None, required=False)
+@click.option("--all", "process_all", is_flag=True, help="Process all source files.")
+@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
+@click.option("--provider", default=None, help="LLM provider (openrouter, openai, etc.).")
+@click.option("--model", default=None, help="LLM model name.")
+@click.option(
+    "--check-after-each",
+    is_flag=True,
+    help="Run collection checks (C1–C5) after each source file.",
+)
+@click.option("--no-commit", is_flag=True, help="Skip git commits.")
+def process(
+    glob_pattern: Optional[str],
+    process_all: bool,
+    config_path: Optional[str],
+    provider: Optional[str],
+    model: Optional[str],
+    check_after_each: bool,
+    no_commit: bool,
+):
+    """Process source files through the pipeline defined in infospace.yaml.
+
+    GLOB_PATTERN is matched against the sources directory declared in
+    infospace.yaml (default ``*.md``).  Use ``--all`` to process every
+    source file.
+
+    \b
+    Examples:
+      # Process chapters 1-3 from book 1
+      markitect infospace process "book-1-chapter-0[1-3].md" --provider openrouter
+
+      # Process all source files and check metrics after each
+      markitect infospace process --all --provider openrouter --check-after-each
+
+      # Dry run — load existing outputs only, no LLM calls
+      markitect infospace process --all
+    """
+    cfg, cfg_path = _load_config_or_exit(config_path)
+    root = cfg_path.parent
+
+    if not cfg.pipeline or not cfg.pipeline.stages:
+        click.echo(
+            "Error: No pipeline stages defined in infospace.yaml.\n"
+            "Add a 'pipeline.stages' section with at least one stage.",
+            err=True,
+        )
+        raise SystemExit(1)
+
+    # Resolve sources directory
+    sources_dir = root / cfg.topic.sources if cfg.topic.sources else root
+    if not sources_dir.is_dir():
+        click.echo(
+            f"Error: Sources directory not found: {sources_dir}\n"
+            f"Set 'topic.sources' in infospace.yaml.",
+            err=True,
+        )
+        raise SystemExit(1)
+
+    # Collect source files
+    if process_all:
+        source_files = sorted(sources_dir.glob("*.md"))
+    else:
+        pattern = glob_pattern or "*.md"
+        source_files = sorted(sources_dir.glob(pattern))
+
+    if not source_files:
+        if process_all:
+            click.echo(f"No source files found in {sources_dir}")
+        else:
+            click.echo(
+                f"No files matched: {glob_pattern or '*.md'}\n"
+                f"Sources directory: {sources_dir}"
+            )
+        return
+
+    click.echo(f"Found {len(source_files)} source file(s) in {sources_dir.name}/")
+
+    # Create LLM adapter
+    adapter = None
+    if provider:
+        from markitect.llm import create_adapter
+
+        _PROVIDER_DEFAULTS = {"openrouter": "arcee-ai/trinity-large-preview:free"}
+        resolved_model = model or _PROVIDER_DEFAULTS.get(provider)
+        adapter = create_adapter(provider, model=resolved_model)
+        click.echo(f"LLM: {provider} ({resolved_model or 'default'})")
+    else:
+        click.echo("No LLM provider — will use existing outputs only (manual mode).")
+
+    # Run pipeline
+    from markitect.infospace.pipeline import SourcePipeline
+
+    pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)
+
+    total = len(source_files)
+    completed = 0
+    for i, source_file in enumerate(source_files, 1):
+        click.echo(f"\n[{i}/{total}] {source_file.name}")
+        success = pipeline.process_source(source_file)
+        if success:
+            completed += 1
+            if check_after_each:
+                pipeline.run_collection_check()
+
+    click.echo(f"\nDone: {completed}/{total} source file(s) fully processed.")
+
+
 # ── stale-mappings ──────────────────────────────────────────────────