feat(infospace): add process command for batch source file processing

- Extend PipelineStage with name, output_dir, output_macro,
  split_entities, and macros fields for declarative pipeline config
- Add SourcePipeline class (pipeline.py) using simple @{macro}
  substitution — no SQLite dependency, skip-if-exists per stage,
  LLM retry on rate limits, git commit per source
- Add `markitect infospace process [GLOB_PATTERN]` CLI command with
  --all, --provider, --model, --check-after-each, --no-commit flags
- Update infospace.yaml with output_dir, output_macro, split_entities,
  and macros for each pipeline stage in the WoN example

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 13:29:32 +01:00
parent 4e0b27b075
commit c594bc3a38
4 changed files with 654 additions and 1 deletions

View File

@@ -479,6 +479,117 @@ def disciplines(config_path: Optional[str]):
click.echo(f" Error: {status.error}")
# ── process ─────────────────────────────────────────────────────
@infospace_commands.command()
@click.argument("glob_pattern", default=None, required=False)
@click.option("--all", "process_all", is_flag=True, help="Process all source files.")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--provider", default=None, help="LLM provider (openrouter, openai, etc.).")
@click.option("--model", default=None, help="LLM model name.")
@click.option(
"--check-after-each",
is_flag=True,
help="Run collection checks (C1C5) after each source file.",
)
@click.option("--no-commit", is_flag=True, help="Skip git commits.")
def process(
glob_pattern: Optional[str],
process_all: bool,
config_path: Optional[str],
provider: Optional[str],
model: Optional[str],
check_after_each: bool,
no_commit: bool,
):
"""Process source files through the pipeline defined in infospace.yaml.
GLOB_PATTERN is matched against the sources directory declared in
infospace.yaml (default ``*.md``). Use ``--all`` to process every
source file.
\b
Examples:
# Process chapters 1-3 from book 1
markitect infospace process "book-1-chapter-0[1-3].md" --provider openrouter
# Process all source files and check metrics after each
markitect infospace process --all --provider openrouter --check-after-each
# Dry run — load existing outputs only, no LLM calls
markitect infospace process --all
"""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
if not cfg.pipeline or not cfg.pipeline.stages:
click.echo(
"Error: No pipeline stages defined in infospace.yaml.\n"
"Add a 'pipeline.stages' section with at least one stage.",
err=True,
)
raise SystemExit(1)
# Resolve sources directory
sources_dir = root / cfg.topic.sources if cfg.topic.sources else root
if not sources_dir.is_dir():
click.echo(
f"Error: Sources directory not found: {sources_dir}\n"
f"Set 'topic.sources' in infospace.yaml.",
err=True,
)
raise SystemExit(1)
# Collect source files
if process_all:
source_files = sorted(sources_dir.glob("*.md"))
else:
pattern = glob_pattern or "*.md"
source_files = sorted(sources_dir.glob(pattern))
if not source_files:
if process_all:
click.echo(f"No source files found in {sources_dir}")
else:
click.echo(
f"No files matched: {glob_pattern or '*.md'}\n"
f"Sources directory: {sources_dir}"
)
return
click.echo(f"Found {len(source_files)} source file(s) in {sources_dir.name}/")
# Create LLM adapter
adapter = None
if provider:
from markitect.llm import create_adapter
_PROVIDER_DEFAULTS = {"openrouter": "arcee-ai/trinity-large-preview:free"}
resolved_model = model or _PROVIDER_DEFAULTS.get(provider)
adapter = create_adapter(provider, model=resolved_model)
click.echo(f"LLM: {provider} ({resolved_model or 'default'})")
else:
click.echo("No LLM provider — will use existing outputs only (manual mode).")
# Run pipeline
from markitect.infospace.pipeline import SourcePipeline
pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)
total = len(source_files)
completed = 0
for i, source_file in enumerate(source_files, 1):
click.echo(f"\n[{i}/{total}] {source_file.name}")
success = pipeline.process_source(source_file)
if success:
completed += 1
if check_after_each:
pipeline.run_collection_check()
click.echo(f"\nDone: {completed}/{total} source file(s) fully processed.")
# ── stale-mappings ──────────────────────────────────────────────────