feat(infospace): add process command for batch source file processing
- Extend PipelineStage with name, output_dir, output_macro,
split_entities, and macros fields for declarative pipeline config
- Add SourcePipeline class (pipeline.py) using simple @{macro}
substitution — no SQLite dependency, skip-if-exists per stage,
LLM retry on rate limits, git commit per source
- Add `markitect infospace process [GLOB_PATTERN]` CLI command with
--all, --provider, --model, --check-after-each, --no-commit flags
- Update infospace.yaml with output_dir, output_macro, split_entities,
and macros for each pipeline stage in the WoN example
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -479,6 +479,117 @@ def disciplines(config_path: Optional[str]):
|
||||
click.echo(f" Error: {status.error}")
|
||||
|
||||
|
||||
# ── process ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@infospace_commands.command()
|
||||
@click.argument("glob_pattern", default=None, required=False)
|
||||
@click.option("--all", "process_all", is_flag=True, help="Process all source files.")
|
||||
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
|
||||
@click.option("--provider", default=None, help="LLM provider (openrouter, openai, etc.).")
|
||||
@click.option("--model", default=None, help="LLM model name.")
|
||||
@click.option(
|
||||
"--check-after-each",
|
||||
is_flag=True,
|
||||
help="Run collection checks (C1–C5) after each source file.",
|
||||
)
|
||||
@click.option("--no-commit", is_flag=True, help="Skip git commits.")
|
||||
def process(
|
||||
glob_pattern: Optional[str],
|
||||
process_all: bool,
|
||||
config_path: Optional[str],
|
||||
provider: Optional[str],
|
||||
model: Optional[str],
|
||||
check_after_each: bool,
|
||||
no_commit: bool,
|
||||
):
|
||||
"""Process source files through the pipeline defined in infospace.yaml.
|
||||
|
||||
GLOB_PATTERN is matched against the sources directory declared in
|
||||
infospace.yaml (default ``*.md``). Use ``--all`` to process every
|
||||
source file.
|
||||
|
||||
\b
|
||||
Examples:
|
||||
# Process chapters 1-3 from book 1
|
||||
markitect infospace process "book-1-chapter-0[1-3].md" --provider openrouter
|
||||
|
||||
# Process all source files and check metrics after each
|
||||
markitect infospace process --all --provider openrouter --check-after-each
|
||||
|
||||
# Dry run — load existing outputs only, no LLM calls
|
||||
markitect infospace process --all
|
||||
"""
|
||||
cfg, cfg_path = _load_config_or_exit(config_path)
|
||||
root = cfg_path.parent
|
||||
|
||||
if not cfg.pipeline or not cfg.pipeline.stages:
|
||||
click.echo(
|
||||
"Error: No pipeline stages defined in infospace.yaml.\n"
|
||||
"Add a 'pipeline.stages' section with at least one stage.",
|
||||
err=True,
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
# Resolve sources directory
|
||||
sources_dir = root / cfg.topic.sources if cfg.topic.sources else root
|
||||
if not sources_dir.is_dir():
|
||||
click.echo(
|
||||
f"Error: Sources directory not found: {sources_dir}\n"
|
||||
f"Set 'topic.sources' in infospace.yaml.",
|
||||
err=True,
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
# Collect source files
|
||||
if process_all:
|
||||
source_files = sorted(sources_dir.glob("*.md"))
|
||||
else:
|
||||
pattern = glob_pattern or "*.md"
|
||||
source_files = sorted(sources_dir.glob(pattern))
|
||||
|
||||
if not source_files:
|
||||
if process_all:
|
||||
click.echo(f"No source files found in {sources_dir}")
|
||||
else:
|
||||
click.echo(
|
||||
f"No files matched: {glob_pattern or '*.md'}\n"
|
||||
f"Sources directory: {sources_dir}"
|
||||
)
|
||||
return
|
||||
|
||||
click.echo(f"Found {len(source_files)} source file(s) in {sources_dir.name}/")
|
||||
|
||||
# Create LLM adapter
|
||||
adapter = None
|
||||
if provider:
|
||||
from markitect.llm import create_adapter
|
||||
|
||||
_PROVIDER_DEFAULTS = {"openrouter": "arcee-ai/trinity-large-preview:free"}
|
||||
resolved_model = model or _PROVIDER_DEFAULTS.get(provider)
|
||||
adapter = create_adapter(provider, model=resolved_model)
|
||||
click.echo(f"LLM: {provider} ({resolved_model or 'default'})")
|
||||
else:
|
||||
click.echo("No LLM provider — will use existing outputs only (manual mode).")
|
||||
|
||||
# Run pipeline
|
||||
from markitect.infospace.pipeline import SourcePipeline
|
||||
|
||||
pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)
|
||||
|
||||
total = len(source_files)
|
||||
completed = 0
|
||||
for i, source_file in enumerate(source_files, 1):
|
||||
click.echo(f"\n[{i}/{total}] {source_file.name}")
|
||||
success = pipeline.process_source(source_file)
|
||||
if success:
|
||||
completed += 1
|
||||
if check_after_each:
|
||||
pipeline.run_collection_check()
|
||||
|
||||
click.echo(f"\nDone: {completed}/{total} source file(s) fully processed.")
|
||||
|
||||
|
||||
# ── stale-mappings ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user