feat(infospace): add process command for batch source file processing

- Extend PipelineStage with name, output_dir, output_macro,
  split_entities, and macros fields for declarative pipeline config
- Add SourcePipeline class (pipeline.py) using simple @{macro}
  substitution — no SQLite dependency, skip-if-exists per stage,
  LLM retry on rate limits, git commit per source
- Add `markitect infospace process [GLOB_PATTERN]` CLI command with
  --all, --provider, --model, --check-after-each, --no-commit flags
- Update infospace.yaml with output_dir, output_macro, split_entities,
  and macros for each pipeline stage in the WoN example

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 13:29:32 +01:00
parent 4e0b27b075
commit c594bc3a38
4 changed files with 654 additions and 1 deletions

View File

@@ -155,13 +155,41 @@ class ViabilityThreshold:
@dataclass
class PipelineStage:
"""A single stage in the processing pipeline."""
"""A single stage in the processing pipeline.
Attributes:
template: Path to the template file (relative to infospace root).
name: Human-readable stage name used in progress output.
output_dir: Directory for stage outputs (relative to root).
output_macro: Macro name for this stage's output, also used as
the filename suffix (e.g. ``entities`` → ``<id>-entities.md``).
split_entities: If True, parse ``--- ENTITY: <name> ---`` delimiters
from LLM output and write individual entity files.
macros: Static macros loaded from files (macro name → relative path).
spaces: Legacy space IDs for SQLite-based resolver (unused by
:class:`SourcePipeline`).
"""
template: str
name: str = ""
output_dir: str = ""
output_macro: str = ""
split_entities: bool = False
macros: Dict[str, str] = field(default_factory=dict)
spaces: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {"template": self.template}
if self.name:
d["name"] = self.name
if self.output_dir:
d["output_dir"] = self.output_dir
if self.output_macro:
d["output_macro"] = self.output_macro
if self.split_entities:
d["split_entities"] = self.split_entities
if self.macros:
d["macros"] = self.macros
if self.spaces:
d["spaces"] = self.spaces
return d
@@ -170,6 +198,11 @@ class PipelineStage:
def from_dict(cls, data: Dict[str, Any]) -> PipelineStage:
return cls(
template=data["template"],
name=data.get("name", ""),
output_dir=data.get("output_dir", ""),
output_macro=data.get("output_macro", ""),
split_entities=data.get("split_entities", False),
macros=data.get("macros", {}),
spaces=data.get("spaces", []),
)