feat(pipeline): per-stage max_tokens, LLM provenance, processing log

- PipelineStage now supports max_tokens to override the 4096 default - SourcePipeline records provider/model on each entity file as HTML comment - output/processing-log.yaml tracks tokens, cost, duration, retries, errors - _call_llm returns (content, metadata) for downstream traceability - _http.py wraps JSON parse errors with body preview for debugging - infospace.yaml stages: extract/map=6000 tokens, synthesize=3000 tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-19 14:50:49 +01:00
parent 5ede1de4b8
commit df1fdf1842
4 changed files with 191 additions and 32 deletions
--- a/markitect/infospace/config.py
+++ b/markitect/infospace/config.py
@@ -168,6 +168,8 @@ class PipelineStage:
        macros: Static macros loaded from files (macro name → relative path).
        spaces: Legacy space IDs for SQLite-based resolver (unused by
            :class:`SourcePipeline`).
+        max_tokens: Maximum tokens to request from the LLM for this stage.
+            Overrides the pipeline-level default (4096).
    """

    template: str
@@ -177,6 +179,7 @@ class PipelineStage:
    split_entities: bool = False
    macros: Dict[str, str] = field(default_factory=dict)
    spaces: List[str] = field(default_factory=list)
+    max_tokens: Optional[int] = None

    def to_dict(self) -> Dict[str, Any]:
        d: Dict[str, Any] = {"template": self.template}
@@ -192,6 +195,8 @@ class PipelineStage:
            d["macros"] = self.macros
        if self.spaces:
            d["spaces"] = self.spaces
+        if self.max_tokens is not None:
+            d["max_tokens"] = self.max_tokens
        return d

    @classmethod
@@ -204,6 +209,7 @@ class PipelineStage:
            split_entities=data.get("split_entities", False),
            macros=data.get("macros", {}),
            spaces=data.get("spaces", []),
+            max_tokens=data.get("max_tokens"),
        )