feat(pipeline): per-stage max_tokens, LLM provenance, processing log

- PipelineStage now supports max_tokens to override the 4096 default - SourcePipeline records provider/model on each entity file as HTML comment - output/processing-log.yaml tracks tokens, cost, duration, retries, errors - _call_llm returns (content, metadata) for downstream traceability - _http.py wraps JSON parse errors with body preview for debugging - infospace.yaml stages: extract/map=6000 tokens, synthesize=3000 tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-19 14:50:49 +01:00
parent 5ede1de4b8
commit df1fdf1842
4 changed files with 191 additions and 32 deletions
--- a/examples/infospace-with-history/infospace.yaml
+++ b/examples/infospace-with-history/infospace.yaml
@@ -45,6 +45,7 @@ pipeline:
      output_dir: output/entities
      output_macro: entities
      split_entities: true
      max_tokens: 6000
      macros:
        extraction_rules: artifacts/guidelines/extraction-rules.md
        vsm_framework: artifacts/vsm-reference/vsm-framework.md
@@ -52,6 +53,7 @@ pipeline:
      template: templates/map-to-vsm.md
      output_dir: output/mappings
      output_macro: mappings
      max_tokens: 6000
      macros:
        mapping_rules: artifacts/guidelines/mapping-rules.md
        vsm_framework: artifacts/vsm-reference/vsm-framework.md
@@ -59,6 +61,7 @@ pipeline:
      template: templates/synthesize-analysis.md
      output_dir: output/analyses
      output_macro: analysis
      max_tokens: 3000
      macros:
        vsm_framework: artifacts/vsm-reference/vsm-framework.md
  post_batch:
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -575,7 +575,13 @@ def process(
    # Run pipeline
    from markitect.infospace.pipeline import SourcePipeline
-    pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)
+    pipeline = SourcePipeline(
        cfg, root,
        adapter=adapter,
        provider=provider or "",
        model=(model or _PROVIDER_DEFAULTS.get(provider or "", "")) if provider else "",
        no_commit=no_commit,
    )
    total = len(source_files)
    completed = 0
--- a/markitect/infospace/config.py
+++ b/markitect/infospace/config.py
@@ -168,6 +168,8 @@ class PipelineStage:
        macros: Static macros loaded from files (macro name → relative path).
        spaces: Legacy space IDs for SQLite-based resolver (unused by
            :class:`SourcePipeline`).
        max_tokens: Maximum tokens to request from the LLM for this stage.
            Overrides the pipeline-level default (4096).
    """
    template: str
@@ -177,6 +179,7 @@ class PipelineStage:
    split_entities: bool = False
    macros: Dict[str, str] = field(default_factory=dict)
    spaces: List[str] = field(default_factory=list)
    max_tokens: Optional[int] = None
    def to_dict(self) -> Dict[str, Any]:
        d: Dict[str, Any] = {"template": self.template}
@@ -192,6 +195,8 @@ class PipelineStage:
            d["macros"] = self.macros
        if self.spaces:
            d["spaces"] = self.spaces
        if self.max_tokens is not None:
            d["max_tokens"] = self.max_tokens
        return d
    @classmethod
@@ -204,6 +209,7 @@ class PipelineStage:
            split_entities=data.get("split_entities", False),
            macros=data.get("macros", {}),
            spaces=data.get("spaces", []),
            max_tokens=data.get("max_tokens"),
        )
--- a/markitect/infospace/pipeline.py
+++ b/markitect/infospace/pipeline.py
@@ -25,11 +25,17 @@ from __future__ import annotations
 import re
 import subprocess
 import time
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 import yaml
 from markitect.infospace.config import InfospaceConfig, PipelineStage
 # Default max_tokens when not specified on a stage
 _DEFAULT_MAX_TOKENS = 4096
 class SourcePipeline:
    """Processes source files through infospace pipeline stages.
@@ -53,11 +59,15 @@ class SourcePipeline:
        config: InfospaceConfig,
        root: Path,
        adapter=None,
        provider: str = "",
        model: str = "",
        no_commit: bool = False,
    ) -> None:
        self.config = config
        self.root = root
        self.adapter = adapter
        self.provider = provider
        self.model = model
        self.no_commit = no_commit
    # ── Public API ────────────────────────────────────────────────────
@@ -98,6 +108,7 @@ class SourcePipeline:
        source_id = source_file.stem
        source_content = source_file.read_text(encoding="utf-8")
        stage_outputs: Dict[str, str] = {}
        stage_logs: List[Dict[str, Any]] = []
        print(f"\nProcessing: {source_id}")
        print("=" * 60)
@@ -107,15 +118,21 @@ class SourcePipeline:
            return False
        for stage in self.config.pipeline.stages:
-            content = self._run_stage(stage, source_id, source_content, stage_outputs)
+            content, stage_meta = self._run_stage(
                stage, source_id, source_content, stage_outputs
            )
            if stage_meta:
                stage_logs.append(stage_meta)
            if content is None:
                stage_label = stage.name or stage.template
                print(f"\n  Pipeline paused at stage '{stage_label}'.")
                self._write_processing_log(source_id, stage_logs, success=False)
                return False
            if stage.output_macro:
                stage_outputs[stage.output_macro] = content
        print(f"\n  {source_id}: all stages complete.")
        self._write_processing_log(source_id, stage_logs, success=True)
        if not self.no_commit:
            self._git_commit(source_id)
@@ -156,10 +173,12 @@ class SourcePipeline:
        source_id: str,
        source_content: str,
        stage_outputs: Dict[str, str],
-    ) -> Optional[str]:
+    ) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
        """Run a single pipeline stage.
-        Returns the stage's output content, or None on failure.
+        Returns:
            ``(content, metadata)`` where *content* is the stage output
            (or ``None`` on failure) and *metadata* is a log dict.
        """
        stage_label = stage.name or stage.template
        print(f"\n  [{stage_label}]")
@@ -170,8 +189,8 @@ class SourcePipeline:
        if output_file and output_file.exists():
            print(f"    Found existing output: {output_file.name}")
            if stage.split_entities:
-                return self._load_from_view(output_file)
+                return self._load_from_view(output_file), None
-            return output_file.read_text(encoding="utf-8")
+            return output_file.read_text(encoding="utf-8"), None
        # Build macro substitution dict
        macros = self._build_macros(stage, source_content, stage_outputs)
@@ -180,7 +199,7 @@ class SourcePipeline:
        template_path = self.root / stage.template
        if not template_path.exists():
            print(f"    ERROR: Template not found: {stage.template}")
-            return None
+            return None, {"stage": stage_label, "error": f"template not found: {stage.template}"}
        template_content = template_path.read_text(encoding="utf-8")
        prompt = self._resolve_macros(template_content, macros)
@@ -195,17 +214,23 @@ class SourcePipeline:
        # Without an adapter, we cannot generate output
        if self.adapter is None:
            print("    No LLM adapter — skipping generation (manual mode).")
-            return None
+            return None, {"stage": stage_label, "error": "no adapter"}
        # Resolve max_tokens: stage config > pipeline default
        max_tokens = stage.max_tokens if stage.max_tokens is not None else _DEFAULT_MAX_TOKENS
        # Call LLM — with one retry for split_entities stages that return 0 entities
        max_attempts = 2 if stage.split_entities else 1
        entity_files: List[Tuple[str, Path]] = []
-        content = None
+        content: Optional[str] = None
        llm_meta: Dict[str, Any] = {}
        total_retries = 0
        for attempt in range(max_attempts):
-            content = self._call_llm(prompt, stage_label)
+            content, llm_meta = self._call_llm(prompt, stage_label, max_tokens)
            if content is None:
-                return None
+                meta = {"stage": stage_label, "retries": total_retries, **llm_meta}
                return None, meta
            # Save raw response for debugging (overwritten on retry)
            if output_file:
@@ -214,30 +239,35 @@ class SourcePipeline:
                raw_file.write_text(content, encoding="utf-8")
            if stage.split_entities:
-                entity_files = self._split_and_write_entities(stage, content)
+                entity_files = self._split_and_write_entities(stage, content, source_id)
                if entity_files:
                    break  # Got entities — proceed
                if attempt < max_attempts - 1:
                    total_retries += 1
                    print(f"    No entity delimiters found — retrying ({attempt + 2}/{max_attempts})...")
                else:
                    print(
                        f"    WARNING: No '--- ENTITY: ---' markers found after {max_attempts} attempt(s).\n"
                        f"    Check {raw_file.name} to inspect the raw LLM response."
                    )
-                    return None  # Don't write empty view; allow re-run
+                    meta = {"stage": stage_label, "retries": total_retries,
                            "error": "no entity delimiters", **llm_meta}
                    return None, meta  # Don't write empty view; allow re-run
            else:
                break  # Non-split stages don't need retry
        stage_meta: Dict[str, Any] = {"stage": stage_label, "retries": total_retries, **llm_meta}
        # Persist output
        if stage.split_entities:
            self._write_entity_view(source_id, entity_files, output_file)
-            return content
+            return content, stage_meta
        else:
            if output_file:
                output_file.parent.mkdir(parents=True, exist_ok=True)
                output_file.write_text(content, encoding="utf-8")
                print(f"    Output written to {output_file.name}")
-            return content
+            return content, stage_meta
    # ── Output File Resolution ────────────────────────────────────────
@@ -319,6 +349,7 @@ class SourcePipeline:
            for f in entities_dir.glob("*.md")
            if not f.name.endswith("-entities.md")
            and not f.name.endswith("-prompt.md")
            and not f.name.endswith("-raw.md")
        )
    @staticmethod
@@ -334,6 +365,7 @@ class SourcePipeline:
        self,
        stage: PipelineStage,
        combined_content: str,
        source_id: str = "",
    ) -> List[Tuple[str, Path]]:
        """Split ``--- ENTITY: <name> ---`` delimited output into files.
@@ -369,7 +401,11 @@ class SourcePipeline:
            if file_path.exists():
                skipped_count += 1
            else:
-                file_path.write_text(entity_content + "\n", encoding="utf-8")
+                # Prepend provenance comment so the LLM origin is traceable
                provenance = self._provenance_comment(source_id)
                file_path.write_text(
                    provenance + entity_content + "\n", encoding="utf-8"
                )
                new_count += 1
            entity_files.append((entity_name, file_path))
@@ -428,19 +464,99 @@ class SourcePipeline:
        return "\n\n".join(parts) + "\n" if parts else ""
    # ── Provenance & Processing Log ───────────────────────────────────
    def _provenance_comment(self, source_id: str) -> str:
        """Return an HTML comment tagging the LLM that generated this content."""
        date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        parts = [f"date={date}", f"source={source_id}"]
        if self.provider:
            parts.insert(0, f"provider={self.provider}")
        if self.model:
            parts.insert(1, f"model={self.model}")
        return f"<!-- generated: {' '.join(parts)} -->\n\n"
    def _write_processing_log(
        self,
        source_id: str,
        stage_logs: List[Dict[str, Any]],
        success: bool,
    ) -> None:
        """Append a run record to ``output/processing-log.yaml``."""
        log_file = self.root / "output" / "processing-log.yaml"
        log_file.parent.mkdir(parents=True, exist_ok=True)
        # Load existing log
        existing: List[Dict[str, Any]] = []
        if log_file.is_file():
            try:
                raw = yaml.safe_load(log_file.read_text(encoding="utf-8"))
                if isinstance(raw, list):
                    existing = raw
            except Exception:
                pass
        # Build new entry
        total_prompt = sum(s.get("prompt_tokens", 0) for s in stage_logs)
        total_completion = sum(s.get("completion_tokens", 0) for s in stage_logs)
        total_cost = sum(s.get("cost", 0.0) for s in stage_logs)
        total_duration = sum(s.get("duration_seconds", 0.0) for s in stage_logs)
        total_retries = sum(s.get("retries", 0) for s in stage_logs)
        errors = [s["error"] for s in stage_logs if s.get("error")]
        entry: Dict[str, Any] = {
            "source_id": source_id,
            "processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
            "provider": self.provider,
            "model": self.model,
            "success": success,
            "total_prompt_tokens": total_prompt,
            "total_completion_tokens": total_completion,
            "total_cost": round(total_cost, 6),
            "total_duration_seconds": round(total_duration, 1),
            "total_retries": total_retries,
            "stages": stage_logs,
        }
        if errors:
            entry["errors"] = errors
        # Remove previous entry for the same source_id (re-run)
        existing = [e for e in existing if e.get("source_id") != source_id]
        existing.append(entry)
        log_file.write_text(
            yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
            encoding="utf-8",
        )
    # ── LLM Execution ─────────────────────────────────────────────────
    def _call_llm(
-        self, prompt: str, stage_name: str, max_tokens: int = 8192
+        self, prompt: str, stage_name: str, max_tokens: int = _DEFAULT_MAX_TOKENS
-    ) -> Optional[str]:
+    ) -> Tuple[Optional[str], Dict[str, Any]]:
        """Call the LLM adapter with exponential back-off on rate limits.
-        Returns the response content string, or None on failure.
+        Returns:
            ``(content, metadata)`` where *content* is the response string
            (or ``None`` on failure) and *metadata* has provider, model,
            token counts, cost, finish_reason, duration, and error info.
        """
        from markitect.prompts.execution.models import RunConfig
        from markitect.llm.exceptions import LLMRateLimitError
-        print(f"    Calling LLM ({stage_name})...")
+        meta: Dict[str, Any] = {
            "provider": self.provider,
            "model": self.model,
            "prompt_tokens": 0,
            "completion_tokens": 0,
            "cost": 0.0,
            "finish_reason": None,
            "duration_seconds": 0.0,
            "error": None,
        }
        model_label = f"{self.provider}/{self.model}" if self.provider else stage_name
        print(f"    Calling LLM ({model_label})...")
        t0 = time.time()
        max_retries = 3
        response = None
@@ -460,29 +576,57 @@ class SourcePipeline:
                    )
                    time.sleep(wait)
                else:
-                    print(f"    Rate limit exceeded after {max_retries} retries: {exc}")
+                    msg = f"Rate limit exceeded after {max_retries} retries: {exc}"
-                    return None
+                    print(f"    {msg}")
                    meta["error"] = msg
                    meta["duration_seconds"] = round(time.time() - t0, 1)
                    return None, meta
            except Exception as exc:
-                print(f"    LLM error: {exc}")
+                msg = str(exc)
-                return None
+                print(f"    LLM error: {msg}")
                meta["error"] = msg
                meta["duration_seconds"] = round(time.time() - t0, 1)
                return None, meta
        if response is None:
-            return None
+            meta["error"] = "no response"
            return None, meta
-        elapsed = time.time() - t0
+        elapsed = round(time.time() - t0, 1)
        usage = response.usage
        prompt_tok = usage.get("prompt_tokens", 0)
        completion_tok = usage.get("completion_tokens", 0)
        cost = float(usage.get("cost", 0.0))
        finish_reason = getattr(response, "finish_reason", None) or "unknown"
        meta.update({
            "prompt_tokens": prompt_tok,
            "completion_tokens": completion_tok,
            "cost": cost,
            "finish_reason": finish_reason,
            "duration_seconds": elapsed,
        })
        cost_str = f", cost=${cost:.4f}" if cost > 0 else ""
        print(
-            f"    Done in {elapsed:.1f}s — "
+            f"    Done in {elapsed}s — "
-            f"prompt {usage.get('prompt_tokens', '?')} tok, "
+            f"prompt {prompt_tok} tok, completion {completion_tok} tok{cost_str}"
            f"completion {usage.get('completion_tokens', '?')} tok"
        )
        if finish_reason == "length":
            print(
                f"    WARNING: Output truncated at {max_tokens} tokens "
                f"(finish_reason=length). Consider raising max_tokens for "
                f"stage '{stage_name}' in infospace.yaml."
            )
        content = response.content
        if not content or not content.strip():
            print("    LLM returned empty content.")
-            return None
+            meta["error"] = "empty response"
            return None, meta
-        return content
+        return content, meta
    # ── Git Integration ───────────────────────────────────────────────