feat(pipeline): per-stage max_tokens, LLM provenance, processing log

- PipelineStage now supports max_tokens to override the 4096 default - SourcePipeline records provider/model on each entity file as HTML comment - output/processing-log.yaml tracks tokens, cost, duration, retries, errors - _call_llm returns (content, metadata) for downstream traceability - _http.py wraps JSON parse errors with body preview for debugging - infospace.yaml stages: extract/map=6000 tokens, synthesize=3000 tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-19 14:50:49 +01:00
parent 5ede1de4b8
commit df1fdf1842
4 changed files with 191 additions and 32 deletions
--- a/examples/infospace-with-history/infospace.yaml
+++ b/examples/infospace-with-history/infospace.yaml
@@ -45,6 +45,7 @@ pipeline:
      output_dir: output/entities
      output_macro: entities
      split_entities: true
+      max_tokens: 6000
      macros:
        extraction_rules: artifacts/guidelines/extraction-rules.md
        vsm_framework: artifacts/vsm-reference/vsm-framework.md
@@ -52,6 +53,7 @@ pipeline:
      template: templates/map-to-vsm.md
      output_dir: output/mappings
      output_macro: mappings
+      max_tokens: 6000
      macros:
        mapping_rules: artifacts/guidelines/mapping-rules.md
        vsm_framework: artifacts/vsm-reference/vsm-framework.md
@@ -59,6 +61,7 @@ pipeline:
      template: templates/synthesize-analysis.md
      output_dir: output/analyses
      output_macro: analysis
+      max_tokens: 3000
      macros:
        vsm_framework: artifacts/vsm-reference/vsm-framework.md
  post_batch:
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -575,7 +575,13 @@ def process(
    # Run pipeline
    from markitect.infospace.pipeline import SourcePipeline

-    pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)
+    pipeline = SourcePipeline(
+        cfg, root,
+        adapter=adapter,
+        provider=provider or "",
+        model=(model or _PROVIDER_DEFAULTS.get(provider or "", "")) if provider else "",
+        no_commit=no_commit,
+    )

    total = len(source_files)
    completed = 0
--- a/markitect/infospace/config.py
+++ b/markitect/infospace/config.py
@@ -168,6 +168,8 @@ class PipelineStage:
        macros: Static macros loaded from files (macro name → relative path).
        spaces: Legacy space IDs for SQLite-based resolver (unused by
            :class:`SourcePipeline`).
+        max_tokens: Maximum tokens to request from the LLM for this stage.
+            Overrides the pipeline-level default (4096).
    """

    template: str
@@ -177,6 +179,7 @@ class PipelineStage:
    split_entities: bool = False
    macros: Dict[str, str] = field(default_factory=dict)
    spaces: List[str] = field(default_factory=list)
+    max_tokens: Optional[int] = None

    def to_dict(self) -> Dict[str, Any]:
        d: Dict[str, Any] = {"template": self.template}
@@ -192,6 +195,8 @@ class PipelineStage:
            d["macros"] = self.macros
        if self.spaces:
            d["spaces"] = self.spaces
+        if self.max_tokens is not None:
+            d["max_tokens"] = self.max_tokens
        return d

    @classmethod
@@ -204,6 +209,7 @@ class PipelineStage:
            split_entities=data.get("split_entities", False),
            macros=data.get("macros", {}),
            spaces=data.get("spaces", []),
+            max_tokens=data.get("max_tokens"),
        )


--- a/markitect/infospace/pipeline.py
+++ b/markitect/infospace/pipeline.py
@@ -25,11 +25,17 @@ from __future__ import annotations
 import re
 import subprocess
 import time
+from datetime import datetime, timezone
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
+
+import yaml

 from markitect.infospace.config import InfospaceConfig, PipelineStage

+# Default max_tokens when not specified on a stage
+_DEFAULT_MAX_TOKENS = 4096
+

 class SourcePipeline:
    """Processes source files through infospace pipeline stages.
@@ -53,11 +59,15 @@ class SourcePipeline:
        config: InfospaceConfig,
        root: Path,
        adapter=None,
+        provider: str = "",
+        model: str = "",
        no_commit: bool = False,
    ) -> None:
        self.config = config
        self.root = root
        self.adapter = adapter
+        self.provider = provider
+        self.model = model
        self.no_commit = no_commit

    # ── Public API ────────────────────────────────────────────────────
@@ -98,6 +108,7 @@ class SourcePipeline:
        source_id = source_file.stem
        source_content = source_file.read_text(encoding="utf-8")
        stage_outputs: Dict[str, str] = {}
+        stage_logs: List[Dict[str, Any]] = []

        print(f"\nProcessing: {source_id}")
        print("=" * 60)
@@ -107,15 +118,21 @@ class SourcePipeline:
            return False

        for stage in self.config.pipeline.stages:
-            content = self._run_stage(stage, source_id, source_content, stage_outputs)
+            content, stage_meta = self._run_stage(
+                stage, source_id, source_content, stage_outputs
+            )
+            if stage_meta:
+                stage_logs.append(stage_meta)
            if content is None:
                stage_label = stage.name or stage.template
                print(f"\n  Pipeline paused at stage '{stage_label}'.")
+                self._write_processing_log(source_id, stage_logs, success=False)
                return False
            if stage.output_macro:
                stage_outputs[stage.output_macro] = content

        print(f"\n  {source_id}: all stages complete.")
+        self._write_processing_log(source_id, stage_logs, success=True)
        if not self.no_commit:
            self._git_commit(source_id)

@@ -156,10 +173,12 @@ class SourcePipeline:
        source_id: str,
        source_content: str,
        stage_outputs: Dict[str, str],
-    ) -> Optional[str]:
+    ) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
        """Run a single pipeline stage.

-        Returns the stage's output content, or None on failure.
+        Returns:
+            ``(content, metadata)`` where *content* is the stage output
+            (or ``None`` on failure) and *metadata* is a log dict.
        """
        stage_label = stage.name or stage.template
        print(f"\n  [{stage_label}]")
@@ -170,8 +189,8 @@ class SourcePipeline:
        if output_file and output_file.exists():
            print(f"    Found existing output: {output_file.name}")
            if stage.split_entities:
-                return self._load_from_view(output_file)
-            return output_file.read_text(encoding="utf-8")
+                return self._load_from_view(output_file), None
+            return output_file.read_text(encoding="utf-8"), None

        # Build macro substitution dict
        macros = self._build_macros(stage, source_content, stage_outputs)
@@ -180,7 +199,7 @@ class SourcePipeline:
        template_path = self.root / stage.template
        if not template_path.exists():
            print(f"    ERROR: Template not found: {stage.template}")
-            return None
+            return None, {"stage": stage_label, "error": f"template not found: {stage.template}"}

        template_content = template_path.read_text(encoding="utf-8")
        prompt = self._resolve_macros(template_content, macros)
@@ -195,17 +214,23 @@ class SourcePipeline:
        # Without an adapter, we cannot generate output
        if self.adapter is None:
            print("    No LLM adapter — skipping generation (manual mode).")
-            return None
+            return None, {"stage": stage_label, "error": "no adapter"}
+
+        # Resolve max_tokens: stage config > pipeline default
+        max_tokens = stage.max_tokens if stage.max_tokens is not None else _DEFAULT_MAX_TOKENS

        # Call LLM — with one retry for split_entities stages that return 0 entities
        max_attempts = 2 if stage.split_entities else 1
        entity_files: List[Tuple[str, Path]] = []
-        content = None
+        content: Optional[str] = None
+        llm_meta: Dict[str, Any] = {}
+        total_retries = 0

        for attempt in range(max_attempts):
-            content = self._call_llm(prompt, stage_label)
+            content, llm_meta = self._call_llm(prompt, stage_label, max_tokens)
            if content is None:
-                return None
+                meta = {"stage": stage_label, "retries": total_retries, **llm_meta}
+                return None, meta

            # Save raw response for debugging (overwritten on retry)
            if output_file:
@@ -214,30 +239,35 @@ class SourcePipeline:
                raw_file.write_text(content, encoding="utf-8")

            if stage.split_entities:
-                entity_files = self._split_and_write_entities(stage, content)
+                entity_files = self._split_and_write_entities(stage, content, source_id)
                if entity_files:
                    break  # Got entities — proceed
                if attempt < max_attempts - 1:
+                    total_retries += 1
                    print(f"    No entity delimiters found — retrying ({attempt + 2}/{max_attempts})...")
                else:
                    print(
                        f"    WARNING: No '--- ENTITY: ---' markers found after {max_attempts} attempt(s).\n"
                        f"    Check {raw_file.name} to inspect the raw LLM response."
                    )
-                    return None  # Don't write empty view; allow re-run
+                    meta = {"stage": stage_label, "retries": total_retries,
+                            "error": "no entity delimiters", **llm_meta}
+                    return None, meta  # Don't write empty view; allow re-run
            else:
                break  # Non-split stages don't need retry

+        stage_meta: Dict[str, Any] = {"stage": stage_label, "retries": total_retries, **llm_meta}
+
        # Persist output
        if stage.split_entities:
            self._write_entity_view(source_id, entity_files, output_file)
-            return content
+            return content, stage_meta
        else:
            if output_file:
                output_file.parent.mkdir(parents=True, exist_ok=True)
                output_file.write_text(content, encoding="utf-8")
                print(f"    Output written to {output_file.name}")
-            return content
+            return content, stage_meta

    # ── Output File Resolution ────────────────────────────────────────

@@ -319,6 +349,7 @@ class SourcePipeline:
            for f in entities_dir.glob("*.md")
            if not f.name.endswith("-entities.md")
            and not f.name.endswith("-prompt.md")
+            and not f.name.endswith("-raw.md")
        )

    @staticmethod
@@ -334,6 +365,7 @@ class SourcePipeline:
        self,
        stage: PipelineStage,
        combined_content: str,
+        source_id: str = "",
    ) -> List[Tuple[str, Path]]:
        """Split ``--- ENTITY: <name> ---`` delimited output into files.

@@ -369,7 +401,11 @@ class SourcePipeline:
            if file_path.exists():
                skipped_count += 1
            else:
-                file_path.write_text(entity_content + "\n", encoding="utf-8")
+                # Prepend provenance comment so the LLM origin is traceable
+                provenance = self._provenance_comment(source_id)
+                file_path.write_text(
+                    provenance + entity_content + "\n", encoding="utf-8"
+                )
                new_count += 1

            entity_files.append((entity_name, file_path))
@@ -428,19 +464,99 @@ class SourcePipeline:

        return "\n\n".join(parts) + "\n" if parts else ""

+    # ── Provenance & Processing Log ───────────────────────────────────
+
+    def _provenance_comment(self, source_id: str) -> str:
+        """Return an HTML comment tagging the LLM that generated this content."""
+        date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+        parts = [f"date={date}", f"source={source_id}"]
+        if self.provider:
+            parts.insert(0, f"provider={self.provider}")
+        if self.model:
+            parts.insert(1, f"model={self.model}")
+        return f"<!-- generated: {' '.join(parts)} -->\n\n"
+
+    def _write_processing_log(
+        self,
+        source_id: str,
+        stage_logs: List[Dict[str, Any]],
+        success: bool,
+    ) -> None:
+        """Append a run record to ``output/processing-log.yaml``."""
+        log_file = self.root / "output" / "processing-log.yaml"
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+
+        # Load existing log
+        existing: List[Dict[str, Any]] = []
+        if log_file.is_file():
+            try:
+                raw = yaml.safe_load(log_file.read_text(encoding="utf-8"))
+                if isinstance(raw, list):
+                    existing = raw
+            except Exception:
+                pass
+
+        # Build new entry
+        total_prompt = sum(s.get("prompt_tokens", 0) for s in stage_logs)
+        total_completion = sum(s.get("completion_tokens", 0) for s in stage_logs)
+        total_cost = sum(s.get("cost", 0.0) for s in stage_logs)
+        total_duration = sum(s.get("duration_seconds", 0.0) for s in stage_logs)
+        total_retries = sum(s.get("retries", 0) for s in stage_logs)
+        errors = [s["error"] for s in stage_logs if s.get("error")]
+
+        entry: Dict[str, Any] = {
+            "source_id": source_id,
+            "processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "provider": self.provider,
+            "model": self.model,
+            "success": success,
+            "total_prompt_tokens": total_prompt,
+            "total_completion_tokens": total_completion,
+            "total_cost": round(total_cost, 6),
+            "total_duration_seconds": round(total_duration, 1),
+            "total_retries": total_retries,
+            "stages": stage_logs,
+        }
+        if errors:
+            entry["errors"] = errors
+
+        # Remove previous entry for the same source_id (re-run)
+        existing = [e for e in existing if e.get("source_id") != source_id]
+        existing.append(entry)
+
+        log_file.write_text(
+            yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
+            encoding="utf-8",
+        )
+
    # ── LLM Execution ─────────────────────────────────────────────────

    def _call_llm(
-        self, prompt: str, stage_name: str, max_tokens: int = 8192
-    ) -> Optional[str]:
+        self, prompt: str, stage_name: str, max_tokens: int = _DEFAULT_MAX_TOKENS
+    ) -> Tuple[Optional[str], Dict[str, Any]]:
        """Call the LLM adapter with exponential back-off on rate limits.

-        Returns the response content string, or None on failure.
+        Returns:
+            ``(content, metadata)`` where *content* is the response string
+            (or ``None`` on failure) and *metadata* has provider, model,
+            token counts, cost, finish_reason, duration, and error info.
        """
        from markitect.prompts.execution.models import RunConfig
        from markitect.llm.exceptions import LLMRateLimitError

-        print(f"    Calling LLM ({stage_name})...")
+        meta: Dict[str, Any] = {
+            "provider": self.provider,
+            "model": self.model,
+            "prompt_tokens": 0,
+            "completion_tokens": 0,
+            "cost": 0.0,
+            "finish_reason": None,
+            "duration_seconds": 0.0,
+            "error": None,
+        }
+
+        model_label = f"{self.provider}/{self.model}" if self.provider else stage_name
+        print(f"    Calling LLM ({model_label})...")
        t0 = time.time()
        max_retries = 3
        response = None
@@ -460,29 +576,57 @@ class SourcePipeline:
                    )
                    time.sleep(wait)
                else:
-                    print(f"    Rate limit exceeded after {max_retries} retries: {exc}")
-                    return None
+                    msg = f"Rate limit exceeded after {max_retries} retries: {exc}"
+                    print(f"    {msg}")
+                    meta["error"] = msg
+                    meta["duration_seconds"] = round(time.time() - t0, 1)
+                    return None, meta
            except Exception as exc:
-                print(f"    LLM error: {exc}")
-                return None
+                msg = str(exc)
+                print(f"    LLM error: {msg}")
+                meta["error"] = msg
+                meta["duration_seconds"] = round(time.time() - t0, 1)
+                return None, meta

        if response is None:
-            return None
+            meta["error"] = "no response"
+            return None, meta

-        elapsed = time.time() - t0
+        elapsed = round(time.time() - t0, 1)
        usage = response.usage
+        prompt_tok = usage.get("prompt_tokens", 0)
+        completion_tok = usage.get("completion_tokens", 0)
+        cost = float(usage.get("cost", 0.0))
+        finish_reason = getattr(response, "finish_reason", None) or "unknown"
+
+        meta.update({
+            "prompt_tokens": prompt_tok,
+            "completion_tokens": completion_tok,
+            "cost": cost,
+            "finish_reason": finish_reason,
+            "duration_seconds": elapsed,
+        })
+
+        cost_str = f", cost=${cost:.4f}" if cost > 0 else ""
        print(
-            f"    Done in {elapsed:.1f}s — "
-            f"prompt {usage.get('prompt_tokens', '?')} tok, "
-            f"completion {usage.get('completion_tokens', '?')} tok"
+            f"    Done in {elapsed}s — "
+            f"prompt {prompt_tok} tok, completion {completion_tok} tok{cost_str}"
        )

+        if finish_reason == "length":
+            print(
+                f"    WARNING: Output truncated at {max_tokens} tokens "
+                f"(finish_reason=length). Consider raising max_tokens for "
+                f"stage '{stage_name}' in infospace.yaml."
+            )
+
        content = response.content
        if not content or not content.strip():
            print("    LLM returned empty content.")
-            return None
+            meta["error"] = "empty response"
+            return None, meta

-        return content
+        return content, meta

    # ── Git Integration ───────────────────────────────────────────────