feat(pipeline): per-stage max_tokens, LLM provenance, processing log

- PipelineStage now supports max_tokens to override the 4096 default
- SourcePipeline records provider/model on each entity file as HTML comment
- output/processing-log.yaml tracks tokens, cost, duration, retries, errors
- _call_llm returns (content, metadata) for downstream traceability
- _http.py wraps JSON parse errors with body preview for debugging
- infospace.yaml stages: extract/map=6000 tokens, synthesize=3000 tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 14:50:49 +01:00
parent 5ede1de4b8
commit df1fdf1842
4 changed files with 191 additions and 32 deletions

View File

@@ -45,6 +45,7 @@ pipeline:
output_dir: output/entities output_dir: output/entities
output_macro: entities output_macro: entities
split_entities: true split_entities: true
max_tokens: 6000
macros: macros:
extraction_rules: artifacts/guidelines/extraction-rules.md extraction_rules: artifacts/guidelines/extraction-rules.md
vsm_framework: artifacts/vsm-reference/vsm-framework.md vsm_framework: artifacts/vsm-reference/vsm-framework.md
@@ -52,6 +53,7 @@ pipeline:
template: templates/map-to-vsm.md template: templates/map-to-vsm.md
output_dir: output/mappings output_dir: output/mappings
output_macro: mappings output_macro: mappings
max_tokens: 6000
macros: macros:
mapping_rules: artifacts/guidelines/mapping-rules.md mapping_rules: artifacts/guidelines/mapping-rules.md
vsm_framework: artifacts/vsm-reference/vsm-framework.md vsm_framework: artifacts/vsm-reference/vsm-framework.md
@@ -59,6 +61,7 @@ pipeline:
template: templates/synthesize-analysis.md template: templates/synthesize-analysis.md
output_dir: output/analyses output_dir: output/analyses
output_macro: analysis output_macro: analysis
max_tokens: 3000
macros: macros:
vsm_framework: artifacts/vsm-reference/vsm-framework.md vsm_framework: artifacts/vsm-reference/vsm-framework.md
post_batch: post_batch:

View File

@@ -575,7 +575,13 @@ def process(
# Run pipeline # Run pipeline
from markitect.infospace.pipeline import SourcePipeline from markitect.infospace.pipeline import SourcePipeline
pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit) pipeline = SourcePipeline(
cfg, root,
adapter=adapter,
provider=provider or "",
model=(model or _PROVIDER_DEFAULTS.get(provider or "", "")) if provider else "",
no_commit=no_commit,
)
total = len(source_files) total = len(source_files)
completed = 0 completed = 0

View File

@@ -168,6 +168,8 @@ class PipelineStage:
macros: Static macros loaded from files (macro name → relative path). macros: Static macros loaded from files (macro name → relative path).
spaces: Legacy space IDs for SQLite-based resolver (unused by spaces: Legacy space IDs for SQLite-based resolver (unused by
:class:`SourcePipeline`). :class:`SourcePipeline`).
max_tokens: Maximum tokens to request from the LLM for this stage.
Overrides the pipeline-level default (4096).
""" """
template: str template: str
@@ -177,6 +179,7 @@ class PipelineStage:
split_entities: bool = False split_entities: bool = False
macros: Dict[str, str] = field(default_factory=dict) macros: Dict[str, str] = field(default_factory=dict)
spaces: List[str] = field(default_factory=list) spaces: List[str] = field(default_factory=list)
max_tokens: Optional[int] = None
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {"template": self.template} d: Dict[str, Any] = {"template": self.template}
@@ -192,6 +195,8 @@ class PipelineStage:
d["macros"] = self.macros d["macros"] = self.macros
if self.spaces: if self.spaces:
d["spaces"] = self.spaces d["spaces"] = self.spaces
if self.max_tokens is not None:
d["max_tokens"] = self.max_tokens
return d return d
@classmethod @classmethod
@@ -204,6 +209,7 @@ class PipelineStage:
split_entities=data.get("split_entities", False), split_entities=data.get("split_entities", False),
macros=data.get("macros", {}), macros=data.get("macros", {}),
spaces=data.get("spaces", []), spaces=data.get("spaces", []),
max_tokens=data.get("max_tokens"),
) )

View File

@@ -25,11 +25,17 @@ from __future__ import annotations
import re import re
import subprocess import subprocess
import time import time
from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import yaml
from markitect.infospace.config import InfospaceConfig, PipelineStage from markitect.infospace.config import InfospaceConfig, PipelineStage
# Default max_tokens when not specified on a stage
_DEFAULT_MAX_TOKENS = 4096
class SourcePipeline: class SourcePipeline:
"""Processes source files through infospace pipeline stages. """Processes source files through infospace pipeline stages.
@@ -53,11 +59,15 @@ class SourcePipeline:
config: InfospaceConfig, config: InfospaceConfig,
root: Path, root: Path,
adapter=None, adapter=None,
provider: str = "",
model: str = "",
no_commit: bool = False, no_commit: bool = False,
) -> None: ) -> None:
self.config = config self.config = config
self.root = root self.root = root
self.adapter = adapter self.adapter = adapter
self.provider = provider
self.model = model
self.no_commit = no_commit self.no_commit = no_commit
# ── Public API ──────────────────────────────────────────────────── # ── Public API ────────────────────────────────────────────────────
@@ -98,6 +108,7 @@ class SourcePipeline:
source_id = source_file.stem source_id = source_file.stem
source_content = source_file.read_text(encoding="utf-8") source_content = source_file.read_text(encoding="utf-8")
stage_outputs: Dict[str, str] = {} stage_outputs: Dict[str, str] = {}
stage_logs: List[Dict[str, Any]] = []
print(f"\nProcessing: {source_id}") print(f"\nProcessing: {source_id}")
print("=" * 60) print("=" * 60)
@@ -107,15 +118,21 @@ class SourcePipeline:
return False return False
for stage in self.config.pipeline.stages: for stage in self.config.pipeline.stages:
content = self._run_stage(stage, source_id, source_content, stage_outputs) content, stage_meta = self._run_stage(
stage, source_id, source_content, stage_outputs
)
if stage_meta:
stage_logs.append(stage_meta)
if content is None: if content is None:
stage_label = stage.name or stage.template stage_label = stage.name or stage.template
print(f"\n Pipeline paused at stage '{stage_label}'.") print(f"\n Pipeline paused at stage '{stage_label}'.")
self._write_processing_log(source_id, stage_logs, success=False)
return False return False
if stage.output_macro: if stage.output_macro:
stage_outputs[stage.output_macro] = content stage_outputs[stage.output_macro] = content
print(f"\n {source_id}: all stages complete.") print(f"\n {source_id}: all stages complete.")
self._write_processing_log(source_id, stage_logs, success=True)
if not self.no_commit: if not self.no_commit:
self._git_commit(source_id) self._git_commit(source_id)
@@ -156,10 +173,12 @@ class SourcePipeline:
source_id: str, source_id: str,
source_content: str, source_content: str,
stage_outputs: Dict[str, str], stage_outputs: Dict[str, str],
) -> Optional[str]: ) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
"""Run a single pipeline stage. """Run a single pipeline stage.
Returns the stage's output content, or None on failure. Returns:
``(content, metadata)`` where *content* is the stage output
(or ``None`` on failure) and *metadata* is a log dict.
""" """
stage_label = stage.name or stage.template stage_label = stage.name or stage.template
print(f"\n [{stage_label}]") print(f"\n [{stage_label}]")
@@ -170,8 +189,8 @@ class SourcePipeline:
if output_file and output_file.exists(): if output_file and output_file.exists():
print(f" Found existing output: {output_file.name}") print(f" Found existing output: {output_file.name}")
if stage.split_entities: if stage.split_entities:
return self._load_from_view(output_file) return self._load_from_view(output_file), None
return output_file.read_text(encoding="utf-8") return output_file.read_text(encoding="utf-8"), None
# Build macro substitution dict # Build macro substitution dict
macros = self._build_macros(stage, source_content, stage_outputs) macros = self._build_macros(stage, source_content, stage_outputs)
@@ -180,7 +199,7 @@ class SourcePipeline:
template_path = self.root / stage.template template_path = self.root / stage.template
if not template_path.exists(): if not template_path.exists():
print(f" ERROR: Template not found: {stage.template}") print(f" ERROR: Template not found: {stage.template}")
return None return None, {"stage": stage_label, "error": f"template not found: {stage.template}"}
template_content = template_path.read_text(encoding="utf-8") template_content = template_path.read_text(encoding="utf-8")
prompt = self._resolve_macros(template_content, macros) prompt = self._resolve_macros(template_content, macros)
@@ -195,17 +214,23 @@ class SourcePipeline:
# Without an adapter, we cannot generate output # Without an adapter, we cannot generate output
if self.adapter is None: if self.adapter is None:
print(" No LLM adapter — skipping generation (manual mode).") print(" No LLM adapter — skipping generation (manual mode).")
return None return None, {"stage": stage_label, "error": "no adapter"}
# Resolve max_tokens: stage config > pipeline default
max_tokens = stage.max_tokens if stage.max_tokens is not None else _DEFAULT_MAX_TOKENS
# Call LLM — with one retry for split_entities stages that return 0 entities # Call LLM — with one retry for split_entities stages that return 0 entities
max_attempts = 2 if stage.split_entities else 1 max_attempts = 2 if stage.split_entities else 1
entity_files: List[Tuple[str, Path]] = [] entity_files: List[Tuple[str, Path]] = []
content = None content: Optional[str] = None
llm_meta: Dict[str, Any] = {}
total_retries = 0
for attempt in range(max_attempts): for attempt in range(max_attempts):
content = self._call_llm(prompt, stage_label) content, llm_meta = self._call_llm(prompt, stage_label, max_tokens)
if content is None: if content is None:
return None meta = {"stage": stage_label, "retries": total_retries, **llm_meta}
return None, meta
# Save raw response for debugging (overwritten on retry) # Save raw response for debugging (overwritten on retry)
if output_file: if output_file:
@@ -214,30 +239,35 @@ class SourcePipeline:
raw_file.write_text(content, encoding="utf-8") raw_file.write_text(content, encoding="utf-8")
if stage.split_entities: if stage.split_entities:
entity_files = self._split_and_write_entities(stage, content) entity_files = self._split_and_write_entities(stage, content, source_id)
if entity_files: if entity_files:
break # Got entities — proceed break # Got entities — proceed
if attempt < max_attempts - 1: if attempt < max_attempts - 1:
total_retries += 1
print(f" No entity delimiters found — retrying ({attempt + 2}/{max_attempts})...") print(f" No entity delimiters found — retrying ({attempt + 2}/{max_attempts})...")
else: else:
print( print(
f" WARNING: No '--- ENTITY: ---' markers found after {max_attempts} attempt(s).\n" f" WARNING: No '--- ENTITY: ---' markers found after {max_attempts} attempt(s).\n"
f" Check {raw_file.name} to inspect the raw LLM response." f" Check {raw_file.name} to inspect the raw LLM response."
) )
return None # Don't write empty view; allow re-run meta = {"stage": stage_label, "retries": total_retries,
"error": "no entity delimiters", **llm_meta}
return None, meta # Don't write empty view; allow re-run
else: else:
break # Non-split stages don't need retry break # Non-split stages don't need retry
stage_meta: Dict[str, Any] = {"stage": stage_label, "retries": total_retries, **llm_meta}
# Persist output # Persist output
if stage.split_entities: if stage.split_entities:
self._write_entity_view(source_id, entity_files, output_file) self._write_entity_view(source_id, entity_files, output_file)
return content return content, stage_meta
else: else:
if output_file: if output_file:
output_file.parent.mkdir(parents=True, exist_ok=True) output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content, encoding="utf-8") output_file.write_text(content, encoding="utf-8")
print(f" Output written to {output_file.name}") print(f" Output written to {output_file.name}")
return content return content, stage_meta
# ── Output File Resolution ──────────────────────────────────────── # ── Output File Resolution ────────────────────────────────────────
@@ -319,6 +349,7 @@ class SourcePipeline:
for f in entities_dir.glob("*.md") for f in entities_dir.glob("*.md")
if not f.name.endswith("-entities.md") if not f.name.endswith("-entities.md")
and not f.name.endswith("-prompt.md") and not f.name.endswith("-prompt.md")
and not f.name.endswith("-raw.md")
) )
@staticmethod @staticmethod
@@ -334,6 +365,7 @@ class SourcePipeline:
self, self,
stage: PipelineStage, stage: PipelineStage,
combined_content: str, combined_content: str,
source_id: str = "",
) -> List[Tuple[str, Path]]: ) -> List[Tuple[str, Path]]:
"""Split ``--- ENTITY: <name> ---`` delimited output into files. """Split ``--- ENTITY: <name> ---`` delimited output into files.
@@ -369,7 +401,11 @@ class SourcePipeline:
if file_path.exists(): if file_path.exists():
skipped_count += 1 skipped_count += 1
else: else:
file_path.write_text(entity_content + "\n", encoding="utf-8") # Prepend provenance comment so the LLM origin is traceable
provenance = self._provenance_comment(source_id)
file_path.write_text(
provenance + entity_content + "\n", encoding="utf-8"
)
new_count += 1 new_count += 1
entity_files.append((entity_name, file_path)) entity_files.append((entity_name, file_path))
@@ -428,19 +464,99 @@ class SourcePipeline:
return "\n\n".join(parts) + "\n" if parts else "" return "\n\n".join(parts) + "\n" if parts else ""
# ── Provenance & Processing Log ───────────────────────────────────
def _provenance_comment(self, source_id: str) -> str:
"""Return an HTML comment tagging the LLM that generated this content."""
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
parts = [f"date={date}", f"source={source_id}"]
if self.provider:
parts.insert(0, f"provider={self.provider}")
if self.model:
parts.insert(1, f"model={self.model}")
return f"<!-- generated: {' '.join(parts)} -->\n\n"
def _write_processing_log(
self,
source_id: str,
stage_logs: List[Dict[str, Any]],
success: bool,
) -> None:
"""Append a run record to ``output/processing-log.yaml``."""
log_file = self.root / "output" / "processing-log.yaml"
log_file.parent.mkdir(parents=True, exist_ok=True)
# Load existing log
existing: List[Dict[str, Any]] = []
if log_file.is_file():
try:
raw = yaml.safe_load(log_file.read_text(encoding="utf-8"))
if isinstance(raw, list):
existing = raw
except Exception:
pass
# Build new entry
total_prompt = sum(s.get("prompt_tokens", 0) for s in stage_logs)
total_completion = sum(s.get("completion_tokens", 0) for s in stage_logs)
total_cost = sum(s.get("cost", 0.0) for s in stage_logs)
total_duration = sum(s.get("duration_seconds", 0.0) for s in stage_logs)
total_retries = sum(s.get("retries", 0) for s in stage_logs)
errors = [s["error"] for s in stage_logs if s.get("error")]
entry: Dict[str, Any] = {
"source_id": source_id,
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"provider": self.provider,
"model": self.model,
"success": success,
"total_prompt_tokens": total_prompt,
"total_completion_tokens": total_completion,
"total_cost": round(total_cost, 6),
"total_duration_seconds": round(total_duration, 1),
"total_retries": total_retries,
"stages": stage_logs,
}
if errors:
entry["errors"] = errors
# Remove previous entry for the same source_id (re-run)
existing = [e for e in existing if e.get("source_id") != source_id]
existing.append(entry)
log_file.write_text(
yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
encoding="utf-8",
)
# ── LLM Execution ───────────────────────────────────────────────── # ── LLM Execution ─────────────────────────────────────────────────
def _call_llm( def _call_llm(
self, prompt: str, stage_name: str, max_tokens: int = 8192 self, prompt: str, stage_name: str, max_tokens: int = _DEFAULT_MAX_TOKENS
) -> Optional[str]: ) -> Tuple[Optional[str], Dict[str, Any]]:
"""Call the LLM adapter with exponential back-off on rate limits. """Call the LLM adapter with exponential back-off on rate limits.
Returns the response content string, or None on failure. Returns:
``(content, metadata)`` where *content* is the response string
(or ``None`` on failure) and *metadata* has provider, model,
token counts, cost, finish_reason, duration, and error info.
""" """
from markitect.prompts.execution.models import RunConfig from markitect.prompts.execution.models import RunConfig
from markitect.llm.exceptions import LLMRateLimitError from markitect.llm.exceptions import LLMRateLimitError
print(f" Calling LLM ({stage_name})...") meta: Dict[str, Any] = {
"provider": self.provider,
"model": self.model,
"prompt_tokens": 0,
"completion_tokens": 0,
"cost": 0.0,
"finish_reason": None,
"duration_seconds": 0.0,
"error": None,
}
model_label = f"{self.provider}/{self.model}" if self.provider else stage_name
print(f" Calling LLM ({model_label})...")
t0 = time.time() t0 = time.time()
max_retries = 3 max_retries = 3
response = None response = None
@@ -460,29 +576,57 @@ class SourcePipeline:
) )
time.sleep(wait) time.sleep(wait)
else: else:
print(f" Rate limit exceeded after {max_retries} retries: {exc}") msg = f"Rate limit exceeded after {max_retries} retries: {exc}"
return None print(f" {msg}")
meta["error"] = msg
meta["duration_seconds"] = round(time.time() - t0, 1)
return None, meta
except Exception as exc: except Exception as exc:
print(f" LLM error: {exc}") msg = str(exc)
return None print(f" LLM error: {msg}")
meta["error"] = msg
meta["duration_seconds"] = round(time.time() - t0, 1)
return None, meta
if response is None: if response is None:
return None meta["error"] = "no response"
return None, meta
elapsed = time.time() - t0 elapsed = round(time.time() - t0, 1)
usage = response.usage usage = response.usage
prompt_tok = usage.get("prompt_tokens", 0)
completion_tok = usage.get("completion_tokens", 0)
cost = float(usage.get("cost", 0.0))
finish_reason = getattr(response, "finish_reason", None) or "unknown"
meta.update({
"prompt_tokens": prompt_tok,
"completion_tokens": completion_tok,
"cost": cost,
"finish_reason": finish_reason,
"duration_seconds": elapsed,
})
cost_str = f", cost=${cost:.4f}" if cost > 0 else ""
print( print(
f" Done in {elapsed:.1f}s — " f" Done in {elapsed}s — "
f"prompt {usage.get('prompt_tokens', '?')} tok, " f"prompt {prompt_tok} tok, completion {completion_tok} tok{cost_str}"
f"completion {usage.get('completion_tokens', '?')} tok"
) )
if finish_reason == "length":
print(
f" WARNING: Output truncated at {max_tokens} tokens "
f"(finish_reason=length). Consider raising max_tokens for "
f"stage '{stage_name}' in infospace.yaml."
)
content = response.content content = response.content
if not content or not content.strip(): if not content or not content.strip():
print(" LLM returned empty content.") print(" LLM returned empty content.")
return None meta["error"] = "empty response"
return None, meta
return content return content, meta
# ── Git Integration ─────────────────────────────────────────────── # ── Git Integration ───────────────────────────────────────────────