feat(pipeline): per-stage max_tokens, LLM provenance, processing log
- PipelineStage now supports max_tokens to override the 4096 default - SourcePipeline records provider/model on each entity file as HTML comment - output/processing-log.yaml tracks tokens, cost, duration, retries, errors - _call_llm returns (content, metadata) for downstream traceability - _http.py wraps JSON parse errors with body preview for debugging - infospace.yaml stages: extract/map=6000 tokens, synthesize=3000 tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -45,6 +45,7 @@ pipeline:
|
||||
output_dir: output/entities
|
||||
output_macro: entities
|
||||
split_entities: true
|
||||
max_tokens: 6000
|
||||
macros:
|
||||
extraction_rules: artifacts/guidelines/extraction-rules.md
|
||||
vsm_framework: artifacts/vsm-reference/vsm-framework.md
|
||||
@@ -52,6 +53,7 @@ pipeline:
|
||||
template: templates/map-to-vsm.md
|
||||
output_dir: output/mappings
|
||||
output_macro: mappings
|
||||
max_tokens: 6000
|
||||
macros:
|
||||
mapping_rules: artifacts/guidelines/mapping-rules.md
|
||||
vsm_framework: artifacts/vsm-reference/vsm-framework.md
|
||||
@@ -59,6 +61,7 @@ pipeline:
|
||||
template: templates/synthesize-analysis.md
|
||||
output_dir: output/analyses
|
||||
output_macro: analysis
|
||||
max_tokens: 3000
|
||||
macros:
|
||||
vsm_framework: artifacts/vsm-reference/vsm-framework.md
|
||||
post_batch:
|
||||
|
||||
@@ -575,7 +575,13 @@ def process(
|
||||
# Run pipeline
|
||||
from markitect.infospace.pipeline import SourcePipeline
|
||||
|
||||
pipeline = SourcePipeline(cfg, root, adapter=adapter, no_commit=no_commit)
|
||||
pipeline = SourcePipeline(
|
||||
cfg, root,
|
||||
adapter=adapter,
|
||||
provider=provider or "",
|
||||
model=(model or _PROVIDER_DEFAULTS.get(provider or "", "")) if provider else "",
|
||||
no_commit=no_commit,
|
||||
)
|
||||
|
||||
total = len(source_files)
|
||||
completed = 0
|
||||
|
||||
@@ -168,6 +168,8 @@ class PipelineStage:
|
||||
macros: Static macros loaded from files (macro name → relative path).
|
||||
spaces: Legacy space IDs for SQLite-based resolver (unused by
|
||||
:class:`SourcePipeline`).
|
||||
max_tokens: Maximum tokens to request from the LLM for this stage.
|
||||
Overrides the pipeline-level default (4096).
|
||||
"""
|
||||
|
||||
template: str
|
||||
@@ -177,6 +179,7 @@ class PipelineStage:
|
||||
split_entities: bool = False
|
||||
macros: Dict[str, str] = field(default_factory=dict)
|
||||
spaces: List[str] = field(default_factory=list)
|
||||
max_tokens: Optional[int] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
d: Dict[str, Any] = {"template": self.template}
|
||||
@@ -192,6 +195,8 @@ class PipelineStage:
|
||||
d["macros"] = self.macros
|
||||
if self.spaces:
|
||||
d["spaces"] = self.spaces
|
||||
if self.max_tokens is not None:
|
||||
d["max_tokens"] = self.max_tokens
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
@@ -204,6 +209,7 @@ class PipelineStage:
|
||||
split_entities=data.get("split_entities", False),
|
||||
macros=data.get("macros", {}),
|
||||
spaces=data.get("spaces", []),
|
||||
max_tokens=data.get("max_tokens"),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -25,11 +25,17 @@ from __future__ import annotations
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import yaml
|
||||
|
||||
from markitect.infospace.config import InfospaceConfig, PipelineStage
|
||||
|
||||
# Default max_tokens when not specified on a stage
|
||||
_DEFAULT_MAX_TOKENS = 4096
|
||||
|
||||
|
||||
class SourcePipeline:
|
||||
"""Processes source files through infospace pipeline stages.
|
||||
@@ -53,11 +59,15 @@ class SourcePipeline:
|
||||
config: InfospaceConfig,
|
||||
root: Path,
|
||||
adapter=None,
|
||||
provider: str = "",
|
||||
model: str = "",
|
||||
no_commit: bool = False,
|
||||
) -> None:
|
||||
self.config = config
|
||||
self.root = root
|
||||
self.adapter = adapter
|
||||
self.provider = provider
|
||||
self.model = model
|
||||
self.no_commit = no_commit
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────
|
||||
@@ -98,6 +108,7 @@ class SourcePipeline:
|
||||
source_id = source_file.stem
|
||||
source_content = source_file.read_text(encoding="utf-8")
|
||||
stage_outputs: Dict[str, str] = {}
|
||||
stage_logs: List[Dict[str, Any]] = []
|
||||
|
||||
print(f"\nProcessing: {source_id}")
|
||||
print("=" * 60)
|
||||
@@ -107,15 +118,21 @@ class SourcePipeline:
|
||||
return False
|
||||
|
||||
for stage in self.config.pipeline.stages:
|
||||
content = self._run_stage(stage, source_id, source_content, stage_outputs)
|
||||
content, stage_meta = self._run_stage(
|
||||
stage, source_id, source_content, stage_outputs
|
||||
)
|
||||
if stage_meta:
|
||||
stage_logs.append(stage_meta)
|
||||
if content is None:
|
||||
stage_label = stage.name or stage.template
|
||||
print(f"\n Pipeline paused at stage '{stage_label}'.")
|
||||
self._write_processing_log(source_id, stage_logs, success=False)
|
||||
return False
|
||||
if stage.output_macro:
|
||||
stage_outputs[stage.output_macro] = content
|
||||
|
||||
print(f"\n {source_id}: all stages complete.")
|
||||
self._write_processing_log(source_id, stage_logs, success=True)
|
||||
if not self.no_commit:
|
||||
self._git_commit(source_id)
|
||||
|
||||
@@ -156,10 +173,12 @@ class SourcePipeline:
|
||||
source_id: str,
|
||||
source_content: str,
|
||||
stage_outputs: Dict[str, str],
|
||||
) -> Optional[str]:
|
||||
) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
|
||||
"""Run a single pipeline stage.
|
||||
|
||||
Returns the stage's output content, or None on failure.
|
||||
Returns:
|
||||
``(content, metadata)`` where *content* is the stage output
|
||||
(or ``None`` on failure) and *metadata* is a log dict.
|
||||
"""
|
||||
stage_label = stage.name or stage.template
|
||||
print(f"\n [{stage_label}]")
|
||||
@@ -170,8 +189,8 @@ class SourcePipeline:
|
||||
if output_file and output_file.exists():
|
||||
print(f" Found existing output: {output_file.name}")
|
||||
if stage.split_entities:
|
||||
return self._load_from_view(output_file)
|
||||
return output_file.read_text(encoding="utf-8")
|
||||
return self._load_from_view(output_file), None
|
||||
return output_file.read_text(encoding="utf-8"), None
|
||||
|
||||
# Build macro substitution dict
|
||||
macros = self._build_macros(stage, source_content, stage_outputs)
|
||||
@@ -180,7 +199,7 @@ class SourcePipeline:
|
||||
template_path = self.root / stage.template
|
||||
if not template_path.exists():
|
||||
print(f" ERROR: Template not found: {stage.template}")
|
||||
return None
|
||||
return None, {"stage": stage_label, "error": f"template not found: {stage.template}"}
|
||||
|
||||
template_content = template_path.read_text(encoding="utf-8")
|
||||
prompt = self._resolve_macros(template_content, macros)
|
||||
@@ -195,17 +214,23 @@ class SourcePipeline:
|
||||
# Without an adapter, we cannot generate output
|
||||
if self.adapter is None:
|
||||
print(" No LLM adapter — skipping generation (manual mode).")
|
||||
return None
|
||||
return None, {"stage": stage_label, "error": "no adapter"}
|
||||
|
||||
# Resolve max_tokens: stage config > pipeline default
|
||||
max_tokens = stage.max_tokens if stage.max_tokens is not None else _DEFAULT_MAX_TOKENS
|
||||
|
||||
# Call LLM — with one retry for split_entities stages that return 0 entities
|
||||
max_attempts = 2 if stage.split_entities else 1
|
||||
entity_files: List[Tuple[str, Path]] = []
|
||||
content = None
|
||||
content: Optional[str] = None
|
||||
llm_meta: Dict[str, Any] = {}
|
||||
total_retries = 0
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
content = self._call_llm(prompt, stage_label)
|
||||
content, llm_meta = self._call_llm(prompt, stage_label, max_tokens)
|
||||
if content is None:
|
||||
return None
|
||||
meta = {"stage": stage_label, "retries": total_retries, **llm_meta}
|
||||
return None, meta
|
||||
|
||||
# Save raw response for debugging (overwritten on retry)
|
||||
if output_file:
|
||||
@@ -214,30 +239,35 @@ class SourcePipeline:
|
||||
raw_file.write_text(content, encoding="utf-8")
|
||||
|
||||
if stage.split_entities:
|
||||
entity_files = self._split_and_write_entities(stage, content)
|
||||
entity_files = self._split_and_write_entities(stage, content, source_id)
|
||||
if entity_files:
|
||||
break # Got entities — proceed
|
||||
if attempt < max_attempts - 1:
|
||||
total_retries += 1
|
||||
print(f" No entity delimiters found — retrying ({attempt + 2}/{max_attempts})...")
|
||||
else:
|
||||
print(
|
||||
f" WARNING: No '--- ENTITY: ---' markers found after {max_attempts} attempt(s).\n"
|
||||
f" Check {raw_file.name} to inspect the raw LLM response."
|
||||
)
|
||||
return None # Don't write empty view; allow re-run
|
||||
meta = {"stage": stage_label, "retries": total_retries,
|
||||
"error": "no entity delimiters", **llm_meta}
|
||||
return None, meta # Don't write empty view; allow re-run
|
||||
else:
|
||||
break # Non-split stages don't need retry
|
||||
|
||||
stage_meta: Dict[str, Any] = {"stage": stage_label, "retries": total_retries, **llm_meta}
|
||||
|
||||
# Persist output
|
||||
if stage.split_entities:
|
||||
self._write_entity_view(source_id, entity_files, output_file)
|
||||
return content
|
||||
return content, stage_meta
|
||||
else:
|
||||
if output_file:
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_file.write_text(content, encoding="utf-8")
|
||||
print(f" Output written to {output_file.name}")
|
||||
return content
|
||||
return content, stage_meta
|
||||
|
||||
# ── Output File Resolution ────────────────────────────────────────
|
||||
|
||||
@@ -319,6 +349,7 @@ class SourcePipeline:
|
||||
for f in entities_dir.glob("*.md")
|
||||
if not f.name.endswith("-entities.md")
|
||||
and not f.name.endswith("-prompt.md")
|
||||
and not f.name.endswith("-raw.md")
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -334,6 +365,7 @@ class SourcePipeline:
|
||||
self,
|
||||
stage: PipelineStage,
|
||||
combined_content: str,
|
||||
source_id: str = "",
|
||||
) -> List[Tuple[str, Path]]:
|
||||
"""Split ``--- ENTITY: <name> ---`` delimited output into files.
|
||||
|
||||
@@ -369,7 +401,11 @@ class SourcePipeline:
|
||||
if file_path.exists():
|
||||
skipped_count += 1
|
||||
else:
|
||||
file_path.write_text(entity_content + "\n", encoding="utf-8")
|
||||
# Prepend provenance comment so the LLM origin is traceable
|
||||
provenance = self._provenance_comment(source_id)
|
||||
file_path.write_text(
|
||||
provenance + entity_content + "\n", encoding="utf-8"
|
||||
)
|
||||
new_count += 1
|
||||
|
||||
entity_files.append((entity_name, file_path))
|
||||
@@ -428,19 +464,99 @@ class SourcePipeline:
|
||||
|
||||
return "\n\n".join(parts) + "\n" if parts else ""
|
||||
|
||||
# ── Provenance & Processing Log ───────────────────────────────────
|
||||
|
||||
def _provenance_comment(self, source_id: str) -> str:
|
||||
"""Return an HTML comment tagging the LLM that generated this content."""
|
||||
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
parts = [f"date={date}", f"source={source_id}"]
|
||||
if self.provider:
|
||||
parts.insert(0, f"provider={self.provider}")
|
||||
if self.model:
|
||||
parts.insert(1, f"model={self.model}")
|
||||
return f"<!-- generated: {' '.join(parts)} -->\n\n"
|
||||
|
||||
def _write_processing_log(
|
||||
self,
|
||||
source_id: str,
|
||||
stage_logs: List[Dict[str, Any]],
|
||||
success: bool,
|
||||
) -> None:
|
||||
"""Append a run record to ``output/processing-log.yaml``."""
|
||||
log_file = self.root / "output" / "processing-log.yaml"
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load existing log
|
||||
existing: List[Dict[str, Any]] = []
|
||||
if log_file.is_file():
|
||||
try:
|
||||
raw = yaml.safe_load(log_file.read_text(encoding="utf-8"))
|
||||
if isinstance(raw, list):
|
||||
existing = raw
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Build new entry
|
||||
total_prompt = sum(s.get("prompt_tokens", 0) for s in stage_logs)
|
||||
total_completion = sum(s.get("completion_tokens", 0) for s in stage_logs)
|
||||
total_cost = sum(s.get("cost", 0.0) for s in stage_logs)
|
||||
total_duration = sum(s.get("duration_seconds", 0.0) for s in stage_logs)
|
||||
total_retries = sum(s.get("retries", 0) for s in stage_logs)
|
||||
errors = [s["error"] for s in stage_logs if s.get("error")]
|
||||
|
||||
entry: Dict[str, Any] = {
|
||||
"source_id": source_id,
|
||||
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"provider": self.provider,
|
||||
"model": self.model,
|
||||
"success": success,
|
||||
"total_prompt_tokens": total_prompt,
|
||||
"total_completion_tokens": total_completion,
|
||||
"total_cost": round(total_cost, 6),
|
||||
"total_duration_seconds": round(total_duration, 1),
|
||||
"total_retries": total_retries,
|
||||
"stages": stage_logs,
|
||||
}
|
||||
if errors:
|
||||
entry["errors"] = errors
|
||||
|
||||
# Remove previous entry for the same source_id (re-run)
|
||||
existing = [e for e in existing if e.get("source_id") != source_id]
|
||||
existing.append(entry)
|
||||
|
||||
log_file.write_text(
|
||||
yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# ── LLM Execution ─────────────────────────────────────────────────
|
||||
|
||||
def _call_llm(
|
||||
self, prompt: str, stage_name: str, max_tokens: int = 8192
|
||||
) -> Optional[str]:
|
||||
self, prompt: str, stage_name: str, max_tokens: int = _DEFAULT_MAX_TOKENS
|
||||
) -> Tuple[Optional[str], Dict[str, Any]]:
|
||||
"""Call the LLM adapter with exponential back-off on rate limits.
|
||||
|
||||
Returns the response content string, or None on failure.
|
||||
Returns:
|
||||
``(content, metadata)`` where *content* is the response string
|
||||
(or ``None`` on failure) and *metadata* has provider, model,
|
||||
token counts, cost, finish_reason, duration, and error info.
|
||||
"""
|
||||
from markitect.prompts.execution.models import RunConfig
|
||||
from markitect.llm.exceptions import LLMRateLimitError
|
||||
|
||||
print(f" Calling LLM ({stage_name})...")
|
||||
meta: Dict[str, Any] = {
|
||||
"provider": self.provider,
|
||||
"model": self.model,
|
||||
"prompt_tokens": 0,
|
||||
"completion_tokens": 0,
|
||||
"cost": 0.0,
|
||||
"finish_reason": None,
|
||||
"duration_seconds": 0.0,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
model_label = f"{self.provider}/{self.model}" if self.provider else stage_name
|
||||
print(f" Calling LLM ({model_label})...")
|
||||
t0 = time.time()
|
||||
max_retries = 3
|
||||
response = None
|
||||
@@ -460,29 +576,57 @@ class SourcePipeline:
|
||||
)
|
||||
time.sleep(wait)
|
||||
else:
|
||||
print(f" Rate limit exceeded after {max_retries} retries: {exc}")
|
||||
return None
|
||||
msg = f"Rate limit exceeded after {max_retries} retries: {exc}"
|
||||
print(f" {msg}")
|
||||
meta["error"] = msg
|
||||
meta["duration_seconds"] = round(time.time() - t0, 1)
|
||||
return None, meta
|
||||
except Exception as exc:
|
||||
print(f" LLM error: {exc}")
|
||||
return None
|
||||
msg = str(exc)
|
||||
print(f" LLM error: {msg}")
|
||||
meta["error"] = msg
|
||||
meta["duration_seconds"] = round(time.time() - t0, 1)
|
||||
return None, meta
|
||||
|
||||
if response is None:
|
||||
return None
|
||||
meta["error"] = "no response"
|
||||
return None, meta
|
||||
|
||||
elapsed = time.time() - t0
|
||||
elapsed = round(time.time() - t0, 1)
|
||||
usage = response.usage
|
||||
prompt_tok = usage.get("prompt_tokens", 0)
|
||||
completion_tok = usage.get("completion_tokens", 0)
|
||||
cost = float(usage.get("cost", 0.0))
|
||||
finish_reason = getattr(response, "finish_reason", None) or "unknown"
|
||||
|
||||
meta.update({
|
||||
"prompt_tokens": prompt_tok,
|
||||
"completion_tokens": completion_tok,
|
||||
"cost": cost,
|
||||
"finish_reason": finish_reason,
|
||||
"duration_seconds": elapsed,
|
||||
})
|
||||
|
||||
cost_str = f", cost=${cost:.4f}" if cost > 0 else ""
|
||||
print(
|
||||
f" Done in {elapsed:.1f}s — "
|
||||
f"prompt {usage.get('prompt_tokens', '?')} tok, "
|
||||
f"completion {usage.get('completion_tokens', '?')} tok"
|
||||
f" Done in {elapsed}s — "
|
||||
f"prompt {prompt_tok} tok, completion {completion_tok} tok{cost_str}"
|
||||
)
|
||||
|
||||
if finish_reason == "length":
|
||||
print(
|
||||
f" WARNING: Output truncated at {max_tokens} tokens "
|
||||
f"(finish_reason=length). Consider raising max_tokens for "
|
||||
f"stage '{stage_name}' in infospace.yaml."
|
||||
)
|
||||
|
||||
content = response.content
|
||||
if not content or not content.strip():
|
||||
print(" LLM returned empty content.")
|
||||
return None
|
||||
meta["error"] = "empty response"
|
||||
return None, meta
|
||||
|
||||
return content
|
||||
return content, meta
|
||||
|
||||
# ── Git Integration ───────────────────────────────────────────────
|
||||
|
||||
|
||||
Reference in New Issue
Block a user