Files
markitect-main/markitect/infospace/pipeline.py
tegwick e3e5b8ecc1
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat(infospace): systematic long-text processing — rich commit bodies, per-source eval/classify, chapters view
Three coordinated changes that let the pipeline produce a clean
chapter-by-chapter git history on long texts without archaeology after
the fact.

1. Richer commit messages. `SourcePipeline._git_commit` now diffs the
   staged changes, buckets added files by output subdirectory (entities,
   evaluations, classifications, mappings, analyses, metrics, logs), and
   includes counts in the commit body. So `git log` reads "entities:
   +23, evaluations: +23" per chapter instead of the same generic blurb
   on every commit. Zero behaviour change when no output changed; falls
   back to the original message if the diff query fails.

2. --eval-after-source / --classify-after-source on `infospace process`.
   After a source's stages succeed, the pipeline identifies which entity
   files are *new* (set diff of entity slugs before vs after), loads
   their EntityMeta, and runs per-entity evaluation and/or
   classification scoped to just those slugs before the per-source git
   commit lands. Result: each chapter's commit is self-contained —
   extraction + evaluation + classification in one atomic unit. Gated
   behind explicit flags because the cost is real (LLM latency per
   chapter rather than amortised across one bulk batch).

3. `markitect infospace chapters` subcommand. Lists source files in
   canonical order with entity count, evaluated count, classified
   count, and mean per-entity score per source. Text or JSON output.
   Natural triage surface for long-text infospaces — spot chapters that
   under-extracted or evaluated poorly.

Also: `docs/advanced-usage.md` gets a new "Systematic processing of
long texts" section with the recommended flag combo and the tradeoff
note on cost.

11 new unit tests cover the chapters command (text/json/no-sources),
the process flag wiring (help + provider requirement), and the
commit-body bucket logic. Full infospace+llm unit suite (315 tests)
green; 3 pre-existing infospace failures unchanged.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 08:24:26 +02:00

836 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Source file processing pipeline for infospace tooling.
Processes source files through the pipeline stages defined in
``infospace.yaml`` using simple ``@{macro}`` template substitution —
no SQLite or space-resolver dependencies.
Example usage from CLI::
markitect infospace process "book-1-chapter-0[1-3].md" \\
--provider openrouter --check-after-each
Or programmatically::
from markitect.infospace.pipeline import SourcePipeline
from markitect.llm import create_adapter
adapter = create_adapter("openrouter")
pipeline = SourcePipeline(config, root, adapter=adapter)
pipeline.process_sources(source_files)
"""
from __future__ import annotations
import re
import subprocess
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import yaml
from markitect.infospace.config import InfospaceConfig, PipelineStage
# Default max_tokens when not specified on a stage
_DEFAULT_MAX_TOKENS = 4096
class SourcePipeline:
"""Processes source files through infospace pipeline stages.
Each stage in :attr:`config.pipeline.stages` is run in order for
every source file. Stage outputs are passed as macros to subsequent
stages via the ``output_macro`` field.
Template substitution uses ``@{macro_name}`` tokens. Static macros
are loaded from files listed under ``stage.macros``; dynamic macros
include ``chapter_text`` (source file content), ``existing_entities``
(for split-entities stages), and outputs from prior stages.
"""
_PROVIDER_DEFAULTS: Dict[str, str] = {
"openrouter": "arcee-ai/trinity-large-preview:free",
}
def __init__(
self,
config: InfospaceConfig,
root: Path,
adapter=None,
provider: str = "",
model: str = "",
no_commit: bool = False,
eval_after_source: bool = False,
classify_after_source: bool = False,
) -> None:
self.config = config
self.root = root
self.adapter = adapter
self.provider = provider
self.model = model
self.no_commit = no_commit
self.eval_after_source = eval_after_source
self.classify_after_source = classify_after_source
# ── Public API ────────────────────────────────────────────────────
def process_sources(
self,
source_files: List[Path],
check_after_each: bool = False,
) -> int:
"""Process a list of source files through all pipeline stages.
Args:
source_files: Ordered list of source files to process.
check_after_each: If True, run collection checks after each
source and record a metrics snapshot.
Returns:
Number of source files fully processed (all stages complete).
"""
completed = 0
for source_file in source_files:
success = self.process_source(source_file)
if success:
completed += 1
if check_after_each:
self.run_collection_check()
return completed
def process_source(self, source_file: Path) -> bool:
"""Run the full pipeline for a single source file.
Args:
source_file: Path to the source markdown file.
Returns:
True if all stages completed successfully.
"""
source_id = source_file.stem
source_content = source_file.read_text(encoding="utf-8")
stage_outputs: Dict[str, str] = {}
stage_logs: List[Dict[str, Any]] = []
# Snapshot entity slugs before any stage runs so we can identify
# which entities were newly produced by this source. Used to scope
# --eval-after-source / --classify-after-source to only the new
# entities.
pre_entity_slugs = self._current_entity_slugs()
print(f"\nProcessing: {source_id}")
print("=" * 60)
if not self.config.pipeline or not self.config.pipeline.stages:
print(" No pipeline stages configured.")
return False
for stage in self.config.pipeline.stages:
content, stage_meta = self._run_stage(
stage, source_id, source_content, stage_outputs
)
if stage_meta:
stage_logs.append(stage_meta)
if content is None:
stage_label = stage.name or stage.template
print(f"\n Pipeline paused at stage '{stage_label}'.")
self._write_processing_log(source_id, stage_logs, success=False)
return False
if stage.output_macro:
stage_outputs[stage.output_macro] = content
print(f"\n {source_id}: all stages complete.")
self._write_processing_log(source_id, stage_logs, success=True)
# Per-source follow-ups: evaluate and/or classify just the new
# entities this source produced, so the next commit contains a
# fully-processed chapter.
new_slugs = self._current_entity_slugs() - pre_entity_slugs
if new_slugs and (self.eval_after_source or self.classify_after_source):
self._run_per_source_followups(new_slugs)
if not self.no_commit:
self._git_commit(source_id)
return True
def run_collection_check(self) -> None:
"""Run collection-level quality checks (C1C5) and print metrics."""
from markitect.infospace.entity_parser import parse_entity_directory
from markitect.infospace.checks import run_all_checks
from markitect.infospace.history import record_check_results
entities_dir = self.root / self.config.entities_dir
if not entities_dir.is_dir():
print("\n Collection check: no entities directory.")
return
entities = parse_entity_directory(entities_dir)
if not entities:
print("\n Collection check: no entities found.")
return
print(f"\n Collection check ({len(entities)} entities):")
report = run_all_checks(entities=entities)
m = report.metrics()
for k, v in sorted(m.items()):
print(f" {k}: {v:.4f}")
snap = record_check_results(
report, self.config, self.root, entity_count=len(entities)
)
print(f" Snapshot recorded: {snap.snapshot_id}")
# ── Stage Execution ───────────────────────────────────────────────
def _run_stage(
self,
stage: PipelineStage,
source_id: str,
source_content: str,
stage_outputs: Dict[str, str],
) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
"""Run a single pipeline stage.
Returns:
``(content, metadata)`` where *content* is the stage output
(or ``None`` on failure) and *metadata* is a log dict.
"""
stage_label = stage.name or stage.template
print(f"\n [{stage_label}]")
output_file = self._get_output_file(stage, source_id)
# Skip-if-exists — load from disk
if output_file and output_file.exists():
print(f" Found existing output: {output_file.name}")
if stage.split_entities:
return self._load_from_view(output_file), None
return output_file.read_text(encoding="utf-8"), None
# Build macro substitution dict
macros = self._build_macros(stage, source_content, stage_outputs)
# Load and resolve template
template_path = self.root / stage.template
if not template_path.exists():
print(f" ERROR: Template not found: {stage.template}")
return None, {"stage": stage_label, "error": f"template not found: {stage.template}"}
template_content = template_path.read_text(encoding="utf-8")
prompt = self._resolve_macros(template_content, macros)
# Write prompt for inspection
if output_file:
prompt_file = output_file.parent / f"{source_id}-prompt.md"
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text(prompt, encoding="utf-8")
print(f" Prompt written to {prompt_file.name}")
# Without an adapter, we cannot generate output
if self.adapter is None:
print(" No LLM adapter — skipping generation (manual mode).")
return None, {"stage": stage_label, "error": "no adapter"}
# Resolve max_tokens: stage config > pipeline default
max_tokens = stage.max_tokens if stage.max_tokens is not None else _DEFAULT_MAX_TOKENS
# Call LLM — with one retry for split_entities stages that return 0 entities
max_attempts = 2 if stage.split_entities else 1
entity_files: List[Tuple[str, Path]] = []
content: Optional[str] = None
llm_meta: Dict[str, Any] = {}
total_retries = 0
for attempt in range(max_attempts):
content, llm_meta = self._call_llm(prompt, stage_label, max_tokens)
if content is None:
meta = {"stage": stage_label, "retries": total_retries, **llm_meta}
return None, meta
# Save raw response for debugging (overwritten on retry)
if output_file:
raw_file = output_file.parent / f"{source_id}-{stage.name or 'stage'}-raw.md"
raw_file.parent.mkdir(parents=True, exist_ok=True)
raw_file.write_text(content, encoding="utf-8")
if stage.split_entities:
entity_files = self._split_and_write_entities(stage, content, source_id)
if entity_files:
break # Got entities — proceed
if attempt < max_attempts - 1:
total_retries += 1
print(f" No entity delimiters found — retrying ({attempt + 2}/{max_attempts})...")
else:
print(
f" WARNING: No '--- ENTITY: ---' markers found after {max_attempts} attempt(s).\n"
f" Check {raw_file.name} to inspect the raw LLM response."
)
meta = {"stage": stage_label, "retries": total_retries,
"error": "no entity delimiters", **llm_meta}
return None, meta # Don't write empty view; allow re-run
else:
break # Non-split stages don't need retry
stage_meta: Dict[str, Any] = {"stage": stage_label, "retries": total_retries, **llm_meta}
# Persist output
if stage.split_entities:
self._write_entity_view(source_id, entity_files, output_file)
return content, stage_meta
else:
if output_file:
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content, encoding="utf-8")
print(f" Output written to {output_file.name}")
return content, stage_meta
# ── Output File Resolution ────────────────────────────────────────
def _get_output_file(
self, stage: PipelineStage, source_id: str
) -> Optional[Path]:
"""Return the expected output file path for a stage, or None."""
if not stage.output_dir or not stage.output_macro:
return None
return self.root / stage.output_dir / f"{source_id}-{stage.output_macro}.md"
# ── Macro Resolution ──────────────────────────────────────────────
def _build_macros(
self,
stage: PipelineStage,
source_content: str,
stage_outputs: Dict[str, str],
) -> Dict[str, str]:
"""Assemble the macro dict for template substitution."""
macros: Dict[str, str] = {"chapter_text": source_content}
# Static macros loaded from files
for macro_name, rel_path in stage.macros.items():
file_path = self.root / rel_path
if file_path.exists():
macros[macro_name] = file_path.read_text(encoding="utf-8")
else:
print(f" WARNING: macro file not found: {rel_path}")
macros[macro_name] = f"(File not found: {rel_path})"
# Existing entities list (for split_entities stages)
if stage.split_entities:
entities_dir = self._get_entities_dir(stage)
existing = self._list_existing_entities(entities_dir)
if existing:
macros["existing_entities"] = "\n".join(
f"- {s}" for s in existing
)
else:
macros["existing_entities"] = (
"(none — this is the first source file)"
)
# Outputs from prior stages (override static macros if same name)
macros.update(stage_outputs)
return macros
@staticmethod
def _resolve_macros(template: str, macros: Dict[str, str]) -> str:
"""Substitute ``@{macro_name}`` tokens with values from *macros*."""
def replacer(match: re.Match) -> str:
key = match.group(1)
return macros.get(key, match.group(0))
return re.sub(r"@\{([^}]+)\}", replacer, template)
# ── Entity Splitting ──────────────────────────────────────────────
def _get_entities_dir(self, stage: Optional[PipelineStage] = None) -> Path:
"""Return the entities output directory."""
if stage and stage.output_dir:
return self.root / stage.output_dir
# Fall back to the first split_entities stage's output_dir
if self.config.pipeline:
for s in self.config.pipeline.stages:
if s.split_entities and s.output_dir:
return self.root / s.output_dir
return self.root / "output" / "entities"
def _list_existing_entities(self, entities_dir: Path) -> List[str]:
"""Return sorted slugs of all canonical entity files on disk."""
if not entities_dir.is_dir():
return []
return sorted(
f.stem
for f in entities_dir.glob("*.md")
if not f.name.endswith("-entities.md")
and not f.name.endswith("-prompt.md")
and not f.name.endswith("-raw.md")
)
@staticmethod
def _normalize_entity_name(name: str) -> str:
"""Normalise entity name to a kebab-case filename stem."""
slug = name.lower().strip()
slug = slug.replace("_", "-").replace(" ", "-")
slug = re.sub(r"[^a-z0-9-]", "", slug)
slug = re.sub(r"-{2,}", "-", slug)
return slug.strip("-")
def _split_and_write_entities(
self,
stage: PipelineStage,
combined_content: str,
source_id: str = "",
) -> List[Tuple[str, Path]]:
"""Split ``--- ENTITY: <name> ---`` delimited output into files.
Writes each entity to ``<output_dir>/<slug>.md``. Existing files
are skipped (first-occurrence wins).
Returns:
List of ``(entity_name, file_path)`` for all entities in
*combined_content* (new and pre-existing alike).
"""
entities_dir = self._get_entities_dir(stage)
entities_dir.mkdir(parents=True, exist_ok=True)
parts = re.split(
r"^---\s*ENTITY:\s*(.+?)\s*---\s*$",
combined_content,
flags=re.MULTILINE,
)
entity_files: List[Tuple[str, Path]] = []
new_count = 0
skipped_count = 0
for i in range(1, len(parts), 2):
entity_name = parts[i]
entity_content = parts[i + 1].strip() if i + 1 < len(parts) else ""
slug = self._normalize_entity_name(entity_name)
if not slug:
continue
file_path = entities_dir / f"{slug}.md"
if file_path.exists():
skipped_count += 1
else:
# Prepend provenance comment so the LLM origin is traceable
provenance = self._provenance_comment(source_id)
file_path.write_text(
provenance + entity_content + "\n", encoding="utf-8"
)
new_count += 1
entity_files.append((entity_name, file_path))
msg = f" {new_count} new entities written"
if skipped_count:
msg += f", {skipped_count} pre-existing (skipped)"
print(msg)
return entity_files
def _write_entity_view(
self,
source_id: str,
entity_files: List[Tuple[str, Path]],
view_file: Optional[Path],
) -> None:
"""Write a per-source view file with ``{{ include }}`` directives."""
if view_file is None:
return
lines = [f"# Entities: {source_id}\n"]
for _name, file_path in entity_files:
lines.append(f'{{{{ include "{file_path.name}" }}}}')
lines.append("")
lines.append("---")
lines.append("")
# Remove trailing separator after last entity
if lines and lines[-1] == "" and len(lines) >= 3 and lines[-2] == "---":
lines = lines[:-2]
view_file.parent.mkdir(parents=True, exist_ok=True)
view_file.write_text("\n".join(lines) + "\n", encoding="utf-8")
print(f" View written to {view_file.name}")
def _load_from_view(self, view_file: Path) -> str:
"""Reconstruct combined entity content from a view file.
Parses ``{{ include "filename.md" }}`` directives and assembles
the ``--- ENTITY: <slug> ---`` delimited content for downstream
stages.
"""
view_content = view_file.read_text(encoding="utf-8")
includes = re.findall(
r'\{\{\s*include\s+"([^"]+)"\s*\}\}', view_content
)
entities_dir = view_file.parent
parts: List[str] = []
for rel_path in includes:
file_path = entities_dir / rel_path
if file_path.exists():
slug = file_path.stem
body = file_path.read_text(encoding="utf-8").strip()
parts.append(f"--- ENTITY: {slug} ---\n\n{body}")
return "\n\n".join(parts) + "\n" if parts else ""
# ── Provenance & Processing Log ───────────────────────────────────
def _provenance_comment(self, source_id: str) -> str:
"""Return an HTML comment tagging the LLM that generated this content."""
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
parts = [f"date={date}", f"source={source_id}"]
if self.provider:
parts.insert(0, f"provider={self.provider}")
if self.model:
parts.insert(1, f"model={self.model}")
return f"<!-- generated: {' '.join(parts)} -->\n\n"
def _write_processing_log(
self,
source_id: str,
stage_logs: List[Dict[str, Any]],
success: bool,
) -> None:
"""Append a run record to ``output/processing-log.yaml``."""
log_file = self.root / "output" / "processing-log.yaml"
log_file.parent.mkdir(parents=True, exist_ok=True)
# Load existing log
existing: List[Dict[str, Any]] = []
if log_file.is_file():
try:
raw = yaml.safe_load(log_file.read_text(encoding="utf-8"))
if isinstance(raw, list):
existing = raw
except Exception:
pass
# Build new entry
total_prompt = sum(s.get("prompt_tokens", 0) for s in stage_logs)
total_completion = sum(s.get("completion_tokens", 0) for s in stage_logs)
total_cost = sum(s.get("cost", 0.0) for s in stage_logs)
total_duration = sum(s.get("duration_seconds", 0.0) for s in stage_logs)
total_retries = sum(s.get("retries", 0) for s in stage_logs)
errors = [s["error"] for s in stage_logs if s.get("error")]
entry: Dict[str, Any] = {
"source_id": source_id,
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"provider": self.provider,
"model": self.model,
"success": success,
"total_prompt_tokens": total_prompt,
"total_completion_tokens": total_completion,
"total_cost": round(total_cost, 6),
"total_duration_seconds": round(total_duration, 1),
"total_retries": total_retries,
"stages": stage_logs,
}
if errors:
entry["errors"] = errors
# Remove previous entry for the same source_id (re-run)
existing = [e for e in existing if e.get("source_id") != source_id]
existing.append(entry)
log_file.write_text(
yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
encoding="utf-8",
)
# ── LLM Execution ─────────────────────────────────────────────────
def _call_llm(
self, prompt: str, stage_name: str, max_tokens: int = _DEFAULT_MAX_TOKENS
) -> Tuple[Optional[str], Dict[str, Any]]:
"""Call the LLM adapter with exponential back-off on rate limits.
Returns:
``(content, metadata)`` where *content* is the response string
(or ``None`` on failure) and *metadata* has provider, model,
token counts, cost, finish_reason, duration, and error info.
"""
from markitect.prompts.execution.models import RunConfig
from markitect.llm.exceptions import LLMRateLimitError
meta: Dict[str, Any] = {
"provider": self.provider,
"model": self.model,
"prompt_tokens": 0,
"completion_tokens": 0,
"cost": 0.0,
"finish_reason": None,
"duration_seconds": 0.0,
"error": None,
}
model_label = f"{self.provider}/{self.model}" if self.provider else stage_name
print(f" Calling LLM ({model_label})...")
t0 = time.time()
max_retries = 3
response = None
for attempt in range(max_retries + 1):
try:
response = self.adapter.execute_prompt(
prompt, RunConfig(max_tokens=max_tokens)
)
break
except LLMRateLimitError as exc:
if attempt < max_retries:
wait = 15 * (attempt + 1)
print(
f" Rate limited — retrying in {wait}s "
f"(attempt {attempt + 1}/{max_retries})..."
)
time.sleep(wait)
else:
msg = f"Rate limit exceeded after {max_retries} retries: {exc}"
print(f" {msg}")
meta["error"] = msg
meta["duration_seconds"] = round(time.time() - t0, 1)
return None, meta
except Exception as exc:
msg = str(exc)
print(f" LLM error: {msg}")
if attempt < max_retries:
wait = 5 * (attempt + 1)
print(f" Retrying in {wait}s (attempt {attempt + 1}/{max_retries})...")
time.sleep(wait)
else:
meta["error"] = msg
meta["duration_seconds"] = round(time.time() - t0, 1)
return None, meta
if response is None:
meta["error"] = "no response"
return None, meta
elapsed = round(time.time() - t0, 1)
usage = response.usage
prompt_tok = usage.get("prompt_tokens", 0)
completion_tok = usage.get("completion_tokens", 0)
cost = float(usage.get("cost", 0.0))
finish_reason = getattr(response, "finish_reason", None) or "unknown"
meta.update({
"prompt_tokens": prompt_tok,
"completion_tokens": completion_tok,
"cost": cost,
"finish_reason": finish_reason,
"duration_seconds": elapsed,
})
cost_str = f", cost=${cost:.4f}" if cost > 0 else ""
print(
f" Done in {elapsed}s — "
f"prompt {prompt_tok} tok, completion {completion_tok} tok{cost_str}"
)
if finish_reason == "length":
print(
f" WARNING: Output truncated at {max_tokens} tokens "
f"(finish_reason=length). Consider raising max_tokens for "
f"stage '{stage_name}' in infospace.yaml."
)
content = response.content
if not content or not content.strip():
print(" LLM returned empty content.")
meta["error"] = "empty response"
return None, meta
return content, meta
# ── Git Integration ───────────────────────────────────────────────
def _git_commit(self, source_id: str) -> None:
"""Stage all output changes and commit them for *source_id*.
The commit message body summarises what actually changed — counts
of entities / evaluations / classifications / analyses added — so
``git log`` reads like the chapter-by-chapter story of the
infospace growing, not a wall of identical messages.
"""
output_dir = self.root / "output"
try:
subprocess.run(
["git", "add", str(output_dir)],
cwd=str(self.root),
check=True,
capture_output=True,
)
body = self._compose_commit_body(source_id)
result = subprocess.run(
[
"git", "commit", "-m",
f"infospace: process {source_id}\n\n{body}",
],
cwd=str(self.root),
capture_output=True,
text=True,
)
if result.returncode == 0:
print(f" Git commit: process {source_id}")
else:
output = result.stdout + result.stderr
if "nothing to commit" in output or "nothing added" in output:
print(f" Git: nothing to commit for {source_id}")
else:
print(f" Warning: Git commit failed: {output.strip()}")
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode() if isinstance(e.stderr, bytes) else (e.stderr or "")
print(f" Warning: Git error: {stderr.strip()}")
# ── Per-source helpers ────────────────────────────────────────────
def _current_entity_slugs(self) -> set:
"""Return the set of entity file stems currently on disk."""
entities_dir = self.root / self.config.entities_dir
if not entities_dir.is_dir():
return set()
return {p.stem for p in entities_dir.glob("*.md")}
def _run_per_source_followups(self, new_slugs: set) -> None:
"""Run per-source evaluation and/or classification on *new_slugs*.
Called after a source's pipeline stages succeed, before the git
commit, so each chapter's commit contains the full set of
artefacts derived from it.
"""
from markitect.infospace.entity_parser import parse_entity_directory
entities_dir = self.root / self.config.entities_dir
all_entities = parse_entity_directory(entities_dir)
new_entities = [e for e in all_entities if e.slug in new_slugs]
if not new_entities:
return
if self.adapter is None:
print(
" Skipping per-source eval/classify: no LLM adapter "
"configured (run with --provider)."
)
return
from markitect.prompts.execution.models import RunConfig
run_config = RunConfig(
model_name=self.model or None, temperature=0.3, max_tokens=2000
)
if self.eval_after_source:
from markitect.infospace.evaluate import run_entity_evaluation
print(f" Evaluating {len(new_entities)} new entity/entities…")
try:
run_entity_evaluation(
config=self.config,
entities=new_entities,
adapter=self.adapter,
run_config=run_config,
output_dir=self.root / self.config.evaluations_dir,
)
except Exception as exc:
print(f" Warning: per-source evaluation failed: {exc}")
if self.classify_after_source:
from markitect.infospace.classifier import run_entity_classification
print(f" Classifying {len(new_entities)} new entity/entities…")
try:
run_entity_classification(
config=self.config,
entities=new_entities,
adapter=self.adapter,
run_config=run_config,
output_dir=self.root / self.config.classifications_dir,
)
except Exception as exc:
print(f" Warning: per-source classification failed: {exc}")
def _compose_commit_body(self, source_id: str) -> str:
"""Summarise staged output changes into a commit-message body.
Counts added files per output subdirectory (entities, evaluations,
classifications, analyses, mappings…) and produces one line per
bucket that actually saw additions. Modified/deleted files are
noted separately for auditability.
"""
default = "Extract entities, map to VSM, and synthesize analysis."
try:
result = subprocess.run(
["git", "diff", "--cached", "--name-status", "--", "output"],
cwd=str(self.root),
check=True,
capture_output=True,
text=True,
)
except subprocess.CalledProcessError:
return default
added_by_bucket: Dict[str, int] = {}
modified = 0
deleted = 0
for line in result.stdout.splitlines():
parts = line.split("\t")
if len(parts) < 2:
continue
status = parts[0]
path = parts[-1]
if status.startswith("A"):
bucket = self._bucket_for(path)
if bucket:
added_by_bucket[bucket] = added_by_bucket.get(bucket, 0) + 1
elif status.startswith("M"):
modified += 1
elif status.startswith("D"):
deleted += 1
if not added_by_bucket and not modified and not deleted:
return default
# Emit buckets in a deterministic, reader-friendly order.
order = ["entities", "mappings", "analyses", "evaluations",
"classifications", "metrics", "logs", "other"]
lines: List[str] = []
for bucket in order:
n = added_by_bucket.get(bucket, 0)
if n:
lines.append(f"- {bucket}: +{n}")
if modified:
lines.append(f"- modified: {modified}")
if deleted:
lines.append(f"- deleted: {deleted}")
return "\n".join(lines) if lines else default
def _bucket_for(self, path: str) -> Optional[str]:
"""Map an ``output/...`` path to a commit-summary bucket name."""
# Use configured directory basenames where possible so non-default
# layouts still bucket correctly.
buckets = {
Path(self.config.entities_dir).name: "entities",
Path(self.config.evaluations_dir).name: "evaluations",
Path(self.config.classifications_dir).name: "classifications",
}
parts = Path(path).parts
if len(parts) < 2 or parts[0] != "output":
return None
sub = parts[1]
if sub in buckets:
return buckets[sub]
# Heuristic fallback for common additional output subdirectories.
known = {"mappings", "analyses", "metrics", "logs"}
if sub in known:
return sub
return "other"