""" Per-entity evaluation pipeline. Builds prompts from entity metadata and delegates LLM evaluation to the :class:`BatchEvaluator`. Writes structured results to the evaluations directory. """ from __future__ import annotations import hashlib from datetime import datetime from pathlib import Path from typing import Callable, Dict, List, Optional from markitect.infospace.config import InfospaceConfig from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry from markitect.infospace.evaluation_io import write_entity_evaluation from markitect.infospace.models import EntityMeta from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary from markitect.prompts.execution.llm_adapter import LLMAdapter from markitect.prompts.execution.models import RunConfig _DEFAULT_DIMENSIONS = [ "definition_precision", "source_grounding", "domain_placement", "vsm_relevance", "explanatory_value", ] _PROMPT_TEMPLATE = """\ You are evaluating an entity from an infospace about "{topic}". ## Entity: {title} **Slug:** {slug} **Domain:** {domain} **Source chapter:** {source_chapter} ### Definition {definition} ### Context {context} ## Background This infospace maps concepts from the source corpus to Stafford Beer's Viable System Model (VSM). The VSM has five systems: S1 (primary operations), S2 (coordination/anti-oscillation), S3 (internal regulation/audit), S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this to assess whether the entity has a natural VSM home. ## Dimensions - **definition_precision**: Is the definition precise and non-circular? \ Does it capture a distinct concept rather than a vague umbrella term? - **source_grounding**: Is this entity grounded in the actual source text, \ or does it introduce concepts the source does not clearly state? - **domain_placement**: Is the economic/thematic domain assignment correct? \ Does the entity belong in a different conceptual category? - **vsm_relevance**: Does this entity map naturally to one or more VSM \ systems (S1–S5), or is it VSM-neutral/too abstract to place? - **explanatory_value**: Does this entity add genuine explanatory power — \ illuminating a mechanism or structural relation — or does it merely name \ a surface phenomenon? ## Instructions Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief rationale (1–2 sentences) for each score. ## Output format DIMENSION: SCORE: <1-5> RATIONALE: Repeat for each dimension. """ def build_evaluation_prompt( entity: EntityMeta, topic: str, dimensions: Optional[List[str]] = None, ) -> str: """Build an evaluation prompt for a single entity.""" dims = dimensions or _DEFAULT_DIMENSIONS dims_list = "\n".join(f"- {d}" for d in dims) return _PROMPT_TEMPLATE.format( topic=topic, title=entity.title, slug=entity.slug, domain=entity.domain or "(unspecified)", source_chapter=entity.source_chapter or "(unspecified)", definition=entity.definition or "(no definition)", context=entity.context or "(no context)", dimensions_list=dims_list, ) def content_digest(entity: EntityMeta) -> str: """Compute a content digest for incremental evaluation.""" content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}" return hashlib.sha256(content.encode()).hexdigest()[:16] def parse_evaluation_response( response_text: str, dimensions: Optional[List[str]] = None, ) -> List[ScoreEntry]: """Parse structured dimension scores from LLM response text. Expects blocks of:: DIMENSION: SCORE: <1-5> RATIONALE: """ dims = dimensions or _DEFAULT_DIMENSIONS scores: List[ScoreEntry] = [] current_dim = None current_score = None current_rationale = "" for line in response_text.splitlines(): stripped = line.strip() if stripped.upper().startswith("DIMENSION:"): # Flush previous if current_dim is not None and current_score is not None: scores.append(ScoreEntry( name=current_dim, value=current_score, max_value=5.0, rationale=current_rationale.strip(), )) current_dim = stripped.split(":", 1)[1].strip() current_score = None current_rationale = "" elif stripped.upper().startswith("SCORE:"): try: current_score = float(stripped.split(":", 1)[1].strip()) except ValueError: current_score = None elif stripped.upper().startswith("RATIONALE:"): current_rationale = stripped.split(":", 1)[1].strip() elif current_dim is not None and current_score is not None: # Continuation of rationale if stripped: current_rationale += " " + stripped # Flush last if current_dim is not None and current_score is not None: scores.append(ScoreEntry( name=current_dim, value=current_score, max_value=5.0, rationale=current_rationale.strip(), )) return scores def run_entity_evaluation( config: InfospaceConfig, entities: List[EntityMeta], adapter: LLMAdapter, run_config: Optional[RunConfig] = None, output_dir: Optional[Path] = None, previous_digests: Optional[Dict[str, str]] = None, progress_callback: Optional[Callable] = None, dimensions: Optional[List[str]] = None, ) -> BatchSummary: """Run per-entity evaluation using the batch evaluator. Evaluation files are written **incrementally** after each successful result, so a long run is resumable and safe to interrupt. Args: config: The infospace configuration. entities: Entities to evaluate. adapter: LLM adapter for evaluation. run_config: LLM execution configuration. output_dir: Where to write evaluation results. Defaults to ``config.evaluations_dir`` relative to CWD. previous_digests: ``{slug: digest}`` for incremental skip. progress_callback: Called after each item. dimensions: Custom evaluation dimensions. Returns: A :class:`BatchSummary` with per-entity results. """ topic = config.topic.name evaluations_path = output_dir or Path(config.evaluations_dir) evaluator_name = (run_config.model_name if run_config else "unknown") def _write_and_notify(done: int, total: int, result) -> None: # Write file immediately on success (incremental — run is resumable) if result.status == "success" and result.response is not None: scores = parse_evaluation_response(result.response.content, dimensions) evaluation = EntityEvaluation( entity_slug=result.key, evaluator=evaluator_name, scores=scores, evaluated_at=datetime.utcnow(), ) eval_path = evaluations_path / f"{result.key}.md" write_entity_evaluation(evaluation, eval_path) if progress_callback is not None: progress_callback(done, total, result) items = [ BatchItem( key=entity.slug, prompt=build_evaluation_prompt(entity, topic, dimensions), content_digest=content_digest(entity), metadata={"source_path": entity.source_path}, ) for entity in entities ] evaluator = BatchEvaluator( adapter=adapter, config=run_config, progress_callback=_write_and_notify, previous_digests=previous_digests, ) return evaluator.evaluate(items)