""" Per-entity classification pipeline for L2 typed entities. Builds a concise LLM prompt asking the model to assign an Entity Type and a VSM System to each entity, then parses the structured response. Batch execution mirrors the evaluate.py pattern: incremental file writing makes long runs safe to interrupt and resume. """ from __future__ import annotations from datetime import datetime from pathlib import Path from typing import Callable, List, Optional from markitect.infospace.classification import ( ENTITY_TYPES, VSM_SYSTEMS, EntityClassification, ) from markitect.infospace.classification_io import write_entity_classification from markitect.infospace.config import InfospaceConfig from markitect.infospace.models import EntityMeta from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary from markitect.prompts.execution.llm_adapter import LLMAdapter from markitect.prompts.execution.models import RunConfig # ── Type and VSM system descriptions ───────────────────────────────────────── _TYPE_DEFS = { "Element": ( "a stock, agent, artifact, or institution that persists — a noun, " "something that exists independently (e.g. Capital Stock, Corn, Colony)" ), "Process": ( "a flow, activity, or transformation with duration — something that " "happens rather than exists (e.g. Division of Labour, Credit Extension, Trade)" ), "Relation": ( "a structural dependency or causal link between two entities — a connector " "or mechanism (e.g. Rent determined by Price; Wages bounded by Profit)" ), "Principle": ( "an abstract law or invariant that holds across contexts — a rule or " "theoretical claim (e.g. Comparative Advantage, Diminishing Returns)" ), "Institution": ( "a socially constructed rule system, norm, or governance structure " "(e.g. Banking System, Apprenticeship Law, Taille)" ), } _VSM_DEFS = { "S1": "Primary operations — productive activities (agricultural labour, manufacturing, carrying trade)", "S2": "Coordination — anti-oscillation, price signals (market price, natural price, wages)", "S3": "Management — resource allocation, operational control (capital allocation, taxation, banking)", "S3*": "Audit — inspection, compliance, integrity (customs enforcement, assay, coinage)", "S4": "Intelligence — adaptation, environment scanning (invisible hand, foreign trade analysis)", "S5": "Policy — identity, ultimate authority, purpose (political economy systems, public debt policy)", } _PROMPT_TEMPLATE = """\ You are classifying an entity from an infospace about "{topic}". Your task: assign exactly one **Entity Type** and one **VSM System** to the entity, \ then give a one-sentence rationale for each choice. ## Entity: {title} **Domain:** {domain} **Source chapter:** {source_chapter} ### Definition {definition} ### Context {context} --- ## Entity Types — choose exactly one - **Element** — {type_Element} - **Process** — {type_Process} - **Relation** — {type_Relation} - **Principle** — {type_Principle} - **Institution** — {type_Institution} ## VSM Systems — choose exactly one - **S1** — {vsm_S1} - **S2** — {vsm_S2} - **S3** — {vsm_S3} - **S3*** — {vsm_S3s} - **S4** — {vsm_S4} - **S5** — {vsm_S5} --- ## Instructions 1. Read the definition and context carefully. 2. Choose the **most appropriate** Entity Type. When uncertain between two, \ pick the type that best reflects the entity's primary role in the argument. 3. Choose the **most appropriate** VSM System. An entity may relate to multiple \ systems — assign the one where it does its primary work. 4. Write one sentence of rationale for each, grounded in the definition above. 5. Use **exactly** the output format below — no preamble, no extra lines. ## Output format TYPE: VSM: TYPE_RATIONALE: VSM_RATIONALE: """ # ── Prompt builder ──────────────────────────────────────────────────────────── def build_classification_prompt(entity: EntityMeta, topic: str) -> str: """Build a classification prompt for a single entity.""" return _PROMPT_TEMPLATE.format( topic=topic, title=entity.title, domain=entity.domain or "(unspecified)", source_chapter=entity.source_chapter or "(unspecified)", definition=entity.definition or "(no definition provided)", context=entity.context or "(no context provided)", type_Element=_TYPE_DEFS["Element"], type_Process=_TYPE_DEFS["Process"], type_Relation=_TYPE_DEFS["Relation"], type_Principle=_TYPE_DEFS["Principle"], type_Institution=_TYPE_DEFS["Institution"], vsm_S1=_VSM_DEFS["S1"], vsm_S2=_VSM_DEFS["S2"], vsm_S3=_VSM_DEFS["S3"], vsm_S3s=_VSM_DEFS["S3*"], vsm_S4=_VSM_DEFS["S4"], vsm_S5=_VSM_DEFS["S5"], ) # ── Response parser ─────────────────────────────────────────────────────────── def parse_classification_response(text: str) -> dict: """Parse TYPE/VSM/TYPE_RATIONALE/VSM_RATIONALE from an LLM response. Returns a dict with keys: entity_type, vsm_system, type_rationale, vsm_rationale. Values are None / empty string if not found. """ result: dict = { "entity_type": None, "vsm_system": None, "type_rationale": "", "vsm_rationale": "", } for line in text.splitlines(): stripped = line.strip() upper = stripped.upper() if upper.startswith("TYPE_RATIONALE:"): result["type_rationale"] = stripped.split(":", 1)[1].strip() elif upper.startswith("VSM_RATIONALE:"): result["vsm_rationale"] = stripped.split(":", 1)[1].strip() elif upper.startswith("TYPE:"): raw = stripped.split(":", 1)[1].strip() # Case-insensitive match against controlled vocabulary for t in ENTITY_TYPES: if t.lower() == raw.lower(): result["entity_type"] = t break else: result["entity_type"] = raw # keep raw if unrecognised elif upper.startswith("VSM:"): raw = stripped.split(":", 1)[1].strip() for v in VSM_SYSTEMS: if v.lower() == raw.lower(): result["vsm_system"] = v break else: result["vsm_system"] = raw return result # ── Batch runner ────────────────────────────────────────────────────────────── def run_entity_classification( config: InfospaceConfig, entities: List[EntityMeta], adapter: LLMAdapter, run_config: Optional[RunConfig] = None, output_dir: Optional[Path] = None, progress_callback: Optional[Callable] = None, ) -> BatchSummary: """Run per-entity classification using the batch evaluator. Classification files are written **incrementally** after each successful result, so a long run is resumable and safe to interrupt. Args: config: The infospace configuration. entities: Entities to classify. adapter: LLM adapter. run_config: LLM execution configuration. output_dir: Where to write classification results. Defaults to ``config.classifications_dir`` relative to CWD. progress_callback: Called after each item with (done, total, result). Returns: A :class:`BatchSummary` with per-entity results. """ topic = config.topic.name cls_path = output_dir or Path(config.classifications_dir) classifier_name = (run_config.model_name if run_config else "unknown") def _write_and_notify(done: int, total: int, result) -> None: if result.status == "success" and result.response is not None: parsed = parse_classification_response(result.response.content) entity_type = parsed["entity_type"] or "Unknown" vsm_system = parsed["vsm_system"] or "Unknown" classification = EntityClassification( entity_slug=result.key, entity_type=entity_type, vsm_system=vsm_system, type_rationale=parsed["type_rationale"], vsm_rationale=parsed["vsm_rationale"], classified_by=classifier_name, classified_at=datetime.utcnow(), ) dest = cls_path / f"{result.key}.md" write_entity_classification(classification, dest) if progress_callback is not None: progress_callback(done, total, result) items = [ BatchItem( key=entity.slug, prompt=build_classification_prompt(entity, topic), ) for entity in entities ] evaluator = BatchEvaluator( adapter=adapter, config=run_config, progress_callback=_write_and_notify, ) return evaluator.evaluate(items)