""" Per-entity classification pipeline for L2 typed entities. Builds a concise LLM prompt asking the model to assign an Entity Type and a VSM System to each entity, then parses the structured response. Batch execution mirrors the evaluate.py pattern: incremental file writing makes long runs safe to interrupt and resume. """ from __future__ import annotations from datetime import datetime from pathlib import Path from typing import Callable, List, Optional from markitect.infospace.classification import ( ENTITY_TYPES, VSM_SYSTEMS, EntityClassification, ) from markitect.infospace.classification_io import write_entity_classification from markitect.infospace.config import InfospaceConfig from markitect.infospace.models import EntityMeta from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary from markitect.prompts.execution.llm_adapter import LLMAdapter from markitect.prompts.execution.models import RunConfig # ── Type and VSM system descriptions ───────────────────────────────────────── _TYPE_DEFS = { "Element": ( "a stock, agent, artifact, or institution that persists — a noun, " "something that exists independently (e.g. Capital Stock, Corn, Colony)" ), "Process": ( "a flow, activity, or transformation with duration — something that " "happens rather than exists (e.g. Division of Labour, Credit Extension, Trade)" ), "Relation": ( "a structural dependency or causal link between two entities — a connector " "or mechanism (e.g. Rent determined by Price; Wages bounded by Profit)" ), "Principle": ( "an abstract law or invariant that holds across contexts — a rule or " "theoretical claim (e.g. Comparative Advantage, Diminishing Returns)" ), "Institution": ( "a socially constructed rule system, norm, or governance structure " "(e.g. Banking System, Apprenticeship Law, Taille)" ), } _VSM_DEFS = { "S1": "Primary operations — productive activities (agricultural labour, manufacturing, carrying trade)", "S2": "Coordination — anti-oscillation, price signals (market price, natural price, wages)", "S3": "Management — resource allocation, operational control (capital allocation, taxation, banking)", "S3*": "Audit — inspection, compliance, integrity (customs enforcement, assay, coinage)", "S4": "Intelligence — adaptation, environment scanning (invisible hand, foreign trade analysis)", "S5": "Policy — identity, ultimate authority, purpose (political economy systems, public debt policy)", } _PROMPT_TEMPLATE = """\ You are classifying an entity from an infospace about "{topic}". Your task: assign exactly one **Entity Type** and one **VSM System** to the entity, \ then give a one-sentence rationale for each choice. ## Entity: {title} **Domain:** {domain} **Source chapter:** {source_chapter} ### Definition {definition} ### Context {context} --- ## Entity Types — choose exactly one - **Element** — {type_Element} - **Process** — {type_Process} - **Relation** — {type_Relation} - **Principle** — {type_Principle} - **Institution** — {type_Institution} ## VSM Systems — choose exactly one - **S1** — {vsm_S1} - **S2** — {vsm_S2} - **S3** — {vsm_S3} - **S3*** — {vsm_S3s} - **S4** — {vsm_S4} - **S5** — {vsm_S5} --- ## Instructions 1. Read the definition and context carefully. 2. Choose the **most appropriate** Entity Type. When uncertain between two, \ pick the type that best reflects the entity's primary role in the argument. 3. Choose the **most appropriate** VSM System. An entity may relate to multiple \ systems — assign the one where it does its primary work. 4. Write one sentence of rationale for each, grounded in the definition above. 5. Use **exactly** the output format below — no preamble, no extra lines. ## Output format TYPE: VSM: TYPE_RATIONALE: VSM_RATIONALE: """ # ── Prompt builder ──────────────────────────────────────────────────────────── def build_classification_prompt(entity: EntityMeta, topic: str) -> str: """Build a classification prompt for a single entity.""" return _PROMPT_TEMPLATE.format( topic=topic, title=entity.title, domain=entity.domain or "(unspecified)", source_chapter=entity.source_chapter or "(unspecified)", definition=entity.definition or "(no definition provided)", context=entity.context or "(no context provided)", type_Element=_TYPE_DEFS["Element"], type_Process=_TYPE_DEFS["Process"], type_Relation=_TYPE_DEFS["Relation"], type_Principle=_TYPE_DEFS["Principle"], type_Institution=_TYPE_DEFS["Institution"], vsm_S1=_VSM_DEFS["S1"], vsm_S2=_VSM_DEFS["S2"], vsm_S3=_VSM_DEFS["S3"], vsm_S3s=_VSM_DEFS["S3*"], vsm_S4=_VSM_DEFS["S4"], vsm_S5=_VSM_DEFS["S5"], ) # ── Response parser ─────────────────────────────────────────────────────────── def parse_classification_response(text: str) -> dict: """Parse TYPE/VSM/TYPE_RATIONALE/VSM_RATIONALE from an LLM response. Returns a dict with keys: entity_type, vsm_system, type_rationale, vsm_rationale. Values are None / empty string if not found. """ result: dict = { "entity_type": None, "vsm_system": None, "type_rationale": "", "vsm_rationale": "", } for line in text.splitlines(): stripped = line.strip() upper = stripped.upper() if upper.startswith("TYPE_RATIONALE:"): result["type_rationale"] = stripped.split(":", 1)[1].strip() elif upper.startswith("VSM_RATIONALE:"): result["vsm_rationale"] = stripped.split(":", 1)[1].strip() elif upper.startswith("TYPE:"): raw = stripped.split(":", 1)[1].strip() # Case-insensitive match against controlled vocabulary for t in ENTITY_TYPES: if t.lower() == raw.lower(): result["entity_type"] = t break else: result["entity_type"] = raw # keep raw if unrecognised elif upper.startswith("VSM:"): raw = stripped.split(":", 1)[1].strip() for v in VSM_SYSTEMS: if v.lower() == raw.lower(): result["vsm_system"] = v break else: result["vsm_system"] = raw return result # ── Batch runner ────────────────────────────────────────────────────────────── def run_entity_classification( config: InfospaceConfig, entities: List[EntityMeta], adapter: LLMAdapter, run_config: Optional[RunConfig] = None, output_dir: Optional[Path] = None, progress_callback: Optional[Callable] = None, delay_seconds: float = 0.0, ) -> BatchSummary: """Run per-entity classification using the batch evaluator. Classification files are written **incrementally** after each successful result, so a long run is resumable and safe to interrupt. Args: config: The infospace configuration. entities: Entities to classify. adapter: LLM adapter. run_config: LLM execution configuration. output_dir: Where to write classification results. Defaults to ``config.classifications_dir`` relative to CWD. progress_callback: Called after each item with (done, total, result). delay_seconds: Seconds to sleep between requests (for rate limiting). Returns: A :class:`BatchSummary` with per-entity results. """ topic = config.topic.name cls_path = output_dir or Path(config.classifications_dir) classifier_name = (run_config.model_name if run_config else "unknown") def _write_and_notify(done: int, total: int, result) -> None: if result.status == "success" and result.response is not None: parsed = parse_classification_response(result.response.content) entity_type = parsed["entity_type"] or "Unknown" vsm_system = parsed["vsm_system"] or "Unknown" classification = EntityClassification( entity_slug=result.key, entity_type=entity_type, vsm_system=vsm_system, type_rationale=parsed["type_rationale"], vsm_rationale=parsed["vsm_rationale"], classified_by=classifier_name, classified_at=datetime.utcnow(), ) dest = cls_path / f"{result.key}.md" write_entity_classification(classification, dest) if progress_callback is not None: progress_callback(done, total, result) items = [ BatchItem( key=entity.slug, prompt=build_classification_prompt(entity, topic), ) for entity in entities ] evaluator = BatchEvaluator( adapter=adapter, config=run_config, progress_callback=_write_and_notify, delay_seconds=delay_seconds, ) return evaluator.evaluate(items) # ── Relation-link prompt and runner ─────────────────────────────────────────── _RELATION_LINK_PROMPT_TEMPLATE = """\ You are enriching a Relation-type entity from an infospace about "{topic}". This entity IS a structural connector — a dependency, mechanism, or causal link \ between two other entities. Your task: identify which two entities it connects \ and describe the linking mechanism in one sentence. ## Entity: {title} **Domain:** {domain} ### Definition {definition} ### Context {context} --- ## Instructions 1. Read the definition and context carefully. 2. Identify **Entity A** (the subject/origin of the relation) and **Entity B** \ (the object/destination). 3. Write a single sentence explaining HOW this entity connects or mediates between A and B. 4. Use **exactly** the output format below — no preamble, no extra lines. 5. For slugs: use lowercase letters and underscores only (same as file names), \ e.g. "division_of_labour", "market_extent". ## Output format SUBJECT: SUBJECT_SLUG: OBJECT: OBJECT_SLUG: MECHANISM: """ def build_relation_link_prompt(entity: EntityMeta, topic: str) -> str: """Build a relation-link enrichment prompt for a Relation-type entity.""" return _RELATION_LINK_PROMPT_TEMPLATE.format( topic=topic, title=entity.title, domain=entity.domain or "(unspecified)", definition=entity.definition or "(no definition provided)", context=entity.context or "(no context provided)", ) def parse_relation_link_response(text: str) -> dict: """Parse SUBJECT/SUBJECT_SLUG/OBJECT/OBJECT_SLUG/MECHANISM from an LLM response.""" result: dict = { "links_subject": "", "links_subject_slug": "", "links_object": "", "links_object_slug": "", "links_mechanism": "", } for line in text.splitlines(): stripped = line.strip() upper = stripped.upper() if upper.startswith("SUBJECT_SLUG:"): result["links_subject_slug"] = stripped.split(":", 1)[1].strip() elif upper.startswith("SUBJECT:"): result["links_subject"] = stripped.split(":", 1)[1].strip() elif upper.startswith("OBJECT_SLUG:"): result["links_object_slug"] = stripped.split(":", 1)[1].strip() elif upper.startswith("OBJECT:"): result["links_object"] = stripped.split(":", 1)[1].strip() elif upper.startswith("MECHANISM:"): result["links_mechanism"] = stripped.split(":", 1)[1].strip() return result def run_relation_link_capture( config: InfospaceConfig, relation_entities: List[EntityMeta], classifications: dict, # slug → EntityClassification adapter: LLMAdapter, run_config: Optional[RunConfig] = None, output_dir: Optional[Path] = None, progress_callback: Optional[Callable] = None, ) -> BatchSummary: """Capture relation endpoint data for Relation-type entities. Reads existing classification files for Relation-type entities, skips those that already have ``links_mechanism`` set, calls the LLM for the rest, and updates classification files in-place. Args: config: The infospace configuration. relation_entities: EntityMeta objects for Relation-type entities only. classifications: Slug → EntityClassification map (pre-loaded). adapter: LLM adapter. run_config: LLM execution configuration. output_dir: Where classification files live (defaults to config.classifications_dir). progress_callback: Called after each item with (done, total, result). Returns: A :class:`BatchSummary` with per-entity results. """ topic = config.topic.name cls_path = output_dir or Path(config.classifications_dir) def _write_and_notify(done: int, total: int, result) -> None: if result.status == "success" and result.response is not None: parsed = parse_relation_link_response(result.response.content) existing_cls = classifications.get(result.key) if existing_cls is not None: existing_cls.links_subject = parsed["links_subject"] existing_cls.links_subject_slug = parsed["links_subject_slug"] existing_cls.links_object = parsed["links_object"] existing_cls.links_object_slug = parsed["links_object_slug"] existing_cls.links_mechanism = parsed["links_mechanism"] dest = cls_path / f"{result.key}.md" write_entity_classification(existing_cls, dest) if progress_callback is not None: progress_callback(done, total, result) items = [ BatchItem( key=entity.slug, prompt=build_relation_link_prompt(entity, topic), ) for entity in relation_entities ] evaluator = BatchEvaluator( adapter=adapter, config=run_config, progress_callback=_write_and_notify, ) return evaluator.evaluate(items)