markitect-main/markitect/infospace/classifier.py

"""
Per-entity classification pipeline for L2 typed entities.

Builds a concise LLM prompt asking the model to assign an Entity Type and
a VSM System to each entity, then parses the structured response.  Batch
execution mirrors the evaluate.py pattern: incremental file writing makes
long runs safe to interrupt and resume.
"""

from __future__ import annotations

from datetime import datetime
from pathlib import Path
from typing import Callable, List, Optional

from markitect.infospace.classification import (
    ENTITY_TYPES,
    VSM_SYSTEMS,
    EntityClassification,
)
from markitect.infospace.classification_io import write_entity_classification
from markitect.infospace.config import InfospaceConfig
from markitect.infospace.models import EntityMeta
from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
from markitect.prompts.execution.llm_adapter import LLMAdapter
from markitect.prompts.execution.models import RunConfig


# ── Type and VSM system descriptions ─────────────────────────────────────────

_TYPE_DEFS = {
    "Element": (
        "a stock, agent, artifact, or institution that persists — a noun, "
        "something that exists independently (e.g. Capital Stock, Corn, Colony)"
    ),
    "Process": (
        "a flow, activity, or transformation with duration — something that "
        "happens rather than exists (e.g. Division of Labour, Credit Extension, Trade)"
    ),
    "Relation": (
        "a structural dependency or causal link between two entities — a connector "
        "or mechanism (e.g. Rent determined by Price; Wages bounded by Profit)"
    ),
    "Principle": (
        "an abstract law or invariant that holds across contexts — a rule or "
        "theoretical claim (e.g. Comparative Advantage, Diminishing Returns)"
    ),
    "Institution": (
        "a socially constructed rule system, norm, or governance structure "
        "(e.g. Banking System, Apprenticeship Law, Taille)"
    ),
}

_VSM_DEFS = {
    "S1": "Primary operations — productive activities (agricultural labour, manufacturing, carrying trade)",
    "S2": "Coordination — anti-oscillation, price signals (market price, natural price, wages)",
    "S3": "Management — resource allocation, operational control (capital allocation, taxation, banking)",
    "S3*": "Audit — inspection, compliance, integrity (customs enforcement, assay, coinage)",
    "S4": "Intelligence — adaptation, environment scanning (invisible hand, foreign trade analysis)",
    "S5": "Policy — identity, ultimate authority, purpose (political economy systems, public debt policy)",
}

_PROMPT_TEMPLATE = """\
You are classifying an entity from an infospace about "{topic}".

Your task: assign exactly one **Entity Type** and one **VSM System** to the entity, \
then give a one-sentence rationale for each choice.

## Entity: {title}

**Domain:** {domain}
**Source chapter:** {source_chapter}

### Definition

{definition}

### Context

{context}

---

## Entity Types — choose exactly one

- **Element** — {type_Element}
- **Process** — {type_Process}
- **Relation** — {type_Relation}
- **Principle** — {type_Principle}
- **Institution** — {type_Institution}

## VSM Systems — choose exactly one

- **S1** — {vsm_S1}
- **S2** — {vsm_S2}
- **S3** — {vsm_S3}
- **S3*** — {vsm_S3s}
- **S4** — {vsm_S4}
- **S5** — {vsm_S5}

---

## Instructions

1. Read the definition and context carefully.
2. Choose the **most appropriate** Entity Type. When uncertain between two, \
pick the type that best reflects the entity's primary role in the argument.
3. Choose the **most appropriate** VSM System. An entity may relate to multiple \
systems — assign the one where it does its primary work.
4. Write one sentence of rationale for each, grounded in the definition above.
5. Use **exactly** the output format below — no preamble, no extra lines.

## Output format

TYPE: <one of: Element, Process, Relation, Principle, Institution>
VSM: <one of: S1, S2, S3, S3*, S4, S5>
TYPE_RATIONALE: <one sentence explaining the type choice>
VSM_RATIONALE: <one sentence grounding the VSM assignment in Beer's definitions>
"""


# ── Prompt builder ────────────────────────────────────────────────────────────


def build_classification_prompt(entity: EntityMeta, topic: str) -> str:
    """Build a classification prompt for a single entity."""
    return _PROMPT_TEMPLATE.format(
        topic=topic,
        title=entity.title,
        domain=entity.domain or "(unspecified)",
        source_chapter=entity.source_chapter or "(unspecified)",
        definition=entity.definition or "(no definition provided)",
        context=entity.context or "(no context provided)",
        type_Element=_TYPE_DEFS["Element"],
        type_Process=_TYPE_DEFS["Process"],
        type_Relation=_TYPE_DEFS["Relation"],
        type_Principle=_TYPE_DEFS["Principle"],
        type_Institution=_TYPE_DEFS["Institution"],
        vsm_S1=_VSM_DEFS["S1"],
        vsm_S2=_VSM_DEFS["S2"],
        vsm_S3=_VSM_DEFS["S3"],
        vsm_S3s=_VSM_DEFS["S3*"],
        vsm_S4=_VSM_DEFS["S4"],
        vsm_S5=_VSM_DEFS["S5"],
    )


# ── Response parser ───────────────────────────────────────────────────────────


def parse_classification_response(text: str) -> dict:
    """Parse TYPE/VSM/TYPE_RATIONALE/VSM_RATIONALE from an LLM response.

    Returns a dict with keys: entity_type, vsm_system, type_rationale,
    vsm_rationale.  Values are None / empty string if not found.
    """
    result: dict = {
        "entity_type": None,
        "vsm_system": None,
        "type_rationale": "",
        "vsm_rationale": "",
    }

    for line in text.splitlines():
        stripped = line.strip()
        upper = stripped.upper()

        if upper.startswith("TYPE_RATIONALE:"):
            result["type_rationale"] = stripped.split(":", 1)[1].strip()
        elif upper.startswith("VSM_RATIONALE:"):
            result["vsm_rationale"] = stripped.split(":", 1)[1].strip()
        elif upper.startswith("TYPE:"):
            raw = stripped.split(":", 1)[1].strip()
            # Case-insensitive match against controlled vocabulary
            for t in ENTITY_TYPES:
                if t.lower() == raw.lower():
                    result["entity_type"] = t
                    break
            else:
                result["entity_type"] = raw  # keep raw if unrecognised
        elif upper.startswith("VSM:"):
            raw = stripped.split(":", 1)[1].strip()
            for v in VSM_SYSTEMS:
                if v.lower() == raw.lower():
                    result["vsm_system"] = v
                    break
            else:
                result["vsm_system"] = raw

    return result


# ── Batch runner ──────────────────────────────────────────────────────────────


def run_entity_classification(
    config: InfospaceConfig,
    entities: List[EntityMeta],
    adapter: LLMAdapter,
    run_config: Optional[RunConfig] = None,
    output_dir: Optional[Path] = None,
    progress_callback: Optional[Callable] = None,
    delay_seconds: float = 0.0,
) -> BatchSummary:
    """Run per-entity classification using the batch evaluator.

    Classification files are written **incrementally** after each successful
    result, so a long run is resumable and safe to interrupt.

    Args:
        config: The infospace configuration.
        entities: Entities to classify.
        adapter: LLM adapter.
        run_config: LLM execution configuration.
        output_dir: Where to write classification results.  Defaults to
            ``config.classifications_dir`` relative to CWD.
        progress_callback: Called after each item with (done, total, result).
        delay_seconds: Seconds to sleep between requests (for rate limiting).

    Returns:
        A :class:`BatchSummary` with per-entity results.
    """
    topic = config.topic.name
    cls_path = output_dir or Path(config.classifications_dir)
    classifier_name = (run_config.model_name if run_config else "unknown")

    def _write_and_notify(done: int, total: int, result) -> None:
        if result.status == "success" and result.response is not None:
            parsed = parse_classification_response(result.response.content)
            entity_type = parsed["entity_type"] or "Unknown"
            vsm_system = parsed["vsm_system"] or "Unknown"
            classification = EntityClassification(
                entity_slug=result.key,
                entity_type=entity_type,
                vsm_system=vsm_system,
                type_rationale=parsed["type_rationale"],
                vsm_rationale=parsed["vsm_rationale"],
                classified_by=classifier_name,
                classified_at=datetime.utcnow(),
            )
            dest = cls_path / f"{result.key}.md"
            write_entity_classification(classification, dest)

        if progress_callback is not None:
            progress_callback(done, total, result)

    items = [
        BatchItem(
            key=entity.slug,
            prompt=build_classification_prompt(entity, topic),
        )
        for entity in entities
    ]

    evaluator = BatchEvaluator(
        adapter=adapter,
        config=run_config,
        progress_callback=_write_and_notify,
        delay_seconds=delay_seconds,
    )
    return evaluator.evaluate(items)


# ── Relation-link prompt and runner ───────────────────────────────────────────

_RELATION_LINK_PROMPT_TEMPLATE = """\
You are enriching a Relation-type entity from an infospace about "{topic}".

This entity IS a structural connector — a dependency, mechanism, or causal link \
between two other entities. Your task: identify which two entities it connects \
and describe the linking mechanism in one sentence.

## Entity: {title}

**Domain:** {domain}

### Definition

{definition}

### Context

{context}

---

## Instructions

1. Read the definition and context carefully.
2. Identify **Entity A** (the subject/origin of the relation) and **Entity B** \
(the object/destination).
3. Write a single sentence explaining HOW this entity connects or mediates between A and B.
4. Use **exactly** the output format below — no preamble, no extra lines.
5. For slugs: use lowercase letters and underscores only (same as file names), \
   e.g. "division_of_labour", "market_extent".

## Output format

SUBJECT: <human-readable title of Entity A>
SUBJECT_SLUG: <slug of Entity A>
OBJECT: <human-readable title of Entity B>
OBJECT_SLUG: <slug of Entity B>
MECHANISM: <one sentence describing how this entity links A to B>
"""


def build_relation_link_prompt(entity: EntityMeta, topic: str) -> str:
    """Build a relation-link enrichment prompt for a Relation-type entity."""
    return _RELATION_LINK_PROMPT_TEMPLATE.format(
        topic=topic,
        title=entity.title,
        domain=entity.domain or "(unspecified)",
        definition=entity.definition or "(no definition provided)",
        context=entity.context or "(no context provided)",
    )


def parse_relation_link_response(text: str) -> dict:
    """Parse SUBJECT/SUBJECT_SLUG/OBJECT/OBJECT_SLUG/MECHANISM from an LLM response."""
    result: dict = {
        "links_subject": "",
        "links_subject_slug": "",
        "links_object": "",
        "links_object_slug": "",
        "links_mechanism": "",
    }
    for line in text.splitlines():
        stripped = line.strip()
        upper = stripped.upper()
        if upper.startswith("SUBJECT_SLUG:"):
            result["links_subject_slug"] = stripped.split(":", 1)[1].strip()
        elif upper.startswith("SUBJECT:"):
            result["links_subject"] = stripped.split(":", 1)[1].strip()
        elif upper.startswith("OBJECT_SLUG:"):
            result["links_object_slug"] = stripped.split(":", 1)[1].strip()
        elif upper.startswith("OBJECT:"):
            result["links_object"] = stripped.split(":", 1)[1].strip()
        elif upper.startswith("MECHANISM:"):
            result["links_mechanism"] = stripped.split(":", 1)[1].strip()
    return result


def run_relation_link_capture(
    config: InfospaceConfig,
    relation_entities: List[EntityMeta],
    classifications: dict,  # slug → EntityClassification
    adapter: LLMAdapter,
    run_config: Optional[RunConfig] = None,
    output_dir: Optional[Path] = None,
    progress_callback: Optional[Callable] = None,
) -> BatchSummary:
    """Capture relation endpoint data for Relation-type entities.

    Reads existing classification files for Relation-type entities, skips
    those that already have ``links_mechanism`` set, calls the LLM for the
    rest, and updates classification files in-place.

    Args:
        config: The infospace configuration.
        relation_entities: EntityMeta objects for Relation-type entities only.
        classifications: Slug → EntityClassification map (pre-loaded).
        adapter: LLM adapter.
        run_config: LLM execution configuration.
        output_dir: Where classification files live (defaults to config.classifications_dir).
        progress_callback: Called after each item with (done, total, result).

    Returns:
        A :class:`BatchSummary` with per-entity results.
    """
    topic = config.topic.name
    cls_path = output_dir or Path(config.classifications_dir)

    def _write_and_notify(done: int, total: int, result) -> None:
        if result.status == "success" and result.response is not None:
            parsed = parse_relation_link_response(result.response.content)
            existing_cls = classifications.get(result.key)
            if existing_cls is not None:
                existing_cls.links_subject = parsed["links_subject"]
                existing_cls.links_subject_slug = parsed["links_subject_slug"]
                existing_cls.links_object = parsed["links_object"]
                existing_cls.links_object_slug = parsed["links_object_slug"]
                existing_cls.links_mechanism = parsed["links_mechanism"]
                dest = cls_path / f"{result.key}.md"
                write_entity_classification(existing_cls, dest)

        if progress_callback is not None:
            progress_callback(done, total, result)

    items = [
        BatchItem(
            key=entity.slug,
            prompt=build_relation_link_prompt(entity, topic),
        )
        for entity in relation_entities
    ]

    evaluator = BatchEvaluator(
        adapter=adapter,
        config=run_config,
        progress_callback=_write_and_notify,
    )
    return evaluator.evaluate(items)