feat(infospace): add L2 entity classification with type × VSM matrix (S2.9)

Implements the L2 typed-entities layer — each entity is assigned an Entity Type (Element, Process, Relation, Principle, Institution) and a VSM System (S1–S5) by an LLM, with one-sentence rationales for each. New modules: - markitect/infospace/classification.py — EntityClassification dataclass + ENTITY_TYPES / VSM_SYSTEMS controlled vocabularies - markitect/infospace/classification_io.py — write/read classification files (YAML frontmatter + markdown body, mirrors evaluation_io) - markitect/infospace/classifier.py — build_classification_prompt(), parse_classification_response(), run_entity_classification(); batch runner writes files incrementally (same resumable pattern as evaluate) CLI: markitect infospace classify [--entity SLUG] [--provider P] [--model M] - Incremental skip: checks output/classifications/ for existing files - Defaults to openrouter provider; 2000 max_tokens (Gemini 2.5 Flash uses ~787 thinking tokens, so 800 was too low) CLI: markitect infospace classify-summary [--update-metrics] - Entity type counts + VSM system counts with percentages - 5 × 6 type × VSM matrix (spots structural blind spots at a glance) - --update-metrics writes type_distribution, type_entropy, vsm_type_matrix_cells to metrics.yaml Config: InfospaceConfig gains classifications_dir (default output/classifications) Schema: schemas/typed-entity-schema-v1.0.md — type/VSM vocabulary tables, rationale format rules, validation rules, metrics enabled at L2 infospace.yaml: schemas.typed_entity references typed-entity-schema-v1.0.md Seed classifications (3): division_of_labour (Process/S1), natural_price_as_central_price (Principle/S2), invisible_hand_mechanism (Principle/S4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 09:35:58 +01:00
parent 2d45425b25
commit 81a4c8796a
10 changed files with 789 additions and 0 deletions
--- a/markitect/infospace/classifier.py
+++ b/markitect/infospace/classifier.py
@@ -0,0 +1,258 @@
+"""
+Per-entity classification pipeline for L2 typed entities.
+
+Builds a concise LLM prompt asking the model to assign an Entity Type and
+a VSM System to each entity, then parses the structured response.  Batch
+execution mirrors the evaluate.py pattern: incremental file writing makes
+long runs safe to interrupt and resume.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, List, Optional
+
+from markitect.infospace.classification import (
+    ENTITY_TYPES,
+    VSM_SYSTEMS,
+    EntityClassification,
+)
+from markitect.infospace.classification_io import write_entity_classification
+from markitect.infospace.config import InfospaceConfig
+from markitect.infospace.models import EntityMeta
+from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
+from markitect.prompts.execution.llm_adapter import LLMAdapter
+from markitect.prompts.execution.models import RunConfig
+
+
+# ── Type and VSM system descriptions ─────────────────────────────────────────
+
+_TYPE_DEFS = {
+    "Element": (
+        "a stock, agent, artifact, or institution that persists — a noun, "
+        "something that exists independently (e.g. Capital Stock, Corn, Colony)"
+    ),
+    "Process": (
+        "a flow, activity, or transformation with duration — something that "
+        "happens rather than exists (e.g. Division of Labour, Credit Extension, Trade)"
+    ),
+    "Relation": (
+        "a structural dependency or causal link between two entities — a connector "
+        "or mechanism (e.g. Rent determined by Price; Wages bounded by Profit)"
+    ),
+    "Principle": (
+        "an abstract law or invariant that holds across contexts — a rule or "
+        "theoretical claim (e.g. Comparative Advantage, Diminishing Returns)"
+    ),
+    "Institution": (
+        "a socially constructed rule system, norm, or governance structure "
+        "(e.g. Banking System, Apprenticeship Law, Taille)"
+    ),
+}
+
+_VSM_DEFS = {
+    "S1": "Primary operations — productive activities (agricultural labour, manufacturing, carrying trade)",
+    "S2": "Coordination — anti-oscillation, price signals (market price, natural price, wages)",
+    "S3": "Management — resource allocation, operational control (capital allocation, taxation, banking)",
+    "S3*": "Audit — inspection, compliance, integrity (customs enforcement, assay, coinage)",
+    "S4": "Intelligence — adaptation, environment scanning (invisible hand, foreign trade analysis)",
+    "S5": "Policy — identity, ultimate authority, purpose (political economy systems, public debt policy)",
+}
+
+_PROMPT_TEMPLATE = """\
+You are classifying an entity from an infospace about "{topic}".
+
+Your task: assign exactly one **Entity Type** and one **VSM System** to the entity, \
+then give a one-sentence rationale for each choice.
+
+## Entity: {title}
+
+**Domain:** {domain}
+**Source chapter:** {source_chapter}
+
+### Definition
+
+{definition}
+
+### Context
+
+{context}
+
+---
+
+## Entity Types — choose exactly one
+
+- **Element** — {type_Element}
+- **Process** — {type_Process}
+- **Relation** — {type_Relation}
+- **Principle** — {type_Principle}
+- **Institution** — {type_Institution}
+
+## VSM Systems — choose exactly one
+
+- **S1** — {vsm_S1}
+- **S2** — {vsm_S2}
+- **S3** — {vsm_S3}
+- **S3*** — {vsm_S3s}
+- **S4** — {vsm_S4}
+- **S5** — {vsm_S5}
+
+---
+
+## Instructions
+
+1. Read the definition and context carefully.
+2. Choose the **most appropriate** Entity Type. When uncertain between two, \
+pick the type that best reflects the entity's primary role in the argument.
+3. Choose the **most appropriate** VSM System. An entity may relate to multiple \
+systems — assign the one where it does its primary work.
+4. Write one sentence of rationale for each, grounded in the definition above.
+5. Use **exactly** the output format below — no preamble, no extra lines.
+
+## Output format
+
+TYPE: <one of: Element, Process, Relation, Principle, Institution>
+VSM: <one of: S1, S2, S3, S3*, S4, S5>
+TYPE_RATIONALE: <one sentence explaining the type choice>
+VSM_RATIONALE: <one sentence grounding the VSM assignment in Beer's definitions>
+"""
+
+
+# ── Prompt builder ────────────────────────────────────────────────────────────
+
+
+def build_classification_prompt(entity: EntityMeta, topic: str) -> str:
+    """Build a classification prompt for a single entity."""
+    return _PROMPT_TEMPLATE.format(
+        topic=topic,
+        title=entity.title,
+        domain=entity.domain or "(unspecified)",
+        source_chapter=entity.source_chapter or "(unspecified)",
+        definition=entity.definition or "(no definition provided)",
+        context=entity.context or "(no context provided)",
+        type_Element=_TYPE_DEFS["Element"],
+        type_Process=_TYPE_DEFS["Process"],
+        type_Relation=_TYPE_DEFS["Relation"],
+        type_Principle=_TYPE_DEFS["Principle"],
+        type_Institution=_TYPE_DEFS["Institution"],
+        vsm_S1=_VSM_DEFS["S1"],
+        vsm_S2=_VSM_DEFS["S2"],
+        vsm_S3=_VSM_DEFS["S3"],
+        vsm_S3s=_VSM_DEFS["S3*"],
+        vsm_S4=_VSM_DEFS["S4"],
+        vsm_S5=_VSM_DEFS["S5"],
+    )
+
+
+# ── Response parser ───────────────────────────────────────────────────────────
+
+
+def parse_classification_response(text: str) -> dict:
+    """Parse TYPE/VSM/TYPE_RATIONALE/VSM_RATIONALE from an LLM response.
+
+    Returns a dict with keys: entity_type, vsm_system, type_rationale,
+    vsm_rationale.  Values are None / empty string if not found.
+    """
+    result: dict = {
+        "entity_type": None,
+        "vsm_system": None,
+        "type_rationale": "",
+        "vsm_rationale": "",
+    }
+
+    for line in text.splitlines():
+        stripped = line.strip()
+        upper = stripped.upper()
+
+        if upper.startswith("TYPE_RATIONALE:"):
+            result["type_rationale"] = stripped.split(":", 1)[1].strip()
+        elif upper.startswith("VSM_RATIONALE:"):
+            result["vsm_rationale"] = stripped.split(":", 1)[1].strip()
+        elif upper.startswith("TYPE:"):
+            raw = stripped.split(":", 1)[1].strip()
+            # Case-insensitive match against controlled vocabulary
+            for t in ENTITY_TYPES:
+                if t.lower() == raw.lower():
+                    result["entity_type"] = t
+                    break
+            else:
+                result["entity_type"] = raw  # keep raw if unrecognised
+        elif upper.startswith("VSM:"):
+            raw = stripped.split(":", 1)[1].strip()
+            for v in VSM_SYSTEMS:
+                if v.lower() == raw.lower():
+                    result["vsm_system"] = v
+                    break
+            else:
+                result["vsm_system"] = raw
+
+    return result
+
+
+# ── Batch runner ──────────────────────────────────────────────────────────────
+
+
+def run_entity_classification(
+    config: InfospaceConfig,
+    entities: List[EntityMeta],
+    adapter: LLMAdapter,
+    run_config: Optional[RunConfig] = None,
+    output_dir: Optional[Path] = None,
+    progress_callback: Optional[Callable] = None,
+) -> BatchSummary:
+    """Run per-entity classification using the batch evaluator.
+
+    Classification files are written **incrementally** after each successful
+    result, so a long run is resumable and safe to interrupt.
+
+    Args:
+        config: The infospace configuration.
+        entities: Entities to classify.
+        adapter: LLM adapter.
+        run_config: LLM execution configuration.
+        output_dir: Where to write classification results.  Defaults to
+            ``config.classifications_dir`` relative to CWD.
+        progress_callback: Called after each item with (done, total, result).
+
+    Returns:
+        A :class:`BatchSummary` with per-entity results.
+    """
+    topic = config.topic.name
+    cls_path = output_dir or Path(config.classifications_dir)
+    classifier_name = (run_config.model_name if run_config else "unknown")
+
+    def _write_and_notify(done: int, total: int, result) -> None:
+        if result.status == "success" and result.response is not None:
+            parsed = parse_classification_response(result.response.content)
+            entity_type = parsed["entity_type"] or "Unknown"
+            vsm_system = parsed["vsm_system"] or "Unknown"
+            classification = EntityClassification(
+                entity_slug=result.key,
+                entity_type=entity_type,
+                vsm_system=vsm_system,
+                type_rationale=parsed["type_rationale"],
+                vsm_rationale=parsed["vsm_rationale"],
+                classified_by=classifier_name,
+                classified_at=datetime.utcnow(),
+            )
+            dest = cls_path / f"{result.key}.md"
+            write_entity_classification(classification, dest)
+
+        if progress_callback is not None:
+            progress_callback(done, total, result)
+
+    items = [
+        BatchItem(
+            key=entity.slug,
+            prompt=build_classification_prompt(entity, topic),
+        )
+        for entity in entities
+    ]
+
+    evaluator = BatchEvaluator(
+        adapter=adapter,
+        config=run_config,
+        progress_callback=_write_and_notify,
+    )
+    return evaluator.evaluate(items)