Batch classification via OpenRouter (claude-sonnet-4). 165 entities
remain unclassified due to credit exhaustion; incremental skip means
a follow-up run will complete them automatically.
Type × VSM matrix (823 entities):
S1 S2 S3 S3* S4 S5
Element 86 75 58 21 43 32 (315 total, 38%)
Process 39 42 37 17 67 24 (226 total, 28%)
Institution 4 12 30 24 . 52 (122 total, 15%)
Principle 3 7 15 2 43 32 (102 total, 12%)
Relation 2 14 5 5 22 10 (58 total, 7%)
Matrix fill: 29/30 cells (Institution/S4 empty — expected)
Metrics updated: type_entropy=2.0936, vsm_type_matrix_cells=29
Also:
- BatchEvaluator gains delay_seconds param for rate-limited providers
- classify CLI gains --rpm option (--rpm 10 for Gemini free tier)
- history.write_metrics_file now handles non-float metric values
(type_distribution is a dict, was crashing round())
- run_entity_classification forwards delay_seconds to BatchEvaluator
- classify-links and graph commands added by user (entities --by-type,
graph --format mermaid/dot, classify-links for Relation enrichment)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
403 lines
15 KiB
Python
403 lines
15 KiB
Python
"""
|
|
Per-entity classification pipeline for L2 typed entities.
|
|
|
|
Builds a concise LLM prompt asking the model to assign an Entity Type and
|
|
a VSM System to each entity, then parses the structured response. Batch
|
|
execution mirrors the evaluate.py pattern: incremental file writing makes
|
|
long runs safe to interrupt and resume.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Callable, List, Optional
|
|
|
|
from markitect.infospace.classification import (
|
|
ENTITY_TYPES,
|
|
VSM_SYSTEMS,
|
|
EntityClassification,
|
|
)
|
|
from markitect.infospace.classification_io import write_entity_classification
|
|
from markitect.infospace.config import InfospaceConfig
|
|
from markitect.infospace.models import EntityMeta
|
|
from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
|
|
from markitect.prompts.execution.llm_adapter import LLMAdapter
|
|
from markitect.prompts.execution.models import RunConfig
|
|
|
|
|
|
# ── Type and VSM system descriptions ─────────────────────────────────────────
|
|
|
|
_TYPE_DEFS = {
|
|
"Element": (
|
|
"a stock, agent, artifact, or institution that persists — a noun, "
|
|
"something that exists independently (e.g. Capital Stock, Corn, Colony)"
|
|
),
|
|
"Process": (
|
|
"a flow, activity, or transformation with duration — something that "
|
|
"happens rather than exists (e.g. Division of Labour, Credit Extension, Trade)"
|
|
),
|
|
"Relation": (
|
|
"a structural dependency or causal link between two entities — a connector "
|
|
"or mechanism (e.g. Rent determined by Price; Wages bounded by Profit)"
|
|
),
|
|
"Principle": (
|
|
"an abstract law or invariant that holds across contexts — a rule or "
|
|
"theoretical claim (e.g. Comparative Advantage, Diminishing Returns)"
|
|
),
|
|
"Institution": (
|
|
"a socially constructed rule system, norm, or governance structure "
|
|
"(e.g. Banking System, Apprenticeship Law, Taille)"
|
|
),
|
|
}
|
|
|
|
_VSM_DEFS = {
|
|
"S1": "Primary operations — productive activities (agricultural labour, manufacturing, carrying trade)",
|
|
"S2": "Coordination — anti-oscillation, price signals (market price, natural price, wages)",
|
|
"S3": "Management — resource allocation, operational control (capital allocation, taxation, banking)",
|
|
"S3*": "Audit — inspection, compliance, integrity (customs enforcement, assay, coinage)",
|
|
"S4": "Intelligence — adaptation, environment scanning (invisible hand, foreign trade analysis)",
|
|
"S5": "Policy — identity, ultimate authority, purpose (political economy systems, public debt policy)",
|
|
}
|
|
|
|
_PROMPT_TEMPLATE = """\
|
|
You are classifying an entity from an infospace about "{topic}".
|
|
|
|
Your task: assign exactly one **Entity Type** and one **VSM System** to the entity, \
|
|
then give a one-sentence rationale for each choice.
|
|
|
|
## Entity: {title}
|
|
|
|
**Domain:** {domain}
|
|
**Source chapter:** {source_chapter}
|
|
|
|
### Definition
|
|
|
|
{definition}
|
|
|
|
### Context
|
|
|
|
{context}
|
|
|
|
---
|
|
|
|
## Entity Types — choose exactly one
|
|
|
|
- **Element** — {type_Element}
|
|
- **Process** — {type_Process}
|
|
- **Relation** — {type_Relation}
|
|
- **Principle** — {type_Principle}
|
|
- **Institution** — {type_Institution}
|
|
|
|
## VSM Systems — choose exactly one
|
|
|
|
- **S1** — {vsm_S1}
|
|
- **S2** — {vsm_S2}
|
|
- **S3** — {vsm_S3}
|
|
- **S3*** — {vsm_S3s}
|
|
- **S4** — {vsm_S4}
|
|
- **S5** — {vsm_S5}
|
|
|
|
---
|
|
|
|
## Instructions
|
|
|
|
1. Read the definition and context carefully.
|
|
2. Choose the **most appropriate** Entity Type. When uncertain between two, \
|
|
pick the type that best reflects the entity's primary role in the argument.
|
|
3. Choose the **most appropriate** VSM System. An entity may relate to multiple \
|
|
systems — assign the one where it does its primary work.
|
|
4. Write one sentence of rationale for each, grounded in the definition above.
|
|
5. Use **exactly** the output format below — no preamble, no extra lines.
|
|
|
|
## Output format
|
|
|
|
TYPE: <one of: Element, Process, Relation, Principle, Institution>
|
|
VSM: <one of: S1, S2, S3, S3*, S4, S5>
|
|
TYPE_RATIONALE: <one sentence explaining the type choice>
|
|
VSM_RATIONALE: <one sentence grounding the VSM assignment in Beer's definitions>
|
|
"""
|
|
|
|
|
|
# ── Prompt builder ────────────────────────────────────────────────────────────
|
|
|
|
|
|
def build_classification_prompt(entity: EntityMeta, topic: str) -> str:
|
|
"""Build a classification prompt for a single entity."""
|
|
return _PROMPT_TEMPLATE.format(
|
|
topic=topic,
|
|
title=entity.title,
|
|
domain=entity.domain or "(unspecified)",
|
|
source_chapter=entity.source_chapter or "(unspecified)",
|
|
definition=entity.definition or "(no definition provided)",
|
|
context=entity.context or "(no context provided)",
|
|
type_Element=_TYPE_DEFS["Element"],
|
|
type_Process=_TYPE_DEFS["Process"],
|
|
type_Relation=_TYPE_DEFS["Relation"],
|
|
type_Principle=_TYPE_DEFS["Principle"],
|
|
type_Institution=_TYPE_DEFS["Institution"],
|
|
vsm_S1=_VSM_DEFS["S1"],
|
|
vsm_S2=_VSM_DEFS["S2"],
|
|
vsm_S3=_VSM_DEFS["S3"],
|
|
vsm_S3s=_VSM_DEFS["S3*"],
|
|
vsm_S4=_VSM_DEFS["S4"],
|
|
vsm_S5=_VSM_DEFS["S5"],
|
|
)
|
|
|
|
|
|
# ── Response parser ───────────────────────────────────────────────────────────
|
|
|
|
|
|
def parse_classification_response(text: str) -> dict:
|
|
"""Parse TYPE/VSM/TYPE_RATIONALE/VSM_RATIONALE from an LLM response.
|
|
|
|
Returns a dict with keys: entity_type, vsm_system, type_rationale,
|
|
vsm_rationale. Values are None / empty string if not found.
|
|
"""
|
|
result: dict = {
|
|
"entity_type": None,
|
|
"vsm_system": None,
|
|
"type_rationale": "",
|
|
"vsm_rationale": "",
|
|
}
|
|
|
|
for line in text.splitlines():
|
|
stripped = line.strip()
|
|
upper = stripped.upper()
|
|
|
|
if upper.startswith("TYPE_RATIONALE:"):
|
|
result["type_rationale"] = stripped.split(":", 1)[1].strip()
|
|
elif upper.startswith("VSM_RATIONALE:"):
|
|
result["vsm_rationale"] = stripped.split(":", 1)[1].strip()
|
|
elif upper.startswith("TYPE:"):
|
|
raw = stripped.split(":", 1)[1].strip()
|
|
# Case-insensitive match against controlled vocabulary
|
|
for t in ENTITY_TYPES:
|
|
if t.lower() == raw.lower():
|
|
result["entity_type"] = t
|
|
break
|
|
else:
|
|
result["entity_type"] = raw # keep raw if unrecognised
|
|
elif upper.startswith("VSM:"):
|
|
raw = stripped.split(":", 1)[1].strip()
|
|
for v in VSM_SYSTEMS:
|
|
if v.lower() == raw.lower():
|
|
result["vsm_system"] = v
|
|
break
|
|
else:
|
|
result["vsm_system"] = raw
|
|
|
|
return result
|
|
|
|
|
|
# ── Batch runner ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
def run_entity_classification(
|
|
config: InfospaceConfig,
|
|
entities: List[EntityMeta],
|
|
adapter: LLMAdapter,
|
|
run_config: Optional[RunConfig] = None,
|
|
output_dir: Optional[Path] = None,
|
|
progress_callback: Optional[Callable] = None,
|
|
delay_seconds: float = 0.0,
|
|
) -> BatchSummary:
|
|
"""Run per-entity classification using the batch evaluator.
|
|
|
|
Classification files are written **incrementally** after each successful
|
|
result, so a long run is resumable and safe to interrupt.
|
|
|
|
Args:
|
|
config: The infospace configuration.
|
|
entities: Entities to classify.
|
|
adapter: LLM adapter.
|
|
run_config: LLM execution configuration.
|
|
output_dir: Where to write classification results. Defaults to
|
|
``config.classifications_dir`` relative to CWD.
|
|
progress_callback: Called after each item with (done, total, result).
|
|
delay_seconds: Seconds to sleep between requests (for rate limiting).
|
|
|
|
Returns:
|
|
A :class:`BatchSummary` with per-entity results.
|
|
"""
|
|
topic = config.topic.name
|
|
cls_path = output_dir or Path(config.classifications_dir)
|
|
classifier_name = (run_config.model_name if run_config else "unknown")
|
|
|
|
def _write_and_notify(done: int, total: int, result) -> None:
|
|
if result.status == "success" and result.response is not None:
|
|
parsed = parse_classification_response(result.response.content)
|
|
entity_type = parsed["entity_type"] or "Unknown"
|
|
vsm_system = parsed["vsm_system"] or "Unknown"
|
|
classification = EntityClassification(
|
|
entity_slug=result.key,
|
|
entity_type=entity_type,
|
|
vsm_system=vsm_system,
|
|
type_rationale=parsed["type_rationale"],
|
|
vsm_rationale=parsed["vsm_rationale"],
|
|
classified_by=classifier_name,
|
|
classified_at=datetime.utcnow(),
|
|
)
|
|
dest = cls_path / f"{result.key}.md"
|
|
write_entity_classification(classification, dest)
|
|
|
|
if progress_callback is not None:
|
|
progress_callback(done, total, result)
|
|
|
|
items = [
|
|
BatchItem(
|
|
key=entity.slug,
|
|
prompt=build_classification_prompt(entity, topic),
|
|
)
|
|
for entity in entities
|
|
]
|
|
|
|
evaluator = BatchEvaluator(
|
|
adapter=adapter,
|
|
config=run_config,
|
|
progress_callback=_write_and_notify,
|
|
delay_seconds=delay_seconds,
|
|
)
|
|
return evaluator.evaluate(items)
|
|
|
|
|
|
# ── Relation-link prompt and runner ───────────────────────────────────────────
|
|
|
|
_RELATION_LINK_PROMPT_TEMPLATE = """\
|
|
You are enriching a Relation-type entity from an infospace about "{topic}".
|
|
|
|
This entity IS a structural connector — a dependency, mechanism, or causal link \
|
|
between two other entities. Your task: identify which two entities it connects \
|
|
and describe the linking mechanism in one sentence.
|
|
|
|
## Entity: {title}
|
|
|
|
**Domain:** {domain}
|
|
|
|
### Definition
|
|
|
|
{definition}
|
|
|
|
### Context
|
|
|
|
{context}
|
|
|
|
---
|
|
|
|
## Instructions
|
|
|
|
1. Read the definition and context carefully.
|
|
2. Identify **Entity A** (the subject/origin of the relation) and **Entity B** \
|
|
(the object/destination).
|
|
3. Write a single sentence explaining HOW this entity connects or mediates between A and B.
|
|
4. Use **exactly** the output format below — no preamble, no extra lines.
|
|
5. For slugs: use lowercase letters and underscores only (same as file names), \
|
|
e.g. "division_of_labour", "market_extent".
|
|
|
|
## Output format
|
|
|
|
SUBJECT: <human-readable title of Entity A>
|
|
SUBJECT_SLUG: <slug of Entity A>
|
|
OBJECT: <human-readable title of Entity B>
|
|
OBJECT_SLUG: <slug of Entity B>
|
|
MECHANISM: <one sentence describing how this entity links A to B>
|
|
"""
|
|
|
|
|
|
def build_relation_link_prompt(entity: EntityMeta, topic: str) -> str:
|
|
"""Build a relation-link enrichment prompt for a Relation-type entity."""
|
|
return _RELATION_LINK_PROMPT_TEMPLATE.format(
|
|
topic=topic,
|
|
title=entity.title,
|
|
domain=entity.domain or "(unspecified)",
|
|
definition=entity.definition or "(no definition provided)",
|
|
context=entity.context or "(no context provided)",
|
|
)
|
|
|
|
|
|
def parse_relation_link_response(text: str) -> dict:
|
|
"""Parse SUBJECT/SUBJECT_SLUG/OBJECT/OBJECT_SLUG/MECHANISM from an LLM response."""
|
|
result: dict = {
|
|
"links_subject": "",
|
|
"links_subject_slug": "",
|
|
"links_object": "",
|
|
"links_object_slug": "",
|
|
"links_mechanism": "",
|
|
}
|
|
for line in text.splitlines():
|
|
stripped = line.strip()
|
|
upper = stripped.upper()
|
|
if upper.startswith("SUBJECT_SLUG:"):
|
|
result["links_subject_slug"] = stripped.split(":", 1)[1].strip()
|
|
elif upper.startswith("SUBJECT:"):
|
|
result["links_subject"] = stripped.split(":", 1)[1].strip()
|
|
elif upper.startswith("OBJECT_SLUG:"):
|
|
result["links_object_slug"] = stripped.split(":", 1)[1].strip()
|
|
elif upper.startswith("OBJECT:"):
|
|
result["links_object"] = stripped.split(":", 1)[1].strip()
|
|
elif upper.startswith("MECHANISM:"):
|
|
result["links_mechanism"] = stripped.split(":", 1)[1].strip()
|
|
return result
|
|
|
|
|
|
def run_relation_link_capture(
|
|
config: InfospaceConfig,
|
|
relation_entities: List[EntityMeta],
|
|
classifications: dict, # slug → EntityClassification
|
|
adapter: LLMAdapter,
|
|
run_config: Optional[RunConfig] = None,
|
|
output_dir: Optional[Path] = None,
|
|
progress_callback: Optional[Callable] = None,
|
|
) -> BatchSummary:
|
|
"""Capture relation endpoint data for Relation-type entities.
|
|
|
|
Reads existing classification files for Relation-type entities, skips
|
|
those that already have ``links_mechanism`` set, calls the LLM for the
|
|
rest, and updates classification files in-place.
|
|
|
|
Args:
|
|
config: The infospace configuration.
|
|
relation_entities: EntityMeta objects for Relation-type entities only.
|
|
classifications: Slug → EntityClassification map (pre-loaded).
|
|
adapter: LLM adapter.
|
|
run_config: LLM execution configuration.
|
|
output_dir: Where classification files live (defaults to config.classifications_dir).
|
|
progress_callback: Called after each item with (done, total, result).
|
|
|
|
Returns:
|
|
A :class:`BatchSummary` with per-entity results.
|
|
"""
|
|
topic = config.topic.name
|
|
cls_path = output_dir or Path(config.classifications_dir)
|
|
|
|
def _write_and_notify(done: int, total: int, result) -> None:
|
|
if result.status == "success" and result.response is not None:
|
|
parsed = parse_relation_link_response(result.response.content)
|
|
existing_cls = classifications.get(result.key)
|
|
if existing_cls is not None:
|
|
existing_cls.links_subject = parsed["links_subject"]
|
|
existing_cls.links_subject_slug = parsed["links_subject_slug"]
|
|
existing_cls.links_object = parsed["links_object"]
|
|
existing_cls.links_object_slug = parsed["links_object_slug"]
|
|
existing_cls.links_mechanism = parsed["links_mechanism"]
|
|
dest = cls_path / f"{result.key}.md"
|
|
write_entity_classification(existing_cls, dest)
|
|
|
|
if progress_callback is not None:
|
|
progress_callback(done, total, result)
|
|
|
|
items = [
|
|
BatchItem(
|
|
key=entity.slug,
|
|
prompt=build_relation_link_prompt(entity, topic),
|
|
)
|
|
for entity in relation_entities
|
|
]
|
|
|
|
evaluator = BatchEvaluator(
|
|
adapter=adapter,
|
|
config=run_config,
|
|
progress_callback=_write_and_notify,
|
|
)
|
|
return evaluator.evaluate(items)
|