feat(infospace): add L2 entity classification with type × VSM matrix (S2.9)

Implements the L2 typed-entities layer — each entity is assigned an
Entity Type (Element, Process, Relation, Principle, Institution) and a
VSM System (S1–S5) by an LLM, with one-sentence rationales for each.

New modules:
- markitect/infospace/classification.py — EntityClassification dataclass
  + ENTITY_TYPES / VSM_SYSTEMS controlled vocabularies
- markitect/infospace/classification_io.py — write/read classification
  files (YAML frontmatter + markdown body, mirrors evaluation_io)
- markitect/infospace/classifier.py — build_classification_prompt(),
  parse_classification_response(), run_entity_classification(); batch
  runner writes files incrementally (same resumable pattern as evaluate)

CLI: markitect infospace classify [--entity SLUG] [--provider P] [--model M]
  - Incremental skip: checks output/classifications/ for existing files
  - Defaults to openrouter provider; 2000 max_tokens (Gemini 2.5 Flash
    uses ~787 thinking tokens, so 800 was too low)

CLI: markitect infospace classify-summary [--update-metrics]
  - Entity type counts + VSM system counts with percentages
  - 5 × 6 type × VSM matrix (spots structural blind spots at a glance)
  - --update-metrics writes type_distribution, type_entropy,
    vsm_type_matrix_cells to metrics.yaml

Config: InfospaceConfig gains classifications_dir (default output/classifications)
Schema: schemas/typed-entity-schema-v1.0.md — type/VSM vocabulary tables,
  rationale format rules, validation rules, metrics enabled at L2
infospace.yaml: schemas.typed_entity references typed-entity-schema-v1.0.md

Seed classifications (3): division_of_labour (Process/S1),
  natural_price_as_central_price (Principle/S2),
  invisible_hand_mechanism (Principle/S4)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 09:35:58 +01:00
parent 2d45425b25
commit 81a4c8796a
10 changed files with 789 additions and 0 deletions

View File

@@ -419,6 +419,172 @@ def relations(config_path: Optional[str], entity_slug: Optional[str],
click.echo(f"{subj:<35} {pred:<30} {obj:<35} {r.vsm_channel}")
# ── classify ─────────────────────────────────────────────────────────
@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--entity", "entity_slug", default=None,
help="Classify a single entity by slug.")
@click.option("--provider", default="openrouter",
help="LLM provider (openrouter, gemini, openai, …).")
@click.option("--model", default=None, help="Model name override.")
def classify(config_path: Optional[str], entity_slug: Optional[str],
provider: str, model: Optional[str]):
"""Classify entities with Entity Type and VSM System (L2)."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
from markitect.infospace.classifier import run_entity_classification
from markitect.llm import create_adapter
from markitect.prompts.execution.models import RunConfig
entity_list = parse_entity_directory(root / cfg.entities_dir)
if not entity_list:
click.echo("No entities found in " + str(root / cfg.entities_dir), err=True)
return
output_dir = root / cfg.classifications_dir
if entity_slug:
entity_list = [e for e in entity_list if e.slug == entity_slug]
if not entity_list:
click.echo(f"Entity '{entity_slug}' not found.", err=True)
return
else:
# Incremental skip — entities already classified are omitted
if output_dir.is_dir():
done_slugs = {p.stem for p in output_dir.glob("*.md")}
before = len(entity_list)
entity_list = [e for e in entity_list if e.slug not in done_slugs]
skipped = before - len(entity_list)
if skipped:
click.echo(f"Skipping {skipped} already-classified entities.")
if not entity_list:
click.echo("All entities already classified. Nothing to do.")
return
click.echo(f"Classifying {len(entity_list)} entities …")
output_dir.mkdir(parents=True, exist_ok=True)
adapter = create_adapter(provider, model=model)
run_config = RunConfig(model_name=model, temperature=0.1, max_tokens=2000)
def _progress(done: int, total: int, result) -> None:
if result.status == "success":
click.echo(f" [{done}/{total}] {result.key}")
else:
click.echo(f" [{done}/{total}] {result.key} — FAILED: {result.error}")
summary = run_entity_classification(
config=cfg,
entities=entity_list,
adapter=adapter,
run_config=run_config,
output_dir=output_dir,
progress_callback=_progress,
)
click.echo(f"\nDone: {summary.succeeded} classified, {summary.failed} failed.")
# ── classify-summary ──────────────────────────────────────────────────
@infospace_commands.command(name="classify-summary")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--update-metrics", "update_metrics", is_flag=True, default=False,
help="Write type_distribution metrics to metrics.yaml.")
def classify_summary(config_path: Optional[str], update_metrics: bool):
"""Show type × VSM distribution across all classified entities (L2)."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
from markitect.infospace.classification import ENTITY_TYPES, VSM_SYSTEMS
from markitect.infospace.classification_io import read_classifications_directory
cls_dir = root / cfg.classifications_dir
if not cls_dir.is_dir():
click.echo("No classifications directory found. Run 'classify' first.")
return
all_cls = read_classifications_directory(cls_dir)
if not all_cls:
click.echo("No classification files found.")
return
n = len(all_cls)
type_counts: dict = {}
vsm_counts: dict = {}
matrix: dict = {} # (entity_type, vsm_system) → count
for c in all_cls:
type_counts[c.entity_type] = type_counts.get(c.entity_type, 0) + 1
vsm_counts[c.vsm_system] = vsm_counts.get(c.vsm_system, 0) + 1
key = (c.entity_type, c.vsm_system)
matrix[key] = matrix.get(key, 0) + 1
click.echo(f"Classification summary — {n} entities\n")
click.echo("Entity types:")
for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
pct = 100 * count / n if n else 0.0
click.echo(f" {t:<15} {count:>4} ({pct:.1f}%)")
click.echo()
vsm_order = ["S1", "S2", "S3", "S3*", "S4", "S5"]
click.echo("VSM systems:")
for v in vsm_order:
if v in vsm_counts:
count = vsm_counts[v]
pct = 100 * count / n if n else 0.0
click.echo(f" {v:<6} {count:>4} ({pct:.1f}%)")
click.echo()
# Type × VSM matrix
header = f"{'':15}" + "".join(f"{v:>7}" for v in vsm_order)
sep = "-" * (15 + 7 * len(vsm_order))
click.echo(header)
click.echo(sep)
for t in ENTITY_TYPES:
row = f"{t:<15}"
for v in vsm_order:
c = matrix.get((t, v), 0)
row += f"{c if c else '.':>7}"
click.echo(row)
click.echo()
filled_cells = len(matrix)
total_cells = len(ENTITY_TYPES) * len(vsm_order)
click.echo(f"Matrix fill: {filled_cells}/{total_cells} cells occupied")
click.echo()
if update_metrics:
import math
from markitect.infospace.history import read_metrics_file, write_metrics_file
metrics_dir = root / cfg.metrics_dir
metrics_dir.mkdir(parents=True, exist_ok=True)
# Type entropy
type_entropy = 0.0
for count in type_counts.values():
p = count / n
if p > 0:
type_entropy -= p * math.log2(p)
existing = read_metrics_file(metrics_dir / "metrics.yaml")
new_metrics = {
"type_distribution": type_counts,
"vsm_type_matrix_cells": filled_cells,
"type_entropy": round(type_entropy, 4),
}
merged = {**existing, **new_metrics}
write_metrics_file(merged, metrics_dir / "metrics.yaml")
click.echo(
f"Updated metrics.yaml: type_entropy={type_entropy:.4f}, "
f"vsm_type_matrix_cells={filled_cells}"
)
# ── viability ────────────────────────────────────────────────────────