feat(example): add L2 classifications for 823/988 WoN entities (S3.4)

Batch classification via OpenRouter (claude-sonnet-4). 165 entities
remain unclassified due to credit exhaustion; incremental skip means
a follow-up run will complete them automatically.

Type × VSM matrix (823 entities):
                  S1   S2   S3  S3*   S4   S5
  Element         86   75   58   21   43   32  (315 total, 38%)
  Process         39   42   37   17   67   24  (226 total, 28%)
  Institution      4   12   30   24    .   52  (122 total, 15%)
  Principle        3    7   15    2   43   32  (102 total, 12%)
  Relation         2   14    5    5   22   10   (58 total,  7%)
  Matrix fill: 29/30 cells (Institution/S4 empty — expected)

Metrics updated: type_entropy=2.0936, vsm_type_matrix_cells=29

Also:
- BatchEvaluator gains delay_seconds param for rate-limited providers
- classify CLI gains --rpm option (--rpm 10 for Gemini free tier)
- history.write_metrics_file now handles non-float metric values
  (type_distribution is a dict, was crashing round())
- run_entity_classification forwards delay_seconds to BatchEvaluator
- classify-links and graph commands added by user (entities --by-type,
  graph --format mermaid/dot, classify-links for Relation enrichment)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 12:49:11 +01:00
parent a9ca0adfcf
commit d1f57272a4
827 changed files with 25240 additions and 4 deletions

View File

@@ -32,6 +32,13 @@ class EntityClassification:
classified_by: str = "" # model name
classified_at: Optional[datetime] = None
# Optional — only set when entity_type == "Relation"
links_subject: str = "" # human-readable title of entity A
links_subject_slug: str = "" # slug of entity A
links_object: str = "" # human-readable title of entity B
links_object_slug: str = "" # slug of entity B
links_mechanism: str = "" # one sentence: how A and B are connected
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {
"entity_slug": self.entity_slug,
@@ -46,6 +53,16 @@ class EntityClassification:
d["classified_by"] = self.classified_by
if self.classified_at is not None:
d["classified_at"] = self.classified_at.isoformat()
if self.links_subject:
d["links_subject"] = self.links_subject
if self.links_subject_slug:
d["links_subject_slug"] = self.links_subject_slug
if self.links_object:
d["links_object"] = self.links_object
if self.links_object_slug:
d["links_object_slug"] = self.links_object_slug
if self.links_mechanism:
d["links_mechanism"] = self.links_mechanism
return d
@classmethod
@@ -61,4 +78,9 @@ class EntityClassification:
vsm_rationale=data.get("vsm_rationale", ""),
classified_by=data.get("classified_by", ""),
classified_at=classified_at,
links_subject=data.get("links_subject", ""),
links_subject_slug=data.get("links_subject_slug", ""),
links_object=data.get("links_object", ""),
links_object_slug=data.get("links_object_slug", ""),
links_mechanism=data.get("links_mechanism", ""),
)

View File

@@ -55,6 +55,17 @@ def write_entity_classification(c: EntityClassification, path: Path) -> None:
lines.append(c.vsm_rationale)
lines.append("")
if c.links_mechanism:
lines.append("## Links")
lines.append("")
if c.links_subject:
lines.append(f"**Subject:** {c.links_subject}")
if c.links_object:
lines.append(f"**Object:** {c.links_object}")
lines.append("")
lines.append(c.links_mechanism)
lines.append("")
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")

View File

@@ -200,6 +200,7 @@ def run_entity_classification(
run_config: Optional[RunConfig] = None,
output_dir: Optional[Path] = None,
progress_callback: Optional[Callable] = None,
delay_seconds: float = 0.0,
) -> BatchSummary:
"""Run per-entity classification using the batch evaluator.
@@ -214,6 +215,7 @@ def run_entity_classification(
output_dir: Where to write classification results. Defaults to
``config.classifications_dir`` relative to CWD.
progress_callback: Called after each item with (done, total, result).
delay_seconds: Seconds to sleep between requests (for rate limiting).
Returns:
A :class:`BatchSummary` with per-entity results.
@@ -250,6 +252,148 @@ def run_entity_classification(
for entity in entities
]
evaluator = BatchEvaluator(
adapter=adapter,
config=run_config,
progress_callback=_write_and_notify,
delay_seconds=delay_seconds,
)
return evaluator.evaluate(items)
# ── Relation-link prompt and runner ───────────────────────────────────────────
_RELATION_LINK_PROMPT_TEMPLATE = """\
You are enriching a Relation-type entity from an infospace about "{topic}".
This entity IS a structural connector — a dependency, mechanism, or causal link \
between two other entities. Your task: identify which two entities it connects \
and describe the linking mechanism in one sentence.
## Entity: {title}
**Domain:** {domain}
### Definition
{definition}
### Context
{context}
---
## Instructions
1. Read the definition and context carefully.
2. Identify **Entity A** (the subject/origin of the relation) and **Entity B** \
(the object/destination).
3. Write a single sentence explaining HOW this entity connects or mediates between A and B.
4. Use **exactly** the output format below — no preamble, no extra lines.
5. For slugs: use lowercase letters and underscores only (same as file names), \
e.g. "division_of_labour", "market_extent".
## Output format
SUBJECT: <human-readable title of Entity A>
SUBJECT_SLUG: <slug of Entity A>
OBJECT: <human-readable title of Entity B>
OBJECT_SLUG: <slug of Entity B>
MECHANISM: <one sentence describing how this entity links A to B>
"""
def build_relation_link_prompt(entity: EntityMeta, topic: str) -> str:
"""Build a relation-link enrichment prompt for a Relation-type entity."""
return _RELATION_LINK_PROMPT_TEMPLATE.format(
topic=topic,
title=entity.title,
domain=entity.domain or "(unspecified)",
definition=entity.definition or "(no definition provided)",
context=entity.context or "(no context provided)",
)
def parse_relation_link_response(text: str) -> dict:
"""Parse SUBJECT/SUBJECT_SLUG/OBJECT/OBJECT_SLUG/MECHANISM from an LLM response."""
result: dict = {
"links_subject": "",
"links_subject_slug": "",
"links_object": "",
"links_object_slug": "",
"links_mechanism": "",
}
for line in text.splitlines():
stripped = line.strip()
upper = stripped.upper()
if upper.startswith("SUBJECT_SLUG:"):
result["links_subject_slug"] = stripped.split(":", 1)[1].strip()
elif upper.startswith("SUBJECT:"):
result["links_subject"] = stripped.split(":", 1)[1].strip()
elif upper.startswith("OBJECT_SLUG:"):
result["links_object_slug"] = stripped.split(":", 1)[1].strip()
elif upper.startswith("OBJECT:"):
result["links_object"] = stripped.split(":", 1)[1].strip()
elif upper.startswith("MECHANISM:"):
result["links_mechanism"] = stripped.split(":", 1)[1].strip()
return result
def run_relation_link_capture(
config: InfospaceConfig,
relation_entities: List[EntityMeta],
classifications: dict, # slug → EntityClassification
adapter: LLMAdapter,
run_config: Optional[RunConfig] = None,
output_dir: Optional[Path] = None,
progress_callback: Optional[Callable] = None,
) -> BatchSummary:
"""Capture relation endpoint data for Relation-type entities.
Reads existing classification files for Relation-type entities, skips
those that already have ``links_mechanism`` set, calls the LLM for the
rest, and updates classification files in-place.
Args:
config: The infospace configuration.
relation_entities: EntityMeta objects for Relation-type entities only.
classifications: Slug → EntityClassification map (pre-loaded).
adapter: LLM adapter.
run_config: LLM execution configuration.
output_dir: Where classification files live (defaults to config.classifications_dir).
progress_callback: Called after each item with (done, total, result).
Returns:
A :class:`BatchSummary` with per-entity results.
"""
topic = config.topic.name
cls_path = output_dir or Path(config.classifications_dir)
def _write_and_notify(done: int, total: int, result) -> None:
if result.status == "success" and result.response is not None:
parsed = parse_relation_link_response(result.response.content)
existing_cls = classifications.get(result.key)
if existing_cls is not None:
existing_cls.links_subject = parsed["links_subject"]
existing_cls.links_subject_slug = parsed["links_subject_slug"]
existing_cls.links_object = parsed["links_object"]
existing_cls.links_object_slug = parsed["links_object_slug"]
existing_cls.links_mechanism = parsed["links_mechanism"]
dest = cls_path / f"{result.key}.md"
write_entity_classification(existing_cls, dest)
if progress_callback is not None:
progress_callback(done, total, result)
items = [
BatchItem(
key=entity.slug,
prompt=build_relation_link_prompt(entity, topic),
)
for entity in relation_entities
]
evaluator = BatchEvaluator(
adapter=adapter,
config=run_config,

View File

@@ -122,7 +122,9 @@ def status(config_path: Optional[str]):
default="slug",
help="Sort entities by field.",
)
def entities(config_path: Optional[str], sort_key: str):
@click.option("--by-type", "by_type", is_flag=True, default=False,
help="Group entities by L2 entity type.")
def entities(config_path: Optional[str], sort_key: str, by_type: bool):
"""List entities with metadata summary."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
@@ -137,6 +139,10 @@ def entities(config_path: Optional[str], sort_key: str):
click.echo("No entities found.")
return
if by_type:
_entities_by_type(cfg, root, entity_list)
return
# Sort
if sort_key == "domain":
entity_list.sort(key=lambda e: (e.domain or "", e.slug))
@@ -153,6 +159,75 @@ def entities(config_path: Optional[str], sort_key: str):
click.echo(f"\nTotal: {len(entity_list)} entities")
def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
"""Print entities grouped by L2 entity type."""
from markitect.infospace.classification import ENTITY_TYPES
from markitect.infospace.classification_io import read_classifications_directory
from markitect.infospace.evaluation_io import read_entity_evaluation
# Load classifications
cls_dir = root / cfg.classifications_dir
cls_map: dict = {}
if cls_dir.is_dir():
from markitect.infospace.classification_io import read_classifications_directory
for c in read_classifications_directory(cls_dir):
cls_map[c.entity_slug] = c
# Load evaluation scores (best-effort)
eval_dir = root / cfg.evaluations_dir
eval_scores: dict = {} # slug → overall_score
if eval_dir.is_dir():
for ef in eval_dir.glob("*.md"):
try:
ev = read_entity_evaluation(ef)
eval_scores[ev.entity_slug] = ev.overall_score
except Exception:
pass
# Build index: entity_type → list of (entity, classification)
entity_index = {
t: [] for t in ENTITY_TYPES
}
entity_index["Unclassified"] = []
entity_map = {e.slug: e for e in entity_list}
for e in entity_list:
cls = cls_map.get(e.slug)
if cls is None:
entity_index["Unclassified"].append((e, None))
else:
bucket = cls.entity_type if cls.entity_type in entity_index else "Unclassified"
entity_index[bucket].append((e, cls))
# Print each type group
type_order = list(ENTITY_TYPES) + ["Unclassified"]
total = 0
for etype in type_order:
group = entity_index.get(etype, [])
if not group:
continue
click.echo(f"\n=== {etype} ({len(group)} entities) ===")
group.sort(key=lambda x: x[0].slug)
for e, cls in group:
vsm = cls.vsm_system if cls else ""
domain = (e.domain or "-")[:18]
score = eval_scores.get(e.slug)
score_str = f" \u2605{score:.1f}" if score is not None else ""
slug_col = f"{e.slug:<40}"
click.echo(f" {slug_col} {domain:<18} {vsm:<4}{score_str}")
if cls and cls.entity_type == "Relation" and cls.links_mechanism:
subj = cls.links_subject or cls.links_subject_slug or "?"
obj = cls.links_object or cls.links_object_slug or "?"
click.echo(f" \u2192 links: {subj} \u2194 {obj}")
mech = cls.links_mechanism
if len(mech) > 80:
mech = mech[:77] + "..."
click.echo(f" \u2192 mechanism: {mech}")
total += len(group)
click.echo(f"\nTotal: {total} entities")
# ── evaluate ─────────────────────────────────────────────────────────
@@ -429,8 +504,10 @@ def relations(config_path: Optional[str], entity_slug: Optional[str],
@click.option("--provider", default="openrouter",
help="LLM provider (openrouter, gemini, openai, …).")
@click.option("--model", default=None, help="Model name override.")
@click.option("--rpm", default=0, type=int,
help="Max requests per minute (0 = unlimited). Use 10 for Gemini free tier.")
def classify(config_path: Optional[str], entity_slug: Optional[str],
provider: str, model: Optional[str]):
provider: str, model: Optional[str], rpm: int):
"""Classify entities with Entity Type and VSM System (L2)."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
@@ -464,7 +541,9 @@ def classify(config_path: Optional[str], entity_slug: Optional[str],
click.echo("All entities already classified. Nothing to do.")
return
click.echo(f"Classifying {len(entity_list)} entities …")
delay = (60.0 / rpm) if rpm > 0 else 0.0
click.echo(f"Classifying {len(entity_list)} entities …" +
(f" (rate: {rpm} RPM, {delay:.1f}s delay)" if delay else ""))
output_dir.mkdir(parents=True, exist_ok=True)
adapter = create_adapter(provider, model=model)
@@ -483,6 +562,7 @@ def classify(config_path: Optional[str], entity_slug: Optional[str],
run_config=run_config,
output_dir=output_dir,
progress_callback=_progress,
delay_seconds=delay,
)
click.echo(f"\nDone: {summary.succeeded} classified, {summary.failed} failed.")
@@ -585,6 +665,80 @@ def classify_summary(config_path: Optional[str], update_metrics: bool):
)
# ── classify-links ────────────────────────────────────────────────────
@infospace_commands.command(name="classify-links")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--provider", default="openrouter",
help="LLM provider (openrouter, gemini, openai, …).")
@click.option("--model", default=None, help="Model name override.")
def classify_links(config_path: Optional[str], provider: str, model: Optional[str]):
"""Capture relation endpoint data (subject, object, mechanism) for Relation-type entities."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
from markitect.infospace.classification import ENTITY_TYPES
from markitect.infospace.classification_io import read_classifications_directory
from markitect.infospace.classifier import run_relation_link_capture
from markitect.llm import create_adapter
from markitect.prompts.execution.models import RunConfig
cls_dir = root / cfg.classifications_dir
if not cls_dir.is_dir():
click.echo("No classifications directory found. Run 'classify' first.", err=True)
raise SystemExit(1)
all_cls = read_classifications_directory(cls_dir)
cls_map = {c.entity_slug: c for c in all_cls}
# Filter to Relation-type entities that are missing links_mechanism
relation_slugs = [
c.entity_slug for c in all_cls
if c.entity_type == "Relation" and not c.links_mechanism
]
if not relation_slugs:
click.echo("All Relation-type entities already have endpoint data. Nothing to do.")
return
# Load entity metadata for these slugs
entity_list = parse_entity_directory(root / cfg.entities_dir)
entity_map = {e.slug: e for e in entity_list}
relation_entities = [entity_map[s] for s in relation_slugs if s in entity_map]
missing_from_entities = [s for s in relation_slugs if s not in entity_map]
if missing_from_entities:
click.echo(f"Warning: {len(missing_from_entities)} Relation-type slugs not found in "
f"entities directory and will be skipped.")
if not relation_entities:
click.echo("No Relation-type entities found to enrich.")
return
click.echo(f"Capturing relation links for {len(relation_entities)} Relation-type entities …")
adapter = create_adapter(provider, model=model)
run_config = RunConfig(model_name=model, temperature=0.1, max_tokens=512)
def _progress(done: int, total: int, result) -> None:
if result.status == "success":
click.echo(f" [{done}/{total}] {result.key}")
else:
click.echo(f" [{done}/{total}] {result.key} — FAILED: {result.error}")
summary = run_relation_link_capture(
config=cfg,
relation_entities=relation_entities,
classifications=cls_map,
adapter=adapter,
run_config=run_config,
output_dir=cls_dir,
progress_callback=_progress,
)
click.echo(f"\nDone: {summary.succeeded} enriched, {summary.failed} failed.")
# ── viability ────────────────────────────────────────────────────────
@@ -994,6 +1148,127 @@ def stale_mappings(config_path: Optional[str]):
click.echo(f" {s.reason}")
# ── graph ──────────────────────────────────────────────────────────────────
@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option(
"--format", "output_format",
type=click.Choice(["mermaid", "dot"]),
default="mermaid",
show_default=True,
help="Output format.",
)
@click.option(
"--color-by",
type=click.Choice(["type", "vsm"]),
default="type",
show_default=True,
help="Color nodes by entity type or VSM system.",
)
@click.option("--type", "filter_type", default=None,
help="Show only entities with this entity type (e.g. Relation, Process).")
@click.option("--vsm", "filter_vsm", default=None,
help="Show only entities with this VSM system (e.g. S1, S3).")
@click.option("--entity", "filter_entity", default=None,
help="Show neighborhood of a specific entity slug.")
@click.option("--loops", "loops_only", is_flag=True, default=False,
help="Show only the feedback loop subgraph.")
@click.option("--output", "-o", default=None,
help="Write to file instead of stdout.")
@click.option("--classified-only/--all-entities", "classified_only",
default=True, show_default=True,
help="Only include classified entities (default: true).")
def graph(
config_path: Optional[str],
output_format: str,
color_by: str,
filter_type: Optional[str],
filter_vsm: Optional[str],
filter_entity: Optional[str],
loops_only: bool,
output: Optional[str],
classified_only: bool,
):
"""Render the entity-relation graph as Mermaid or DOT."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
from markitect.infospace.classification_io import read_classifications_directory
from markitect.infospace.relation_parser import parse_relations_directory
from markitect.infospace.graph_export import (
apply_filters,
build_entity_graph,
to_dot,
to_mermaid,
)
# Load classifications
cls_dir = root / cfg.classifications_dir
classifications = []
if cls_dir.is_dir():
classifications = read_classifications_directory(cls_dir)
classified_slugs = {c.entity_slug for c in classifications}
# Load relations
relations_dir = root / cfg.relations_dir
relations = []
if relations_dir.is_dir():
relations = parse_relations_directory(relations_dir)
if not classifications and not relations:
click.echo("No classifications or relations found. Run 'classify' and add relation files.")
return
# Detect feedback loops via networkx
feedback_cycles = []
if relations:
try:
import networkx as nx
G = nx.DiGraph()
for r in relations:
G.add_edge(r.subject_slug, r.object_slug)
feedback_cycles = list(nx.simple_cycles(G))
except ImportError:
pass
# Build graph
g = build_entity_graph(classifications, relations, feedback_cycles)
# Apply filters
filtered = apply_filters(
g,
filter_type=filter_type,
filter_vsm=filter_vsm,
filter_entity=filter_entity,
loops_only=loops_only,
classified_only=classified_only,
classified_slugs=classified_slugs,
)
if not filtered.nodes:
click.echo("No nodes match the given filters.")
return
# Export
if output_format == "dot":
result = to_dot(filtered, color_by=color_by)
else:
result = to_mermaid(filtered, color_by=color_by)
if output:
out_path = Path(output)
out_path.write_text(result, encoding="utf-8")
click.echo(
f"Wrote {output_format} graph ({len(filtered.nodes)} nodes, "
f"{sum(len(v) for v in filtered.edges.values())} edges) to {out_path}"
)
else:
click.echo(result, nl=False)
def _load_mapping_references(
cfg: InfospaceConfig, root: Path
) -> Optional[dict]:

View File

@@ -90,7 +90,8 @@ def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
yaml.safe_dump(
{k: round(v, 6) for k, v in sorted(metrics.items())},
{k: round(v, 6) if isinstance(v, float) else v
for k, v in sorted(metrics.items())},
default_flow_style=False,
sort_keys=True,
),

View File

@@ -102,11 +102,13 @@ class BatchEvaluator:
config: Optional[RunConfig] = None,
progress_callback: Optional[Callable[[int, int, BatchResult], None]] = None,
previous_digests: Optional[Dict[str, str]] = None,
delay_seconds: float = 0.0,
):
self._adapter = adapter
self._config = config or RunConfig()
self._progress_callback = progress_callback
self._previous_digests = previous_digests or {}
self._delay_seconds = delay_seconds
def evaluate(self, items: List[BatchItem]) -> BatchSummary:
"""Run evaluation for all items and return aggregate results.
@@ -116,9 +118,13 @@ class BatchEvaluator:
the LLM adapter. Errors on individual items are captured
without aborting the batch.
"""
import time as _time
summary = BatchSummary(total=len(items))
for idx, item in enumerate(items):
if idx > 0 and self._delay_seconds > 0:
_time.sleep(self._delay_seconds)
result = self._evaluate_one(item)
summary.results.append(result)