Files
markitect-main/markitect/infospace/classification_io.py
tegwick d1f57272a4 feat(example): add L2 classifications for 823/988 WoN entities (S3.4)
Batch classification via OpenRouter (claude-sonnet-4). 165 entities
remain unclassified due to credit exhaustion; incremental skip means
a follow-up run will complete them automatically.

Type × VSM matrix (823 entities):
                  S1   S2   S3  S3*   S4   S5
  Element         86   75   58   21   43   32  (315 total, 38%)
  Process         39   42   37   17   67   24  (226 total, 28%)
  Institution      4   12   30   24    .   52  (122 total, 15%)
  Principle        3    7   15    2   43   32  (102 total, 12%)
  Relation         2   14    5    5   22   10   (58 total,  7%)
  Matrix fill: 29/30 cells (Institution/S4 empty — expected)

Metrics updated: type_entropy=2.0936, vsm_type_matrix_cells=29

Also:
- BatchEvaluator gains delay_seconds param for rate-limited providers
- classify CLI gains --rpm option (--rpm 10 for Gemini free tier)
- history.write_metrics_file now handles non-float metric values
  (type_distribution is a dict, was crashing round())
- run_entity_classification forwards delay_seconds to BatchEvaluator
- classify-links and graph commands added by user (entities --by-type,
  graph --format mermaid/dot, classify-links for Relation enrichment)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 12:49:11 +01:00

92 lines
2.6 KiB
Python

"""
Read/write utilities for entity classification files (L2).
Classification files use YAML frontmatter (machine-readable) plus a
markdown body (human-readable), matching the convention used by evaluation
files.
"""
from __future__ import annotations
from pathlib import Path
from typing import List
import yaml
from .classification import EntityClassification
_FRONTMATTER_SEP = "---"
def write_entity_classification(c: EntityClassification, path: Path) -> None:
"""Write a per-entity classification as YAML frontmatter + markdown body."""
fm = c.to_dict()
lines: List[str] = []
lines.append(_FRONTMATTER_SEP)
lines.append(yaml.safe_dump(fm, default_flow_style=False, sort_keys=False).rstrip())
lines.append(_FRONTMATTER_SEP)
lines.append("")
title = c.entity_slug.replace("_", " ").replace("-", " ").title()
lines.append(f"# Classification: {title}")
lines.append("")
lines.append("## Entity Type")
lines.append("")
lines.append(c.entity_type)
lines.append("")
lines.append("## VSM System")
lines.append("")
lines.append(c.vsm_system)
lines.append("")
if c.type_rationale:
lines.append("## Type Rationale")
lines.append("")
lines.append(c.type_rationale)
lines.append("")
if c.vsm_rationale:
lines.append("## VSM Rationale")
lines.append("")
lines.append(c.vsm_rationale)
lines.append("")
if c.links_mechanism:
lines.append("## Links")
lines.append("")
if c.links_subject:
lines.append(f"**Subject:** {c.links_subject}")
if c.links_object:
lines.append(f"**Object:** {c.links_object}")
lines.append("")
lines.append(c.links_mechanism)
lines.append("")
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")
def read_entity_classification(path: Path) -> EntityClassification:
"""Read a classification file (YAML frontmatter + markdown body)."""
text = path.read_text(encoding="utf-8")
parts = text.split(f"{_FRONTMATTER_SEP}\n", maxsplit=2)
if len(parts) < 3:
raise ValueError(f"No YAML frontmatter found in {path}")
fm = yaml.safe_load(parts[1])
return EntityClassification.from_dict(fm)
def read_classifications_directory(directory: Path) -> List[EntityClassification]:
"""Read all classification files from a directory."""
results: List[EntityClassification] = []
for p in sorted(directory.glob("*.md")):
try:
results.append(read_entity_classification(p))
except Exception:
pass
return results