entity relationship model

2026-05-14 15:06:17 +02:00
parent 5e6c5aed8b
commit 9627d03c1a
8 changed files with 599 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Start with:
 - `docs/reference-pilot-decision.md`
 - `docs/markitect-main-scope-assessment.md`
 - `docs/markitect-tool-adapter.md`
+- `docs/entity-relation-model.md`
 - `docs/orthogonal-successor-roadmap.md`
 - `docs/legacy-infospace-feature-inventory.md`
 - `docs/successor-boundary-interface-map.md`
--- a/docs/entity-relation-model.md
+++ b/docs/entity-relation-model.md
@@ -0,0 +1,54 @@
+# Entity And Relation Model
+
+`infospace-bench` owns the application-level semantic model for infospace
+entities and relation triplets. `markitect-tool` remains the Markdown structure
+provider and is accessed only through `infospace_bench.markdown_adapter`.
+
+## Entity Artifacts
+
+Entity artifacts are registered in `artifacts/index.yaml` with `kind: entity`
+and are stored under `artifacts/entities/`.
+
+The parser extracts:
+
+- `artifact_id`: manifest identity such as `entity/division.md`
+- `slug`, `title`, and `h1_raw`: identity derived from the document H1
+- `definition`, `source_chapter`, `context`, `domain`, `original_wording`,
+  and `modern_interpretation`: legacy-style sections where present
+- `h1_is_title_case`, `has_original_wording`, `definition_word_count`,
+  `total_word_count`, and ordered `section_slugs`: compatibility metrics used by
+  evaluation and inspection flows
+- `source_path`: path to the concrete artifact file
+
+`## Definition` is required. Missing required sections raise
+`invalid_entity_artifact` with a `missing_sections` detail list.
+
+## Relation Artifacts
+
+Relation artifacts are registered with `kind: relation` and are stored under
+`artifacts/relations/`.
+
+The parser extracts:
+
+- `artifact_id` and `slug`: manifest identity plus a relation slug derived from
+  the H1
+- `subject`, `predicate`, `object`: the relation triplet
+- `subject_slug`, `object_slug`, `subject_entity_id`, and `object_entity_id`:
+  endpoint links back to parsed entity artifacts
+- `relation_type`, `vsm_channel`, `evidence`, and `feedback_role`: semantic and
+  evaluation metadata
+- `is_feedback_member`: derived from whether `feedback_role` is present
+
+`## Subject`, `## Predicate`, and `## Object` are required. When relation
+listing is performed from an infospace manifest, subject and object slugs must
+resolve to entity artifacts or `unresolved_relation_endpoint` is raised.
+
+## CLI
+
+```bash
+python3 -m infospace_bench entities infospaces/bootstrap-pilot
+python3 -m infospace_bench relations infospaces/bootstrap-pilot
+```
+
+These commands emit JSON records for downstream evaluation, graphing, and
+inspection workflows.
--- a/src/infospace_bench/init.py
+++ b/src/infospace_bench/init.py
@@ -9,6 +9,7 @@ from .models import (
    TopicConfig,
    ViabilityThreshold,
 )
+from .semantics import EntityRecord, RelationRecord, list_entities, list_relations

 __all__ = [
    "DisciplineBinding",
@@ -19,10 +20,14 @@ __all__ = [
    "InfospaceError",
    "KnowledgeArtifact",
    "MetricValue",
+    "EntityRecord",
+    "RelationRecord",
    "ScoreEntry",
    "TopicConfig",
    "ViabilityThreshold",
    "add_artifact",
    "create_infospace",
+    "list_entities",
+    "list_relations",
    "load_infospace",
 ]
--- a/src/infospace_bench/cli.py
+++ b/src/infospace_bench/cli.py
@@ -8,6 +8,7 @@ from pathlib import Path
 from .errors import InfospaceError
 from .lifecycle import add_artifact, create_infospace, load_infospace
 from .markdown_adapter import validate_infospace_artifacts
+from .semantics import list_entities, list_relations


 def build_parser() -> argparse.ArgumentParser:
@@ -35,6 +36,12 @@ def build_parser() -> argparse.ArgumentParser:
    validate = sub.add_parser("validate", help="Validate infospace artifacts")
    validate.add_argument("root")

+    entities = sub.add_parser("entities", help="List parsed entity artifacts")
+    entities.add_argument("root")
+
+    relations = sub.add_parser("relations", help="List parsed relation artifacts")
+    relations.add_argument("root")
+
    return parser


@@ -72,6 +79,23 @@ def main(argv: list[str] | None = None) -> int:
                }
            )
            return 0 if valid else 1
+        elif args.command == "entities":
+            _write_json(
+                {
+                    "entities": [
+                        entity.to_dict() for entity in list_entities(Path(args.root))
+                    ]
+                }
+            )
+        elif args.command == "relations":
+            _write_json(
+                {
+                    "relations": [
+                        relation.to_dict()
+                        for relation in list_relations(Path(args.root))
+                    ]
+                }
+            )
        else:
            parser.error(f"Unhandled command: {args.command}")
    except InfospaceError as exc:
--- a/src/infospace_bench/lifecycle.py
+++ b/src/infospace_bench/lifecycle.py
@@ -15,13 +15,20 @@ CONFIG_FILE = "infospace.yaml"
 ARTIFACT_INDEX = "artifacts/index.yaml"
 LAYOUT_DIRS = (
    "artifacts/sources",
+    "artifacts/entities",
+    "artifacts/relations",
    "artifacts/generated",
    "output/evaluations",
    "output/metrics",
    "reports",
    "exports",
 )
-KIND_DIRS = {"source": "sources", "generated": "generated"}
+KIND_DIRS = {
+    "source": "sources",
+    "entity": "entities",
+    "relation": "relations",
+    "generated": "generated",
+}


 def create_infospace(
--- a/src/infospace_bench/semantics.py
+++ b/src/infospace_bench/semantics.py
@@ -0,0 +1,281 @@
+from __future__ import annotations
+
+import re
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+
+from .errors import InfospaceError
+from .lifecycle import load_infospace
+from .markdown_adapter import (
+    ParsedMarkdownArtifact,
+    extract_section_text,
+    parse_markdown_artifact,
+)
+
+MINOR_TITLE_WORDS = {
+    "a",
+    "an",
+    "and",
+    "as",
+    "at",
+    "but",
+    "by",
+    "for",
+    "if",
+    "in",
+    "is",
+    "nor",
+    "of",
+    "on",
+    "or",
+    "so",
+    "the",
+    "to",
+    "up",
+    "yet",
+}
+
+
+@dataclass(frozen=True)
+class EntityRecord:
+    artifact_id: str
+    slug: str
+    title: str
+    h1_raw: str
+    definition: str = ""
+    source_chapter: str = ""
+    context: str = ""
+    domain: str = ""
+    original_wording: str = ""
+    modern_interpretation: str = ""
+    h1_is_title_case: bool = False
+    has_original_wording: bool = False
+    definition_word_count: int = 0
+    total_word_count: int = 0
+    section_slugs: list[str] = field(default_factory=list)
+    source_path: str = ""
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class RelationRecord:
+    artifact_id: str
+    slug: str
+    subject: str
+    subject_slug: str
+    subject_entity_id: str
+    predicate: str
+    object: str
+    object_slug: str
+    object_entity_id: str
+    relation_type: str = ""
+    vsm_channel: str = ""
+    evidence: str = ""
+    feedback_role: str = ""
+    source_path: str = ""
+
+    @property
+    def is_feedback_member(self) -> bool:
+        return bool(self.feedback_role.strip())
+
+    def to_dict(self) -> dict[str, Any]:
+        data = asdict(self)
+        data["is_feedback_member"] = self.is_feedback_member
+        return data
+
+
+def parse_entity_artifact(artifact_id: str, path: str | Path) -> EntityRecord:
+    artifact_path = Path(path)
+    parsed = parse_markdown_artifact(artifact_path)
+    h1 = _first_heading(parsed, level=1)
+    missing_sections: list[str] = []
+    if h1 is None:
+        missing_sections.append("h1")
+
+    definition = _section_text(parsed, "Definition")
+    if not definition:
+        missing_sections.append("definition")
+
+    if missing_sections:
+        raise InfospaceError(
+            "invalid_entity_artifact",
+            f"Invalid entity artifact: {artifact_id}",
+            {
+                "artifact_id": artifact_id,
+                "path": str(artifact_path),
+                "missing_sections": missing_sections,
+            },
+        )
+
+    title = h1.text if h1 is not None else ""
+    original_wording = _section_text(
+        parsed,
+        "Original Wording",
+        "Smith's Original Wording",
+    )
+    return EntityRecord(
+        artifact_id=artifact_id,
+        slug=slugify(title),
+        title=title,
+        h1_raw=title,
+        definition=definition,
+        source_chapter=_section_text(parsed, "Source Chapter"),
+        context=_section_text(parsed, "Context"),
+        domain=_section_text(
+            parsed,
+            "Economic Domain",
+            "Supply Chain Domain",
+            "Knowledge Domain",
+            "Domain",
+        ),
+        original_wording=original_wording,
+        modern_interpretation=_section_text(parsed, "Modern Interpretation"),
+        h1_is_title_case=_is_title_case(title),
+        has_original_wording=bool(original_wording),
+        definition_word_count=_word_count(definition),
+        total_word_count=_word_count(artifact_path.read_text(encoding="utf-8")),
+        section_slugs=_section_slugs(parsed),
+        source_path=str(artifact_path),
+    )
+
+
+def parse_relation_artifact(
+    artifact_id: str,
+    path: str | Path,
+    entity_ids: dict[str, str] | None = None,
+) -> RelationRecord:
+    artifact_path = Path(path)
+    parsed = parse_markdown_artifact(artifact_path)
+    h1 = _first_heading(parsed, level=1)
+    missing_sections: list[str] = []
+    if h1 is None:
+        missing_sections.append("h1")
+
+    subject = _section_text(parsed, "Subject")
+    predicate = _section_text(parsed, "Predicate")
+    obj = _section_text(parsed, "Object")
+    for section_slug, value in (
+        ("subject", subject),
+        ("predicate", predicate),
+        ("object", obj),
+    ):
+        if not value:
+            missing_sections.append(section_slug)
+
+    if missing_sections:
+        raise InfospaceError(
+            "invalid_relation_artifact",
+            f"Invalid relation artifact: {artifact_id}",
+            {
+                "artifact_id": artifact_id,
+                "path": str(artifact_path),
+                "missing_sections": missing_sections,
+            },
+        )
+
+    subject_slug = slugify(subject)
+    object_slug = slugify(obj)
+    subject_entity_id = ""
+    object_entity_id = ""
+    if entity_ids is not None:
+        missing_slugs = [
+            slug
+            for slug in (subject_slug, object_slug)
+            if slug and slug not in entity_ids
+        ]
+        if missing_slugs:
+            raise InfospaceError(
+                "unresolved_relation_endpoint",
+                f"Relation endpoint not found for artifact: {artifact_id}",
+                {
+                    "artifact_id": artifact_id,
+                    "path": str(artifact_path),
+                    "missing_slugs": missing_slugs,
+                },
+            )
+        subject_entity_id = entity_ids[subject_slug]
+        object_entity_id = entity_ids[object_slug]
+
+    title = h1.text if h1 is not None else artifact_path.stem
+    return RelationRecord(
+        artifact_id=artifact_id,
+        slug=slugify(title),
+        subject=subject,
+        subject_slug=subject_slug,
+        subject_entity_id=subject_entity_id,
+        predicate=predicate,
+        object=obj,
+        object_slug=object_slug,
+        object_entity_id=object_entity_id,
+        relation_type=_section_text(parsed, "Relation Type"),
+        vsm_channel=_section_text(parsed, "VSM Channel"),
+        evidence=_section_text(parsed, "Evidence"),
+        feedback_role=_section_text(parsed, "Feedback Role"),
+        source_path=str(artifact_path),
+    )
+
+
+def list_entities(root: str | Path) -> list[EntityRecord]:
+    infospace = load_infospace(root)
+    return [
+        parse_entity_artifact(artifact.id, infospace.root / artifact.path)
+        for artifact in infospace.artifacts
+        if artifact.kind == "entity"
+    ]
+
+
+def list_relations(root: str | Path) -> list[RelationRecord]:
+    infospace = load_infospace(root)
+    entity_ids = {entity.slug: entity.artifact_id for entity in list_entities(root)}
+    return [
+        parse_relation_artifact(artifact.id, infospace.root / artifact.path, entity_ids)
+        for artifact in infospace.artifacts
+        if artifact.kind == "relation"
+    ]
+
+
+def slugify(value: str) -> str:
+    slug = re.sub(r"[^a-z0-9]+", "-", value.strip().lower())
+    return slug.strip("-")
+
+
+def _first_heading(parsed: ParsedMarkdownArtifact, *, level: int) -> Any | None:
+    return next((heading for heading in parsed.headings if heading.level == level), None)
+
+
+def _section_text(parsed: ParsedMarkdownArtifact, *headings: str) -> str:
+    for heading in headings:
+        text = extract_section_text(parsed, heading)
+        if text:
+            return text
+    return ""
+
+
+def _section_slugs(parsed: ParsedMarkdownArtifact) -> list[str]:
+    return [
+        slugify(section.heading.text)
+        for section in parsed.sections
+        if section.heading.level == 2
+    ]
+
+
+def _is_title_case(value: str) -> bool:
+    words = value.split()
+    if not words:
+        return False
+    for index, word in enumerate(words):
+        clean = re.sub(r"[^\w]", "", word)
+        if not clean:
+            continue
+        if index > 0 and clean.lower() in MINOR_TITLE_WORDS:
+            continue
+        if not clean[0].isupper():
+            return False
+    return True
+
+
+def _word_count(value: str) -> int:
+    return len(value.split())
--- a/tests/test_semantics.py
+++ b/tests/test_semantics.py
@@ -0,0 +1,221 @@
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from infospace_bench import InfospaceError, add_artifact, create_infospace
+from infospace_bench.semantics import (
+    list_entities,
+    list_relations,
+    parse_entity_artifact,
+    parse_relation_artifact,
+)
+
+
+ENTITY = """# Division of Labour
+
+## Definition
+
+Increasing productivity by splitting work into specialized tasks.
+
+## Source Chapter
+
+Book I, Chapter 1
+
+## Context
+
+Smith introduces the pin factory as an example of this mechanism.
+
+## Economic Domain
+
+Production
+
+## Original Wording
+
+The greatest improvement in the productive powers of labour.
+
+## Modern Interpretation
+
+Specialization improves throughput by reducing switching costs.
+"""
+
+
+INVALID_ENTITY = """# Thin Entity
+
+## Context
+
+This artifact is missing its definition.
+"""
+
+
+RELATION = """# Division of Labour enables Market Extent
+
+## Subject
+
+Division of Labour
+
+## Predicate
+
+is limited by
+
+## Object
+
+Market Extent
+
+## Relation Type
+
+constrains
+
+## VSM Channel
+
+S1 -> S4
+
+## Evidence
+
+Book I, Chapter 3 connects specialization to market size.
+
+## Feedback Role
+
+Part of the market expansion loop.
+"""
+
+
+def cli_env() -> dict[str, str]:
+    env = os.environ.copy()
+    env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
+    return env
+
+
+def test_parse_entity_artifact_extracts_legacy_sections(tmp_path: Path) -> None:
+    path = tmp_path / "division.md"
+    path.write_text(ENTITY, encoding="utf-8")
+
+    entity = parse_entity_artifact("entity/division.md", path)
+
+    assert entity.artifact_id == "entity/division.md"
+    assert entity.slug == "division-of-labour"
+    assert entity.title == "Division of Labour"
+    assert entity.definition_word_count == 8
+    assert entity.domain == "Production"
+    assert entity.has_original_wording is True
+    assert entity.section_slugs == [
+        "definition",
+        "source-chapter",
+        "context",
+        "economic-domain",
+        "original-wording",
+        "modern-interpretation",
+    ]
+
+
+def test_parse_entity_artifact_reports_missing_required_sections(tmp_path: Path) -> None:
+    path = tmp_path / "thin.md"
+    path.write_text(INVALID_ENTITY, encoding="utf-8")
+
+    with pytest.raises(InfospaceError) as raised:
+        parse_entity_artifact("entity/thin.md", path)
+
+    assert raised.value.code == "invalid_entity_artifact"
+    assert raised.value.detail["missing_sections"] == ["definition"]
+
+
+def test_parse_relation_artifact_links_entity_endpoints(tmp_path: Path) -> None:
+    path = tmp_path / "relation.md"
+    path.write_text(RELATION, encoding="utf-8")
+    entity_ids = {
+        "division-of-labour": "entity/division.md",
+        "market-extent": "entity/market.md",
+    }
+
+    relation = parse_relation_artifact("relation/division-market.md", path, entity_ids)
+
+    assert relation.slug == "division-of-labour-enables-market-extent"
+    assert relation.subject_slug == "division-of-labour"
+    assert relation.object_slug == "market-extent"
+    assert relation.subject_entity_id == "entity/division.md"
+    assert relation.object_entity_id == "entity/market.md"
+    assert relation.is_feedback_member is True
+
+
+def test_parse_relation_artifact_reports_unresolved_endpoint(tmp_path: Path) -> None:
+    path = tmp_path / "relation.md"
+    path.write_text(RELATION, encoding="utf-8")
+
+    with pytest.raises(InfospaceError) as raised:
+        parse_relation_artifact("relation/division-market.md", path, {})
+
+    assert raised.value.code == "unresolved_relation_endpoint"
+    assert raised.value.detail["missing_slugs"] == [
+        "division-of-labour",
+        "market-extent",
+    ]
+
+
+def test_list_entities_and_relations_from_manifest(tmp_path: Path) -> None:
+    infospace = create_infospace(tmp_path, "pilot", name="Pilot")
+    division = tmp_path / "division.md"
+    division.write_text(ENTITY, encoding="utf-8")
+    market = tmp_path / "market.md"
+    market.write_text(
+        ENTITY.replace("Division of Labour", "Market Extent").replace(
+            "Production", "Exchange"
+        ),
+        encoding="utf-8",
+    )
+    relation = tmp_path / "relation.md"
+    relation.write_text(RELATION, encoding="utf-8")
+
+    add_artifact(infospace.root, division, kind="entity", title="Division")
+    add_artifact(infospace.root, market, kind="entity", title="Market")
+    add_artifact(infospace.root, relation, kind="relation", title="Relation")
+
+    assert [entity.slug for entity in list_entities(infospace.root)] == [
+        "division-of-labour",
+        "market-extent",
+    ]
+    assert [item.relation_type for item in list_relations(infospace.root)] == [
+        "constrains"
+    ]
+
+
+def test_cli_entities_and_relations_output_json(tmp_path: Path) -> None:
+    infospace = create_infospace(tmp_path, "pilot", name="Pilot")
+    division = tmp_path / "division.md"
+    division.write_text(ENTITY, encoding="utf-8")
+    market = tmp_path / "market.md"
+    market.write_text(
+        ENTITY.replace("Division of Labour", "Market Extent").replace(
+            "Production", "Exchange"
+        ),
+        encoding="utf-8",
+    )
+    relation = tmp_path / "relation.md"
+    relation.write_text(RELATION, encoding="utf-8")
+    add_artifact(infospace.root, division, kind="entity", title="Division")
+    add_artifact(infospace.root, market, kind="entity", title="Market")
+    add_artifact(infospace.root, relation, kind="relation", title="Relation")
+
+    entities = subprocess.run(
+        [sys.executable, "-m", "infospace_bench", "entities", str(infospace.root)],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+    relations = subprocess.run(
+        [sys.executable, "-m", "infospace_bench", "relations", str(infospace.root)],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+
+    assert entities.returncode == 0, entities.stderr
+    assert relations.returncode == 0, relations.stderr
+    assert json.loads(entities.stdout)["entities"][0]["slug"] == "division-of-labour"
+    assert json.loads(relations.stdout)["relations"][0]["subject_entity_id"] == (
+        "entity/division.md"
+    )
--- a/workplans/IB-WP-0007-entity-relation-model-migration.md
+++ b/workplans/IB-WP-0007-entity-relation-model-migration.md
@@ -4,7 +4,7 @@ type: workplan
 title: "Entity And Relation Model Migration"
 domain: markitect
 repo: infospace-bench
-status: planned
+status: completed
 owner: markitect
 topic_slug: markitect
 created: "2026-05-14"
@@ -26,7 +26,7 @@ application-level models on top of `markitect-tool` parsing.

 ```task
 id: IB-WP-0007-T01
-status: todo
+status: done
 priority: high
 state_hub_task_id: "d6c401be-ada6-4684-9186-8ae35101bfa8"
 ```
@@ -39,7 +39,7 @@ state_hub_task_id: "d6c401be-ada6-4684-9186-8ae35101bfa8"

 ```task
 id: IB-WP-0007-T02
-status: todo
+status: done
 priority: high
 state_hub_task_id: "25e42321-33fe-4b84-8e95-c5308d91ad3b"
 ```
@@ -52,7 +52,7 @@ state_hub_task_id: "25e42321-33fe-4b84-8e95-c5308d91ad3b"

 ```task
 id: IB-WP-0007-T03
-status: todo
+status: done
 priority: high
 state_hub_task_id: "845a8ea0-50d8-4dd3-8cc3-23717195ae6f"
 ```
@@ -66,7 +66,7 @@ state_hub_task_id: "845a8ea0-50d8-4dd3-8cc3-23717195ae6f"

 ```task
 id: IB-WP-0007-T04
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "155028a2-4df7-4144-9193-74e95f6e51b1"
 ```