From 9627d03c1a8f9dce9d1b68f64a70dd331d86318d Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 14 May 2026 15:06:17 +0200 Subject: [PATCH] entity relationship model --- README.md | 1 + docs/entity-relation-model.md | 54 ++++ src/infospace_bench/__init__.py | 5 + src/infospace_bench/cli.py | 24 ++ src/infospace_bench/lifecycle.py | 9 +- src/infospace_bench/semantics.py | 281 ++++++++++++++++++ tests/test_semantics.py | 221 ++++++++++++++ ...WP-0007-entity-relation-model-migration.md | 10 +- 8 files changed, 599 insertions(+), 6 deletions(-) create mode 100644 docs/entity-relation-model.md create mode 100644 src/infospace_bench/semantics.py create mode 100644 tests/test_semantics.py diff --git a/README.md b/README.md index 44e5503..f9bd9ba 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Start with: - `docs/reference-pilot-decision.md` - `docs/markitect-main-scope-assessment.md` - `docs/markitect-tool-adapter.md` +- `docs/entity-relation-model.md` - `docs/orthogonal-successor-roadmap.md` - `docs/legacy-infospace-feature-inventory.md` - `docs/successor-boundary-interface-map.md` diff --git a/docs/entity-relation-model.md b/docs/entity-relation-model.md new file mode 100644 index 0000000..cbabf10 --- /dev/null +++ b/docs/entity-relation-model.md @@ -0,0 +1,54 @@ +# Entity And Relation Model + +`infospace-bench` owns the application-level semantic model for infospace +entities and relation triplets. `markitect-tool` remains the Markdown structure +provider and is accessed only through `infospace_bench.markdown_adapter`. + +## Entity Artifacts + +Entity artifacts are registered in `artifacts/index.yaml` with `kind: entity` +and are stored under `artifacts/entities/`. + +The parser extracts: + +- `artifact_id`: manifest identity such as `entity/division.md` +- `slug`, `title`, and `h1_raw`: identity derived from the document H1 +- `definition`, `source_chapter`, `context`, `domain`, `original_wording`, + and `modern_interpretation`: legacy-style sections where present +- `h1_is_title_case`, `has_original_wording`, `definition_word_count`, + `total_word_count`, and ordered `section_slugs`: compatibility metrics used by + evaluation and inspection flows +- `source_path`: path to the concrete artifact file + +`## Definition` is required. Missing required sections raise +`invalid_entity_artifact` with a `missing_sections` detail list. + +## Relation Artifacts + +Relation artifacts are registered with `kind: relation` and are stored under +`artifacts/relations/`. + +The parser extracts: + +- `artifact_id` and `slug`: manifest identity plus a relation slug derived from + the H1 +- `subject`, `predicate`, `object`: the relation triplet +- `subject_slug`, `object_slug`, `subject_entity_id`, and `object_entity_id`: + endpoint links back to parsed entity artifacts +- `relation_type`, `vsm_channel`, `evidence`, and `feedback_role`: semantic and + evaluation metadata +- `is_feedback_member`: derived from whether `feedback_role` is present + +`## Subject`, `## Predicate`, and `## Object` are required. When relation +listing is performed from an infospace manifest, subject and object slugs must +resolve to entity artifacts or `unresolved_relation_endpoint` is raised. + +## CLI + +```bash +python3 -m infospace_bench entities infospaces/bootstrap-pilot +python3 -m infospace_bench relations infospaces/bootstrap-pilot +``` + +These commands emit JSON records for downstream evaluation, graphing, and +inspection workflows. diff --git a/src/infospace_bench/__init__.py b/src/infospace_bench/__init__.py index 67ae208..9a8ffbf 100644 --- a/src/infospace_bench/__init__.py +++ b/src/infospace_bench/__init__.py @@ -9,6 +9,7 @@ from .models import ( TopicConfig, ViabilityThreshold, ) +from .semantics import EntityRecord, RelationRecord, list_entities, list_relations __all__ = [ "DisciplineBinding", @@ -19,10 +20,14 @@ __all__ = [ "InfospaceError", "KnowledgeArtifact", "MetricValue", + "EntityRecord", + "RelationRecord", "ScoreEntry", "TopicConfig", "ViabilityThreshold", "add_artifact", "create_infospace", + "list_entities", + "list_relations", "load_infospace", ] diff --git a/src/infospace_bench/cli.py b/src/infospace_bench/cli.py index d6377eb..7f47df8 100644 --- a/src/infospace_bench/cli.py +++ b/src/infospace_bench/cli.py @@ -8,6 +8,7 @@ from pathlib import Path from .errors import InfospaceError from .lifecycle import add_artifact, create_infospace, load_infospace from .markdown_adapter import validate_infospace_artifacts +from .semantics import list_entities, list_relations def build_parser() -> argparse.ArgumentParser: @@ -35,6 +36,12 @@ def build_parser() -> argparse.ArgumentParser: validate = sub.add_parser("validate", help="Validate infospace artifacts") validate.add_argument("root") + entities = sub.add_parser("entities", help="List parsed entity artifacts") + entities.add_argument("root") + + relations = sub.add_parser("relations", help="List parsed relation artifacts") + relations.add_argument("root") + return parser @@ -72,6 +79,23 @@ def main(argv: list[str] | None = None) -> int: } ) return 0 if valid else 1 + elif args.command == "entities": + _write_json( + { + "entities": [ + entity.to_dict() for entity in list_entities(Path(args.root)) + ] + } + ) + elif args.command == "relations": + _write_json( + { + "relations": [ + relation.to_dict() + for relation in list_relations(Path(args.root)) + ] + } + ) else: parser.error(f"Unhandled command: {args.command}") except InfospaceError as exc: diff --git a/src/infospace_bench/lifecycle.py b/src/infospace_bench/lifecycle.py index 733c6e2..ac67cb1 100644 --- a/src/infospace_bench/lifecycle.py +++ b/src/infospace_bench/lifecycle.py @@ -15,13 +15,20 @@ CONFIG_FILE = "infospace.yaml" ARTIFACT_INDEX = "artifacts/index.yaml" LAYOUT_DIRS = ( "artifacts/sources", + "artifacts/entities", + "artifacts/relations", "artifacts/generated", "output/evaluations", "output/metrics", "reports", "exports", ) -KIND_DIRS = {"source": "sources", "generated": "generated"} +KIND_DIRS = { + "source": "sources", + "entity": "entities", + "relation": "relations", + "generated": "generated", +} def create_infospace( diff --git a/src/infospace_bench/semantics.py b/src/infospace_bench/semantics.py new file mode 100644 index 0000000..82cd1e4 --- /dev/null +++ b/src/infospace_bench/semantics.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import re +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +from .errors import InfospaceError +from .lifecycle import load_infospace +from .markdown_adapter import ( + ParsedMarkdownArtifact, + extract_section_text, + parse_markdown_artifact, +) + +MINOR_TITLE_WORDS = { + "a", + "an", + "and", + "as", + "at", + "but", + "by", + "for", + "if", + "in", + "is", + "nor", + "of", + "on", + "or", + "so", + "the", + "to", + "up", + "yet", +} + + +@dataclass(frozen=True) +class EntityRecord: + artifact_id: str + slug: str + title: str + h1_raw: str + definition: str = "" + source_chapter: str = "" + context: str = "" + domain: str = "" + original_wording: str = "" + modern_interpretation: str = "" + h1_is_title_case: bool = False + has_original_wording: bool = False + definition_word_count: int = 0 + total_word_count: int = 0 + section_slugs: list[str] = field(default_factory=list) + source_path: str = "" + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +@dataclass(frozen=True) +class RelationRecord: + artifact_id: str + slug: str + subject: str + subject_slug: str + subject_entity_id: str + predicate: str + object: str + object_slug: str + object_entity_id: str + relation_type: str = "" + vsm_channel: str = "" + evidence: str = "" + feedback_role: str = "" + source_path: str = "" + + @property + def is_feedback_member(self) -> bool: + return bool(self.feedback_role.strip()) + + def to_dict(self) -> dict[str, Any]: + data = asdict(self) + data["is_feedback_member"] = self.is_feedback_member + return data + + +def parse_entity_artifact(artifact_id: str, path: str | Path) -> EntityRecord: + artifact_path = Path(path) + parsed = parse_markdown_artifact(artifact_path) + h1 = _first_heading(parsed, level=1) + missing_sections: list[str] = [] + if h1 is None: + missing_sections.append("h1") + + definition = _section_text(parsed, "Definition") + if not definition: + missing_sections.append("definition") + + if missing_sections: + raise InfospaceError( + "invalid_entity_artifact", + f"Invalid entity artifact: {artifact_id}", + { + "artifact_id": artifact_id, + "path": str(artifact_path), + "missing_sections": missing_sections, + }, + ) + + title = h1.text if h1 is not None else "" + original_wording = _section_text( + parsed, + "Original Wording", + "Smith's Original Wording", + ) + return EntityRecord( + artifact_id=artifact_id, + slug=slugify(title), + title=title, + h1_raw=title, + definition=definition, + source_chapter=_section_text(parsed, "Source Chapter"), + context=_section_text(parsed, "Context"), + domain=_section_text( + parsed, + "Economic Domain", + "Supply Chain Domain", + "Knowledge Domain", + "Domain", + ), + original_wording=original_wording, + modern_interpretation=_section_text(parsed, "Modern Interpretation"), + h1_is_title_case=_is_title_case(title), + has_original_wording=bool(original_wording), + definition_word_count=_word_count(definition), + total_word_count=_word_count(artifact_path.read_text(encoding="utf-8")), + section_slugs=_section_slugs(parsed), + source_path=str(artifact_path), + ) + + +def parse_relation_artifact( + artifact_id: str, + path: str | Path, + entity_ids: dict[str, str] | None = None, +) -> RelationRecord: + artifact_path = Path(path) + parsed = parse_markdown_artifact(artifact_path) + h1 = _first_heading(parsed, level=1) + missing_sections: list[str] = [] + if h1 is None: + missing_sections.append("h1") + + subject = _section_text(parsed, "Subject") + predicate = _section_text(parsed, "Predicate") + obj = _section_text(parsed, "Object") + for section_slug, value in ( + ("subject", subject), + ("predicate", predicate), + ("object", obj), + ): + if not value: + missing_sections.append(section_slug) + + if missing_sections: + raise InfospaceError( + "invalid_relation_artifact", + f"Invalid relation artifact: {artifact_id}", + { + "artifact_id": artifact_id, + "path": str(artifact_path), + "missing_sections": missing_sections, + }, + ) + + subject_slug = slugify(subject) + object_slug = slugify(obj) + subject_entity_id = "" + object_entity_id = "" + if entity_ids is not None: + missing_slugs = [ + slug + for slug in (subject_slug, object_slug) + if slug and slug not in entity_ids + ] + if missing_slugs: + raise InfospaceError( + "unresolved_relation_endpoint", + f"Relation endpoint not found for artifact: {artifact_id}", + { + "artifact_id": artifact_id, + "path": str(artifact_path), + "missing_slugs": missing_slugs, + }, + ) + subject_entity_id = entity_ids[subject_slug] + object_entity_id = entity_ids[object_slug] + + title = h1.text if h1 is not None else artifact_path.stem + return RelationRecord( + artifact_id=artifact_id, + slug=slugify(title), + subject=subject, + subject_slug=subject_slug, + subject_entity_id=subject_entity_id, + predicate=predicate, + object=obj, + object_slug=object_slug, + object_entity_id=object_entity_id, + relation_type=_section_text(parsed, "Relation Type"), + vsm_channel=_section_text(parsed, "VSM Channel"), + evidence=_section_text(parsed, "Evidence"), + feedback_role=_section_text(parsed, "Feedback Role"), + source_path=str(artifact_path), + ) + + +def list_entities(root: str | Path) -> list[EntityRecord]: + infospace = load_infospace(root) + return [ + parse_entity_artifact(artifact.id, infospace.root / artifact.path) + for artifact in infospace.artifacts + if artifact.kind == "entity" + ] + + +def list_relations(root: str | Path) -> list[RelationRecord]: + infospace = load_infospace(root) + entity_ids = {entity.slug: entity.artifact_id for entity in list_entities(root)} + return [ + parse_relation_artifact(artifact.id, infospace.root / artifact.path, entity_ids) + for artifact in infospace.artifacts + if artifact.kind == "relation" + ] + + +def slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.strip().lower()) + return slug.strip("-") + + +def _first_heading(parsed: ParsedMarkdownArtifact, *, level: int) -> Any | None: + return next((heading for heading in parsed.headings if heading.level == level), None) + + +def _section_text(parsed: ParsedMarkdownArtifact, *headings: str) -> str: + for heading in headings: + text = extract_section_text(parsed, heading) + if text: + return text + return "" + + +def _section_slugs(parsed: ParsedMarkdownArtifact) -> list[str]: + return [ + slugify(section.heading.text) + for section in parsed.sections + if section.heading.level == 2 + ] + + +def _is_title_case(value: str) -> bool: + words = value.split() + if not words: + return False + for index, word in enumerate(words): + clean = re.sub(r"[^\w]", "", word) + if not clean: + continue + if index > 0 and clean.lower() in MINOR_TITLE_WORDS: + continue + if not clean[0].isupper(): + return False + return True + + +def _word_count(value: str) -> int: + return len(value.split()) diff --git a/tests/test_semantics.py b/tests/test_semantics.py new file mode 100644 index 0000000..ab7d5ba --- /dev/null +++ b/tests/test_semantics.py @@ -0,0 +1,221 @@ +import json +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +from infospace_bench import InfospaceError, add_artifact, create_infospace +from infospace_bench.semantics import ( + list_entities, + list_relations, + parse_entity_artifact, + parse_relation_artifact, +) + + +ENTITY = """# Division of Labour + +## Definition + +Increasing productivity by splitting work into specialized tasks. + +## Source Chapter + +Book I, Chapter 1 + +## Context + +Smith introduces the pin factory as an example of this mechanism. + +## Economic Domain + +Production + +## Original Wording + +The greatest improvement in the productive powers of labour. + +## Modern Interpretation + +Specialization improves throughput by reducing switching costs. +""" + + +INVALID_ENTITY = """# Thin Entity + +## Context + +This artifact is missing its definition. +""" + + +RELATION = """# Division of Labour enables Market Extent + +## Subject + +Division of Labour + +## Predicate + +is limited by + +## Object + +Market Extent + +## Relation Type + +constrains + +## VSM Channel + +S1 -> S4 + +## Evidence + +Book I, Chapter 3 connects specialization to market size. + +## Feedback Role + +Part of the market expansion loop. +""" + + +def cli_env() -> dict[str, str]: + env = os.environ.copy() + env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src" + return env + + +def test_parse_entity_artifact_extracts_legacy_sections(tmp_path: Path) -> None: + path = tmp_path / "division.md" + path.write_text(ENTITY, encoding="utf-8") + + entity = parse_entity_artifact("entity/division.md", path) + + assert entity.artifact_id == "entity/division.md" + assert entity.slug == "division-of-labour" + assert entity.title == "Division of Labour" + assert entity.definition_word_count == 8 + assert entity.domain == "Production" + assert entity.has_original_wording is True + assert entity.section_slugs == [ + "definition", + "source-chapter", + "context", + "economic-domain", + "original-wording", + "modern-interpretation", + ] + + +def test_parse_entity_artifact_reports_missing_required_sections(tmp_path: Path) -> None: + path = tmp_path / "thin.md" + path.write_text(INVALID_ENTITY, encoding="utf-8") + + with pytest.raises(InfospaceError) as raised: + parse_entity_artifact("entity/thin.md", path) + + assert raised.value.code == "invalid_entity_artifact" + assert raised.value.detail["missing_sections"] == ["definition"] + + +def test_parse_relation_artifact_links_entity_endpoints(tmp_path: Path) -> None: + path = tmp_path / "relation.md" + path.write_text(RELATION, encoding="utf-8") + entity_ids = { + "division-of-labour": "entity/division.md", + "market-extent": "entity/market.md", + } + + relation = parse_relation_artifact("relation/division-market.md", path, entity_ids) + + assert relation.slug == "division-of-labour-enables-market-extent" + assert relation.subject_slug == "division-of-labour" + assert relation.object_slug == "market-extent" + assert relation.subject_entity_id == "entity/division.md" + assert relation.object_entity_id == "entity/market.md" + assert relation.is_feedback_member is True + + +def test_parse_relation_artifact_reports_unresolved_endpoint(tmp_path: Path) -> None: + path = tmp_path / "relation.md" + path.write_text(RELATION, encoding="utf-8") + + with pytest.raises(InfospaceError) as raised: + parse_relation_artifact("relation/division-market.md", path, {}) + + assert raised.value.code == "unresolved_relation_endpoint" + assert raised.value.detail["missing_slugs"] == [ + "division-of-labour", + "market-extent", + ] + + +def test_list_entities_and_relations_from_manifest(tmp_path: Path) -> None: + infospace = create_infospace(tmp_path, "pilot", name="Pilot") + division = tmp_path / "division.md" + division.write_text(ENTITY, encoding="utf-8") + market = tmp_path / "market.md" + market.write_text( + ENTITY.replace("Division of Labour", "Market Extent").replace( + "Production", "Exchange" + ), + encoding="utf-8", + ) + relation = tmp_path / "relation.md" + relation.write_text(RELATION, encoding="utf-8") + + add_artifact(infospace.root, division, kind="entity", title="Division") + add_artifact(infospace.root, market, kind="entity", title="Market") + add_artifact(infospace.root, relation, kind="relation", title="Relation") + + assert [entity.slug for entity in list_entities(infospace.root)] == [ + "division-of-labour", + "market-extent", + ] + assert [item.relation_type for item in list_relations(infospace.root)] == [ + "constrains" + ] + + +def test_cli_entities_and_relations_output_json(tmp_path: Path) -> None: + infospace = create_infospace(tmp_path, "pilot", name="Pilot") + division = tmp_path / "division.md" + division.write_text(ENTITY, encoding="utf-8") + market = tmp_path / "market.md" + market.write_text( + ENTITY.replace("Division of Labour", "Market Extent").replace( + "Production", "Exchange" + ), + encoding="utf-8", + ) + relation = tmp_path / "relation.md" + relation.write_text(RELATION, encoding="utf-8") + add_artifact(infospace.root, division, kind="entity", title="Division") + add_artifact(infospace.root, market, kind="entity", title="Market") + add_artifact(infospace.root, relation, kind="relation", title="Relation") + + entities = subprocess.run( + [sys.executable, "-m", "infospace_bench", "entities", str(infospace.root)], + check=False, + env=cli_env(), + text=True, + capture_output=True, + ) + relations = subprocess.run( + [sys.executable, "-m", "infospace_bench", "relations", str(infospace.root)], + check=False, + env=cli_env(), + text=True, + capture_output=True, + ) + + assert entities.returncode == 0, entities.stderr + assert relations.returncode == 0, relations.stderr + assert json.loads(entities.stdout)["entities"][0]["slug"] == "division-of-labour" + assert json.loads(relations.stdout)["relations"][0]["subject_entity_id"] == ( + "entity/division.md" + ) diff --git a/workplans/IB-WP-0007-entity-relation-model-migration.md b/workplans/IB-WP-0007-entity-relation-model-migration.md index 3c56b8a..e7c7396 100644 --- a/workplans/IB-WP-0007-entity-relation-model-migration.md +++ b/workplans/IB-WP-0007-entity-relation-model-migration.md @@ -4,7 +4,7 @@ type: workplan title: "Entity And Relation Model Migration" domain: markitect repo: infospace-bench -status: planned +status: completed owner: markitect topic_slug: markitect created: "2026-05-14" @@ -26,7 +26,7 @@ application-level models on top of `markitect-tool` parsing. ```task id: IB-WP-0007-T01 -status: todo +status: done priority: high state_hub_task_id: "d6c401be-ada6-4684-9186-8ae35101bfa8" ``` @@ -39,7 +39,7 @@ state_hub_task_id: "d6c401be-ada6-4684-9186-8ae35101bfa8" ```task id: IB-WP-0007-T02 -status: todo +status: done priority: high state_hub_task_id: "25e42321-33fe-4b84-8e95-c5308d91ad3b" ``` @@ -52,7 +52,7 @@ state_hub_task_id: "25e42321-33fe-4b84-8e95-c5308d91ad3b" ```task id: IB-WP-0007-T03 -status: todo +status: done priority: high state_hub_task_id: "845a8ea0-50d8-4dd3-8cc3-23717195ae6f" ``` @@ -66,7 +66,7 @@ state_hub_task_id: "845a8ea0-50d8-4dd3-8cc3-23717195ae6f" ```task id: IB-WP-0007-T04 -status: todo +status: done priority: medium state_hub_task_id: "155028a2-4df7-4144-9193-74e95f6e51b1" ```