feat(infospace): add entity metadata parser (S1.1)

Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 00:27:45 +01:00
parent b5e994b014
commit 03c6c5e8de
9 changed files with 739 additions and 0 deletions
--- a/tests/unit/core/init.py
+++ b/tests/unit/core/init.py
--- a/tests/unit/core/test_section_tree.py
+++ b/tests/unit/core/test_section_tree.py
@@ -0,0 +1,137 @@
+"""Tests for markitect.core.section_tree."""
+
+from markitect.core.parser import parse_markdown_to_ast
+from markitect.core.section_tree import (
+    build_section_tree,
+    extract_heading_content,
+    extract_heading_level,
+    extract_section_text,
+    slugify,
+)
+
+
+class TestSlugify:
+    def test_simple_text(self):
+        assert slugify("Hello World") == "hello_world"
+
+    def test_german_umlauts(self):
+        assert slugify("Ärger mit Über") == "aerger_mit_ueber"
+
+    def test_special_characters(self):
+        assert slugify("Smith's Original Wording") == "smith_s_original_wording"
+
+    def test_empty_string(self):
+        assert slugify("") == "feld"
+
+    def test_trailing_underscores_stripped(self):
+        assert slugify("--hello--") == "hello"
+
+    def test_multiple_spaces(self):
+        assert slugify("a   b") == "a_b"
+
+
+class TestExtractHeadingLevel:
+    def test_h1(self):
+        assert extract_heading_level("h1") == 1
+
+    def test_h6(self):
+        assert extract_heading_level("h6") == 6
+
+    def test_invalid_tag(self):
+        assert extract_heading_level("p") == 1
+
+    def test_empty(self):
+        assert extract_heading_level("") == 1
+
+
+class TestExtractHeadingContent:
+    def test_finds_inline_token(self):
+        tokens = [
+            {"type": "heading_open", "tag": "h1"},
+            {"type": "inline", "content": "Hello"},
+            {"type": "heading_close", "tag": "h1"},
+        ]
+        assert extract_heading_content(tokens, 0) == "Hello"
+
+    def test_no_inline(self):
+        tokens = [
+            {"type": "heading_open", "tag": "h1"},
+            {"type": "heading_close", "tag": "h1"},
+        ]
+        assert extract_heading_content(tokens, 0) == ""
+
+
+class TestBuildSectionTree:
+    def test_single_heading(self):
+        md = "# Title\n\nSome text."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        assert tree["level"] == 0
+        assert len(tree["children"]) == 1
+        assert tree["children"][0]["heading"] == "Title"
+        assert tree["children"][0]["level"] == 1
+
+    def test_nested_headings(self):
+        md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore"
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        top = tree["children"][0]
+        assert top["heading"] == "Top"
+        assert len(top["children"]) == 2
+        assert top["children"][0]["heading"] == "Sub"
+        assert top["children"][1]["heading"] == "Sub2"
+
+    def test_max_depth(self):
+        md = "# Top\n\n## Sub\n\n### Deep\n\ntext"
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens, max_depth=2)
+
+        top = tree["children"][0]
+        sub = top["children"][0]
+        # H3 should be excluded from tree
+        assert len(sub["children"]) == 0
+
+    def test_content_tokens_captured(self):
+        md = "# Title\n\nParagraph text here."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        section = tree["children"][0]
+        inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"]
+        assert len(inline_tokens) == 1
+        assert "Paragraph text here" in inline_tokens[0]["content"]
+
+    def test_slug_assigned(self):
+        md = "# Economic Domain\n\ntext"
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        assert tree["children"][0]["slug"] == "economic_domain"
+
+    def test_empty_document(self):
+        tokens = parse_markdown_to_ast("")
+        tree = build_section_tree(tokens)
+        assert tree["children"] == []
+
+
+class TestExtractSectionText:
+    def test_simple_paragraph(self):
+        md = "# Title\n\nHello world."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+        text = extract_section_text(tree["children"][0])
+        assert text == "Hello world."
+
+    def test_multiple_paragraphs(self):
+        md = "# Title\n\nFirst paragraph.\n\nSecond paragraph."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+        text = extract_section_text(tree["children"][0])
+        assert "First paragraph." in text
+        assert "Second paragraph." in text
+
+    def test_empty_section(self):
+        section = {"content_tokens": []}
+        assert extract_section_text(section) == ""
--- a/tests/unit/infospace/init.py
+++ b/tests/unit/infospace/init.py
--- a/tests/unit/infospace/test_entity_parser.py
+++ b/tests/unit/infospace/test_entity_parser.py
@@ -0,0 +1,230 @@
+"""Tests for markitect.infospace.entity_parser and EntityMeta."""
+
+import logging
+from pathlib import Path
+
+import pytest
+
+from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory
+
+
+# ── Fixtures ────────────────────────────────────────────────────────
+
+COMPLETE_ENTITY = """\
+# Division of Labour
+
+## Definition
+
+The separation of a work process into a number of distinct tasks, each performed
+by a specialised worker, resulting in a significant increase in the productive
+powers of labour.
+
+## Source Chapter
+
+Book I, Chapter 1: "Of the Division of Labour"
+
+## Context
+
+The division of labour is the central argument of the chapter.
+
+## Economic Domain
+
+Production
+
+## Smith's Original Wording
+
+"The greatest improvements in the productive powers of labour…"
+
+## Modern Interpretation
+
+The division of labour remains a foundational concept in economics.
+"""
+
+MINIMAL_ENTITY = """\
+# Minimal Entity
+
+## Definition
+
+A brief definition.
+
+## Source Chapter
+
+Book I, Chapter 1
+
+## Context
+
+Some context.
+
+## Economic Domain
+
+Exchange
+"""
+
+SLUG_H1_ENTITY = """\
+# effectual-demand
+
+## Definition
+
+Effectual demand is the demand by consumers who are willing and able to pay.
+
+## Source Chapter
+
+Book 1, Chapter 7
+
+## Context
+
+Context for effectual demand.
+
+## Economic Domain
+
+Exchange
+
+## Smith's Original Wording
+
+"Such people may be called the effectual demanders…"
+
+## Modern Interpretation
+
+Represents the intersection of desire and purchasing power.
+"""
+
+NO_H1 = """\
+## Only H2
+
+Some content.
+"""
+
+
+# ── parse_entity_file ────────────────────────────────────────────────
+
+class TestParseEntityFile:
+    def test_complete_entity(self, tmp_path):
+        f = tmp_path / "division-of-labour.md"
+        f.write_text(COMPLETE_ENTITY)
+        meta = parse_entity_file(f)
+
+        assert meta.slug == "division_of_labour"
+        assert meta.title == "Division of Labour"
+        assert meta.h1_is_title_case is True
+        assert meta.has_original_wording is True
+        assert meta.domain == "Production"
+        assert meta.definition_word_count > 20
+        assert "separation" in meta.definition.lower()
+        assert meta.source_path == str(f)
+        assert "definition" in meta.section_slugs
+        assert "smith_s_original_wording" in meta.section_slugs
+
+    def test_minimal_entity(self, tmp_path):
+        f = tmp_path / "minimal-entity.md"
+        f.write_text(MINIMAL_ENTITY)
+        meta = parse_entity_file(f)
+
+        assert meta.slug == "minimal_entity"
+        assert meta.has_original_wording is False
+        assert meta.original_wording == ""
+        assert meta.modern_interpretation == ""
+        assert meta.domain == "Exchange"
+
+    def test_slug_format_h1(self, tmp_path):
+        f = tmp_path / "effectual-demand.md"
+        f.write_text(SLUG_H1_ENTITY)
+        meta = parse_entity_file(f)
+
+        assert meta.h1_raw == "effectual-demand"
+        assert meta.h1_is_title_case is False
+        assert meta.slug == "effectual_demand"
+        assert meta.has_original_wording is True
+
+    def test_missing_h1_raises(self, tmp_path):
+        f = tmp_path / "no-h1.md"
+        f.write_text(NO_H1)
+        with pytest.raises(ValueError, match="No H1"):
+            parse_entity_file(f)
+
+    def test_missing_sections_return_empty(self, tmp_path):
+        f = tmp_path / "minimal.md"
+        f.write_text(MINIMAL_ENTITY)
+        meta = parse_entity_file(f)
+
+        # Optional sections not present → empty string
+        assert meta.original_wording == ""
+        assert meta.modern_interpretation == ""
+
+    def test_word_count_accuracy(self, tmp_path):
+        f = tmp_path / "test.md"
+        f.write_text("# Test\n\n## Definition\n\none two three four five\n")
+        meta = parse_entity_file(f)
+        assert meta.definition_word_count == 5
+
+
+# ── parse_entity_directory ──────────────────────────────────────────
+
+class TestParseEntityDirectory:
+    def _make_dir(self, tmp_path):
+        """Create a temporary entity directory."""
+        d = tmp_path / "entities"
+        d.mkdir()
+        (d / "entity-a.md").write_text(COMPLETE_ENTITY)
+        (d / "entity-b.md").write_text(MINIMAL_ENTITY)
+        # files that should be excluded by default
+        (d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file")
+        (d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file")
+        return d
+
+    def test_excludes_view_and_prompt(self, tmp_path):
+        d = self._make_dir(tmp_path)
+        results = parse_entity_directory(d)
+        slugs = {e.slug for e in results}
+
+        assert "division_of_labour" in slugs
+        assert "minimal_entity" in slugs
+        # Excluded files should not be parsed as entities
+        assert len(results) == 2
+
+    def test_custom_exclude_patterns(self, tmp_path):
+        d = self._make_dir(tmp_path)
+        # Only exclude prompt files, allow entity views
+        results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"])
+        assert len(results) == 3  # entity-a, entity-b, chapter-01-entities
+
+    def test_malformed_skipped_with_warning(self, tmp_path, caplog):
+        d = tmp_path / "entities"
+        d.mkdir()
+        (d / "good.md").write_text(COMPLETE_ENTITY)
+        (d / "bad.md").write_text(NO_H1)
+
+        with caplog.at_level(logging.WARNING):
+            results = parse_entity_directory(d)
+
+        assert len(results) == 1
+        assert "bad.md" in caplog.text
+
+
+# ── EntityMeta round-trip ───────────────────────────────────────────
+
+class TestEntityMetaRoundTrip:
+    def test_to_dict_from_dict(self, tmp_path):
+        f = tmp_path / "entity.md"
+        f.write_text(COMPLETE_ENTITY)
+        original = parse_entity_file(f)
+
+        data = original.to_dict()
+        restored = EntityMeta.from_dict(data)
+
+        assert restored.slug == original.slug
+        assert restored.title == original.title
+        assert restored.definition == original.definition
+        assert restored.h1_is_title_case == original.h1_is_title_case
+        assert restored.section_slugs == original.section_slugs
+        assert restored.definition_word_count == original.definition_word_count
+
+    def test_from_dict_ignores_unknown_keys(self):
+        data = {
+            "slug": "test",
+            "title": "Test",
+            "h1_raw": "Test",
+            "unknown_field": "should be ignored",
+        }
+        meta = EntityMeta.from_dict(data)
+        assert meta.slug == "test"
+        assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__