From 03c6c5e8de432b33efb69b91efad1fdd0ca10a89 Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Thu, 19 Feb 2026 00:27:45 +0100
Subject: [PATCH] feat(infospace): add entity metadata parser (S1.1)

Extract section-tree algorithm from SchemaGenerator into standalone
core/section_tree.py and build markitect/infospace/ package with
EntityMeta dataclass and parse_entity_file/parse_entity_directory.
Foundation for schema compliance, coverage, and granularity metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 markitect/core/__init__.py                 |   4 +
 markitect/core/section_tree.py             | 124 +++++++++++
 markitect/infospace/__init__.py            |  15 ++
 markitect/infospace/entity_parser.py       | 176 ++++++++++++++++
 markitect/infospace/models.py              |  53 +++++
 tests/unit/core/__init__.py                |   0
 tests/unit/core/test_section_tree.py       | 137 ++++++++++++
 tests/unit/infospace/__init__.py           |   0
 tests/unit/infospace/test_entity_parser.py | 230 +++++++++++++++++++++
 9 files changed, 739 insertions(+)
 create mode 100644 markitect/core/section_tree.py
 create mode 100644 markitect/infospace/__init__.py
 create mode 100644 markitect/infospace/entity_parser.py
 create mode 100644 markitect/infospace/models.py
 create mode 100644 tests/unit/core/__init__.py
 create mode 100644 tests/unit/core/test_section_tree.py
 create mode 100644 tests/unit/infospace/__init__.py
 create mode 100644 tests/unit/infospace/test_entity_parser.py

diff --git a/markitect/core/__init__.py b/markitect/core/__init__.py
index c6864e29..4b0aa42a 100644
--- a/markitect/core/__init__.py
+++ b/markitect/core/__init__.py
@@ -9,6 +9,7 @@ This package contains the fundamental building blocks:
 """
 
 from .parser import parse_markdown_to_ast
+from .section_tree import build_section_tree, extract_section_text
 from .serializer import ASTSerializer
 from .document_manager import DocumentManager, CleanDocumentManager
 from .workspace import (
@@ -29,6 +30,9 @@ from .workspace import (
 __all__ = [
     # Parser
     "parse_markdown_to_ast",
+    # Section tree
+    "build_section_tree",
+    "extract_section_text",
     # Serializer
     "ASTSerializer",
     # Document Manager
diff --git a/markitect/core/section_tree.py b/markitect/core/section_tree.py
new file mode 100644
index 00000000..e32c87c9
--- /dev/null
+++ b/markitect/core/section_tree.py
@@ -0,0 +1,124 @@
+"""
+Standalone section-tree utilities extracted from SchemaGenerator.
+
+Builds a hierarchical section tree from flat markdown-it AST tokens and
+provides helpers for navigating heading structure and extracting text.
+These functions are used by both the schema generator and the infospace
+entity parser.
+"""
+
+import re
+from typing import Any, Dict, List, Optional
+
+
+def slugify(text: str) -> str:
+    """Convert heading or label text to a valid slug / JSON property key."""
+    replacements = {
+        'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
+        'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
+    }
+    slug = text
+    for char, repl in replacements.items():
+        slug = slug.replace(char, repl)
+    slug = slug.lower()
+    slug = re.sub(r'[^a-z0-9]+', '_', slug)
+    slug = slug.strip('_')
+    return slug or 'feld'
+
+
+def extract_heading_level(tag: str) -> int:
+    """Extract heading level from an HTML tag string (h1, h2, …)."""
+    if tag.startswith('h') and len(tag) == 2:
+        try:
+            return int(tag[1])
+        except ValueError:
+            pass
+    return 1
+
+
+def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
+    """Return the inline text content following a ``heading_open`` token."""
+    for i in range(start_index, min(start_index + 3, len(tokens))):
+        token = tokens[i]
+        if token.get('type') == 'inline':
+            return token.get('content', '')
+    return ''
+
+
+def build_section_tree(
+    tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
+) -> Dict[str, Any]:
+    """
+    Build a hierarchical section tree from a flat markdown-it token list.
+
+    Returns a root node whose ``children`` list contains the top-level
+    sections.  Each node carries:
+
+    - ``heading`` – heading text (``None`` for the root)
+    - ``level`` – heading depth (``0`` for the root)
+    - ``slug`` – slugified heading
+    - ``content_tokens`` – non-heading tokens belonging to this section
+    - ``children`` – nested sub-sections
+    """
+    root: Dict[str, Any] = {
+        'heading': None, 'level': 0, 'slug': '',
+        'content_tokens': [], 'children': []
+    }
+    stack = [root]
+
+    i = 0
+    while i < len(tokens):
+        token = tokens[i]
+        if token.get('type') == 'heading_open':
+            level = extract_heading_level(token.get('tag', ''))
+            heading_text = extract_heading_content(tokens, i)
+
+            if max_depth is not None and level > max_depth:
+                # Skip this heading and its close token, but keep content
+                i += 1
+                while i < len(tokens) and tokens[i].get('type') != 'heading_close':
+                    i += 1
+                i += 1
+                continue
+
+            section: Dict[str, Any] = {
+                'heading': heading_text,
+                'level': level,
+                'slug': slugify(heading_text),
+                'content_tokens': [],
+                'children': []
+            }
+
+            # Pop stack until we find the parent (level < current)
+            while len(stack) > 1 and stack[-1]['level'] >= level:
+                stack.pop()
+
+            stack[-1]['children'].append(section)
+            stack.append(section)
+
+            # Skip past heading_close
+            i += 1
+            while i < len(tokens) and tokens[i].get('type') != 'heading_close':
+                i += 1
+        else:
+            # Add content token to current section
+            stack[-1]['content_tokens'].append(token)
+
+        i += 1
+
+    return root
+
+
+def extract_section_text(section: Dict[str, Any]) -> str:
+    """
+    Return the plain text content of a section node.
+
+    Concatenates the ``content`` field of every ``inline`` token found
+    in the section's ``content_tokens``.  Paragraphs are separated by
+    newlines; other inline tokens are joined with spaces.
+    """
+    parts: List[str] = []
+    for token in section.get('content_tokens', []):
+        if token.get('type') == 'inline':
+            parts.append(token.get('content', ''))
+    return '\n'.join(parts)
diff --git a/markitect/infospace/__init__.py b/markitect/infospace/__init__.py
new file mode 100644
index 00000000..666bc78e
--- /dev/null
+++ b/markitect/infospace/__init__.py
@@ -0,0 +1,15 @@
+"""
+Infospace analysis package.
+
+Provides tooling for extracting structured metadata from entity markdown
+files and analysing infospace collections.
+"""
+
+from .models import EntityMeta
+from .entity_parser import parse_entity_file, parse_entity_directory
+
+__all__ = [
+    "EntityMeta",
+    "parse_entity_file",
+    "parse_entity_directory",
+]
diff --git a/markitect/infospace/entity_parser.py b/markitect/infospace/entity_parser.py
new file mode 100644
index 00000000..888e3490
--- /dev/null
+++ b/markitect/infospace/entity_parser.py
@@ -0,0 +1,176 @@
+"""
+Entity metadata parser.
+
+Extracts structured :class:`EntityMeta` from entity markdown files
+produced by the infospace entity-extraction pipeline.
+"""
+
+import logging
+import re
+from pathlib import Path
+from typing import List, Optional, Sequence
+
+from markitect.core.parser import parse_markdown_to_ast
+from markitect.core.section_tree import (
+    build_section_tree,
+    extract_heading_content,
+    extract_heading_level,
+    extract_section_text,
+    slugify,
+)
+from .models import EntityMeta
+
+logger = logging.getLogger(__name__)
+
+# Sections we look for (slug → human-friendly label)
+_KNOWN_SECTIONS = {
+    "definition": "Definition",
+    "source_chapter": "Source Chapter",
+    "context": "Context",
+    "economic_domain": "Economic Domain",
+    "smith_s_original_wording": "Smith's Original Wording",
+    "modern_interpretation": "Modern Interpretation",
+}
+
+# Default filename patterns to exclude from directory parsing
+_DEFAULT_EXCLUDE_PATTERNS = (
+    r".*-entities\.md$",
+    r".*-prompt\.md$",
+)
+
+
+def _is_title_case(text: str) -> bool:
+    """Return True if *text* is in title case (ignoring short words)."""
+    # Words that are allowed to be lowercase in title case
+    minor_words = {
+        "a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
+        "in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
+    }
+    words = text.split()
+    if not words:
+        return False
+    for i, word in enumerate(words):
+        # Strip leading/trailing punctuation for the check
+        clean = re.sub(r"[^\w]", "", word)
+        if not clean:
+            continue
+        # First word must be capitalised
+        if i == 0:
+            if not clean[0].isupper():
+                return False
+        elif clean.lower() in minor_words:
+            continue  # minor words may be lower
+        elif not clean[0].isupper():
+            return False
+    return True
+
+
+def _word_count(text: str) -> int:
+    """Count whitespace-separated words in *text*."""
+    return len(text.split())
+
+
+def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
+    """Find a direct H2 child of the root by slug."""
+    for child in tree_root.get("children", []):
+        if child["level"] == 2 and child["slug"] == slug:
+            return child
+    return None
+
+
+def parse_entity_file(path: Path) -> EntityMeta:
+    """Parse a single entity markdown file into :class:`EntityMeta`.
+
+    Raises:
+        ValueError: If the file has no H1 heading.
+    """
+    content = path.read_text(encoding="utf-8")
+    tokens = parse_markdown_to_ast(content)
+    tree = build_section_tree(tokens)
+
+    # --- H1: entity title ---
+    h1_section = None
+    for child in tree["children"]:
+        if child["level"] == 1:
+            h1_section = child
+            break
+
+    if h1_section is None:
+        raise ValueError(f"No H1 heading found in {path}")
+
+    h1_raw = h1_section["heading"]
+    slug = slugify(h1_raw)
+    title = h1_raw
+    h1_is_title_case = _is_title_case(h1_raw)
+
+    # Use the H1 node as the effective root for H2 look-ups
+    effective_root = h1_section
+
+    # Collect all H2 section slugs
+    section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
+
+    # --- Extract known sections ---
+    def _get_section_text(section_slug: str) -> str:
+        node = _find_h2_section(effective_root, section_slug)
+        if node is None:
+            return ""
+        return extract_section_text(node).strip()
+
+    definition = _get_section_text("definition")
+    source_chapter = _get_section_text("source_chapter")
+    context = _get_section_text("context")
+    domain = _get_section_text("economic_domain")
+    original_wording = _get_section_text("smith_s_original_wording")
+    modern_interpretation = _get_section_text("modern_interpretation")
+
+    # --- Derived metrics ---
+    has_original_wording = bool(original_wording)
+    definition_word_count = _word_count(definition)
+    total_word_count = _word_count(content)
+
+    return EntityMeta(
+        slug=slug,
+        title=title,
+        h1_raw=h1_raw,
+        definition=definition,
+        source_chapter=source_chapter,
+        context=context,
+        domain=domain,
+        original_wording=original_wording,
+        modern_interpretation=modern_interpretation,
+        h1_is_title_case=h1_is_title_case,
+        has_original_wording=has_original_wording,
+        definition_word_count=definition_word_count,
+        total_word_count=total_word_count,
+        section_slugs=section_slugs,
+        source_path=str(path),
+    )
+
+
+def parse_entity_directory(
+    directory: Path,
+    exclude_patterns: Optional[Sequence[str]] = None,
+) -> List[EntityMeta]:
+    """Parse all entity markdown files in *directory*.
+
+    Files matching *exclude_patterns* (regexes tested against the
+    filename) are skipped.  Defaults exclude chapter-view
+    (``*-entities.md``) and prompt (``*-prompt.md``) files.
+
+    Malformed files are skipped with a warning rather than raising.
+    """
+    if exclude_patterns is None:
+        exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
+
+    compiled = [re.compile(p) for p in exclude_patterns]
+    entities: List[EntityMeta] = []
+
+    for md_file in sorted(directory.glob("*.md")):
+        if any(pat.match(md_file.name) for pat in compiled):
+            continue
+        try:
+            entities.append(parse_entity_file(md_file))
+        except Exception as exc:
+            logger.warning("Skipping %s: %s", md_file.name, exc)
+
+    return entities
diff --git a/markitect/infospace/models.py b/markitect/infospace/models.py
new file mode 100644
index 00000000..38705ecd
--- /dev/null
+++ b/markitect/infospace/models.py
@@ -0,0 +1,53 @@
+"""
+Data models for infospace entity metadata.
+"""
+
+from dataclasses import dataclass, field, asdict
+from typing import Any, Dict, List
+
+
+@dataclass
+class EntityMeta:
+    """Structured metadata extracted from a single entity markdown file.
+
+    The parser populates every field it can find; missing optional
+    sections are left as empty strings (validation is a separate step).
+    """
+
+    # Identity
+    slug: str
+    title: str
+    h1_raw: str  # verbatim H1 text before any normalisation
+
+    # Section contents (plain text, empty string if section missing)
+    definition: str = ""
+    source_chapter: str = ""
+    context: str = ""
+    domain: str = ""
+    original_wording: str = ""
+    modern_interpretation: str = ""
+
+    # Derived flags
+    h1_is_title_case: bool = False
+    has_original_wording: bool = False
+
+    # Metrics-ready numbers
+    definition_word_count: int = 0
+    total_word_count: int = 0
+
+    # All H2 section slugs found (preserves order)
+    section_slugs: List[str] = field(default_factory=list)
+
+    # Source file path (as string for serialisation)
+    source_path: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialise to a plain dictionary."""
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
+        """Deserialise from a plain dictionary."""
+        known_fields = {f.name for f in cls.__dataclass_fields__.values()}
+        filtered = {k: v for k, v in data.items() if k in known_fields}
+        return cls(**filtered)
diff --git a/tests/unit/core/__init__.py b/tests/unit/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/core/test_section_tree.py b/tests/unit/core/test_section_tree.py
new file mode 100644
index 00000000..83fa3528
--- /dev/null
+++ b/tests/unit/core/test_section_tree.py
@@ -0,0 +1,137 @@
+"""Tests for markitect.core.section_tree."""
+
+from markitect.core.parser import parse_markdown_to_ast
+from markitect.core.section_tree import (
+    build_section_tree,
+    extract_heading_content,
+    extract_heading_level,
+    extract_section_text,
+    slugify,
+)
+
+
+class TestSlugify:
+    def test_simple_text(self):
+        assert slugify("Hello World") == "hello_world"
+
+    def test_german_umlauts(self):
+        assert slugify("Ärger mit Über") == "aerger_mit_ueber"
+
+    def test_special_characters(self):
+        assert slugify("Smith's Original Wording") == "smith_s_original_wording"
+
+    def test_empty_string(self):
+        assert slugify("") == "feld"
+
+    def test_trailing_underscores_stripped(self):
+        assert slugify("--hello--") == "hello"
+
+    def test_multiple_spaces(self):
+        assert slugify("a   b") == "a_b"
+
+
+class TestExtractHeadingLevel:
+    def test_h1(self):
+        assert extract_heading_level("h1") == 1
+
+    def test_h6(self):
+        assert extract_heading_level("h6") == 6
+
+    def test_invalid_tag(self):
+        assert extract_heading_level("p") == 1
+
+    def test_empty(self):
+        assert extract_heading_level("") == 1
+
+
+class TestExtractHeadingContent:
+    def test_finds_inline_token(self):
+        tokens = [
+            {"type": "heading_open", "tag": "h1"},
+            {"type": "inline", "content": "Hello"},
+            {"type": "heading_close", "tag": "h1"},
+        ]
+        assert extract_heading_content(tokens, 0) == "Hello"
+
+    def test_no_inline(self):
+        tokens = [
+            {"type": "heading_open", "tag": "h1"},
+            {"type": "heading_close", "tag": "h1"},
+        ]
+        assert extract_heading_content(tokens, 0) == ""
+
+
+class TestBuildSectionTree:
+    def test_single_heading(self):
+        md = "# Title\n\nSome text."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        assert tree["level"] == 0
+        assert len(tree["children"]) == 1
+        assert tree["children"][0]["heading"] == "Title"
+        assert tree["children"][0]["level"] == 1
+
+    def test_nested_headings(self):
+        md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore"
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        top = tree["children"][0]
+        assert top["heading"] == "Top"
+        assert len(top["children"]) == 2
+        assert top["children"][0]["heading"] == "Sub"
+        assert top["children"][1]["heading"] == "Sub2"
+
+    def test_max_depth(self):
+        md = "# Top\n\n## Sub\n\n### Deep\n\ntext"
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens, max_depth=2)
+
+        top = tree["children"][0]
+        sub = top["children"][0]
+        # H3 should be excluded from tree
+        assert len(sub["children"]) == 0
+
+    def test_content_tokens_captured(self):
+        md = "# Title\n\nParagraph text here."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        section = tree["children"][0]
+        inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"]
+        assert len(inline_tokens) == 1
+        assert "Paragraph text here" in inline_tokens[0]["content"]
+
+    def test_slug_assigned(self):
+        md = "# Economic Domain\n\ntext"
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+
+        assert tree["children"][0]["slug"] == "economic_domain"
+
+    def test_empty_document(self):
+        tokens = parse_markdown_to_ast("")
+        tree = build_section_tree(tokens)
+        assert tree["children"] == []
+
+
+class TestExtractSectionText:
+    def test_simple_paragraph(self):
+        md = "# Title\n\nHello world."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+        text = extract_section_text(tree["children"][0])
+        assert text == "Hello world."
+
+    def test_multiple_paragraphs(self):
+        md = "# Title\n\nFirst paragraph.\n\nSecond paragraph."
+        tokens = parse_markdown_to_ast(md)
+        tree = build_section_tree(tokens)
+        text = extract_section_text(tree["children"][0])
+        assert "First paragraph." in text
+        assert "Second paragraph." in text
+
+    def test_empty_section(self):
+        section = {"content_tokens": []}
+        assert extract_section_text(section) == ""
diff --git a/tests/unit/infospace/__init__.py b/tests/unit/infospace/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/infospace/test_entity_parser.py b/tests/unit/infospace/test_entity_parser.py
new file mode 100644
index 00000000..fbcaf9c8
--- /dev/null
+++ b/tests/unit/infospace/test_entity_parser.py
@@ -0,0 +1,230 @@
+"""Tests for markitect.infospace.entity_parser and EntityMeta."""
+
+import logging
+from pathlib import Path
+
+import pytest
+
+from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory
+
+
+# ── Fixtures ────────────────────────────────────────────────────────
+
+COMPLETE_ENTITY = """\
+# Division of Labour
+
+## Definition
+
+The separation of a work process into a number of distinct tasks, each performed
+by a specialised worker, resulting in a significant increase in the productive
+powers of labour.
+
+## Source Chapter
+
+Book I, Chapter 1: "Of the Division of Labour"
+
+## Context
+
+The division of labour is the central argument of the chapter.
+
+## Economic Domain
+
+Production
+
+## Smith's Original Wording
+
+"The greatest improvements in the productive powers of labour…"
+
+## Modern Interpretation
+
+The division of labour remains a foundational concept in economics.
+"""
+
+MINIMAL_ENTITY = """\
+# Minimal Entity
+
+## Definition
+
+A brief definition.
+
+## Source Chapter
+
+Book I, Chapter 1
+
+## Context
+
+Some context.
+
+## Economic Domain
+
+Exchange
+"""
+
+SLUG_H1_ENTITY = """\
+# effectual-demand
+
+## Definition
+
+Effectual demand is the demand by consumers who are willing and able to pay.
+
+## Source Chapter
+
+Book 1, Chapter 7
+
+## Context
+
+Context for effectual demand.
+
+## Economic Domain
+
+Exchange
+
+## Smith's Original Wording
+
+"Such people may be called the effectual demanders…"
+
+## Modern Interpretation
+
+Represents the intersection of desire and purchasing power.
+"""
+
+NO_H1 = """\
+## Only H2
+
+Some content.
+"""
+
+
+# ── parse_entity_file ────────────────────────────────────────────────
+
+class TestParseEntityFile:
+    def test_complete_entity(self, tmp_path):
+        f = tmp_path / "division-of-labour.md"
+        f.write_text(COMPLETE_ENTITY)
+        meta = parse_entity_file(f)
+
+        assert meta.slug == "division_of_labour"
+        assert meta.title == "Division of Labour"
+        assert meta.h1_is_title_case is True
+        assert meta.has_original_wording is True
+        assert meta.domain == "Production"
+        assert meta.definition_word_count > 20
+        assert "separation" in meta.definition.lower()
+        assert meta.source_path == str(f)
+        assert "definition" in meta.section_slugs
+        assert "smith_s_original_wording" in meta.section_slugs
+
+    def test_minimal_entity(self, tmp_path):
+        f = tmp_path / "minimal-entity.md"
+        f.write_text(MINIMAL_ENTITY)
+        meta = parse_entity_file(f)
+
+        assert meta.slug == "minimal_entity"
+        assert meta.has_original_wording is False
+        assert meta.original_wording == ""
+        assert meta.modern_interpretation == ""
+        assert meta.domain == "Exchange"
+
+    def test_slug_format_h1(self, tmp_path):
+        f = tmp_path / "effectual-demand.md"
+        f.write_text(SLUG_H1_ENTITY)
+        meta = parse_entity_file(f)
+
+        assert meta.h1_raw == "effectual-demand"
+        assert meta.h1_is_title_case is False
+        assert meta.slug == "effectual_demand"
+        assert meta.has_original_wording is True
+
+    def test_missing_h1_raises(self, tmp_path):
+        f = tmp_path / "no-h1.md"
+        f.write_text(NO_H1)
+        with pytest.raises(ValueError, match="No H1"):
+            parse_entity_file(f)
+
+    def test_missing_sections_return_empty(self, tmp_path):
+        f = tmp_path / "minimal.md"
+        f.write_text(MINIMAL_ENTITY)
+        meta = parse_entity_file(f)
+
+        # Optional sections not present → empty string
+        assert meta.original_wording == ""
+        assert meta.modern_interpretation == ""
+
+    def test_word_count_accuracy(self, tmp_path):
+        f = tmp_path / "test.md"
+        f.write_text("# Test\n\n## Definition\n\none two three four five\n")
+        meta = parse_entity_file(f)
+        assert meta.definition_word_count == 5
+
+
+# ── parse_entity_directory ──────────────────────────────────────────
+
+class TestParseEntityDirectory:
+    def _make_dir(self, tmp_path):
+        """Create a temporary entity directory."""
+        d = tmp_path / "entities"
+        d.mkdir()
+        (d / "entity-a.md").write_text(COMPLETE_ENTITY)
+        (d / "entity-b.md").write_text(MINIMAL_ENTITY)
+        # files that should be excluded by default
+        (d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file")
+        (d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file")
+        return d
+
+    def test_excludes_view_and_prompt(self, tmp_path):
+        d = self._make_dir(tmp_path)
+        results = parse_entity_directory(d)
+        slugs = {e.slug for e in results}
+
+        assert "division_of_labour" in slugs
+        assert "minimal_entity" in slugs
+        # Excluded files should not be parsed as entities
+        assert len(results) == 2
+
+    def test_custom_exclude_patterns(self, tmp_path):
+        d = self._make_dir(tmp_path)
+        # Only exclude prompt files, allow entity views
+        results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"])
+        assert len(results) == 3  # entity-a, entity-b, chapter-01-entities
+
+    def test_malformed_skipped_with_warning(self, tmp_path, caplog):
+        d = tmp_path / "entities"
+        d.mkdir()
+        (d / "good.md").write_text(COMPLETE_ENTITY)
+        (d / "bad.md").write_text(NO_H1)
+
+        with caplog.at_level(logging.WARNING):
+            results = parse_entity_directory(d)
+
+        assert len(results) == 1
+        assert "bad.md" in caplog.text
+
+
+# ── EntityMeta round-trip ───────────────────────────────────────────
+
+class TestEntityMetaRoundTrip:
+    def test_to_dict_from_dict(self, tmp_path):
+        f = tmp_path / "entity.md"
+        f.write_text(COMPLETE_ENTITY)
+        original = parse_entity_file(f)
+
+        data = original.to_dict()
+        restored = EntityMeta.from_dict(data)
+
+        assert restored.slug == original.slug
+        assert restored.title == original.title
+        assert restored.definition == original.definition
+        assert restored.h1_is_title_case == original.h1_is_title_case
+        assert restored.section_slugs == original.section_slugs
+        assert restored.definition_word_count == original.definition_word_count
+
+    def test_from_dict_ignores_unknown_keys(self):
+        data = {
+            "slug": "test",
+            "title": "Test",
+            "h1_raw": "Test",
+            "unknown_field": "should be ignored",
+        }
+        meta = EntityMeta.from_dict(data)
+        assert meta.slug == "test"
+        assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__