feat(infospace): add entity metadata parser (S1.1)

Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 00:27:45 +01:00
parent b5e994b014
commit 03c6c5e8de
9 changed files with 739 additions and 0 deletions
--- a/markitect/core/init.py
+++ b/markitect/core/init.py
@@ -9,6 +9,7 @@ This package contains the fundamental building blocks:
 """
 from .parser import parse_markdown_to_ast
 from .section_tree import build_section_tree, extract_section_text
 from .serializer import ASTSerializer
 from .document_manager import DocumentManager, CleanDocumentManager
 from .workspace import (
@@ -29,6 +30,9 @@ from .workspace import (
 __all__ = [
    # Parser
    "parse_markdown_to_ast",
    # Section tree
    "build_section_tree",
    "extract_section_text",
    # Serializer
    "ASTSerializer",
    # Document Manager
--- a/markitect/core/section_tree.py
+++ b/markitect/core/section_tree.py
@@ -0,0 +1,124 @@
 """
 Standalone section-tree utilities extracted from SchemaGenerator.
 Builds a hierarchical section tree from flat markdown-it AST tokens and
 provides helpers for navigating heading structure and extracting text.
 These functions are used by both the schema generator and the infospace
 entity parser.
 """
 import re
 from typing import Any, Dict, List, Optional
 def slugify(text: str) -> str:
    """Convert heading or label text to a valid slug / JSON property key."""
    replacements = {
        'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
        'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
    }
    slug = text
    for char, repl in replacements.items():
        slug = slug.replace(char, repl)
    slug = slug.lower()
    slug = re.sub(r'[^a-z0-9]+', '_', slug)
    slug = slug.strip('_')
    return slug or 'feld'
 def extract_heading_level(tag: str) -> int:
    """Extract heading level from an HTML tag string (h1, h2, …)."""
    if tag.startswith('h') and len(tag) == 2:
        try:
            return int(tag[1])
        except ValueError:
            pass
    return 1
 def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
    """Return the inline text content following a ``heading_open`` token."""
    for i in range(start_index, min(start_index + 3, len(tokens))):
        token = tokens[i]
        if token.get('type') == 'inline':
            return token.get('content', '')
    return ''
 def build_section_tree(
    tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
 ) -> Dict[str, Any]:
    """
    Build a hierarchical section tree from a flat markdown-it token list.
    Returns a root node whose ``children`` list contains the top-level
    sections.  Each node carries:
    - ``heading`` – heading text (``None`` for the root)
    - ``level`` – heading depth (``0`` for the root)
    - ``slug`` – slugified heading
    - ``content_tokens`` – non-heading tokens belonging to this section
    - ``children`` – nested sub-sections
    """
    root: Dict[str, Any] = {
        'heading': None, 'level': 0, 'slug': '',
        'content_tokens': [], 'children': []
    }
    stack = [root]
    i = 0
    while i < len(tokens):
        token = tokens[i]
        if token.get('type') == 'heading_open':
            level = extract_heading_level(token.get('tag', ''))
            heading_text = extract_heading_content(tokens, i)
            if max_depth is not None and level > max_depth:
                # Skip this heading and its close token, but keep content
                i += 1
                while i < len(tokens) and tokens[i].get('type') != 'heading_close':
                    i += 1
                i += 1
                continue
            section: Dict[str, Any] = {
                'heading': heading_text,
                'level': level,
                'slug': slugify(heading_text),
                'content_tokens': [],
                'children': []
            }
            # Pop stack until we find the parent (level < current)
            while len(stack) > 1 and stack[-1]['level'] >= level:
                stack.pop()
            stack[-1]['children'].append(section)
            stack.append(section)
            # Skip past heading_close
            i += 1
            while i < len(tokens) and tokens[i].get('type') != 'heading_close':
                i += 1
        else:
            # Add content token to current section
            stack[-1]['content_tokens'].append(token)
        i += 1
    return root
 def extract_section_text(section: Dict[str, Any]) -> str:
    """
    Return the plain text content of a section node.
    Concatenates the ``content`` field of every ``inline`` token found
    in the section's ``content_tokens``.  Paragraphs are separated by
    newlines; other inline tokens are joined with spaces.
    """
    parts: List[str] = []
    for token in section.get('content_tokens', []):
        if token.get('type') == 'inline':
            parts.append(token.get('content', ''))
    return '\n'.join(parts)
--- a/markitect/infospace/init.py
+++ b/markitect/infospace/init.py
@@ -0,0 +1,15 @@
 """
 Infospace analysis package.
 Provides tooling for extracting structured metadata from entity markdown
 files and analysing infospace collections.
 """
 from .models import EntityMeta
 from .entity_parser import parse_entity_file, parse_entity_directory
 __all__ = [
    "EntityMeta",
    "parse_entity_file",
    "parse_entity_directory",
 ]
--- a/markitect/infospace/entity_parser.py
+++ b/markitect/infospace/entity_parser.py
@@ -0,0 +1,176 @@
 """
 Entity metadata parser.
 Extracts structured :class:`EntityMeta` from entity markdown files
 produced by the infospace entity-extraction pipeline.
 """
 import logging
 import re
 from pathlib import Path
 from typing import List, Optional, Sequence
 from markitect.core.parser import parse_markdown_to_ast
 from markitect.core.section_tree import (
    build_section_tree,
    extract_heading_content,
    extract_heading_level,
    extract_section_text,
    slugify,
 )
 from .models import EntityMeta
 logger = logging.getLogger(__name__)
 # Sections we look for (slug → human-friendly label)
 _KNOWN_SECTIONS = {
    "definition": "Definition",
    "source_chapter": "Source Chapter",
    "context": "Context",
    "economic_domain": "Economic Domain",
    "smith_s_original_wording": "Smith's Original Wording",
    "modern_interpretation": "Modern Interpretation",
 }
 # Default filename patterns to exclude from directory parsing
 _DEFAULT_EXCLUDE_PATTERNS = (
    r".*-entities\.md$",
    r".*-prompt\.md$",
 )
 def _is_title_case(text: str) -> bool:
    """Return True if *text* is in title case (ignoring short words)."""
    # Words that are allowed to be lowercase in title case
    minor_words = {
        "a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
        "in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
    }
    words = text.split()
    if not words:
        return False
    for i, word in enumerate(words):
        # Strip leading/trailing punctuation for the check
        clean = re.sub(r"[^\w]", "", word)
        if not clean:
            continue
        # First word must be capitalised
        if i == 0:
            if not clean[0].isupper():
                return False
        elif clean.lower() in minor_words:
            continue  # minor words may be lower
        elif not clean[0].isupper():
            return False
    return True
 def _word_count(text: str) -> int:
    """Count whitespace-separated words in *text*."""
    return len(text.split())
 def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
    """Find a direct H2 child of the root by slug."""
    for child in tree_root.get("children", []):
        if child["level"] == 2 and child["slug"] == slug:
            return child
    return None
 def parse_entity_file(path: Path) -> EntityMeta:
    """Parse a single entity markdown file into :class:`EntityMeta`.
    Raises:
        ValueError: If the file has no H1 heading.
    """
    content = path.read_text(encoding="utf-8")
    tokens = parse_markdown_to_ast(content)
    tree = build_section_tree(tokens)
    # --- H1: entity title ---
    h1_section = None
    for child in tree["children"]:
        if child["level"] == 1:
            h1_section = child
            break
    if h1_section is None:
        raise ValueError(f"No H1 heading found in {path}")
    h1_raw = h1_section["heading"]
    slug = slugify(h1_raw)
    title = h1_raw
    h1_is_title_case = _is_title_case(h1_raw)
    # Use the H1 node as the effective root for H2 look-ups
    effective_root = h1_section
    # Collect all H2 section slugs
    section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
    # --- Extract known sections ---
    def _get_section_text(section_slug: str) -> str:
        node = _find_h2_section(effective_root, section_slug)
        if node is None:
            return ""
        return extract_section_text(node).strip()
    definition = _get_section_text("definition")
    source_chapter = _get_section_text("source_chapter")
    context = _get_section_text("context")
    domain = _get_section_text("economic_domain")
    original_wording = _get_section_text("smith_s_original_wording")
    modern_interpretation = _get_section_text("modern_interpretation")
    # --- Derived metrics ---
    has_original_wording = bool(original_wording)
    definition_word_count = _word_count(definition)
    total_word_count = _word_count(content)
    return EntityMeta(
        slug=slug,
        title=title,
        h1_raw=h1_raw,
        definition=definition,
        source_chapter=source_chapter,
        context=context,
        domain=domain,
        original_wording=original_wording,
        modern_interpretation=modern_interpretation,
        h1_is_title_case=h1_is_title_case,
        has_original_wording=has_original_wording,
        definition_word_count=definition_word_count,
        total_word_count=total_word_count,
        section_slugs=section_slugs,
        source_path=str(path),
    )
 def parse_entity_directory(
    directory: Path,
    exclude_patterns: Optional[Sequence[str]] = None,
 ) -> List[EntityMeta]:
    """Parse all entity markdown files in *directory*.
    Files matching *exclude_patterns* (regexes tested against the
    filename) are skipped.  Defaults exclude chapter-view
    (``*-entities.md``) and prompt (``*-prompt.md``) files.
    Malformed files are skipped with a warning rather than raising.
    """
    if exclude_patterns is None:
        exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
    compiled = [re.compile(p) for p in exclude_patterns]
    entities: List[EntityMeta] = []
    for md_file in sorted(directory.glob("*.md")):
        if any(pat.match(md_file.name) for pat in compiled):
            continue
        try:
            entities.append(parse_entity_file(md_file))
        except Exception as exc:
            logger.warning("Skipping %s: %s", md_file.name, exc)
    return entities
--- a/markitect/infospace/models.py
+++ b/markitect/infospace/models.py
@@ -0,0 +1,53 @@
 """
 Data models for infospace entity metadata.
 """
 from dataclasses import dataclass, field, asdict
 from typing import Any, Dict, List
@dataclass
 class EntityMeta:
    """Structured metadata extracted from a single entity markdown file.
    The parser populates every field it can find; missing optional
    sections are left as empty strings (validation is a separate step).
    """
    # Identity
    slug: str
    title: str
    h1_raw: str  # verbatim H1 text before any normalisation
    # Section contents (plain text, empty string if section missing)
    definition: str = ""
    source_chapter: str = ""
    context: str = ""
    domain: str = ""
    original_wording: str = ""
    modern_interpretation: str = ""
    # Derived flags
    h1_is_title_case: bool = False
    has_original_wording: bool = False
    # Metrics-ready numbers
    definition_word_count: int = 0
    total_word_count: int = 0
    # All H2 section slugs found (preserves order)
    section_slugs: List[str] = field(default_factory=list)
    # Source file path (as string for serialisation)
    source_path: str = ""
    def to_dict(self) -> Dict[str, Any]:
        """Serialise to a plain dictionary."""
        return asdict(self)
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
        """Deserialise from a plain dictionary."""
        known_fields = {f.name for f in cls.__dataclass_fields__.values()}
        filtered = {k: v for k, v in data.items() if k in known_fields}
        return cls(**filtered)
--- a/tests/unit/core/init.py
+++ b/tests/unit/core/init.py
--- a/tests/unit/core/test_section_tree.py
+++ b/tests/unit/core/test_section_tree.py
@@ -0,0 +1,137 @@
 """Tests for markitect.core.section_tree."""
 from markitect.core.parser import parse_markdown_to_ast
 from markitect.core.section_tree import (
    build_section_tree,
    extract_heading_content,
    extract_heading_level,
    extract_section_text,
    slugify,
 )
 class TestSlugify:
    def test_simple_text(self):
        assert slugify("Hello World") == "hello_world"
    def test_german_umlauts(self):
        assert slugify("Ärger mit Über") == "aerger_mit_ueber"
    def test_special_characters(self):
        assert slugify("Smith's Original Wording") == "smith_s_original_wording"
    def test_empty_string(self):
        assert slugify("") == "feld"
    def test_trailing_underscores_stripped(self):
        assert slugify("--hello--") == "hello"
    def test_multiple_spaces(self):
        assert slugify("a   b") == "a_b"
 class TestExtractHeadingLevel:
    def test_h1(self):
        assert extract_heading_level("h1") == 1
    def test_h6(self):
        assert extract_heading_level("h6") == 6
    def test_invalid_tag(self):
        assert extract_heading_level("p") == 1
    def test_empty(self):
        assert extract_heading_level("") == 1
 class TestExtractHeadingContent:
    def test_finds_inline_token(self):
        tokens = [
            {"type": "heading_open", "tag": "h1"},
            {"type": "inline", "content": "Hello"},
            {"type": "heading_close", "tag": "h1"},
        ]
        assert extract_heading_content(tokens, 0) == "Hello"
    def test_no_inline(self):
        tokens = [
            {"type": "heading_open", "tag": "h1"},
            {"type": "heading_close", "tag": "h1"},
        ]
        assert extract_heading_content(tokens, 0) == ""
 class TestBuildSectionTree:
    def test_single_heading(self):
        md = "# Title\n\nSome text."
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens)
        assert tree["level"] == 0
        assert len(tree["children"]) == 1
        assert tree["children"][0]["heading"] == "Title"
        assert tree["children"][0]["level"] == 1
    def test_nested_headings(self):
        md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore"
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens)
        top = tree["children"][0]
        assert top["heading"] == "Top"
        assert len(top["children"]) == 2
        assert top["children"][0]["heading"] == "Sub"
        assert top["children"][1]["heading"] == "Sub2"
    def test_max_depth(self):
        md = "# Top\n\n## Sub\n\n### Deep\n\ntext"
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens, max_depth=2)
        top = tree["children"][0]
        sub = top["children"][0]
        # H3 should be excluded from tree
        assert len(sub["children"]) == 0
    def test_content_tokens_captured(self):
        md = "# Title\n\nParagraph text here."
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens)
        section = tree["children"][0]
        inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"]
        assert len(inline_tokens) == 1
        assert "Paragraph text here" in inline_tokens[0]["content"]
    def test_slug_assigned(self):
        md = "# Economic Domain\n\ntext"
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens)
        assert tree["children"][0]["slug"] == "economic_domain"
    def test_empty_document(self):
        tokens = parse_markdown_to_ast("")
        tree = build_section_tree(tokens)
        assert tree["children"] == []
 class TestExtractSectionText:
    def test_simple_paragraph(self):
        md = "# Title\n\nHello world."
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens)
        text = extract_section_text(tree["children"][0])
        assert text == "Hello world."
    def test_multiple_paragraphs(self):
        md = "# Title\n\nFirst paragraph.\n\nSecond paragraph."
        tokens = parse_markdown_to_ast(md)
        tree = build_section_tree(tokens)
        text = extract_section_text(tree["children"][0])
        assert "First paragraph." in text
        assert "Second paragraph." in text
    def test_empty_section(self):
        section = {"content_tokens": []}
        assert extract_section_text(section) == ""
--- a/tests/unit/infospace/init.py
+++ b/tests/unit/infospace/init.py
--- a/tests/unit/infospace/test_entity_parser.py
+++ b/tests/unit/infospace/test_entity_parser.py
@@ -0,0 +1,230 @@
 """Tests for markitect.infospace.entity_parser and EntityMeta."""
 import logging
 from pathlib import Path
 import pytest
 from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory
 # ── Fixtures ────────────────────────────────────────────────────────
 COMPLETE_ENTITY = """\
 # Division of Labour
 ## Definition
 The separation of a work process into a number of distinct tasks, each performed
 by a specialised worker, resulting in a significant increase in the productive
 powers of labour.
 ## Source Chapter
 Book I, Chapter 1: "Of the Division of Labour"
 ## Context
 The division of labour is the central argument of the chapter.
 ## Economic Domain
 Production
 ## Smith's Original Wording
 "The greatest improvements in the productive powers of labour…"
 ## Modern Interpretation
 The division of labour remains a foundational concept in economics.
 """
 MINIMAL_ENTITY = """\
 # Minimal Entity
 ## Definition
 A brief definition.
 ## Source Chapter
 Book I, Chapter 1
 ## Context
 Some context.
 ## Economic Domain
 Exchange
 """
 SLUG_H1_ENTITY = """\
 # effectual-demand
 ## Definition
 Effectual demand is the demand by consumers who are willing and able to pay.
 ## Source Chapter
 Book 1, Chapter 7
 ## Context
 Context for effectual demand.
 ## Economic Domain
 Exchange
 ## Smith's Original Wording
 "Such people may be called the effectual demanders…"
 ## Modern Interpretation
 Represents the intersection of desire and purchasing power.
 """
 NO_H1 = """\
 ## Only H2
 Some content.
 """
 # ── parse_entity_file ────────────────────────────────────────────────
 class TestParseEntityFile:
    def test_complete_entity(self, tmp_path):
        f = tmp_path / "division-of-labour.md"
        f.write_text(COMPLETE_ENTITY)
        meta = parse_entity_file(f)
        assert meta.slug == "division_of_labour"
        assert meta.title == "Division of Labour"
        assert meta.h1_is_title_case is True
        assert meta.has_original_wording is True
        assert meta.domain == "Production"
        assert meta.definition_word_count > 20
        assert "separation" in meta.definition.lower()
        assert meta.source_path == str(f)
        assert "definition" in meta.section_slugs
        assert "smith_s_original_wording" in meta.section_slugs
    def test_minimal_entity(self, tmp_path):
        f = tmp_path / "minimal-entity.md"
        f.write_text(MINIMAL_ENTITY)
        meta = parse_entity_file(f)
        assert meta.slug == "minimal_entity"
        assert meta.has_original_wording is False
        assert meta.original_wording == ""
        assert meta.modern_interpretation == ""
        assert meta.domain == "Exchange"
    def test_slug_format_h1(self, tmp_path):
        f = tmp_path / "effectual-demand.md"
        f.write_text(SLUG_H1_ENTITY)
        meta = parse_entity_file(f)
        assert meta.h1_raw == "effectual-demand"
        assert meta.h1_is_title_case is False
        assert meta.slug == "effectual_demand"
        assert meta.has_original_wording is True
    def test_missing_h1_raises(self, tmp_path):
        f = tmp_path / "no-h1.md"
        f.write_text(NO_H1)
        with pytest.raises(ValueError, match="No H1"):
            parse_entity_file(f)
    def test_missing_sections_return_empty(self, tmp_path):
        f = tmp_path / "minimal.md"
        f.write_text(MINIMAL_ENTITY)
        meta = parse_entity_file(f)
        # Optional sections not present → empty string
        assert meta.original_wording == ""
        assert meta.modern_interpretation == ""
    def test_word_count_accuracy(self, tmp_path):
        f = tmp_path / "test.md"
        f.write_text("# Test\n\n## Definition\n\none two three four five\n")
        meta = parse_entity_file(f)
        assert meta.definition_word_count == 5
 # ── parse_entity_directory ──────────────────────────────────────────
 class TestParseEntityDirectory:
    def _make_dir(self, tmp_path):
        """Create a temporary entity directory."""
        d = tmp_path / "entities"
        d.mkdir()
        (d / "entity-a.md").write_text(COMPLETE_ENTITY)
        (d / "entity-b.md").write_text(MINIMAL_ENTITY)
        # files that should be excluded by default
        (d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file")
        (d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file")
        return d
    def test_excludes_view_and_prompt(self, tmp_path):
        d = self._make_dir(tmp_path)
        results = parse_entity_directory(d)
        slugs = {e.slug for e in results}
        assert "division_of_labour" in slugs
        assert "minimal_entity" in slugs
        # Excluded files should not be parsed as entities
        assert len(results) == 2
    def test_custom_exclude_patterns(self, tmp_path):
        d = self._make_dir(tmp_path)
        # Only exclude prompt files, allow entity views
        results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"])
        assert len(results) == 3  # entity-a, entity-b, chapter-01-entities
    def test_malformed_skipped_with_warning(self, tmp_path, caplog):
        d = tmp_path / "entities"
        d.mkdir()
        (d / "good.md").write_text(COMPLETE_ENTITY)
        (d / "bad.md").write_text(NO_H1)
        with caplog.at_level(logging.WARNING):
            results = parse_entity_directory(d)
        assert len(results) == 1
        assert "bad.md" in caplog.text
 # ── EntityMeta round-trip ───────────────────────────────────────────
 class TestEntityMetaRoundTrip:
    def test_to_dict_from_dict(self, tmp_path):
        f = tmp_path / "entity.md"
        f.write_text(COMPLETE_ENTITY)
        original = parse_entity_file(f)
        data = original.to_dict()
        restored = EntityMeta.from_dict(data)
        assert restored.slug == original.slug
        assert restored.title == original.title
        assert restored.definition == original.definition
        assert restored.h1_is_title_case == original.h1_is_title_case
        assert restored.section_slugs == original.section_slugs
        assert restored.definition_word_count == original.definition_word_count
    def test_from_dict_ignores_unknown_keys(self):
        data = {
            "slug": "test",
            "title": "Test",
            "h1_raw": "Test",
            "unknown_field": "should be ignored",
        }
        meta = EntityMeta.from_dict(data)
        assert meta.slug == "test"
        assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__