From 03c6c5e8de432b33efb69b91efad1fdd0ca10a89 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 19 Feb 2026 00:27:45 +0100 Subject: [PATCH] feat(infospace): add entity metadata parser (S1.1) Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 --- markitect/core/__init__.py | 4 + markitect/core/section_tree.py | 124 +++++++++++ markitect/infospace/__init__.py | 15 ++ markitect/infospace/entity_parser.py | 176 ++++++++++++++++ markitect/infospace/models.py | 53 +++++ tests/unit/core/__init__.py | 0 tests/unit/core/test_section_tree.py | 137 ++++++++++++ tests/unit/infospace/__init__.py | 0 tests/unit/infospace/test_entity_parser.py | 230 +++++++++++++++++++++ 9 files changed, 739 insertions(+) create mode 100644 markitect/core/section_tree.py create mode 100644 markitect/infospace/__init__.py create mode 100644 markitect/infospace/entity_parser.py create mode 100644 markitect/infospace/models.py create mode 100644 tests/unit/core/__init__.py create mode 100644 tests/unit/core/test_section_tree.py create mode 100644 tests/unit/infospace/__init__.py create mode 100644 tests/unit/infospace/test_entity_parser.py diff --git a/markitect/core/__init__.py b/markitect/core/__init__.py index c6864e29..4b0aa42a 100644 --- a/markitect/core/__init__.py +++ b/markitect/core/__init__.py @@ -9,6 +9,7 @@ This package contains the fundamental building blocks: """ from .parser import parse_markdown_to_ast +from .section_tree import build_section_tree, extract_section_text from .serializer import ASTSerializer from .document_manager import DocumentManager, CleanDocumentManager from .workspace import ( @@ -29,6 +30,9 @@ from .workspace import ( __all__ = [ # Parser "parse_markdown_to_ast", + # Section tree + "build_section_tree", + "extract_section_text", # Serializer "ASTSerializer", # Document Manager diff --git a/markitect/core/section_tree.py b/markitect/core/section_tree.py new file mode 100644 index 00000000..e32c87c9 --- /dev/null +++ b/markitect/core/section_tree.py @@ -0,0 +1,124 @@ +""" +Standalone section-tree utilities extracted from SchemaGenerator. + +Builds a hierarchical section tree from flat markdown-it AST tokens and +provides helpers for navigating heading structure and extracting text. +These functions are used by both the schema generator and the infospace +entity parser. +""" + +import re +from typing import Any, Dict, List, Optional + + +def slugify(text: str) -> str: + """Convert heading or label text to a valid slug / JSON property key.""" + replacements = { + 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', + 'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss', + } + slug = text + for char, repl in replacements.items(): + slug = slug.replace(char, repl) + slug = slug.lower() + slug = re.sub(r'[^a-z0-9]+', '_', slug) + slug = slug.strip('_') + return slug or 'feld' + + +def extract_heading_level(tag: str) -> int: + """Extract heading level from an HTML tag string (h1, h2, …).""" + if tag.startswith('h') and len(tag) == 2: + try: + return int(tag[1]) + except ValueError: + pass + return 1 + + +def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str: + """Return the inline text content following a ``heading_open`` token.""" + for i in range(start_index, min(start_index + 3, len(tokens))): + token = tokens[i] + if token.get('type') == 'inline': + return token.get('content', '') + return '' + + +def build_section_tree( + tokens: List[Dict[str, Any]], max_depth: Optional[int] = None +) -> Dict[str, Any]: + """ + Build a hierarchical section tree from a flat markdown-it token list. + + Returns a root node whose ``children`` list contains the top-level + sections. Each node carries: + + - ``heading`` – heading text (``None`` for the root) + - ``level`` – heading depth (``0`` for the root) + - ``slug`` – slugified heading + - ``content_tokens`` – non-heading tokens belonging to this section + - ``children`` – nested sub-sections + """ + root: Dict[str, Any] = { + 'heading': None, 'level': 0, 'slug': '', + 'content_tokens': [], 'children': [] + } + stack = [root] + + i = 0 + while i < len(tokens): + token = tokens[i] + if token.get('type') == 'heading_open': + level = extract_heading_level(token.get('tag', '')) + heading_text = extract_heading_content(tokens, i) + + if max_depth is not None and level > max_depth: + # Skip this heading and its close token, but keep content + i += 1 + while i < len(tokens) and tokens[i].get('type') != 'heading_close': + i += 1 + i += 1 + continue + + section: Dict[str, Any] = { + 'heading': heading_text, + 'level': level, + 'slug': slugify(heading_text), + 'content_tokens': [], + 'children': [] + } + + # Pop stack until we find the parent (level < current) + while len(stack) > 1 and stack[-1]['level'] >= level: + stack.pop() + + stack[-1]['children'].append(section) + stack.append(section) + + # Skip past heading_close + i += 1 + while i < len(tokens) and tokens[i].get('type') != 'heading_close': + i += 1 + else: + # Add content token to current section + stack[-1]['content_tokens'].append(token) + + i += 1 + + return root + + +def extract_section_text(section: Dict[str, Any]) -> str: + """ + Return the plain text content of a section node. + + Concatenates the ``content`` field of every ``inline`` token found + in the section's ``content_tokens``. Paragraphs are separated by + newlines; other inline tokens are joined with spaces. + """ + parts: List[str] = [] + for token in section.get('content_tokens', []): + if token.get('type') == 'inline': + parts.append(token.get('content', '')) + return '\n'.join(parts) diff --git a/markitect/infospace/__init__.py b/markitect/infospace/__init__.py new file mode 100644 index 00000000..666bc78e --- /dev/null +++ b/markitect/infospace/__init__.py @@ -0,0 +1,15 @@ +""" +Infospace analysis package. + +Provides tooling for extracting structured metadata from entity markdown +files and analysing infospace collections. +""" + +from .models import EntityMeta +from .entity_parser import parse_entity_file, parse_entity_directory + +__all__ = [ + "EntityMeta", + "parse_entity_file", + "parse_entity_directory", +] diff --git a/markitect/infospace/entity_parser.py b/markitect/infospace/entity_parser.py new file mode 100644 index 00000000..888e3490 --- /dev/null +++ b/markitect/infospace/entity_parser.py @@ -0,0 +1,176 @@ +""" +Entity metadata parser. + +Extracts structured :class:`EntityMeta` from entity markdown files +produced by the infospace entity-extraction pipeline. +""" + +import logging +import re +from pathlib import Path +from typing import List, Optional, Sequence + +from markitect.core.parser import parse_markdown_to_ast +from markitect.core.section_tree import ( + build_section_tree, + extract_heading_content, + extract_heading_level, + extract_section_text, + slugify, +) +from .models import EntityMeta + +logger = logging.getLogger(__name__) + +# Sections we look for (slug → human-friendly label) +_KNOWN_SECTIONS = { + "definition": "Definition", + "source_chapter": "Source Chapter", + "context": "Context", + "economic_domain": "Economic Domain", + "smith_s_original_wording": "Smith's Original Wording", + "modern_interpretation": "Modern Interpretation", +} + +# Default filename patterns to exclude from directory parsing +_DEFAULT_EXCLUDE_PATTERNS = ( + r".*-entities\.md$", + r".*-prompt\.md$", +) + + +def _is_title_case(text: str) -> bool: + """Return True if *text* is in title case (ignoring short words).""" + # Words that are allowed to be lowercase in title case + minor_words = { + "a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so", + "in", "on", "at", "to", "by", "of", "up", "as", "is", "if", + } + words = text.split() + if not words: + return False + for i, word in enumerate(words): + # Strip leading/trailing punctuation for the check + clean = re.sub(r"[^\w]", "", word) + if not clean: + continue + # First word must be capitalised + if i == 0: + if not clean[0].isupper(): + return False + elif clean.lower() in minor_words: + continue # minor words may be lower + elif not clean[0].isupper(): + return False + return True + + +def _word_count(text: str) -> int: + """Count whitespace-separated words in *text*.""" + return len(text.split()) + + +def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]: + """Find a direct H2 child of the root by slug.""" + for child in tree_root.get("children", []): + if child["level"] == 2 and child["slug"] == slug: + return child + return None + + +def parse_entity_file(path: Path) -> EntityMeta: + """Parse a single entity markdown file into :class:`EntityMeta`. + + Raises: + ValueError: If the file has no H1 heading. + """ + content = path.read_text(encoding="utf-8") + tokens = parse_markdown_to_ast(content) + tree = build_section_tree(tokens) + + # --- H1: entity title --- + h1_section = None + for child in tree["children"]: + if child["level"] == 1: + h1_section = child + break + + if h1_section is None: + raise ValueError(f"No H1 heading found in {path}") + + h1_raw = h1_section["heading"] + slug = slugify(h1_raw) + title = h1_raw + h1_is_title_case = _is_title_case(h1_raw) + + # Use the H1 node as the effective root for H2 look-ups + effective_root = h1_section + + # Collect all H2 section slugs + section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2] + + # --- Extract known sections --- + def _get_section_text(section_slug: str) -> str: + node = _find_h2_section(effective_root, section_slug) + if node is None: + return "" + return extract_section_text(node).strip() + + definition = _get_section_text("definition") + source_chapter = _get_section_text("source_chapter") + context = _get_section_text("context") + domain = _get_section_text("economic_domain") + original_wording = _get_section_text("smith_s_original_wording") + modern_interpretation = _get_section_text("modern_interpretation") + + # --- Derived metrics --- + has_original_wording = bool(original_wording) + definition_word_count = _word_count(definition) + total_word_count = _word_count(content) + + return EntityMeta( + slug=slug, + title=title, + h1_raw=h1_raw, + definition=definition, + source_chapter=source_chapter, + context=context, + domain=domain, + original_wording=original_wording, + modern_interpretation=modern_interpretation, + h1_is_title_case=h1_is_title_case, + has_original_wording=has_original_wording, + definition_word_count=definition_word_count, + total_word_count=total_word_count, + section_slugs=section_slugs, + source_path=str(path), + ) + + +def parse_entity_directory( + directory: Path, + exclude_patterns: Optional[Sequence[str]] = None, +) -> List[EntityMeta]: + """Parse all entity markdown files in *directory*. + + Files matching *exclude_patterns* (regexes tested against the + filename) are skipped. Defaults exclude chapter-view + (``*-entities.md``) and prompt (``*-prompt.md``) files. + + Malformed files are skipped with a warning rather than raising. + """ + if exclude_patterns is None: + exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS + + compiled = [re.compile(p) for p in exclude_patterns] + entities: List[EntityMeta] = [] + + for md_file in sorted(directory.glob("*.md")): + if any(pat.match(md_file.name) for pat in compiled): + continue + try: + entities.append(parse_entity_file(md_file)) + except Exception as exc: + logger.warning("Skipping %s: %s", md_file.name, exc) + + return entities diff --git a/markitect/infospace/models.py b/markitect/infospace/models.py new file mode 100644 index 00000000..38705ecd --- /dev/null +++ b/markitect/infospace/models.py @@ -0,0 +1,53 @@ +""" +Data models for infospace entity metadata. +""" + +from dataclasses import dataclass, field, asdict +from typing import Any, Dict, List + + +@dataclass +class EntityMeta: + """Structured metadata extracted from a single entity markdown file. + + The parser populates every field it can find; missing optional + sections are left as empty strings (validation is a separate step). + """ + + # Identity + slug: str + title: str + h1_raw: str # verbatim H1 text before any normalisation + + # Section contents (plain text, empty string if section missing) + definition: str = "" + source_chapter: str = "" + context: str = "" + domain: str = "" + original_wording: str = "" + modern_interpretation: str = "" + + # Derived flags + h1_is_title_case: bool = False + has_original_wording: bool = False + + # Metrics-ready numbers + definition_word_count: int = 0 + total_word_count: int = 0 + + # All H2 section slugs found (preserves order) + section_slugs: List[str] = field(default_factory=list) + + # Source file path (as string for serialisation) + source_path: str = "" + + def to_dict(self) -> Dict[str, Any]: + """Serialise to a plain dictionary.""" + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta": + """Deserialise from a plain dictionary.""" + known_fields = {f.name for f in cls.__dataclass_fields__.values()} + filtered = {k: v for k, v in data.items() if k in known_fields} + return cls(**filtered) diff --git a/tests/unit/core/__init__.py b/tests/unit/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/core/test_section_tree.py b/tests/unit/core/test_section_tree.py new file mode 100644 index 00000000..83fa3528 --- /dev/null +++ b/tests/unit/core/test_section_tree.py @@ -0,0 +1,137 @@ +"""Tests for markitect.core.section_tree.""" + +from markitect.core.parser import parse_markdown_to_ast +from markitect.core.section_tree import ( + build_section_tree, + extract_heading_content, + extract_heading_level, + extract_section_text, + slugify, +) + + +class TestSlugify: + def test_simple_text(self): + assert slugify("Hello World") == "hello_world" + + def test_german_umlauts(self): + assert slugify("Ärger mit Über") == "aerger_mit_ueber" + + def test_special_characters(self): + assert slugify("Smith's Original Wording") == "smith_s_original_wording" + + def test_empty_string(self): + assert slugify("") == "feld" + + def test_trailing_underscores_stripped(self): + assert slugify("--hello--") == "hello" + + def test_multiple_spaces(self): + assert slugify("a b") == "a_b" + + +class TestExtractHeadingLevel: + def test_h1(self): + assert extract_heading_level("h1") == 1 + + def test_h6(self): + assert extract_heading_level("h6") == 6 + + def test_invalid_tag(self): + assert extract_heading_level("p") == 1 + + def test_empty(self): + assert extract_heading_level("") == 1 + + +class TestExtractHeadingContent: + def test_finds_inline_token(self): + tokens = [ + {"type": "heading_open", "tag": "h1"}, + {"type": "inline", "content": "Hello"}, + {"type": "heading_close", "tag": "h1"}, + ] + assert extract_heading_content(tokens, 0) == "Hello" + + def test_no_inline(self): + tokens = [ + {"type": "heading_open", "tag": "h1"}, + {"type": "heading_close", "tag": "h1"}, + ] + assert extract_heading_content(tokens, 0) == "" + + +class TestBuildSectionTree: + def test_single_heading(self): + md = "# Title\n\nSome text." + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens) + + assert tree["level"] == 0 + assert len(tree["children"]) == 1 + assert tree["children"][0]["heading"] == "Title" + assert tree["children"][0]["level"] == 1 + + def test_nested_headings(self): + md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore" + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens) + + top = tree["children"][0] + assert top["heading"] == "Top" + assert len(top["children"]) == 2 + assert top["children"][0]["heading"] == "Sub" + assert top["children"][1]["heading"] == "Sub2" + + def test_max_depth(self): + md = "# Top\n\n## Sub\n\n### Deep\n\ntext" + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens, max_depth=2) + + top = tree["children"][0] + sub = top["children"][0] + # H3 should be excluded from tree + assert len(sub["children"]) == 0 + + def test_content_tokens_captured(self): + md = "# Title\n\nParagraph text here." + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens) + + section = tree["children"][0] + inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"] + assert len(inline_tokens) == 1 + assert "Paragraph text here" in inline_tokens[0]["content"] + + def test_slug_assigned(self): + md = "# Economic Domain\n\ntext" + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens) + + assert tree["children"][0]["slug"] == "economic_domain" + + def test_empty_document(self): + tokens = parse_markdown_to_ast("") + tree = build_section_tree(tokens) + assert tree["children"] == [] + + +class TestExtractSectionText: + def test_simple_paragraph(self): + md = "# Title\n\nHello world." + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens) + text = extract_section_text(tree["children"][0]) + assert text == "Hello world." + + def test_multiple_paragraphs(self): + md = "# Title\n\nFirst paragraph.\n\nSecond paragraph." + tokens = parse_markdown_to_ast(md) + tree = build_section_tree(tokens) + text = extract_section_text(tree["children"][0]) + assert "First paragraph." in text + assert "Second paragraph." in text + + def test_empty_section(self): + section = {"content_tokens": []} + assert extract_section_text(section) == "" diff --git a/tests/unit/infospace/__init__.py b/tests/unit/infospace/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/infospace/test_entity_parser.py b/tests/unit/infospace/test_entity_parser.py new file mode 100644 index 00000000..fbcaf9c8 --- /dev/null +++ b/tests/unit/infospace/test_entity_parser.py @@ -0,0 +1,230 @@ +"""Tests for markitect.infospace.entity_parser and EntityMeta.""" + +import logging +from pathlib import Path + +import pytest + +from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory + + +# ── Fixtures ──────────────────────────────────────────────────────── + +COMPLETE_ENTITY = """\ +# Division of Labour + +## Definition + +The separation of a work process into a number of distinct tasks, each performed +by a specialised worker, resulting in a significant increase in the productive +powers of labour. + +## Source Chapter + +Book I, Chapter 1: "Of the Division of Labour" + +## Context + +The division of labour is the central argument of the chapter. + +## Economic Domain + +Production + +## Smith's Original Wording + +"The greatest improvements in the productive powers of labour…" + +## Modern Interpretation + +The division of labour remains a foundational concept in economics. +""" + +MINIMAL_ENTITY = """\ +# Minimal Entity + +## Definition + +A brief definition. + +## Source Chapter + +Book I, Chapter 1 + +## Context + +Some context. + +## Economic Domain + +Exchange +""" + +SLUG_H1_ENTITY = """\ +# effectual-demand + +## Definition + +Effectual demand is the demand by consumers who are willing and able to pay. + +## Source Chapter + +Book 1, Chapter 7 + +## Context + +Context for effectual demand. + +## Economic Domain + +Exchange + +## Smith's Original Wording + +"Such people may be called the effectual demanders…" + +## Modern Interpretation + +Represents the intersection of desire and purchasing power. +""" + +NO_H1 = """\ +## Only H2 + +Some content. +""" + + +# ── parse_entity_file ──────────────────────────────────────────────── + +class TestParseEntityFile: + def test_complete_entity(self, tmp_path): + f = tmp_path / "division-of-labour.md" + f.write_text(COMPLETE_ENTITY) + meta = parse_entity_file(f) + + assert meta.slug == "division_of_labour" + assert meta.title == "Division of Labour" + assert meta.h1_is_title_case is True + assert meta.has_original_wording is True + assert meta.domain == "Production" + assert meta.definition_word_count > 20 + assert "separation" in meta.definition.lower() + assert meta.source_path == str(f) + assert "definition" in meta.section_slugs + assert "smith_s_original_wording" in meta.section_slugs + + def test_minimal_entity(self, tmp_path): + f = tmp_path / "minimal-entity.md" + f.write_text(MINIMAL_ENTITY) + meta = parse_entity_file(f) + + assert meta.slug == "minimal_entity" + assert meta.has_original_wording is False + assert meta.original_wording == "" + assert meta.modern_interpretation == "" + assert meta.domain == "Exchange" + + def test_slug_format_h1(self, tmp_path): + f = tmp_path / "effectual-demand.md" + f.write_text(SLUG_H1_ENTITY) + meta = parse_entity_file(f) + + assert meta.h1_raw == "effectual-demand" + assert meta.h1_is_title_case is False + assert meta.slug == "effectual_demand" + assert meta.has_original_wording is True + + def test_missing_h1_raises(self, tmp_path): + f = tmp_path / "no-h1.md" + f.write_text(NO_H1) + with pytest.raises(ValueError, match="No H1"): + parse_entity_file(f) + + def test_missing_sections_return_empty(self, tmp_path): + f = tmp_path / "minimal.md" + f.write_text(MINIMAL_ENTITY) + meta = parse_entity_file(f) + + # Optional sections not present → empty string + assert meta.original_wording == "" + assert meta.modern_interpretation == "" + + def test_word_count_accuracy(self, tmp_path): + f = tmp_path / "test.md" + f.write_text("# Test\n\n## Definition\n\none two three four five\n") + meta = parse_entity_file(f) + assert meta.definition_word_count == 5 + + +# ── parse_entity_directory ────────────────────────────────────────── + +class TestParseEntityDirectory: + def _make_dir(self, tmp_path): + """Create a temporary entity directory.""" + d = tmp_path / "entities" + d.mkdir() + (d / "entity-a.md").write_text(COMPLETE_ENTITY) + (d / "entity-b.md").write_text(MINIMAL_ENTITY) + # files that should be excluded by default + (d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file") + (d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file") + return d + + def test_excludes_view_and_prompt(self, tmp_path): + d = self._make_dir(tmp_path) + results = parse_entity_directory(d) + slugs = {e.slug for e in results} + + assert "division_of_labour" in slugs + assert "minimal_entity" in slugs + # Excluded files should not be parsed as entities + assert len(results) == 2 + + def test_custom_exclude_patterns(self, tmp_path): + d = self._make_dir(tmp_path) + # Only exclude prompt files, allow entity views + results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"]) + assert len(results) == 3 # entity-a, entity-b, chapter-01-entities + + def test_malformed_skipped_with_warning(self, tmp_path, caplog): + d = tmp_path / "entities" + d.mkdir() + (d / "good.md").write_text(COMPLETE_ENTITY) + (d / "bad.md").write_text(NO_H1) + + with caplog.at_level(logging.WARNING): + results = parse_entity_directory(d) + + assert len(results) == 1 + assert "bad.md" in caplog.text + + +# ── EntityMeta round-trip ─────────────────────────────────────────── + +class TestEntityMetaRoundTrip: + def test_to_dict_from_dict(self, tmp_path): + f = tmp_path / "entity.md" + f.write_text(COMPLETE_ENTITY) + original = parse_entity_file(f) + + data = original.to_dict() + restored = EntityMeta.from_dict(data) + + assert restored.slug == original.slug + assert restored.title == original.title + assert restored.definition == original.definition + assert restored.h1_is_title_case == original.h1_is_title_case + assert restored.section_slugs == original.section_slugs + assert restored.definition_word_count == original.definition_word_count + + def test_from_dict_ignores_unknown_keys(self): + data = { + "slug": "test", + "title": "Test", + "h1_raw": "Test", + "unknown_field": "should be ignored", + } + meta = EntityMeta.from_dict(data) + assert meta.slug == "test" + assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__