feat(infospace): add entity metadata parser (S1.1)

Extract section-tree algorithm from SchemaGenerator into standalone
core/section_tree.py and build markitect/infospace/ package with
EntityMeta dataclass and parse_entity_file/parse_entity_directory.
Foundation for schema compliance, coverage, and granularity metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 00:27:45 +01:00
parent b5e994b014
commit 03c6c5e8de
9 changed files with 739 additions and 0 deletions

View File

@@ -9,6 +9,7 @@ This package contains the fundamental building blocks:
"""
from .parser import parse_markdown_to_ast
from .section_tree import build_section_tree, extract_section_text
from .serializer import ASTSerializer
from .document_manager import DocumentManager, CleanDocumentManager
from .workspace import (
@@ -29,6 +30,9 @@ from .workspace import (
__all__ = [
# Parser
"parse_markdown_to_ast",
# Section tree
"build_section_tree",
"extract_section_text",
# Serializer
"ASTSerializer",
# Document Manager

View File

@@ -0,0 +1,124 @@
"""
Standalone section-tree utilities extracted from SchemaGenerator.
Builds a hierarchical section tree from flat markdown-it AST tokens and
provides helpers for navigating heading structure and extracting text.
These functions are used by both the schema generator and the infospace
entity parser.
"""
import re
from typing import Any, Dict, List, Optional
def slugify(text: str) -> str:
"""Convert heading or label text to a valid slug / JSON property key."""
replacements = {
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
}
slug = text
for char, repl in replacements.items():
slug = slug.replace(char, repl)
slug = slug.lower()
slug = re.sub(r'[^a-z0-9]+', '_', slug)
slug = slug.strip('_')
return slug or 'feld'
def extract_heading_level(tag: str) -> int:
"""Extract heading level from an HTML tag string (h1, h2, …)."""
if tag.startswith('h') and len(tag) == 2:
try:
return int(tag[1])
except ValueError:
pass
return 1
def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Return the inline text content following a ``heading_open`` token."""
for i in range(start_index, min(start_index + 3, len(tokens))):
token = tokens[i]
if token.get('type') == 'inline':
return token.get('content', '')
return ''
def build_section_tree(
tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
) -> Dict[str, Any]:
"""
Build a hierarchical section tree from a flat markdown-it token list.
Returns a root node whose ``children`` list contains the top-level
sections. Each node carries:
- ``heading`` heading text (``None`` for the root)
- ``level`` heading depth (``0`` for the root)
- ``slug`` slugified heading
- ``content_tokens`` non-heading tokens belonging to this section
- ``children`` nested sub-sections
"""
root: Dict[str, Any] = {
'heading': None, 'level': 0, 'slug': '',
'content_tokens': [], 'children': []
}
stack = [root]
i = 0
while i < len(tokens):
token = tokens[i]
if token.get('type') == 'heading_open':
level = extract_heading_level(token.get('tag', ''))
heading_text = extract_heading_content(tokens, i)
if max_depth is not None and level > max_depth:
# Skip this heading and its close token, but keep content
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
i += 1
continue
section: Dict[str, Any] = {
'heading': heading_text,
'level': level,
'slug': slugify(heading_text),
'content_tokens': [],
'children': []
}
# Pop stack until we find the parent (level < current)
while len(stack) > 1 and stack[-1]['level'] >= level:
stack.pop()
stack[-1]['children'].append(section)
stack.append(section)
# Skip past heading_close
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
else:
# Add content token to current section
stack[-1]['content_tokens'].append(token)
i += 1
return root
def extract_section_text(section: Dict[str, Any]) -> str:
"""
Return the plain text content of a section node.
Concatenates the ``content`` field of every ``inline`` token found
in the section's ``content_tokens``. Paragraphs are separated by
newlines; other inline tokens are joined with spaces.
"""
parts: List[str] = []
for token in section.get('content_tokens', []):
if token.get('type') == 'inline':
parts.append(token.get('content', ''))
return '\n'.join(parts)

View File

@@ -0,0 +1,15 @@
"""
Infospace analysis package.
Provides tooling for extracting structured metadata from entity markdown
files and analysing infospace collections.
"""
from .models import EntityMeta
from .entity_parser import parse_entity_file, parse_entity_directory
__all__ = [
"EntityMeta",
"parse_entity_file",
"parse_entity_directory",
]

View File

@@ -0,0 +1,176 @@
"""
Entity metadata parser.
Extracts structured :class:`EntityMeta` from entity markdown files
produced by the infospace entity-extraction pipeline.
"""
import logging
import re
from pathlib import Path
from typing import List, Optional, Sequence
from markitect.core.parser import parse_markdown_to_ast
from markitect.core.section_tree import (
build_section_tree,
extract_heading_content,
extract_heading_level,
extract_section_text,
slugify,
)
from .models import EntityMeta
logger = logging.getLogger(__name__)
# Sections we look for (slug → human-friendly label)
_KNOWN_SECTIONS = {
"definition": "Definition",
"source_chapter": "Source Chapter",
"context": "Context",
"economic_domain": "Economic Domain",
"smith_s_original_wording": "Smith's Original Wording",
"modern_interpretation": "Modern Interpretation",
}
# Default filename patterns to exclude from directory parsing
_DEFAULT_EXCLUDE_PATTERNS = (
r".*-entities\.md$",
r".*-prompt\.md$",
)
def _is_title_case(text: str) -> bool:
"""Return True if *text* is in title case (ignoring short words)."""
# Words that are allowed to be lowercase in title case
minor_words = {
"a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
"in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
}
words = text.split()
if not words:
return False
for i, word in enumerate(words):
# Strip leading/trailing punctuation for the check
clean = re.sub(r"[^\w]", "", word)
if not clean:
continue
# First word must be capitalised
if i == 0:
if not clean[0].isupper():
return False
elif clean.lower() in minor_words:
continue # minor words may be lower
elif not clean[0].isupper():
return False
return True
def _word_count(text: str) -> int:
"""Count whitespace-separated words in *text*."""
return len(text.split())
def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
"""Find a direct H2 child of the root by slug."""
for child in tree_root.get("children", []):
if child["level"] == 2 and child["slug"] == slug:
return child
return None
def parse_entity_file(path: Path) -> EntityMeta:
"""Parse a single entity markdown file into :class:`EntityMeta`.
Raises:
ValueError: If the file has no H1 heading.
"""
content = path.read_text(encoding="utf-8")
tokens = parse_markdown_to_ast(content)
tree = build_section_tree(tokens)
# --- H1: entity title ---
h1_section = None
for child in tree["children"]:
if child["level"] == 1:
h1_section = child
break
if h1_section is None:
raise ValueError(f"No H1 heading found in {path}")
h1_raw = h1_section["heading"]
slug = slugify(h1_raw)
title = h1_raw
h1_is_title_case = _is_title_case(h1_raw)
# Use the H1 node as the effective root for H2 look-ups
effective_root = h1_section
# Collect all H2 section slugs
section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
# --- Extract known sections ---
def _get_section_text(section_slug: str) -> str:
node = _find_h2_section(effective_root, section_slug)
if node is None:
return ""
return extract_section_text(node).strip()
definition = _get_section_text("definition")
source_chapter = _get_section_text("source_chapter")
context = _get_section_text("context")
domain = _get_section_text("economic_domain")
original_wording = _get_section_text("smith_s_original_wording")
modern_interpretation = _get_section_text("modern_interpretation")
# --- Derived metrics ---
has_original_wording = bool(original_wording)
definition_word_count = _word_count(definition)
total_word_count = _word_count(content)
return EntityMeta(
slug=slug,
title=title,
h1_raw=h1_raw,
definition=definition,
source_chapter=source_chapter,
context=context,
domain=domain,
original_wording=original_wording,
modern_interpretation=modern_interpretation,
h1_is_title_case=h1_is_title_case,
has_original_wording=has_original_wording,
definition_word_count=definition_word_count,
total_word_count=total_word_count,
section_slugs=section_slugs,
source_path=str(path),
)
def parse_entity_directory(
directory: Path,
exclude_patterns: Optional[Sequence[str]] = None,
) -> List[EntityMeta]:
"""Parse all entity markdown files in *directory*.
Files matching *exclude_patterns* (regexes tested against the
filename) are skipped. Defaults exclude chapter-view
(``*-entities.md``) and prompt (``*-prompt.md``) files.
Malformed files are skipped with a warning rather than raising.
"""
if exclude_patterns is None:
exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
compiled = [re.compile(p) for p in exclude_patterns]
entities: List[EntityMeta] = []
for md_file in sorted(directory.glob("*.md")):
if any(pat.match(md_file.name) for pat in compiled):
continue
try:
entities.append(parse_entity_file(md_file))
except Exception as exc:
logger.warning("Skipping %s: %s", md_file.name, exc)
return entities

View File

@@ -0,0 +1,53 @@
"""
Data models for infospace entity metadata.
"""
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List
@dataclass
class EntityMeta:
"""Structured metadata extracted from a single entity markdown file.
The parser populates every field it can find; missing optional
sections are left as empty strings (validation is a separate step).
"""
# Identity
slug: str
title: str
h1_raw: str # verbatim H1 text before any normalisation
# Section contents (plain text, empty string if section missing)
definition: str = ""
source_chapter: str = ""
context: str = ""
domain: str = ""
original_wording: str = ""
modern_interpretation: str = ""
# Derived flags
h1_is_title_case: bool = False
has_original_wording: bool = False
# Metrics-ready numbers
definition_word_count: int = 0
total_word_count: int = 0
# All H2 section slugs found (preserves order)
section_slugs: List[str] = field(default_factory=list)
# Source file path (as string for serialisation)
source_path: str = ""
def to_dict(self) -> Dict[str, Any]:
"""Serialise to a plain dictionary."""
return asdict(self)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
"""Deserialise from a plain dictionary."""
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
filtered = {k: v for k, v in data.items() if k in known_fields}
return cls(**filtered)

View File

View File

@@ -0,0 +1,137 @@
"""Tests for markitect.core.section_tree."""
from markitect.core.parser import parse_markdown_to_ast
from markitect.core.section_tree import (
build_section_tree,
extract_heading_content,
extract_heading_level,
extract_section_text,
slugify,
)
class TestSlugify:
def test_simple_text(self):
assert slugify("Hello World") == "hello_world"
def test_german_umlauts(self):
assert slugify("Ärger mit Über") == "aerger_mit_ueber"
def test_special_characters(self):
assert slugify("Smith's Original Wording") == "smith_s_original_wording"
def test_empty_string(self):
assert slugify("") == "feld"
def test_trailing_underscores_stripped(self):
assert slugify("--hello--") == "hello"
def test_multiple_spaces(self):
assert slugify("a b") == "a_b"
class TestExtractHeadingLevel:
def test_h1(self):
assert extract_heading_level("h1") == 1
def test_h6(self):
assert extract_heading_level("h6") == 6
def test_invalid_tag(self):
assert extract_heading_level("p") == 1
def test_empty(self):
assert extract_heading_level("") == 1
class TestExtractHeadingContent:
def test_finds_inline_token(self):
tokens = [
{"type": "heading_open", "tag": "h1"},
{"type": "inline", "content": "Hello"},
{"type": "heading_close", "tag": "h1"},
]
assert extract_heading_content(tokens, 0) == "Hello"
def test_no_inline(self):
tokens = [
{"type": "heading_open", "tag": "h1"},
{"type": "heading_close", "tag": "h1"},
]
assert extract_heading_content(tokens, 0) == ""
class TestBuildSectionTree:
def test_single_heading(self):
md = "# Title\n\nSome text."
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens)
assert tree["level"] == 0
assert len(tree["children"]) == 1
assert tree["children"][0]["heading"] == "Title"
assert tree["children"][0]["level"] == 1
def test_nested_headings(self):
md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore"
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens)
top = tree["children"][0]
assert top["heading"] == "Top"
assert len(top["children"]) == 2
assert top["children"][0]["heading"] == "Sub"
assert top["children"][1]["heading"] == "Sub2"
def test_max_depth(self):
md = "# Top\n\n## Sub\n\n### Deep\n\ntext"
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens, max_depth=2)
top = tree["children"][0]
sub = top["children"][0]
# H3 should be excluded from tree
assert len(sub["children"]) == 0
def test_content_tokens_captured(self):
md = "# Title\n\nParagraph text here."
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens)
section = tree["children"][0]
inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"]
assert len(inline_tokens) == 1
assert "Paragraph text here" in inline_tokens[0]["content"]
def test_slug_assigned(self):
md = "# Economic Domain\n\ntext"
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens)
assert tree["children"][0]["slug"] == "economic_domain"
def test_empty_document(self):
tokens = parse_markdown_to_ast("")
tree = build_section_tree(tokens)
assert tree["children"] == []
class TestExtractSectionText:
def test_simple_paragraph(self):
md = "# Title\n\nHello world."
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens)
text = extract_section_text(tree["children"][0])
assert text == "Hello world."
def test_multiple_paragraphs(self):
md = "# Title\n\nFirst paragraph.\n\nSecond paragraph."
tokens = parse_markdown_to_ast(md)
tree = build_section_tree(tokens)
text = extract_section_text(tree["children"][0])
assert "First paragraph." in text
assert "Second paragraph." in text
def test_empty_section(self):
section = {"content_tokens": []}
assert extract_section_text(section) == ""

View File

View File

@@ -0,0 +1,230 @@
"""Tests for markitect.infospace.entity_parser and EntityMeta."""
import logging
from pathlib import Path
import pytest
from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory
# ── Fixtures ────────────────────────────────────────────────────────
COMPLETE_ENTITY = """\
# Division of Labour
## Definition
The separation of a work process into a number of distinct tasks, each performed
by a specialised worker, resulting in a significant increase in the productive
powers of labour.
## Source Chapter
Book I, Chapter 1: "Of the Division of Labour"
## Context
The division of labour is the central argument of the chapter.
## Economic Domain
Production
## Smith's Original Wording
"The greatest improvements in the productive powers of labour…"
## Modern Interpretation
The division of labour remains a foundational concept in economics.
"""
MINIMAL_ENTITY = """\
# Minimal Entity
## Definition
A brief definition.
## Source Chapter
Book I, Chapter 1
## Context
Some context.
## Economic Domain
Exchange
"""
SLUG_H1_ENTITY = """\
# effectual-demand
## Definition
Effectual demand is the demand by consumers who are willing and able to pay.
## Source Chapter
Book 1, Chapter 7
## Context
Context for effectual demand.
## Economic Domain
Exchange
## Smith's Original Wording
"Such people may be called the effectual demanders…"
## Modern Interpretation
Represents the intersection of desire and purchasing power.
"""
NO_H1 = """\
## Only H2
Some content.
"""
# ── parse_entity_file ────────────────────────────────────────────────
class TestParseEntityFile:
def test_complete_entity(self, tmp_path):
f = tmp_path / "division-of-labour.md"
f.write_text(COMPLETE_ENTITY)
meta = parse_entity_file(f)
assert meta.slug == "division_of_labour"
assert meta.title == "Division of Labour"
assert meta.h1_is_title_case is True
assert meta.has_original_wording is True
assert meta.domain == "Production"
assert meta.definition_word_count > 20
assert "separation" in meta.definition.lower()
assert meta.source_path == str(f)
assert "definition" in meta.section_slugs
assert "smith_s_original_wording" in meta.section_slugs
def test_minimal_entity(self, tmp_path):
f = tmp_path / "minimal-entity.md"
f.write_text(MINIMAL_ENTITY)
meta = parse_entity_file(f)
assert meta.slug == "minimal_entity"
assert meta.has_original_wording is False
assert meta.original_wording == ""
assert meta.modern_interpretation == ""
assert meta.domain == "Exchange"
def test_slug_format_h1(self, tmp_path):
f = tmp_path / "effectual-demand.md"
f.write_text(SLUG_H1_ENTITY)
meta = parse_entity_file(f)
assert meta.h1_raw == "effectual-demand"
assert meta.h1_is_title_case is False
assert meta.slug == "effectual_demand"
assert meta.has_original_wording is True
def test_missing_h1_raises(self, tmp_path):
f = tmp_path / "no-h1.md"
f.write_text(NO_H1)
with pytest.raises(ValueError, match="No H1"):
parse_entity_file(f)
def test_missing_sections_return_empty(self, tmp_path):
f = tmp_path / "minimal.md"
f.write_text(MINIMAL_ENTITY)
meta = parse_entity_file(f)
# Optional sections not present → empty string
assert meta.original_wording == ""
assert meta.modern_interpretation == ""
def test_word_count_accuracy(self, tmp_path):
f = tmp_path / "test.md"
f.write_text("# Test\n\n## Definition\n\none two three four five\n")
meta = parse_entity_file(f)
assert meta.definition_word_count == 5
# ── parse_entity_directory ──────────────────────────────────────────
class TestParseEntityDirectory:
def _make_dir(self, tmp_path):
"""Create a temporary entity directory."""
d = tmp_path / "entities"
d.mkdir()
(d / "entity-a.md").write_text(COMPLETE_ENTITY)
(d / "entity-b.md").write_text(MINIMAL_ENTITY)
# files that should be excluded by default
(d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file")
(d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file")
return d
def test_excludes_view_and_prompt(self, tmp_path):
d = self._make_dir(tmp_path)
results = parse_entity_directory(d)
slugs = {e.slug for e in results}
assert "division_of_labour" in slugs
assert "minimal_entity" in slugs
# Excluded files should not be parsed as entities
assert len(results) == 2
def test_custom_exclude_patterns(self, tmp_path):
d = self._make_dir(tmp_path)
# Only exclude prompt files, allow entity views
results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"])
assert len(results) == 3 # entity-a, entity-b, chapter-01-entities
def test_malformed_skipped_with_warning(self, tmp_path, caplog):
d = tmp_path / "entities"
d.mkdir()
(d / "good.md").write_text(COMPLETE_ENTITY)
(d / "bad.md").write_text(NO_H1)
with caplog.at_level(logging.WARNING):
results = parse_entity_directory(d)
assert len(results) == 1
assert "bad.md" in caplog.text
# ── EntityMeta round-trip ───────────────────────────────────────────
class TestEntityMetaRoundTrip:
def test_to_dict_from_dict(self, tmp_path):
f = tmp_path / "entity.md"
f.write_text(COMPLETE_ENTITY)
original = parse_entity_file(f)
data = original.to_dict()
restored = EntityMeta.from_dict(data)
assert restored.slug == original.slug
assert restored.title == original.title
assert restored.definition == original.definition
assert restored.h1_is_title_case == original.h1_is_title_case
assert restored.section_slugs == original.section_slugs
assert restored.definition_word_count == original.definition_word_count
def test_from_dict_ignores_unknown_keys(self):
data = {
"slug": "test",
"title": "Test",
"h1_raw": "Test",
"unknown_field": "should be ignored",
}
meta = EntityMeta.from_dict(data)
assert meta.slug == "test"
assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__