feat(infospace): add entity metadata parser (S1.1)

Extract section-tree algorithm from SchemaGenerator into standalone
core/section_tree.py and build markitect/infospace/ package with
EntityMeta dataclass and parse_entity_file/parse_entity_directory.
Foundation for schema compliance, coverage, and granularity metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 00:27:45 +01:00
parent b5e994b014
commit 03c6c5e8de
9 changed files with 739 additions and 0 deletions

View File

@@ -9,6 +9,7 @@ This package contains the fundamental building blocks:
"""
from .parser import parse_markdown_to_ast
from .section_tree import build_section_tree, extract_section_text
from .serializer import ASTSerializer
from .document_manager import DocumentManager, CleanDocumentManager
from .workspace import (
@@ -29,6 +30,9 @@ from .workspace import (
__all__ = [
# Parser
"parse_markdown_to_ast",
# Section tree
"build_section_tree",
"extract_section_text",
# Serializer
"ASTSerializer",
# Document Manager

View File

@@ -0,0 +1,124 @@
"""
Standalone section-tree utilities extracted from SchemaGenerator.
Builds a hierarchical section tree from flat markdown-it AST tokens and
provides helpers for navigating heading structure and extracting text.
These functions are used by both the schema generator and the infospace
entity parser.
"""
import re
from typing import Any, Dict, List, Optional
def slugify(text: str) -> str:
"""Convert heading or label text to a valid slug / JSON property key."""
replacements = {
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
}
slug = text
for char, repl in replacements.items():
slug = slug.replace(char, repl)
slug = slug.lower()
slug = re.sub(r'[^a-z0-9]+', '_', slug)
slug = slug.strip('_')
return slug or 'feld'
def extract_heading_level(tag: str) -> int:
"""Extract heading level from an HTML tag string (h1, h2, …)."""
if tag.startswith('h') and len(tag) == 2:
try:
return int(tag[1])
except ValueError:
pass
return 1
def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Return the inline text content following a ``heading_open`` token."""
for i in range(start_index, min(start_index + 3, len(tokens))):
token = tokens[i]
if token.get('type') == 'inline':
return token.get('content', '')
return ''
def build_section_tree(
tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
) -> Dict[str, Any]:
"""
Build a hierarchical section tree from a flat markdown-it token list.
Returns a root node whose ``children`` list contains the top-level
sections. Each node carries:
- ``heading`` heading text (``None`` for the root)
- ``level`` heading depth (``0`` for the root)
- ``slug`` slugified heading
- ``content_tokens`` non-heading tokens belonging to this section
- ``children`` nested sub-sections
"""
root: Dict[str, Any] = {
'heading': None, 'level': 0, 'slug': '',
'content_tokens': [], 'children': []
}
stack = [root]
i = 0
while i < len(tokens):
token = tokens[i]
if token.get('type') == 'heading_open':
level = extract_heading_level(token.get('tag', ''))
heading_text = extract_heading_content(tokens, i)
if max_depth is not None and level > max_depth:
# Skip this heading and its close token, but keep content
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
i += 1
continue
section: Dict[str, Any] = {
'heading': heading_text,
'level': level,
'slug': slugify(heading_text),
'content_tokens': [],
'children': []
}
# Pop stack until we find the parent (level < current)
while len(stack) > 1 and stack[-1]['level'] >= level:
stack.pop()
stack[-1]['children'].append(section)
stack.append(section)
# Skip past heading_close
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
else:
# Add content token to current section
stack[-1]['content_tokens'].append(token)
i += 1
return root
def extract_section_text(section: Dict[str, Any]) -> str:
"""
Return the plain text content of a section node.
Concatenates the ``content`` field of every ``inline`` token found
in the section's ``content_tokens``. Paragraphs are separated by
newlines; other inline tokens are joined with spaces.
"""
parts: List[str] = []
for token in section.get('content_tokens', []):
if token.get('type') == 'inline':
parts.append(token.get('content', ''))
return '\n'.join(parts)

View File

@@ -0,0 +1,15 @@
"""
Infospace analysis package.
Provides tooling for extracting structured metadata from entity markdown
files and analysing infospace collections.
"""
from .models import EntityMeta
from .entity_parser import parse_entity_file, parse_entity_directory
__all__ = [
"EntityMeta",
"parse_entity_file",
"parse_entity_directory",
]

View File

@@ -0,0 +1,176 @@
"""
Entity metadata parser.
Extracts structured :class:`EntityMeta` from entity markdown files
produced by the infospace entity-extraction pipeline.
"""
import logging
import re
from pathlib import Path
from typing import List, Optional, Sequence
from markitect.core.parser import parse_markdown_to_ast
from markitect.core.section_tree import (
build_section_tree,
extract_heading_content,
extract_heading_level,
extract_section_text,
slugify,
)
from .models import EntityMeta
logger = logging.getLogger(__name__)
# Sections we look for (slug → human-friendly label)
_KNOWN_SECTIONS = {
"definition": "Definition",
"source_chapter": "Source Chapter",
"context": "Context",
"economic_domain": "Economic Domain",
"smith_s_original_wording": "Smith's Original Wording",
"modern_interpretation": "Modern Interpretation",
}
# Default filename patterns to exclude from directory parsing
_DEFAULT_EXCLUDE_PATTERNS = (
r".*-entities\.md$",
r".*-prompt\.md$",
)
def _is_title_case(text: str) -> bool:
"""Return True if *text* is in title case (ignoring short words)."""
# Words that are allowed to be lowercase in title case
minor_words = {
"a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
"in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
}
words = text.split()
if not words:
return False
for i, word in enumerate(words):
# Strip leading/trailing punctuation for the check
clean = re.sub(r"[^\w]", "", word)
if not clean:
continue
# First word must be capitalised
if i == 0:
if not clean[0].isupper():
return False
elif clean.lower() in minor_words:
continue # minor words may be lower
elif not clean[0].isupper():
return False
return True
def _word_count(text: str) -> int:
"""Count whitespace-separated words in *text*."""
return len(text.split())
def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
"""Find a direct H2 child of the root by slug."""
for child in tree_root.get("children", []):
if child["level"] == 2 and child["slug"] == slug:
return child
return None
def parse_entity_file(path: Path) -> EntityMeta:
"""Parse a single entity markdown file into :class:`EntityMeta`.
Raises:
ValueError: If the file has no H1 heading.
"""
content = path.read_text(encoding="utf-8")
tokens = parse_markdown_to_ast(content)
tree = build_section_tree(tokens)
# --- H1: entity title ---
h1_section = None
for child in tree["children"]:
if child["level"] == 1:
h1_section = child
break
if h1_section is None:
raise ValueError(f"No H1 heading found in {path}")
h1_raw = h1_section["heading"]
slug = slugify(h1_raw)
title = h1_raw
h1_is_title_case = _is_title_case(h1_raw)
# Use the H1 node as the effective root for H2 look-ups
effective_root = h1_section
# Collect all H2 section slugs
section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
# --- Extract known sections ---
def _get_section_text(section_slug: str) -> str:
node = _find_h2_section(effective_root, section_slug)
if node is None:
return ""
return extract_section_text(node).strip()
definition = _get_section_text("definition")
source_chapter = _get_section_text("source_chapter")
context = _get_section_text("context")
domain = _get_section_text("economic_domain")
original_wording = _get_section_text("smith_s_original_wording")
modern_interpretation = _get_section_text("modern_interpretation")
# --- Derived metrics ---
has_original_wording = bool(original_wording)
definition_word_count = _word_count(definition)
total_word_count = _word_count(content)
return EntityMeta(
slug=slug,
title=title,
h1_raw=h1_raw,
definition=definition,
source_chapter=source_chapter,
context=context,
domain=domain,
original_wording=original_wording,
modern_interpretation=modern_interpretation,
h1_is_title_case=h1_is_title_case,
has_original_wording=has_original_wording,
definition_word_count=definition_word_count,
total_word_count=total_word_count,
section_slugs=section_slugs,
source_path=str(path),
)
def parse_entity_directory(
directory: Path,
exclude_patterns: Optional[Sequence[str]] = None,
) -> List[EntityMeta]:
"""Parse all entity markdown files in *directory*.
Files matching *exclude_patterns* (regexes tested against the
filename) are skipped. Defaults exclude chapter-view
(``*-entities.md``) and prompt (``*-prompt.md``) files.
Malformed files are skipped with a warning rather than raising.
"""
if exclude_patterns is None:
exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
compiled = [re.compile(p) for p in exclude_patterns]
entities: List[EntityMeta] = []
for md_file in sorted(directory.glob("*.md")):
if any(pat.match(md_file.name) for pat in compiled):
continue
try:
entities.append(parse_entity_file(md_file))
except Exception as exc:
logger.warning("Skipping %s: %s", md_file.name, exc)
return entities

View File

@@ -0,0 +1,53 @@
"""
Data models for infospace entity metadata.
"""
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List
@dataclass
class EntityMeta:
"""Structured metadata extracted from a single entity markdown file.
The parser populates every field it can find; missing optional
sections are left as empty strings (validation is a separate step).
"""
# Identity
slug: str
title: str
h1_raw: str # verbatim H1 text before any normalisation
# Section contents (plain text, empty string if section missing)
definition: str = ""
source_chapter: str = ""
context: str = ""
domain: str = ""
original_wording: str = ""
modern_interpretation: str = ""
# Derived flags
h1_is_title_case: bool = False
has_original_wording: bool = False
# Metrics-ready numbers
definition_word_count: int = 0
total_word_count: int = 0
# All H2 section slugs found (preserves order)
section_slugs: List[str] = field(default_factory=list)
# Source file path (as string for serialisation)
source_path: str = ""
def to_dict(self) -> Dict[str, Any]:
"""Serialise to a plain dictionary."""
return asdict(self)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
"""Deserialise from a plain dictionary."""
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
filtered = {k: v for k, v in data.items() if k in known_fields}
return cls(**filtered)