feat(infospace): add entity metadata parser (S1.1)

Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 00:27:45 +01:00
parent b5e994b014
commit 03c6c5e8de
9 changed files with 739 additions and 0 deletions
--- a/markitect/infospace/entity_parser.py
+++ b/markitect/infospace/entity_parser.py
@@ -0,0 +1,176 @@
+"""
+Entity metadata parser.
+
+Extracts structured :class:`EntityMeta` from entity markdown files
+produced by the infospace entity-extraction pipeline.
+"""
+
+import logging
+import re
+from pathlib import Path
+from typing import List, Optional, Sequence
+
+from markitect.core.parser import parse_markdown_to_ast
+from markitect.core.section_tree import (
+    build_section_tree,
+    extract_heading_content,
+    extract_heading_level,
+    extract_section_text,
+    slugify,
+)
+from .models import EntityMeta
+
+logger = logging.getLogger(__name__)
+
+# Sections we look for (slug → human-friendly label)
+_KNOWN_SECTIONS = {
+    "definition": "Definition",
+    "source_chapter": "Source Chapter",
+    "context": "Context",
+    "economic_domain": "Economic Domain",
+    "smith_s_original_wording": "Smith's Original Wording",
+    "modern_interpretation": "Modern Interpretation",
+}
+
+# Default filename patterns to exclude from directory parsing
+_DEFAULT_EXCLUDE_PATTERNS = (
+    r".*-entities\.md$",
+    r".*-prompt\.md$",
+)
+
+
+def _is_title_case(text: str) -> bool:
+    """Return True if *text* is in title case (ignoring short words)."""
+    # Words that are allowed to be lowercase in title case
+    minor_words = {
+        "a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
+        "in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
+    }
+    words = text.split()
+    if not words:
+        return False
+    for i, word in enumerate(words):
+        # Strip leading/trailing punctuation for the check
+        clean = re.sub(r"[^\w]", "", word)
+        if not clean:
+            continue
+        # First word must be capitalised
+        if i == 0:
+            if not clean[0].isupper():
+                return False
+        elif clean.lower() in minor_words:
+            continue  # minor words may be lower
+        elif not clean[0].isupper():
+            return False
+    return True
+
+
+def _word_count(text: str) -> int:
+    """Count whitespace-separated words in *text*."""
+    return len(text.split())
+
+
+def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
+    """Find a direct H2 child of the root by slug."""
+    for child in tree_root.get("children", []):
+        if child["level"] == 2 and child["slug"] == slug:
+            return child
+    return None
+
+
+def parse_entity_file(path: Path) -> EntityMeta:
+    """Parse a single entity markdown file into :class:`EntityMeta`.
+
+    Raises:
+        ValueError: If the file has no H1 heading.
+    """
+    content = path.read_text(encoding="utf-8")
+    tokens = parse_markdown_to_ast(content)
+    tree = build_section_tree(tokens)
+
+    # --- H1: entity title ---
+    h1_section = None
+    for child in tree["children"]:
+        if child["level"] == 1:
+            h1_section = child
+            break
+
+    if h1_section is None:
+        raise ValueError(f"No H1 heading found in {path}")
+
+    h1_raw = h1_section["heading"]
+    slug = slugify(h1_raw)
+    title = h1_raw
+    h1_is_title_case = _is_title_case(h1_raw)
+
+    # Use the H1 node as the effective root for H2 look-ups
+    effective_root = h1_section
+
+    # Collect all H2 section slugs
+    section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
+
+    # --- Extract known sections ---
+    def _get_section_text(section_slug: str) -> str:
+        node = _find_h2_section(effective_root, section_slug)
+        if node is None:
+            return ""
+        return extract_section_text(node).strip()
+
+    definition = _get_section_text("definition")
+    source_chapter = _get_section_text("source_chapter")
+    context = _get_section_text("context")
+    domain = _get_section_text("economic_domain")
+    original_wording = _get_section_text("smith_s_original_wording")
+    modern_interpretation = _get_section_text("modern_interpretation")
+
+    # --- Derived metrics ---
+    has_original_wording = bool(original_wording)
+    definition_word_count = _word_count(definition)
+    total_word_count = _word_count(content)
+
+    return EntityMeta(
+        slug=slug,
+        title=title,
+        h1_raw=h1_raw,
+        definition=definition,
+        source_chapter=source_chapter,
+        context=context,
+        domain=domain,
+        original_wording=original_wording,
+        modern_interpretation=modern_interpretation,
+        h1_is_title_case=h1_is_title_case,
+        has_original_wording=has_original_wording,
+        definition_word_count=definition_word_count,
+        total_word_count=total_word_count,
+        section_slugs=section_slugs,
+        source_path=str(path),
+    )
+
+
+def parse_entity_directory(
+    directory: Path,
+    exclude_patterns: Optional[Sequence[str]] = None,
+) -> List[EntityMeta]:
+    """Parse all entity markdown files in *directory*.
+
+    Files matching *exclude_patterns* (regexes tested against the
+    filename) are skipped.  Defaults exclude chapter-view
+    (``*-entities.md``) and prompt (``*-prompt.md``) files.
+
+    Malformed files are skipped with a warning rather than raising.
+    """
+    if exclude_patterns is None:
+        exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
+
+    compiled = [re.compile(p) for p in exclude_patterns]
+    entities: List[EntityMeta] = []
+
+    for md_file in sorted(directory.glob("*.md")):
+        if any(pat.match(md_file.name) for pat in compiled):
+            continue
+        try:
+            entities.append(parse_entity_file(md_file))
+        except Exception as exc:
+            logger.warning("Skipping %s: %s", md_file.name, exc)
+
+    return entities