Files
markitect-main/markitect/infospace/entity_parser.py
tegwick 03c6c5e8de feat(infospace): add entity metadata parser (S1.1)
Extract section-tree algorithm from SchemaGenerator into standalone
core/section_tree.py and build markitect/infospace/ package with
EntityMeta dataclass and parse_entity_file/parse_entity_directory.
Foundation for schema compliance, coverage, and granularity metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 00:27:45 +01:00

177 lines
5.4 KiB
Python

"""
Entity metadata parser.
Extracts structured :class:`EntityMeta` from entity markdown files
produced by the infospace entity-extraction pipeline.
"""
import logging
import re
from pathlib import Path
from typing import List, Optional, Sequence
from markitect.core.parser import parse_markdown_to_ast
from markitect.core.section_tree import (
build_section_tree,
extract_heading_content,
extract_heading_level,
extract_section_text,
slugify,
)
from .models import EntityMeta
logger = logging.getLogger(__name__)
# Sections we look for (slug → human-friendly label)
_KNOWN_SECTIONS = {
"definition": "Definition",
"source_chapter": "Source Chapter",
"context": "Context",
"economic_domain": "Economic Domain",
"smith_s_original_wording": "Smith's Original Wording",
"modern_interpretation": "Modern Interpretation",
}
# Default filename patterns to exclude from directory parsing
_DEFAULT_EXCLUDE_PATTERNS = (
r".*-entities\.md$",
r".*-prompt\.md$",
)
def _is_title_case(text: str) -> bool:
"""Return True if *text* is in title case (ignoring short words)."""
# Words that are allowed to be lowercase in title case
minor_words = {
"a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
"in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
}
words = text.split()
if not words:
return False
for i, word in enumerate(words):
# Strip leading/trailing punctuation for the check
clean = re.sub(r"[^\w]", "", word)
if not clean:
continue
# First word must be capitalised
if i == 0:
if not clean[0].isupper():
return False
elif clean.lower() in minor_words:
continue # minor words may be lower
elif not clean[0].isupper():
return False
return True
def _word_count(text: str) -> int:
"""Count whitespace-separated words in *text*."""
return len(text.split())
def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
"""Find a direct H2 child of the root by slug."""
for child in tree_root.get("children", []):
if child["level"] == 2 and child["slug"] == slug:
return child
return None
def parse_entity_file(path: Path) -> EntityMeta:
"""Parse a single entity markdown file into :class:`EntityMeta`.
Raises:
ValueError: If the file has no H1 heading.
"""
content = path.read_text(encoding="utf-8")
tokens = parse_markdown_to_ast(content)
tree = build_section_tree(tokens)
# --- H1: entity title ---
h1_section = None
for child in tree["children"]:
if child["level"] == 1:
h1_section = child
break
if h1_section is None:
raise ValueError(f"No H1 heading found in {path}")
h1_raw = h1_section["heading"]
slug = slugify(h1_raw)
title = h1_raw
h1_is_title_case = _is_title_case(h1_raw)
# Use the H1 node as the effective root for H2 look-ups
effective_root = h1_section
# Collect all H2 section slugs
section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
# --- Extract known sections ---
def _get_section_text(section_slug: str) -> str:
node = _find_h2_section(effective_root, section_slug)
if node is None:
return ""
return extract_section_text(node).strip()
definition = _get_section_text("definition")
source_chapter = _get_section_text("source_chapter")
context = _get_section_text("context")
domain = _get_section_text("economic_domain")
original_wording = _get_section_text("smith_s_original_wording")
modern_interpretation = _get_section_text("modern_interpretation")
# --- Derived metrics ---
has_original_wording = bool(original_wording)
definition_word_count = _word_count(definition)
total_word_count = _word_count(content)
return EntityMeta(
slug=slug,
title=title,
h1_raw=h1_raw,
definition=definition,
source_chapter=source_chapter,
context=context,
domain=domain,
original_wording=original_wording,
modern_interpretation=modern_interpretation,
h1_is_title_case=h1_is_title_case,
has_original_wording=has_original_wording,
definition_word_count=definition_word_count,
total_word_count=total_word_count,
section_slugs=section_slugs,
source_path=str(path),
)
def parse_entity_directory(
directory: Path,
exclude_patterns: Optional[Sequence[str]] = None,
) -> List[EntityMeta]:
"""Parse all entity markdown files in *directory*.
Files matching *exclude_patterns* (regexes tested against the
filename) are skipped. Defaults exclude chapter-view
(``*-entities.md``) and prompt (``*-prompt.md``) files.
Malformed files are skipped with a warning rather than raising.
"""
if exclude_patterns is None:
exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
compiled = [re.compile(p) for p in exclude_patterns]
entities: List[EntityMeta] = []
for md_file in sorted(directory.glob("*.md")):
if any(pat.match(md_file.name) for pat in compiled):
continue
try:
entities.append(parse_entity_file(md_file))
except Exception as exc:
logger.warning("Skipping %s: %s", md_file.name, exc)
return entities