""" Entity metadata parser. Extracts structured :class:`EntityMeta` from entity markdown files produced by the infospace entity-extraction pipeline. """ import logging import re from pathlib import Path from typing import List, Optional, Sequence from markitect.core.parser import parse_markdown_to_ast from markitect.core.section_tree import ( build_section_tree, extract_heading_content, extract_heading_level, extract_section_text, slugify, ) from .models import EntityMeta logger = logging.getLogger(__name__) # Sections we look for (slug → human-friendly label) _KNOWN_SECTIONS = { "definition": "Definition", "source_chapter": "Source Chapter", "context": "Context", "economic_domain": "Economic Domain", "smith_s_original_wording": "Smith's Original Wording", "modern_interpretation": "Modern Interpretation", } # Default filename patterns to exclude from directory parsing _DEFAULT_EXCLUDE_PATTERNS = ( r".*-entities\.md$", r".*-prompt\.md$", r".*-raw\.md$", # LLM raw output stored alongside entity files ) def _is_title_case(text: str) -> bool: """Return True if *text* is in title case (ignoring short words).""" # Words that are allowed to be lowercase in title case minor_words = { "a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so", "in", "on", "at", "to", "by", "of", "up", "as", "is", "if", } words = text.split() if not words: return False for i, word in enumerate(words): # Strip leading/trailing punctuation for the check clean = re.sub(r"[^\w]", "", word) if not clean: continue # First word must be capitalised if i == 0: if not clean[0].isupper(): return False elif clean.lower() in minor_words: continue # minor words may be lower elif not clean[0].isupper(): return False return True def _word_count(text: str) -> int: """Count whitespace-separated words in *text*.""" return len(text.split()) def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]: """Find a direct H2 child of the root by slug.""" for child in tree_root.get("children", []): if child["level"] == 2 and child["slug"] == slug: return child return None def parse_entity_file(path: Path) -> EntityMeta: """Parse a single entity markdown file into :class:`EntityMeta`. Raises: ValueError: If the file has no H1 heading. """ content = path.read_text(encoding="utf-8") tokens = parse_markdown_to_ast(content) tree = build_section_tree(tokens) # --- H1: entity title --- h1_section = None for child in tree["children"]: if child["level"] == 1: h1_section = child break if h1_section is None: raise ValueError(f"No H1 heading found in {path}") h1_raw = h1_section["heading"] slug = slugify(h1_raw) title = h1_raw h1_is_title_case = _is_title_case(h1_raw) # Use the H1 node as the effective root for H2 look-ups effective_root = h1_section # Collect all H2 section slugs section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2] # --- Extract known sections --- def _get_section_text(section_slug: str) -> str: node = _find_h2_section(effective_root, section_slug) if node is None: return "" return extract_section_text(node).strip() definition = _get_section_text("definition") source_chapter = _get_section_text("source_chapter") context = _get_section_text("context") domain = _get_section_text("economic_domain") or _get_section_text("supply_chain_domain") original_wording = _get_section_text("smith_s_original_wording") modern_interpretation = _get_section_text("modern_interpretation") # --- Derived metrics --- has_original_wording = bool(original_wording) definition_word_count = _word_count(definition) total_word_count = _word_count(content) return EntityMeta( slug=slug, title=title, h1_raw=h1_raw, definition=definition, source_chapter=source_chapter, context=context, domain=domain, original_wording=original_wording, modern_interpretation=modern_interpretation, h1_is_title_case=h1_is_title_case, has_original_wording=has_original_wording, definition_word_count=definition_word_count, total_word_count=total_word_count, section_slugs=section_slugs, source_path=str(path), ) def parse_entity_directory( directory: Path, exclude_patterns: Optional[Sequence[str]] = None, ) -> List[EntityMeta]: """Parse all entity markdown files in *directory*. Files matching *exclude_patterns* (regexes tested against the filename) are skipped. Defaults exclude chapter-view (``*-entities.md``) and prompt (``*-prompt.md``) files. Malformed files are skipped with a warning rather than raising. """ if exclude_patterns is None: exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS compiled = [re.compile(p) for p in exclude_patterns] entities: List[EntityMeta] = [] for md_file in sorted(directory.glob("*.md")): if any(pat.match(md_file.name) for pat in compiled): continue try: entities.append(parse_entity_file(md_file)) except Exception as exc: logger.warning("Skipping %s: %s", md_file.name, exc) return entities