markitect-main/markitect/infospace/entity_parser.py

"""
Entity metadata parser.

Extracts structured :class:`EntityMeta` from entity markdown files
produced by the infospace entity-extraction pipeline.
"""

import logging
import re
from pathlib import Path
from typing import List, Optional, Sequence

from markitect.core.parser import parse_markdown_to_ast
from markitect.core.section_tree import (
    build_section_tree,
    extract_heading_content,
    extract_heading_level,
    extract_section_text,
    slugify,
)
from .models import EntityMeta

logger = logging.getLogger(__name__)

# Sections we look for (slug → human-friendly label)
_KNOWN_SECTIONS = {
    "definition": "Definition",
    "source_chapter": "Source Chapter",
    "context": "Context",
    "economic_domain": "Economic Domain",
    "smith_s_original_wording": "Smith's Original Wording",
    "modern_interpretation": "Modern Interpretation",
}

# Default filename patterns to exclude from directory parsing
_DEFAULT_EXCLUDE_PATTERNS = (
    r".*-entities\.md$",
    r".*-prompt\.md$",
    r".*-raw\.md$",  # LLM raw output stored alongside entity files
)


def _is_title_case(text: str) -> bool:
    """Return True if *text* is in title case (ignoring short words)."""
    # Words that are allowed to be lowercase in title case
    minor_words = {
        "a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
        "in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
    }
    words = text.split()
    if not words:
        return False
    for i, word in enumerate(words):
        # Strip leading/trailing punctuation for the check
        clean = re.sub(r"[^\w]", "", word)
        if not clean:
            continue
        # First word must be capitalised
        if i == 0:
            if not clean[0].isupper():
                return False
        elif clean.lower() in minor_words:
            continue  # minor words may be lower
        elif not clean[0].isupper():
            return False
    return True


def _word_count(text: str) -> int:
    """Count whitespace-separated words in *text*."""
    return len(text.split())


def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
    """Find a direct H2 child of the root by slug."""
    for child in tree_root.get("children", []):
        if child["level"] == 2 and child["slug"] == slug:
            return child
    return None


def parse_entity_file(path: Path) -> EntityMeta:
    """Parse a single entity markdown file into :class:`EntityMeta`.

    Raises:
        ValueError: If the file has no H1 heading.
    """
    content = path.read_text(encoding="utf-8")
    tokens = parse_markdown_to_ast(content)
    tree = build_section_tree(tokens)

    # --- H1: entity title ---
    h1_section = None
    for child in tree["children"]:
        if child["level"] == 1:
            h1_section = child
            break

    if h1_section is None:
        raise ValueError(f"No H1 heading found in {path}")

    h1_raw = h1_section["heading"]
    slug = slugify(h1_raw)
    title = h1_raw
    h1_is_title_case = _is_title_case(h1_raw)

    # Use the H1 node as the effective root for H2 look-ups
    effective_root = h1_section

    # Collect all H2 section slugs
    section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]

    # --- Extract known sections ---
    def _get_section_text(section_slug: str) -> str:
        node = _find_h2_section(effective_root, section_slug)
        if node is None:
            return ""
        return extract_section_text(node).strip()

    definition = _get_section_text("definition")
    source_chapter = _get_section_text("source_chapter")
    context = _get_section_text("context")
    domain = _get_section_text("economic_domain") or _get_section_text("supply_chain_domain")
    original_wording = _get_section_text("smith_s_original_wording")
    modern_interpretation = _get_section_text("modern_interpretation")

    # --- Derived metrics ---
    has_original_wording = bool(original_wording)
    definition_word_count = _word_count(definition)
    total_word_count = _word_count(content)

    return EntityMeta(
        slug=slug,
        title=title,
        h1_raw=h1_raw,
        definition=definition,
        source_chapter=source_chapter,
        context=context,
        domain=domain,
        original_wording=original_wording,
        modern_interpretation=modern_interpretation,
        h1_is_title_case=h1_is_title_case,
        has_original_wording=has_original_wording,
        definition_word_count=definition_word_count,
        total_word_count=total_word_count,
        section_slugs=section_slugs,
        source_path=str(path),
    )


def parse_entity_directory(
    directory: Path,
    exclude_patterns: Optional[Sequence[str]] = None,
) -> List[EntityMeta]:
    """Parse all entity markdown files in *directory*.

    Files matching *exclude_patterns* (regexes tested against the
    filename) are skipped.  Defaults exclude chapter-view
    (``*-entities.md``) and prompt (``*-prompt.md``) files.

    Malformed files are skipped with a warning rather than raising.
    """
    if exclude_patterns is None:
        exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS

    compiled = [re.compile(p) for p in exclude_patterns]
    entities: List[EntityMeta] = []

    for md_file in sorted(directory.glob("*.md")):
        if any(pat.match(md_file.name) for pat in compiled):
            continue
        try:
            entities.append(parse_entity_file(md_file))
        except Exception as exc:
            logger.warning("Skipping %s: %s", md_file.name, exc)

    return entities