markitect-main/markitect/infospace/relation_parser.py

"""
Relation triplet parser.

Reads structured :class:`RelationMeta` objects from relation markdown
files in ``output/relations/``.

File format::

    # Subject — predicate — Object

    ## Subject
    Subject Entity Title

    ## Predicate
    predicate phrase

    ## Object
    Object Entity Title

    ## Relation Type
    constrains

    ## VSM Channel
    S1 → S2

    ## Evidence
    Book I, Chapter 3: "..."

    ## Feedback Role
    Part of the Market Expansion loop: ...
"""

from __future__ import annotations

import logging
import re
from pathlib import Path
from typing import List, Optional, Sequence

from markitect.core.parser import parse_markdown_to_ast
from markitect.core.section_tree import (
    build_section_tree,
    extract_section_text,
    slugify,
)
from .relation_models import RelationMeta

logger = logging.getLogger(__name__)


def _find_h2(tree_root: dict, slug: str) -> Optional[dict]:
    for child in tree_root.get("children", []):
        if child["level"] == 2 and child["slug"] == slug:
            return child
    return None


def _section_text(root: dict, slug: str) -> str:
    node = _find_h2(root, slug)
    return extract_section_text(node).strip() if node else ""


def _slug_from_title(title: str) -> str:
    """Convert entity title to slug (same as slugify used in entity_parser)."""
    return slugify(title)


def parse_relation_file(path: Path) -> RelationMeta:
    """Parse a single relation markdown file into :class:`RelationMeta`.

    Raises:
        ValueError: If required sections are missing.
    """
    content = path.read_text(encoding="utf-8")
    tokens = parse_markdown_to_ast(content)
    tree = build_section_tree(tokens)

    # Find H1
    h1 = next(
        (c for c in tree["children"] if c["level"] == 1),
        None,
    )
    if h1 is None:
        raise ValueError(f"No H1 heading in {path}")

    root = h1

    subject = _section_text(root, "subject")
    predicate = _section_text(root, "predicate")
    obj = _section_text(root, "object")
    relation_type = _section_text(root, "relation_type")
    vsm_channel = _section_text(root, "vsm_channel")
    evidence = _section_text(root, "evidence")
    feedback_role = _section_text(root, "feedback_role")

    if not subject:
        raise ValueError(f"Missing ## Subject in {path}")
    if not predicate:
        raise ValueError(f"Missing ## Predicate in {path}")
    if not obj:
        raise ValueError(f"Missing ## Object in {path}")

    subject_slug = _slug_from_title(subject)
    object_slug = _slug_from_title(obj)

    # Derive canonical slug from file stem
    slug = path.stem

    return RelationMeta(
        slug=slug,
        subject=subject,
        subject_slug=subject_slug,
        predicate=predicate,
        object=obj,
        object_slug=object_slug,
        relation_type=relation_type,
        vsm_channel=vsm_channel,
        evidence=evidence,
        feedback_role=feedback_role,
        source_path=str(path),
    )


def parse_relations_directory(
    directory: Path,
) -> List[RelationMeta]:
    """Parse all relation files in *directory*.

    Malformed files are skipped with a warning.
    """
    relations: List[RelationMeta] = []
    for md_file in sorted(directory.glob("*.md")):
        try:
            relations.append(parse_relation_file(md_file))
        except Exception as exc:
            logger.warning("Skipping relation file %s: %s", md_file.name, exc)
    return relations