Demonstrates infospace composition: the Wealth of Nations infospace is used as a discipline, applying Smith's economic framework as a lens to analyse modern supply chain management concepts. New example: examples/supply-chain-vsm/ - infospace.yaml binding WoN as discipline (../infospace-with-history) - 3 source documents: coordination mechanisms, capital & inventory, market structure (~400 words each, original content) - supply-chain-entity-schema-v1.0.md with WoN Concept required section - won-mapping-schema-v1.0.md with Conceptual Continuity rating - artifacts/won-reference/core-entities.md — 12 curated WoN entities for injection as discipline context - 8 hand-crafted entity files demonstrating LLM output format - 3 mapping files with full rationale and VSM inheritance chains - Viable: YES (5/5 thresholds) Key mappings demonstrated: Demand Signal → Effectual Demand (Strong, S2) Vendor-Managed Inventory → Division of Labour (Strong, S1/S2) Just-in-Time Inventory → Circulating Capital (Strong, S1/S3) Bullwhip Effect → Natural Price (Moderate, S2) Platform Intermediary → Merchant Capital (Strong, S2/S4) Monopsony Power → Combination of Masters (Strong, S3*) Platform fix: entity_parser.py now recognises ## Supply Chain Domain as a domain alias for ## Economic Domain, enabling composed infospaces to use their own domain section name. Tutorial §13 rewritten with real commands, real output, and the full mapping table from the demo. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
178 lines
5.5 KiB
Python
178 lines
5.5 KiB
Python
"""
|
|
Entity metadata parser.
|
|
|
|
Extracts structured :class:`EntityMeta` from entity markdown files
|
|
produced by the infospace entity-extraction pipeline.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Optional, Sequence
|
|
|
|
from markitect.core.parser import parse_markdown_to_ast
|
|
from markitect.core.section_tree import (
|
|
build_section_tree,
|
|
extract_heading_content,
|
|
extract_heading_level,
|
|
extract_section_text,
|
|
slugify,
|
|
)
|
|
from .models import EntityMeta
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Sections we look for (slug → human-friendly label)
|
|
_KNOWN_SECTIONS = {
|
|
"definition": "Definition",
|
|
"source_chapter": "Source Chapter",
|
|
"context": "Context",
|
|
"economic_domain": "Economic Domain",
|
|
"smith_s_original_wording": "Smith's Original Wording",
|
|
"modern_interpretation": "Modern Interpretation",
|
|
}
|
|
|
|
# Default filename patterns to exclude from directory parsing
|
|
_DEFAULT_EXCLUDE_PATTERNS = (
|
|
r".*-entities\.md$",
|
|
r".*-prompt\.md$",
|
|
r".*-raw\.md$", # LLM raw output stored alongside entity files
|
|
)
|
|
|
|
|
|
def _is_title_case(text: str) -> bool:
|
|
"""Return True if *text* is in title case (ignoring short words)."""
|
|
# Words that are allowed to be lowercase in title case
|
|
minor_words = {
|
|
"a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
|
|
"in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
|
|
}
|
|
words = text.split()
|
|
if not words:
|
|
return False
|
|
for i, word in enumerate(words):
|
|
# Strip leading/trailing punctuation for the check
|
|
clean = re.sub(r"[^\w]", "", word)
|
|
if not clean:
|
|
continue
|
|
# First word must be capitalised
|
|
if i == 0:
|
|
if not clean[0].isupper():
|
|
return False
|
|
elif clean.lower() in minor_words:
|
|
continue # minor words may be lower
|
|
elif not clean[0].isupper():
|
|
return False
|
|
return True
|
|
|
|
|
|
def _word_count(text: str) -> int:
|
|
"""Count whitespace-separated words in *text*."""
|
|
return len(text.split())
|
|
|
|
|
|
def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
|
|
"""Find a direct H2 child of the root by slug."""
|
|
for child in tree_root.get("children", []):
|
|
if child["level"] == 2 and child["slug"] == slug:
|
|
return child
|
|
return None
|
|
|
|
|
|
def parse_entity_file(path: Path) -> EntityMeta:
|
|
"""Parse a single entity markdown file into :class:`EntityMeta`.
|
|
|
|
Raises:
|
|
ValueError: If the file has no H1 heading.
|
|
"""
|
|
content = path.read_text(encoding="utf-8")
|
|
tokens = parse_markdown_to_ast(content)
|
|
tree = build_section_tree(tokens)
|
|
|
|
# --- H1: entity title ---
|
|
h1_section = None
|
|
for child in tree["children"]:
|
|
if child["level"] == 1:
|
|
h1_section = child
|
|
break
|
|
|
|
if h1_section is None:
|
|
raise ValueError(f"No H1 heading found in {path}")
|
|
|
|
h1_raw = h1_section["heading"]
|
|
slug = slugify(h1_raw)
|
|
title = h1_raw
|
|
h1_is_title_case = _is_title_case(h1_raw)
|
|
|
|
# Use the H1 node as the effective root for H2 look-ups
|
|
effective_root = h1_section
|
|
|
|
# Collect all H2 section slugs
|
|
section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
|
|
|
|
# --- Extract known sections ---
|
|
def _get_section_text(section_slug: str) -> str:
|
|
node = _find_h2_section(effective_root, section_slug)
|
|
if node is None:
|
|
return ""
|
|
return extract_section_text(node).strip()
|
|
|
|
definition = _get_section_text("definition")
|
|
source_chapter = _get_section_text("source_chapter")
|
|
context = _get_section_text("context")
|
|
domain = _get_section_text("economic_domain") or _get_section_text("supply_chain_domain")
|
|
original_wording = _get_section_text("smith_s_original_wording")
|
|
modern_interpretation = _get_section_text("modern_interpretation")
|
|
|
|
# --- Derived metrics ---
|
|
has_original_wording = bool(original_wording)
|
|
definition_word_count = _word_count(definition)
|
|
total_word_count = _word_count(content)
|
|
|
|
return EntityMeta(
|
|
slug=slug,
|
|
title=title,
|
|
h1_raw=h1_raw,
|
|
definition=definition,
|
|
source_chapter=source_chapter,
|
|
context=context,
|
|
domain=domain,
|
|
original_wording=original_wording,
|
|
modern_interpretation=modern_interpretation,
|
|
h1_is_title_case=h1_is_title_case,
|
|
has_original_wording=has_original_wording,
|
|
definition_word_count=definition_word_count,
|
|
total_word_count=total_word_count,
|
|
section_slugs=section_slugs,
|
|
source_path=str(path),
|
|
)
|
|
|
|
|
|
def parse_entity_directory(
|
|
directory: Path,
|
|
exclude_patterns: Optional[Sequence[str]] = None,
|
|
) -> List[EntityMeta]:
|
|
"""Parse all entity markdown files in *directory*.
|
|
|
|
Files matching *exclude_patterns* (regexes tested against the
|
|
filename) are skipped. Defaults exclude chapter-view
|
|
(``*-entities.md``) and prompt (``*-prompt.md``) files.
|
|
|
|
Malformed files are skipped with a warning rather than raising.
|
|
"""
|
|
if exclude_patterns is None:
|
|
exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
|
|
|
|
compiled = [re.compile(p) for p in exclude_patterns]
|
|
entities: List[EntityMeta] = []
|
|
|
|
for md_file in sorted(directory.glob("*.md")):
|
|
if any(pat.match(md_file.name) for pat in compiled):
|
|
continue
|
|
try:
|
|
entities.append(parse_entity_file(md_file))
|
|
except Exception as exc:
|
|
logger.warning("Skipping %s: %s", md_file.name, exc)
|
|
|
|
return entities
|