feat(infospace): add entity metadata parser (S1.1)
Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ This package contains the fundamental building blocks:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from .parser import parse_markdown_to_ast
|
from .parser import parse_markdown_to_ast
|
||||||
|
from .section_tree import build_section_tree, extract_section_text
|
||||||
from .serializer import ASTSerializer
|
from .serializer import ASTSerializer
|
||||||
from .document_manager import DocumentManager, CleanDocumentManager
|
from .document_manager import DocumentManager, CleanDocumentManager
|
||||||
from .workspace import (
|
from .workspace import (
|
||||||
@@ -29,6 +30,9 @@ from .workspace import (
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
# Parser
|
# Parser
|
||||||
"parse_markdown_to_ast",
|
"parse_markdown_to_ast",
|
||||||
|
# Section tree
|
||||||
|
"build_section_tree",
|
||||||
|
"extract_section_text",
|
||||||
# Serializer
|
# Serializer
|
||||||
"ASTSerializer",
|
"ASTSerializer",
|
||||||
# Document Manager
|
# Document Manager
|
||||||
|
|||||||
124
markitect/core/section_tree.py
Normal file
124
markitect/core/section_tree.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
Standalone section-tree utilities extracted from SchemaGenerator.
|
||||||
|
|
||||||
|
Builds a hierarchical section tree from flat markdown-it AST tokens and
|
||||||
|
provides helpers for navigating heading structure and extracting text.
|
||||||
|
These functions are used by both the schema generator and the infospace
|
||||||
|
entity parser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(text: str) -> str:
|
||||||
|
"""Convert heading or label text to a valid slug / JSON property key."""
|
||||||
|
replacements = {
|
||||||
|
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
|
||||||
|
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
|
||||||
|
}
|
||||||
|
slug = text
|
||||||
|
for char, repl in replacements.items():
|
||||||
|
slug = slug.replace(char, repl)
|
||||||
|
slug = slug.lower()
|
||||||
|
slug = re.sub(r'[^a-z0-9]+', '_', slug)
|
||||||
|
slug = slug.strip('_')
|
||||||
|
return slug or 'feld'
|
||||||
|
|
||||||
|
|
||||||
|
def extract_heading_level(tag: str) -> int:
|
||||||
|
"""Extract heading level from an HTML tag string (h1, h2, …)."""
|
||||||
|
if tag.startswith('h') and len(tag) == 2:
|
||||||
|
try:
|
||||||
|
return int(tag[1])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
|
||||||
|
"""Return the inline text content following a ``heading_open`` token."""
|
||||||
|
for i in range(start_index, min(start_index + 3, len(tokens))):
|
||||||
|
token = tokens[i]
|
||||||
|
if token.get('type') == 'inline':
|
||||||
|
return token.get('content', '')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def build_section_tree(
|
||||||
|
tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Build a hierarchical section tree from a flat markdown-it token list.
|
||||||
|
|
||||||
|
Returns a root node whose ``children`` list contains the top-level
|
||||||
|
sections. Each node carries:
|
||||||
|
|
||||||
|
- ``heading`` – heading text (``None`` for the root)
|
||||||
|
- ``level`` – heading depth (``0`` for the root)
|
||||||
|
- ``slug`` – slugified heading
|
||||||
|
- ``content_tokens`` – non-heading tokens belonging to this section
|
||||||
|
- ``children`` – nested sub-sections
|
||||||
|
"""
|
||||||
|
root: Dict[str, Any] = {
|
||||||
|
'heading': None, 'level': 0, 'slug': '',
|
||||||
|
'content_tokens': [], 'children': []
|
||||||
|
}
|
||||||
|
stack = [root]
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(tokens):
|
||||||
|
token = tokens[i]
|
||||||
|
if token.get('type') == 'heading_open':
|
||||||
|
level = extract_heading_level(token.get('tag', ''))
|
||||||
|
heading_text = extract_heading_content(tokens, i)
|
||||||
|
|
||||||
|
if max_depth is not None and level > max_depth:
|
||||||
|
# Skip this heading and its close token, but keep content
|
||||||
|
i += 1
|
||||||
|
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
|
||||||
|
i += 1
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
section: Dict[str, Any] = {
|
||||||
|
'heading': heading_text,
|
||||||
|
'level': level,
|
||||||
|
'slug': slugify(heading_text),
|
||||||
|
'content_tokens': [],
|
||||||
|
'children': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Pop stack until we find the parent (level < current)
|
||||||
|
while len(stack) > 1 and stack[-1]['level'] >= level:
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
|
stack[-1]['children'].append(section)
|
||||||
|
stack.append(section)
|
||||||
|
|
||||||
|
# Skip past heading_close
|
||||||
|
i += 1
|
||||||
|
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
# Add content token to current section
|
||||||
|
stack[-1]['content_tokens'].append(token)
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
def extract_section_text(section: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Return the plain text content of a section node.
|
||||||
|
|
||||||
|
Concatenates the ``content`` field of every ``inline`` token found
|
||||||
|
in the section's ``content_tokens``. Paragraphs are separated by
|
||||||
|
newlines; other inline tokens are joined with spaces.
|
||||||
|
"""
|
||||||
|
parts: List[str] = []
|
||||||
|
for token in section.get('content_tokens', []):
|
||||||
|
if token.get('type') == 'inline':
|
||||||
|
parts.append(token.get('content', ''))
|
||||||
|
return '\n'.join(parts)
|
||||||
15
markitect/infospace/__init__.py
Normal file
15
markitect/infospace/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
"""
|
||||||
|
Infospace analysis package.
|
||||||
|
|
||||||
|
Provides tooling for extracting structured metadata from entity markdown
|
||||||
|
files and analysing infospace collections.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .models import EntityMeta
|
||||||
|
from .entity_parser import parse_entity_file, parse_entity_directory
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"EntityMeta",
|
||||||
|
"parse_entity_file",
|
||||||
|
"parse_entity_directory",
|
||||||
|
]
|
||||||
176
markitect/infospace/entity_parser.py
Normal file
176
markitect/infospace/entity_parser.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
"""
|
||||||
|
Entity metadata parser.
|
||||||
|
|
||||||
|
Extracts structured :class:`EntityMeta` from entity markdown files
|
||||||
|
produced by the infospace entity-extraction pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Sequence
|
||||||
|
|
||||||
|
from markitect.core.parser import parse_markdown_to_ast
|
||||||
|
from markitect.core.section_tree import (
|
||||||
|
build_section_tree,
|
||||||
|
extract_heading_content,
|
||||||
|
extract_heading_level,
|
||||||
|
extract_section_text,
|
||||||
|
slugify,
|
||||||
|
)
|
||||||
|
from .models import EntityMeta
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Sections we look for (slug → human-friendly label)
|
||||||
|
_KNOWN_SECTIONS = {
|
||||||
|
"definition": "Definition",
|
||||||
|
"source_chapter": "Source Chapter",
|
||||||
|
"context": "Context",
|
||||||
|
"economic_domain": "Economic Domain",
|
||||||
|
"smith_s_original_wording": "Smith's Original Wording",
|
||||||
|
"modern_interpretation": "Modern Interpretation",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Default filename patterns to exclude from directory parsing
|
||||||
|
_DEFAULT_EXCLUDE_PATTERNS = (
|
||||||
|
r".*-entities\.md$",
|
||||||
|
r".*-prompt\.md$",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_title_case(text: str) -> bool:
|
||||||
|
"""Return True if *text* is in title case (ignoring short words)."""
|
||||||
|
# Words that are allowed to be lowercase in title case
|
||||||
|
minor_words = {
|
||||||
|
"a", "an", "the", "and", "but", "or", "nor", "for", "yet", "so",
|
||||||
|
"in", "on", "at", "to", "by", "of", "up", "as", "is", "if",
|
||||||
|
}
|
||||||
|
words = text.split()
|
||||||
|
if not words:
|
||||||
|
return False
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
# Strip leading/trailing punctuation for the check
|
||||||
|
clean = re.sub(r"[^\w]", "", word)
|
||||||
|
if not clean:
|
||||||
|
continue
|
||||||
|
# First word must be capitalised
|
||||||
|
if i == 0:
|
||||||
|
if not clean[0].isupper():
|
||||||
|
return False
|
||||||
|
elif clean.lower() in minor_words:
|
||||||
|
continue # minor words may be lower
|
||||||
|
elif not clean[0].isupper():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _word_count(text: str) -> int:
|
||||||
|
"""Count whitespace-separated words in *text*."""
|
||||||
|
return len(text.split())
|
||||||
|
|
||||||
|
|
||||||
|
def _find_h2_section(tree_root: dict, slug: str) -> Optional[dict]:
|
||||||
|
"""Find a direct H2 child of the root by slug."""
|
||||||
|
for child in tree_root.get("children", []):
|
||||||
|
if child["level"] == 2 and child["slug"] == slug:
|
||||||
|
return child
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_entity_file(path: Path) -> EntityMeta:
|
||||||
|
"""Parse a single entity markdown file into :class:`EntityMeta`.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the file has no H1 heading.
|
||||||
|
"""
|
||||||
|
content = path.read_text(encoding="utf-8")
|
||||||
|
tokens = parse_markdown_to_ast(content)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
|
||||||
|
# --- H1: entity title ---
|
||||||
|
h1_section = None
|
||||||
|
for child in tree["children"]:
|
||||||
|
if child["level"] == 1:
|
||||||
|
h1_section = child
|
||||||
|
break
|
||||||
|
|
||||||
|
if h1_section is None:
|
||||||
|
raise ValueError(f"No H1 heading found in {path}")
|
||||||
|
|
||||||
|
h1_raw = h1_section["heading"]
|
||||||
|
slug = slugify(h1_raw)
|
||||||
|
title = h1_raw
|
||||||
|
h1_is_title_case = _is_title_case(h1_raw)
|
||||||
|
|
||||||
|
# Use the H1 node as the effective root for H2 look-ups
|
||||||
|
effective_root = h1_section
|
||||||
|
|
||||||
|
# Collect all H2 section slugs
|
||||||
|
section_slugs = [c["slug"] for c in effective_root.get("children", []) if c["level"] == 2]
|
||||||
|
|
||||||
|
# --- Extract known sections ---
|
||||||
|
def _get_section_text(section_slug: str) -> str:
|
||||||
|
node = _find_h2_section(effective_root, section_slug)
|
||||||
|
if node is None:
|
||||||
|
return ""
|
||||||
|
return extract_section_text(node).strip()
|
||||||
|
|
||||||
|
definition = _get_section_text("definition")
|
||||||
|
source_chapter = _get_section_text("source_chapter")
|
||||||
|
context = _get_section_text("context")
|
||||||
|
domain = _get_section_text("economic_domain")
|
||||||
|
original_wording = _get_section_text("smith_s_original_wording")
|
||||||
|
modern_interpretation = _get_section_text("modern_interpretation")
|
||||||
|
|
||||||
|
# --- Derived metrics ---
|
||||||
|
has_original_wording = bool(original_wording)
|
||||||
|
definition_word_count = _word_count(definition)
|
||||||
|
total_word_count = _word_count(content)
|
||||||
|
|
||||||
|
return EntityMeta(
|
||||||
|
slug=slug,
|
||||||
|
title=title,
|
||||||
|
h1_raw=h1_raw,
|
||||||
|
definition=definition,
|
||||||
|
source_chapter=source_chapter,
|
||||||
|
context=context,
|
||||||
|
domain=domain,
|
||||||
|
original_wording=original_wording,
|
||||||
|
modern_interpretation=modern_interpretation,
|
||||||
|
h1_is_title_case=h1_is_title_case,
|
||||||
|
has_original_wording=has_original_wording,
|
||||||
|
definition_word_count=definition_word_count,
|
||||||
|
total_word_count=total_word_count,
|
||||||
|
section_slugs=section_slugs,
|
||||||
|
source_path=str(path),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_entity_directory(
|
||||||
|
directory: Path,
|
||||||
|
exclude_patterns: Optional[Sequence[str]] = None,
|
||||||
|
) -> List[EntityMeta]:
|
||||||
|
"""Parse all entity markdown files in *directory*.
|
||||||
|
|
||||||
|
Files matching *exclude_patterns* (regexes tested against the
|
||||||
|
filename) are skipped. Defaults exclude chapter-view
|
||||||
|
(``*-entities.md``) and prompt (``*-prompt.md``) files.
|
||||||
|
|
||||||
|
Malformed files are skipped with a warning rather than raising.
|
||||||
|
"""
|
||||||
|
if exclude_patterns is None:
|
||||||
|
exclude_patterns = _DEFAULT_EXCLUDE_PATTERNS
|
||||||
|
|
||||||
|
compiled = [re.compile(p) for p in exclude_patterns]
|
||||||
|
entities: List[EntityMeta] = []
|
||||||
|
|
||||||
|
for md_file in sorted(directory.glob("*.md")):
|
||||||
|
if any(pat.match(md_file.name) for pat in compiled):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entities.append(parse_entity_file(md_file))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Skipping %s: %s", md_file.name, exc)
|
||||||
|
|
||||||
|
return entities
|
||||||
53
markitect/infospace/models.py
Normal file
53
markitect/infospace/models.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""
|
||||||
|
Data models for infospace entity metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field, asdict
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EntityMeta:
|
||||||
|
"""Structured metadata extracted from a single entity markdown file.
|
||||||
|
|
||||||
|
The parser populates every field it can find; missing optional
|
||||||
|
sections are left as empty strings (validation is a separate step).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Identity
|
||||||
|
slug: str
|
||||||
|
title: str
|
||||||
|
h1_raw: str # verbatim H1 text before any normalisation
|
||||||
|
|
||||||
|
# Section contents (plain text, empty string if section missing)
|
||||||
|
definition: str = ""
|
||||||
|
source_chapter: str = ""
|
||||||
|
context: str = ""
|
||||||
|
domain: str = ""
|
||||||
|
original_wording: str = ""
|
||||||
|
modern_interpretation: str = ""
|
||||||
|
|
||||||
|
# Derived flags
|
||||||
|
h1_is_title_case: bool = False
|
||||||
|
has_original_wording: bool = False
|
||||||
|
|
||||||
|
# Metrics-ready numbers
|
||||||
|
definition_word_count: int = 0
|
||||||
|
total_word_count: int = 0
|
||||||
|
|
||||||
|
# All H2 section slugs found (preserves order)
|
||||||
|
section_slugs: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
# Source file path (as string for serialisation)
|
||||||
|
source_path: str = ""
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Serialise to a plain dictionary."""
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
|
||||||
|
"""Deserialise from a plain dictionary."""
|
||||||
|
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
|
||||||
|
filtered = {k: v for k, v in data.items() if k in known_fields}
|
||||||
|
return cls(**filtered)
|
||||||
0
tests/unit/core/__init__.py
Normal file
0
tests/unit/core/__init__.py
Normal file
137
tests/unit/core/test_section_tree.py
Normal file
137
tests/unit/core/test_section_tree.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
"""Tests for markitect.core.section_tree."""
|
||||||
|
|
||||||
|
from markitect.core.parser import parse_markdown_to_ast
|
||||||
|
from markitect.core.section_tree import (
|
||||||
|
build_section_tree,
|
||||||
|
extract_heading_content,
|
||||||
|
extract_heading_level,
|
||||||
|
extract_section_text,
|
||||||
|
slugify,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSlugify:
|
||||||
|
def test_simple_text(self):
|
||||||
|
assert slugify("Hello World") == "hello_world"
|
||||||
|
|
||||||
|
def test_german_umlauts(self):
|
||||||
|
assert slugify("Ärger mit Über") == "aerger_mit_ueber"
|
||||||
|
|
||||||
|
def test_special_characters(self):
|
||||||
|
assert slugify("Smith's Original Wording") == "smith_s_original_wording"
|
||||||
|
|
||||||
|
def test_empty_string(self):
|
||||||
|
assert slugify("") == "feld"
|
||||||
|
|
||||||
|
def test_trailing_underscores_stripped(self):
|
||||||
|
assert slugify("--hello--") == "hello"
|
||||||
|
|
||||||
|
def test_multiple_spaces(self):
|
||||||
|
assert slugify("a b") == "a_b"
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractHeadingLevel:
|
||||||
|
def test_h1(self):
|
||||||
|
assert extract_heading_level("h1") == 1
|
||||||
|
|
||||||
|
def test_h6(self):
|
||||||
|
assert extract_heading_level("h6") == 6
|
||||||
|
|
||||||
|
def test_invalid_tag(self):
|
||||||
|
assert extract_heading_level("p") == 1
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert extract_heading_level("") == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractHeadingContent:
|
||||||
|
def test_finds_inline_token(self):
|
||||||
|
tokens = [
|
||||||
|
{"type": "heading_open", "tag": "h1"},
|
||||||
|
{"type": "inline", "content": "Hello"},
|
||||||
|
{"type": "heading_close", "tag": "h1"},
|
||||||
|
]
|
||||||
|
assert extract_heading_content(tokens, 0) == "Hello"
|
||||||
|
|
||||||
|
def test_no_inline(self):
|
||||||
|
tokens = [
|
||||||
|
{"type": "heading_open", "tag": "h1"},
|
||||||
|
{"type": "heading_close", "tag": "h1"},
|
||||||
|
]
|
||||||
|
assert extract_heading_content(tokens, 0) == ""
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildSectionTree:
|
||||||
|
def test_single_heading(self):
|
||||||
|
md = "# Title\n\nSome text."
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
|
||||||
|
assert tree["level"] == 0
|
||||||
|
assert len(tree["children"]) == 1
|
||||||
|
assert tree["children"][0]["heading"] == "Title"
|
||||||
|
assert tree["children"][0]["level"] == 1
|
||||||
|
|
||||||
|
def test_nested_headings(self):
|
||||||
|
md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore"
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
|
||||||
|
top = tree["children"][0]
|
||||||
|
assert top["heading"] == "Top"
|
||||||
|
assert len(top["children"]) == 2
|
||||||
|
assert top["children"][0]["heading"] == "Sub"
|
||||||
|
assert top["children"][1]["heading"] == "Sub2"
|
||||||
|
|
||||||
|
def test_max_depth(self):
|
||||||
|
md = "# Top\n\n## Sub\n\n### Deep\n\ntext"
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens, max_depth=2)
|
||||||
|
|
||||||
|
top = tree["children"][0]
|
||||||
|
sub = top["children"][0]
|
||||||
|
# H3 should be excluded from tree
|
||||||
|
assert len(sub["children"]) == 0
|
||||||
|
|
||||||
|
def test_content_tokens_captured(self):
|
||||||
|
md = "# Title\n\nParagraph text here."
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
|
||||||
|
section = tree["children"][0]
|
||||||
|
inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"]
|
||||||
|
assert len(inline_tokens) == 1
|
||||||
|
assert "Paragraph text here" in inline_tokens[0]["content"]
|
||||||
|
|
||||||
|
def test_slug_assigned(self):
|
||||||
|
md = "# Economic Domain\n\ntext"
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
|
||||||
|
assert tree["children"][0]["slug"] == "economic_domain"
|
||||||
|
|
||||||
|
def test_empty_document(self):
|
||||||
|
tokens = parse_markdown_to_ast("")
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
assert tree["children"] == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractSectionText:
|
||||||
|
def test_simple_paragraph(self):
|
||||||
|
md = "# Title\n\nHello world."
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
text = extract_section_text(tree["children"][0])
|
||||||
|
assert text == "Hello world."
|
||||||
|
|
||||||
|
def test_multiple_paragraphs(self):
|
||||||
|
md = "# Title\n\nFirst paragraph.\n\nSecond paragraph."
|
||||||
|
tokens = parse_markdown_to_ast(md)
|
||||||
|
tree = build_section_tree(tokens)
|
||||||
|
text = extract_section_text(tree["children"][0])
|
||||||
|
assert "First paragraph." in text
|
||||||
|
assert "Second paragraph." in text
|
||||||
|
|
||||||
|
def test_empty_section(self):
|
||||||
|
section = {"content_tokens": []}
|
||||||
|
assert extract_section_text(section) == ""
|
||||||
0
tests/unit/infospace/__init__.py
Normal file
0
tests/unit/infospace/__init__.py
Normal file
230
tests/unit/infospace/test_entity_parser.py
Normal file
230
tests/unit/infospace/test_entity_parser.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
"""Tests for markitect.infospace.entity_parser and EntityMeta."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from markitect.infospace import EntityMeta, parse_entity_file, parse_entity_directory
|
||||||
|
|
||||||
|
|
||||||
|
# ── Fixtures ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
COMPLETE_ENTITY = """\
|
||||||
|
# Division of Labour
|
||||||
|
|
||||||
|
## Definition
|
||||||
|
|
||||||
|
The separation of a work process into a number of distinct tasks, each performed
|
||||||
|
by a specialised worker, resulting in a significant increase in the productive
|
||||||
|
powers of labour.
|
||||||
|
|
||||||
|
## Source Chapter
|
||||||
|
|
||||||
|
Book I, Chapter 1: "Of the Division of Labour"
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
The division of labour is the central argument of the chapter.
|
||||||
|
|
||||||
|
## Economic Domain
|
||||||
|
|
||||||
|
Production
|
||||||
|
|
||||||
|
## Smith's Original Wording
|
||||||
|
|
||||||
|
"The greatest improvements in the productive powers of labour…"
|
||||||
|
|
||||||
|
## Modern Interpretation
|
||||||
|
|
||||||
|
The division of labour remains a foundational concept in economics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
MINIMAL_ENTITY = """\
|
||||||
|
# Minimal Entity
|
||||||
|
|
||||||
|
## Definition
|
||||||
|
|
||||||
|
A brief definition.
|
||||||
|
|
||||||
|
## Source Chapter
|
||||||
|
|
||||||
|
Book I, Chapter 1
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Some context.
|
||||||
|
|
||||||
|
## Economic Domain
|
||||||
|
|
||||||
|
Exchange
|
||||||
|
"""
|
||||||
|
|
||||||
|
SLUG_H1_ENTITY = """\
|
||||||
|
# effectual-demand
|
||||||
|
|
||||||
|
## Definition
|
||||||
|
|
||||||
|
Effectual demand is the demand by consumers who are willing and able to pay.
|
||||||
|
|
||||||
|
## Source Chapter
|
||||||
|
|
||||||
|
Book 1, Chapter 7
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Context for effectual demand.
|
||||||
|
|
||||||
|
## Economic Domain
|
||||||
|
|
||||||
|
Exchange
|
||||||
|
|
||||||
|
## Smith's Original Wording
|
||||||
|
|
||||||
|
"Such people may be called the effectual demanders…"
|
||||||
|
|
||||||
|
## Modern Interpretation
|
||||||
|
|
||||||
|
Represents the intersection of desire and purchasing power.
|
||||||
|
"""
|
||||||
|
|
||||||
|
NO_H1 = """\
|
||||||
|
## Only H2
|
||||||
|
|
||||||
|
Some content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── parse_entity_file ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestParseEntityFile:
|
||||||
|
def test_complete_entity(self, tmp_path):
|
||||||
|
f = tmp_path / "division-of-labour.md"
|
||||||
|
f.write_text(COMPLETE_ENTITY)
|
||||||
|
meta = parse_entity_file(f)
|
||||||
|
|
||||||
|
assert meta.slug == "division_of_labour"
|
||||||
|
assert meta.title == "Division of Labour"
|
||||||
|
assert meta.h1_is_title_case is True
|
||||||
|
assert meta.has_original_wording is True
|
||||||
|
assert meta.domain == "Production"
|
||||||
|
assert meta.definition_word_count > 20
|
||||||
|
assert "separation" in meta.definition.lower()
|
||||||
|
assert meta.source_path == str(f)
|
||||||
|
assert "definition" in meta.section_slugs
|
||||||
|
assert "smith_s_original_wording" in meta.section_slugs
|
||||||
|
|
||||||
|
def test_minimal_entity(self, tmp_path):
|
||||||
|
f = tmp_path / "minimal-entity.md"
|
||||||
|
f.write_text(MINIMAL_ENTITY)
|
||||||
|
meta = parse_entity_file(f)
|
||||||
|
|
||||||
|
assert meta.slug == "minimal_entity"
|
||||||
|
assert meta.has_original_wording is False
|
||||||
|
assert meta.original_wording == ""
|
||||||
|
assert meta.modern_interpretation == ""
|
||||||
|
assert meta.domain == "Exchange"
|
||||||
|
|
||||||
|
def test_slug_format_h1(self, tmp_path):
|
||||||
|
f = tmp_path / "effectual-demand.md"
|
||||||
|
f.write_text(SLUG_H1_ENTITY)
|
||||||
|
meta = parse_entity_file(f)
|
||||||
|
|
||||||
|
assert meta.h1_raw == "effectual-demand"
|
||||||
|
assert meta.h1_is_title_case is False
|
||||||
|
assert meta.slug == "effectual_demand"
|
||||||
|
assert meta.has_original_wording is True
|
||||||
|
|
||||||
|
def test_missing_h1_raises(self, tmp_path):
|
||||||
|
f = tmp_path / "no-h1.md"
|
||||||
|
f.write_text(NO_H1)
|
||||||
|
with pytest.raises(ValueError, match="No H1"):
|
||||||
|
parse_entity_file(f)
|
||||||
|
|
||||||
|
def test_missing_sections_return_empty(self, tmp_path):
|
||||||
|
f = tmp_path / "minimal.md"
|
||||||
|
f.write_text(MINIMAL_ENTITY)
|
||||||
|
meta = parse_entity_file(f)
|
||||||
|
|
||||||
|
# Optional sections not present → empty string
|
||||||
|
assert meta.original_wording == ""
|
||||||
|
assert meta.modern_interpretation == ""
|
||||||
|
|
||||||
|
def test_word_count_accuracy(self, tmp_path):
|
||||||
|
f = tmp_path / "test.md"
|
||||||
|
f.write_text("# Test\n\n## Definition\n\none two three four five\n")
|
||||||
|
meta = parse_entity_file(f)
|
||||||
|
assert meta.definition_word_count == 5
|
||||||
|
|
||||||
|
|
||||||
|
# ── parse_entity_directory ──────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestParseEntityDirectory:
|
||||||
|
def _make_dir(self, tmp_path):
|
||||||
|
"""Create a temporary entity directory."""
|
||||||
|
d = tmp_path / "entities"
|
||||||
|
d.mkdir()
|
||||||
|
(d / "entity-a.md").write_text(COMPLETE_ENTITY)
|
||||||
|
(d / "entity-b.md").write_text(MINIMAL_ENTITY)
|
||||||
|
# files that should be excluded by default
|
||||||
|
(d / "book-1-chapter-01-entities.md").write_text("# View\n\nview file")
|
||||||
|
(d / "book-1-chapter-01-prompt.md").write_text("# Prompt\n\nprompt file")
|
||||||
|
return d
|
||||||
|
|
||||||
|
def test_excludes_view_and_prompt(self, tmp_path):
|
||||||
|
d = self._make_dir(tmp_path)
|
||||||
|
results = parse_entity_directory(d)
|
||||||
|
slugs = {e.slug for e in results}
|
||||||
|
|
||||||
|
assert "division_of_labour" in slugs
|
||||||
|
assert "minimal_entity" in slugs
|
||||||
|
# Excluded files should not be parsed as entities
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
def test_custom_exclude_patterns(self, tmp_path):
|
||||||
|
d = self._make_dir(tmp_path)
|
||||||
|
# Only exclude prompt files, allow entity views
|
||||||
|
results = parse_entity_directory(d, exclude_patterns=[r".*-prompt\.md$"])
|
||||||
|
assert len(results) == 3 # entity-a, entity-b, chapter-01-entities
|
||||||
|
|
||||||
|
def test_malformed_skipped_with_warning(self, tmp_path, caplog):
|
||||||
|
d = tmp_path / "entities"
|
||||||
|
d.mkdir()
|
||||||
|
(d / "good.md").write_text(COMPLETE_ENTITY)
|
||||||
|
(d / "bad.md").write_text(NO_H1)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING):
|
||||||
|
results = parse_entity_directory(d)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
assert "bad.md" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
# ── EntityMeta round-trip ───────────────────────────────────────────
|
||||||
|
|
||||||
|
class TestEntityMetaRoundTrip:
|
||||||
|
def test_to_dict_from_dict(self, tmp_path):
|
||||||
|
f = tmp_path / "entity.md"
|
||||||
|
f.write_text(COMPLETE_ENTITY)
|
||||||
|
original = parse_entity_file(f)
|
||||||
|
|
||||||
|
data = original.to_dict()
|
||||||
|
restored = EntityMeta.from_dict(data)
|
||||||
|
|
||||||
|
assert restored.slug == original.slug
|
||||||
|
assert restored.title == original.title
|
||||||
|
assert restored.definition == original.definition
|
||||||
|
assert restored.h1_is_title_case == original.h1_is_title_case
|
||||||
|
assert restored.section_slugs == original.section_slugs
|
||||||
|
assert restored.definition_word_count == original.definition_word_count
|
||||||
|
|
||||||
|
def test_from_dict_ignores_unknown_keys(self):
|
||||||
|
data = {
|
||||||
|
"slug": "test",
|
||||||
|
"title": "Test",
|
||||||
|
"h1_raw": "Test",
|
||||||
|
"unknown_field": "should be ignored",
|
||||||
|
}
|
||||||
|
meta = EntityMeta.from_dict(data)
|
||||||
|
assert meta.slug == "test"
|
||||||
|
assert not hasattr(meta, "unknown_field") or "unknown_field" not in meta.__dict__
|
||||||
Reference in New Issue
Block a user