Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
"""
|
|
Data models for infospace entity metadata.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
@dataclass
|
|
class EntityMeta:
|
|
"""Structured metadata extracted from a single entity markdown file.
|
|
|
|
The parser populates every field it can find; missing optional
|
|
sections are left as empty strings (validation is a separate step).
|
|
"""
|
|
|
|
# Identity
|
|
slug: str
|
|
title: str
|
|
h1_raw: str # verbatim H1 text before any normalisation
|
|
|
|
# Section contents (plain text, empty string if section missing)
|
|
definition: str = ""
|
|
source_chapter: str = ""
|
|
context: str = ""
|
|
domain: str = ""
|
|
original_wording: str = ""
|
|
modern_interpretation: str = ""
|
|
|
|
# Derived flags
|
|
h1_is_title_case: bool = False
|
|
has_original_wording: bool = False
|
|
|
|
# Metrics-ready numbers
|
|
definition_word_count: int = 0
|
|
total_word_count: int = 0
|
|
|
|
# All H2 section slugs found (preserves order)
|
|
section_slugs: List[str] = field(default_factory=list)
|
|
|
|
# Source file path (as string for serialisation)
|
|
source_path: str = ""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Serialise to a plain dictionary."""
|
|
return asdict(self)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
|
|
"""Deserialise from a plain dictionary."""
|
|
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
|
|
filtered = {k: v for k, v in data.items() if k in known_fields}
|
|
return cls(**filtered)
|