Files
markitect-main/markitect/infospace/models.py
tegwick 03c6c5e8de feat(infospace): add entity metadata parser (S1.1)
Extract section-tree algorithm from SchemaGenerator into standalone
core/section_tree.py and build markitect/infospace/ package with
EntityMeta dataclass and parse_entity_file/parse_entity_directory.
Foundation for schema compliance, coverage, and granularity metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 00:27:45 +01:00

54 lines
1.5 KiB
Python

"""
Data models for infospace entity metadata.
"""
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List
@dataclass
class EntityMeta:
"""Structured metadata extracted from a single entity markdown file.
The parser populates every field it can find; missing optional
sections are left as empty strings (validation is a separate step).
"""
# Identity
slug: str
title: str
h1_raw: str # verbatim H1 text before any normalisation
# Section contents (plain text, empty string if section missing)
definition: str = ""
source_chapter: str = ""
context: str = ""
domain: str = ""
original_wording: str = ""
modern_interpretation: str = ""
# Derived flags
h1_is_title_case: bool = False
has_original_wording: bool = False
# Metrics-ready numbers
definition_word_count: int = 0
total_word_count: int = 0
# All H2 section slugs found (preserves order)
section_slugs: List[str] = field(default_factory=list)
# Source file path (as string for serialisation)
source_path: str = ""
def to_dict(self) -> Dict[str, Any]:
"""Serialise to a plain dictionary."""
return asdict(self)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
"""Deserialise from a plain dictionary."""
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
filtered = {k: v for k, v in data.items() if k in known_fields}
return cls(**filtered)