feat(infospace): add entity metadata parser (S1.1)
Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
53
markitect/infospace/models.py
Normal file
53
markitect/infospace/models.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
Data models for infospace entity metadata.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityMeta:
|
||||
"""Structured metadata extracted from a single entity markdown file.
|
||||
|
||||
The parser populates every field it can find; missing optional
|
||||
sections are left as empty strings (validation is a separate step).
|
||||
"""
|
||||
|
||||
# Identity
|
||||
slug: str
|
||||
title: str
|
||||
h1_raw: str # verbatim H1 text before any normalisation
|
||||
|
||||
# Section contents (plain text, empty string if section missing)
|
||||
definition: str = ""
|
||||
source_chapter: str = ""
|
||||
context: str = ""
|
||||
domain: str = ""
|
||||
original_wording: str = ""
|
||||
modern_interpretation: str = ""
|
||||
|
||||
# Derived flags
|
||||
h1_is_title_case: bool = False
|
||||
has_original_wording: bool = False
|
||||
|
||||
# Metrics-ready numbers
|
||||
definition_word_count: int = 0
|
||||
total_word_count: int = 0
|
||||
|
||||
# All H2 section slugs found (preserves order)
|
||||
section_slugs: List[str] = field(default_factory=list)
|
||||
|
||||
# Source file path (as string for serialisation)
|
||||
source_path: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Serialise to a plain dictionary."""
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "EntityMeta":
|
||||
"""Deserialise from a plain dictionary."""
|
||||
known_fields = {f.name for f in cls.__dataclass_fields__.values()}
|
||||
filtered = {k: v for k, v in data.items() if k in known_fields}
|
||||
return cls(**filtered)
|
||||
Reference in New Issue
Block a user