Files
markitect-main/markitect/schema_loader.py
tegwick b81ce5631d feat: implement Phase 2 - Markdown Schema Loader
Completed Phase 2 of the schema-of-schemas implementation with full
markdown schema support. This enables schemas to be authored as
markdown files with rich documentation and embedded JSON schemas.

Core Implementation (markitect/schema_loader.py):
- MarkdownSchemaLoader class with comprehensive parsing capabilities
- YAML frontmatter extraction with error handling
- JSON code block extraction with section preference (## Schema Definition)
- Metadata merging with x-markitect-source tracking
- Schema saving with template support and round-trip capability
- Helper methods: list_json_blocks(), validate_schema_structure()

Test Coverage (tests/test_schema_loader.py):
- 35 comprehensive unit tests (100% passing)
- Tests for loading, parsing, saving, round-trip conversion
- Edge case handling (empty files, binary files, malformed blocks)
- Fixed binary file test to use invalid UTF-8 sequences

Example Schema (markitect/schemas/manpage-schema-v1.0.md):
- First markdown schema following naming convention
- Complete manpage schema with frontmatter + documentation + JSON
- Demonstrates section classification and content control
- Shows proper structure for future schema authors

Documentation (roadmap/schema-of-schemas/SCHEMA_LOADER_GUIDE.md):
- Comprehensive user guide (600+ lines)
- API reference with examples
- Best practices and troubleshooting
- Integration patterns for CLI and validator

Progress Tracking:
- Updated TODO.md with Phase 2 completion
- Updated CHANGELOG.md with implementation details
- Next: Phase 3 - Schema-for-Schemas Metaschema

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-05 00:02:15 +01:00

504 lines
15 KiB
Python

"""
Schema Loader - Extract JSON schemas from markdown files.
This module provides functionality to load schemas from markdown files that
contain embedded JSON schemas in code blocks, along with YAML frontmatter
metadata and rich documentation.
Markdown Schema Format:
---
schema-id: "https://markitect.dev/schemas/domain/v1"
version: "1.0.0"
status: "stable|draft|deprecated"
---
# Schema Title v1.0
## Documentation sections...
## Schema Definition
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
...
}
```
This enables:
- Rich documentation alongside schemas
- Version history in same file
- Human-readable schema files
- Markdown-first approach aligned with MarkiTect philosophy
"""
import re
import json
import yaml
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
class SchemaLoaderError(Exception):
"""Base exception for schema loading errors."""
pass
class InvalidSchemaFormatError(SchemaLoaderError):
"""Schema file format is invalid."""
pass
class SchemaNotFoundError(SchemaLoaderError):
"""No JSON schema found in markdown file."""
pass
class MarkdownSchemaLoader:
"""
Load and parse markdown schema files.
Supports:
- YAML frontmatter for metadata
- JSON code blocks for schema definition
- Validation of schema structure
- Metadata merging
Example:
>>> loader = MarkdownSchemaLoader()
>>> schema_data = loader.load_schema(Path("manpage-schema-v1.0.md"))
>>> schema = schema_data['schema']
>>> metadata = schema_data['metadata']
"""
def __init__(self):
"""Initialize the schema loader with regex patterns."""
# Pattern to match YAML frontmatter
# Matches: --- ... --- at start of file
self.frontmatter_pattern = re.compile(
r'^---\s*\n(.*?)\n---\s*\n',
re.DOTALL | re.MULTILINE
)
# Pattern to match JSON code blocks
# Matches: ```json ... ```
self.json_code_block_pattern = re.compile(
r'```json\s*\n(.*?)\n```',
re.DOTALL | re.MULTILINE
)
# Pattern to find Schema Definition section
# This helps us find the right JSON block if there are multiple
self.schema_section_pattern = re.compile(
r'##\s+Schema Definition\s*\n',
re.MULTILINE
)
def load_schema(self, md_path: Path) -> Dict[str, Any]:
"""
Load schema from markdown file.
Args:
md_path: Path to markdown schema file
Returns:
Dictionary containing:
- schema: Extracted JSON schema (dict)
- metadata: Frontmatter metadata (dict)
- documentation: Full markdown content (str)
- source_file: Source file path (str)
Raises:
FileNotFoundError: If schema file doesn't exist
InvalidSchemaFormatError: If file format is invalid
SchemaNotFoundError: If no JSON schema found
Example:
>>> loader = MarkdownSchemaLoader()
>>> data = loader.load_schema(Path("manpage-schema-v1.0.md"))
>>> print(data['schema']['title'])
'Unix Manual Page Schema'
"""
if not md_path.exists():
raise FileNotFoundError(f"Schema file not found: {md_path}")
# Read file content
try:
content = md_path.read_text(encoding='utf-8')
except Exception as e:
raise InvalidSchemaFormatError(f"Failed to read schema file: {e}")
# Extract frontmatter
metadata = self._extract_frontmatter(content)
# Extract JSON schema
schema = self._extract_json_schema(content)
if not schema:
raise SchemaNotFoundError(
f"No JSON schema found in {md_path}. "
f"Expected a ```json code block with schema definition."
)
# Merge metadata into schema
schema = self._merge_metadata(schema, metadata, md_path)
return {
'schema': schema,
'metadata': metadata,
'documentation': content,
'source_file': str(md_path)
}
def _extract_frontmatter(self, content: str) -> Dict[str, Any]:
"""
Extract YAML frontmatter from markdown content.
Args:
content: Markdown file content
Returns:
Dictionary of frontmatter metadata (empty if none found)
Raises:
InvalidSchemaFormatError: If YAML is malformed
"""
match = self.frontmatter_pattern.search(content)
if not match:
return {}
yaml_content = match.group(1)
try:
metadata = yaml.safe_load(yaml_content) or {}
if not isinstance(metadata, dict):
raise InvalidSchemaFormatError(
f"Frontmatter must be a YAML dictionary, got {type(metadata)}"
)
return metadata
except yaml.YAMLError as e:
raise InvalidSchemaFormatError(f"Invalid YAML frontmatter: {e}")
def _extract_json_schema(self, content: str) -> Optional[Dict[str, Any]]:
"""
Extract JSON schema from markdown code blocks.
Prefers JSON blocks under "## Schema Definition" section,
but will use first JSON block if no Schema Definition section found.
Args:
content: Markdown file content
Returns:
JSON schema dictionary or None if not found
Raises:
InvalidSchemaFormatError: If JSON is malformed
"""
# Find all JSON code blocks
json_blocks = self.json_code_block_pattern.findall(content)
if not json_blocks:
return None
# Try to find the Schema Definition section
schema_section_match = self.schema_section_pattern.search(content)
if schema_section_match:
# Find JSON block that comes after Schema Definition section
section_pos = schema_section_match.end()
# Re-search for JSON blocks starting from section position
remaining_content = content[section_pos:]
section_json_blocks = self.json_code_block_pattern.findall(remaining_content)
if section_json_blocks:
json_text = section_json_blocks[0]
else:
# Fallback to first JSON block in entire document
json_text = json_blocks[0]
else:
# No Schema Definition section, use first JSON block
json_text = json_blocks[0]
# Parse JSON
try:
schema = json.loads(json_text)
if not isinstance(schema, dict):
raise InvalidSchemaFormatError(
f"Schema must be a JSON object, got {type(schema)}"
)
return schema
except json.JSONDecodeError as e:
raise InvalidSchemaFormatError(f"Invalid JSON schema: {e}")
def _merge_metadata(
self,
schema: Dict[str, Any],
metadata: Dict[str, Any],
source_file: Path
) -> Dict[str, Any]:
"""
Merge frontmatter metadata into schema.
Adds x-markitect-source extension with file info and metadata.
Optionally overrides schema fields with frontmatter values.
Args:
schema: JSON schema dictionary
metadata: Frontmatter metadata dictionary
source_file: Path to source file
Returns:
Schema with merged metadata
"""
# Create a copy to avoid modifying original
merged_schema = schema.copy()
# Add MarkiTect-specific source metadata
merged_schema['x-markitect-source'] = {
'file': str(source_file),
'filename': source_file.name,
'format': 'markdown',
'frontmatter': metadata
}
# Override schema fields with frontmatter if present
# This allows frontmatter to be the source of truth for metadata
if 'version' in metadata:
merged_schema['version'] = metadata['version']
if 'schema-id' in metadata:
merged_schema['$id'] = metadata['schema-id']
if 'status' in metadata:
if 'x-markitect-metadata' not in merged_schema:
merged_schema['x-markitect-metadata'] = {}
merged_schema['x-markitect-metadata']['status'] = metadata['status']
return merged_schema
def save_schema(
self,
schema: Dict[str, Any],
md_path: Path,
template: Optional[str] = None,
frontmatter: Optional[Dict[str, Any]] = None
):
"""
Save schema as markdown file.
Args:
schema: JSON schema dictionary to save
md_path: Output path for markdown file
template: Optional markdown template string
frontmatter: Optional frontmatter metadata (extracted from schema if not provided)
Raises:
InvalidSchemaFormatError: If schema is invalid
Example:
>>> loader = MarkdownSchemaLoader()
>>> loader.save_schema(
... schema={'title': 'My Schema', ...},
... md_path=Path('my-schema-v1.0.md')
... )
"""
if template:
# Use provided template
content = self._render_template(template, schema, frontmatter)
else:
# Generate basic markdown
content = self._generate_markdown(schema, frontmatter)
# Create parent directory if needed
md_path.parent.mkdir(parents=True, exist_ok=True)
# Write file
try:
md_path.write_text(content, encoding='utf-8')
except Exception as e:
raise InvalidSchemaFormatError(f"Failed to write schema file: {e}")
def _generate_markdown(
self,
schema: Dict[str, Any],
frontmatter: Optional[Dict[str, Any]] = None
) -> str:
"""
Generate markdown from schema.
Args:
schema: JSON schema dictionary
frontmatter: Optional frontmatter metadata
Returns:
Markdown content as string
"""
# Extract metadata from schema
title = schema.get('title', 'Untitled Schema')
version = schema.get('version', '1.0.0')
description = schema.get('description', '')
schema_id = schema.get('$id', '')
# Build frontmatter
if frontmatter is None:
frontmatter = {}
# Set defaults
if 'schema-id' not in frontmatter and schema_id:
frontmatter['schema-id'] = schema_id
if 'version' not in frontmatter:
frontmatter['version'] = version
if 'status' not in frontmatter:
frontmatter['status'] = 'draft'
# Generate frontmatter YAML
frontmatter_yaml = yaml.dump(
frontmatter,
default_flow_style=False,
allow_unicode=True
).strip()
# Generate JSON (pretty-printed)
schema_json = json.dumps(schema, indent=2, ensure_ascii=False)
# Build markdown content
md_content = f"""---
{frontmatter_yaml}
---
# {title} v{version}
## Overview
{description}
## Usage
```bash
markitect validate document.md --schema {Path(frontmatter.get('schema-id', 'schema')).name}
```
## Schema Definition
```json
{schema_json}
```
## Version History
### v{version}
- Initial version
"""
return md_content
def _render_template(
self,
template: str,
schema: Dict[str, Any],
frontmatter: Optional[Dict[str, Any]] = None
) -> str:
"""
Render markdown from template.
Simple template rendering using string formatting.
For complex templates, consider using Jinja2 or similar.
Args:
template: Template string
schema: JSON schema dictionary
frontmatter: Optional frontmatter metadata
Returns:
Rendered markdown content
"""
# Build context for template
context = {
'title': schema.get('title', 'Untitled'),
'version': schema.get('version', '1.0.0'),
'description': schema.get('description', ''),
'schema_id': schema.get('$id', ''),
'schema_json': json.dumps(schema, indent=2, ensure_ascii=False),
'frontmatter': frontmatter or {},
}
# Simple template rendering
try:
return template.format(**context)
except KeyError as e:
raise InvalidSchemaFormatError(f"Template missing key: {e}")
def list_json_blocks(self, content: str) -> List[Tuple[int, str]]:
"""
List all JSON code blocks in markdown content.
Useful for debugging or when multiple JSON blocks exist.
Args:
content: Markdown file content
Returns:
List of (position, json_content) tuples
Example:
>>> loader = MarkdownSchemaLoader()
>>> content = Path('schema.md').read_text()
>>> blocks = loader.list_json_blocks(content)
>>> print(f"Found {len(blocks)} JSON blocks")
"""
blocks = []
for match in self.json_code_block_pattern.finditer(content):
blocks.append((match.start(), match.group(1)))
return blocks
def validate_schema_structure(self, schema: Dict[str, Any]) -> List[str]:
"""
Validate basic schema structure.
Checks for required JSON Schema fields and MarkiTect conventions.
Args:
schema: JSON schema dictionary
Returns:
List of warning/error messages (empty if valid)
Example:
>>> loader = MarkdownSchemaLoader()
>>> issues = loader.validate_schema_structure(schema)
>>> if issues:
... print("Schema issues:", issues)
"""
issues = []
# Check required JSON Schema fields
if '$schema' not in schema:
issues.append("Missing required field: $schema")
if 'type' not in schema:
issues.append("Missing recommended field: type")
if 'title' not in schema:
issues.append("Missing recommended field: title")
if 'description' not in schema:
issues.append("Missing recommended field: description")
# Check MarkiTect conventions
if 'version' not in schema:
issues.append("Missing MarkiTect convention: version field")
if '$id' not in schema:
issues.append("Missing recommended field: $id")
# Check $id format if present
if '$id' in schema:
schema_id = schema['$id']
if not isinstance(schema_id, str):
issues.append("$id must be a string")
elif not schema_id.startswith('https://'):
issues.append("$id should be a full HTTPS URL")
return issues