Files
markitect-main/tests/test_schema_loader.py
tegwick b81ce5631d feat: implement Phase 2 - Markdown Schema Loader
Completed Phase 2 of the schema-of-schemas implementation with full
markdown schema support. This enables schemas to be authored as
markdown files with rich documentation and embedded JSON schemas.

Core Implementation (markitect/schema_loader.py):
- MarkdownSchemaLoader class with comprehensive parsing capabilities
- YAML frontmatter extraction with error handling
- JSON code block extraction with section preference (## Schema Definition)
- Metadata merging with x-markitect-source tracking
- Schema saving with template support and round-trip capability
- Helper methods: list_json_blocks(), validate_schema_structure()

Test Coverage (tests/test_schema_loader.py):
- 35 comprehensive unit tests (100% passing)
- Tests for loading, parsing, saving, round-trip conversion
- Edge case handling (empty files, binary files, malformed blocks)
- Fixed binary file test to use invalid UTF-8 sequences

Example Schema (markitect/schemas/manpage-schema-v1.0.md):
- First markdown schema following naming convention
- Complete manpage schema with frontmatter + documentation + JSON
- Demonstrates section classification and content control
- Shows proper structure for future schema authors

Documentation (roadmap/schema-of-schemas/SCHEMA_LOADER_GUIDE.md):
- Comprehensive user guide (600+ lines)
- API reference with examples
- Best practices and troubleshooting
- Integration patterns for CLI and validator

Progress Tracking:
- Updated TODO.md with Phase 2 completion
- Updated CHANGELOG.md with implementation details
- Next: Phase 3 - Schema-for-Schemas Metaschema

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-05 00:02:15 +01:00

689 lines
20 KiB
Python

"""
Unit tests for schema_loader.py - Markdown schema loading.
Tests the markdown schema loader functionality including:
- Frontmatter extraction (YAML)
- JSON schema extraction from code blocks
- Metadata merging
- Schema saving
- Error handling
"""
import pytest
import json
import yaml
from pathlib import Path
from markitect.schema_loader import (
MarkdownSchemaLoader,
SchemaLoaderError,
InvalidSchemaFormatError,
SchemaNotFoundError
)
# Test fixtures
@pytest.fixture
def temp_schema_dir(tmp_path):
"""Create temporary directory for schema files."""
schema_dir = tmp_path / "schemas"
schema_dir.mkdir()
return schema_dir
@pytest.fixture
def simple_schema_md():
"""Simple valid markdown schema content."""
return """---
schema-id: "https://markitect.dev/schemas/test/v1"
version: "1.0.0"
status: "stable"
---
# Test Schema v1.0
## Overview
This is a test schema for validation.
## Schema Definition
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://markitect.dev/schemas/test/v1",
"version": "1.0.0",
"title": "Test Schema",
"description": "Schema for testing",
"type": "object",
"properties": {
"name": {"type": "string"}
}
}
```
## Version History
### v1.0.0
- Initial version
"""
@pytest.fixture
def schema_without_frontmatter():
"""Schema without YAML frontmatter."""
return """# Test Schema v1.0
## Schema Definition
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Test Schema",
"type": "object"
}
```
"""
@pytest.fixture
def schema_multiple_json_blocks():
"""Schema with multiple JSON code blocks."""
return """---
version: "1.0.0"
---
# Test Schema
## Example Usage
```json
{
"example": "This is not the schema"
}
```
## Schema Definition
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Test Schema",
"type": "object"
}
```
## More Examples
```json
{
"another": "example"
}
```
"""
class TestMarkdownSchemaLoader:
"""Tests for MarkdownSchemaLoader class."""
def test_init(self):
"""Test loader initialization."""
loader = MarkdownSchemaLoader()
assert loader is not None
assert hasattr(loader, 'frontmatter_pattern')
assert hasattr(loader, 'json_code_block_pattern')
def test_load_simple_schema(self, temp_schema_dir, simple_schema_md):
"""Test loading a simple valid schema."""
schema_file = temp_schema_dir / "test-schema-v1.0.md"
schema_file.write_text(simple_schema_md)
loader = MarkdownSchemaLoader()
result = loader.load_schema(schema_file)
assert 'schema' in result
assert 'metadata' in result
assert 'documentation' in result
assert 'source_file' in result
# Check schema content
schema = result['schema']
assert schema['title'] == 'Test Schema'
assert schema['version'] == '1.0.0'
assert schema['type'] == 'object'
# Check metadata
metadata = result['metadata']
assert metadata['version'] == '1.0.0'
assert metadata['status'] == 'stable'
# Check source tracking
assert result['source_file'] == str(schema_file)
assert 'x-markitect-source' in schema
assert schema['x-markitect-source']['format'] == 'markdown'
def test_load_schema_file_not_found(self):
"""Test loading non-existent file raises FileNotFoundError."""
loader = MarkdownSchemaLoader()
with pytest.raises(FileNotFoundError, match="Schema file not found"):
loader.load_schema(Path("/nonexistent/schema.md"))
def test_load_schema_without_json(self, temp_schema_dir):
"""Test loading markdown without JSON schema raises error."""
schema_file = temp_schema_dir / "no-schema.md"
schema_file.write_text("# Just a heading\n\nNo schema here.")
loader = MarkdownSchemaLoader()
with pytest.raises(SchemaNotFoundError, match="No JSON schema found"):
loader.load_schema(schema_file)
def test_load_schema_invalid_json(self, temp_schema_dir):
"""Test loading markdown with invalid JSON raises error."""
content = """# Test
```json
{invalid json}
```
"""
schema_file = temp_schema_dir / "invalid.md"
schema_file.write_text(content)
loader = MarkdownSchemaLoader()
with pytest.raises(InvalidSchemaFormatError, match="Invalid JSON"):
loader.load_schema(schema_file)
class TestExtractFrontmatter:
"""Tests for frontmatter extraction."""
def test_extract_valid_frontmatter(self, simple_schema_md):
"""Test extracting valid YAML frontmatter."""
loader = MarkdownSchemaLoader()
metadata = loader._extract_frontmatter(simple_schema_md)
assert metadata['schema-id'] == 'https://markitect.dev/schemas/test/v1'
assert metadata['version'] == '1.0.0'
assert metadata['status'] == 'stable'
def test_extract_no_frontmatter(self, schema_without_frontmatter):
"""Test extracting from content without frontmatter returns empty dict."""
loader = MarkdownSchemaLoader()
metadata = loader._extract_frontmatter(schema_without_frontmatter)
assert metadata == {}
def test_extract_invalid_yaml_frontmatter(self):
"""Test extracting invalid YAML raises error."""
content = """---
invalid: yaml: syntax: error
---
# Content
"""
loader = MarkdownSchemaLoader()
with pytest.raises(InvalidSchemaFormatError, match="Invalid YAML"):
loader._extract_frontmatter(content)
def test_extract_non_dict_frontmatter(self):
"""Test extracting non-dictionary YAML raises error."""
content = """---
- list
- not
- dict
---
# Content
"""
loader = MarkdownSchemaLoader()
with pytest.raises(InvalidSchemaFormatError, match="must be a YAML dictionary"):
loader._extract_frontmatter(content)
def test_extract_complex_frontmatter(self):
"""Test extracting complex frontmatter with nested structures."""
content = """---
schema-id: "https://example.com/schema"
version: "1.0.0"
tags:
- documentation
- schema
metadata:
author: "Test Author"
created: "2026-01-04"
---
# Content
"""
loader = MarkdownSchemaLoader()
metadata = loader._extract_frontmatter(content)
assert metadata['tags'] == ['documentation', 'schema']
assert metadata['metadata']['author'] == 'Test Author'
class TestExtractJsonSchema:
"""Tests for JSON schema extraction."""
def test_extract_single_json_block(self, schema_without_frontmatter):
"""Test extracting single JSON block."""
loader = MarkdownSchemaLoader()
schema = loader._extract_json_schema(schema_without_frontmatter)
assert schema is not None
assert schema['title'] == 'Test Schema'
assert schema['type'] == 'object'
def test_extract_from_schema_definition_section(self, schema_multiple_json_blocks):
"""Test preferring JSON block under Schema Definition heading."""
loader = MarkdownSchemaLoader()
schema = loader._extract_json_schema(schema_multiple_json_blocks)
assert schema is not None
assert schema['title'] == 'Test Schema'
# Should get the schema from Schema Definition section, not the example
def test_extract_no_json_block(self):
"""Test extracting from content with no JSON blocks returns None."""
content = "# Just text\n\nNo code blocks here."
loader = MarkdownSchemaLoader()
schema = loader._extract_json_schema(content)
assert schema is None
def test_extract_invalid_json_block(self):
"""Test extracting invalid JSON raises error."""
content = """# Test
```json
{invalid}
```
"""
loader = MarkdownSchemaLoader()
with pytest.raises(InvalidSchemaFormatError, match="Invalid JSON"):
loader._extract_json_schema(content)
def test_extract_non_object_json(self):
"""Test extracting JSON array (non-object) raises error."""
content = """# Test
```json
["array", "not", "object"]
```
"""
loader = MarkdownSchemaLoader()
with pytest.raises(InvalidSchemaFormatError, match="must be a JSON object"):
loader._extract_json_schema(content)
class TestMergeMetadata:
"""Tests for metadata merging."""
def test_merge_basic_metadata(self):
"""Test merging frontmatter into schema."""
loader = MarkdownSchemaLoader()
schema = {
'title': 'Test Schema',
'type': 'object'
}
metadata = {
'version': '2.0.0',
'schema-id': 'https://example.com/v2',
'status': 'draft'
}
merged = loader._merge_metadata(schema, metadata, Path('test.md'))
# Version should be overridden
assert merged['version'] == '2.0.0'
# $id should be set from schema-id
assert merged['$id'] == 'https://example.com/v2'
# Status should be in x-markitect-metadata
assert merged['x-markitect-metadata']['status'] == 'draft'
# Source tracking should be added
assert merged['x-markitect-source']['file'] == 'test.md'
assert merged['x-markitect-source']['format'] == 'markdown'
def test_merge_preserves_schema_fields(self):
"""Test merging doesn't remove existing schema fields."""
loader = MarkdownSchemaLoader()
schema = {
'title': 'Test',
'type': 'object',
'properties': {'name': {'type': 'string'}}
}
merged = loader._merge_metadata(schema, {}, Path('test.md'))
assert merged['title'] == 'Test'
assert merged['type'] == 'object'
assert 'properties' in merged
def test_merge_frontmatter_takes_precedence(self):
"""Test frontmatter overrides schema values."""
loader = MarkdownSchemaLoader()
schema = {
'version': '1.0.0',
'$id': 'old-id'
}
metadata = {
'version': '2.0.0',
'schema-id': 'new-id'
}
merged = loader._merge_metadata(schema, metadata, Path('test.md'))
assert merged['version'] == '2.0.0'
assert merged['$id'] == 'new-id'
class TestSaveSchema:
"""Tests for saving schemas to markdown."""
def test_save_simple_schema(self, temp_schema_dir):
"""Test saving a schema to markdown file."""
loader = MarkdownSchemaLoader()
schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'$id': 'https://example.com/schema/v1',
'version': '1.0.0',
'title': 'Test Schema',
'description': 'A test schema',
'type': 'object'
}
output_file = temp_schema_dir / 'output-schema-v1.0.md'
loader.save_schema(schema, output_file)
assert output_file.exists()
# Verify content
content = output_file.read_text()
assert '---' in content # Frontmatter
assert 'Test Schema v1.0.0' in content # Title
assert '```json' in content # JSON block
assert '"title": "Test Schema"' in content
def test_save_creates_parent_directory(self, temp_schema_dir):
"""Test saving creates parent directories if needed."""
loader = MarkdownSchemaLoader()
schema = {'title': 'Test', 'type': 'object'}
output_file = temp_schema_dir / 'nested' / 'dir' / 'schema.md'
loader.save_schema(schema, output_file)
assert output_file.exists()
assert output_file.parent.exists()
def test_save_with_custom_frontmatter(self, temp_schema_dir):
"""Test saving with custom frontmatter."""
loader = MarkdownSchemaLoader()
schema = {'title': 'Test', 'type': 'object'}
frontmatter = {
'schema-id': 'https://custom.com/schema',
'status': 'experimental',
'tags': ['test', 'custom']
}
output_file = temp_schema_dir / 'custom.md'
loader.save_schema(schema, output_file, frontmatter=frontmatter)
content = output_file.read_text()
assert 'experimental' in content
assert 'https://custom.com/schema' in content
def test_save_and_reload_roundtrip(self, temp_schema_dir):
"""Test saving and reloading produces same schema."""
loader = MarkdownSchemaLoader()
original_schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'version': '1.0.0',
'title': 'Roundtrip Test',
'type': 'object',
'properties': {
'name': {'type': 'string'},
'age': {'type': 'integer'}
}
}
schema_file = temp_schema_dir / 'roundtrip-schema-v1.0.md'
loader.save_schema(original_schema, schema_file)
# Reload
loaded = loader.load_schema(schema_file)
loaded_schema = loaded['schema']
# Compare key fields (ignoring x-markitect-source added during load)
assert loaded_schema['title'] == original_schema['title']
assert loaded_schema['type'] == original_schema['type']
assert loaded_schema['properties'] == original_schema['properties']
class TestGenerateMarkdown:
"""Tests for markdown generation."""
def test_generate_basic_markdown(self):
"""Test generating basic markdown from schema."""
loader = MarkdownSchemaLoader()
schema = {
'title': 'Test Schema',
'version': '1.0.0',
'description': 'Test description',
'type': 'object'
}
md = loader._generate_markdown(schema)
assert 'Test Schema v1.0.0' in md
assert 'Test description' in md
assert '```json' in md
assert '"title": "Test Schema"' in md
assert '---' in md # Frontmatter
def test_generate_includes_frontmatter(self):
"""Test generated markdown includes frontmatter."""
loader = MarkdownSchemaLoader()
schema = {
'$id': 'https://example.com/schema',
'title': 'Test',
'version': '2.0.0',
'type': 'object'
}
md = loader._generate_markdown(schema)
# Parse frontmatter
lines = md.split('\n')
assert lines[0] == '---'
# Find end of frontmatter
end_idx = lines[1:].index('---') + 1
frontmatter_yaml = '\n'.join(lines[1:end_idx])
frontmatter = yaml.safe_load(frontmatter_yaml)
assert frontmatter['version'] == '2.0.0'
assert frontmatter['schema-id'] == 'https://example.com/schema'
class TestListJsonBlocks:
"""Tests for listing JSON blocks."""
def test_list_single_block(self, schema_without_frontmatter):
"""Test listing single JSON block."""
loader = MarkdownSchemaLoader()
blocks = loader.list_json_blocks(schema_without_frontmatter)
assert len(blocks) == 1
assert '"title": "Test Schema"' in blocks[0][1]
def test_list_multiple_blocks(self, schema_multiple_json_blocks):
"""Test listing multiple JSON blocks."""
loader = MarkdownSchemaLoader()
blocks = loader.list_json_blocks(schema_multiple_json_blocks)
assert len(blocks) == 3
# First block
assert '"example"' in blocks[0][1]
# Second block (schema)
assert '"title": "Test Schema"' in blocks[1][1]
# Third block
assert '"another"' in blocks[2][1]
def test_list_no_blocks(self):
"""Test listing with no JSON blocks."""
loader = MarkdownSchemaLoader()
blocks = loader.list_json_blocks("# Just text\n\nNo code blocks.")
assert len(blocks) == 0
class TestValidateSchemaStructure:
"""Tests for schema structure validation."""
def test_validate_complete_schema(self):
"""Test validating complete schema returns no issues."""
loader = MarkdownSchemaLoader()
schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'$id': 'https://example.com/schema',
'version': '1.0.0',
'title': 'Test Schema',
'description': 'Test description',
'type': 'object'
}
issues = loader.validate_schema_structure(schema)
assert len(issues) == 0
def test_validate_missing_required_fields(self):
"""Test validation detects missing required fields."""
loader = MarkdownSchemaLoader()
schema = {'type': 'object'}
issues = loader.validate_schema_structure(schema)
assert len(issues) > 0
assert any('$schema' in issue for issue in issues)
assert any('title' in issue for issue in issues)
assert any('description' in issue for issue in issues)
def test_validate_missing_version(self):
"""Test validation detects missing version field."""
loader = MarkdownSchemaLoader()
schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'title': 'Test',
'type': 'object'
}
issues = loader.validate_schema_structure(schema)
assert any('version' in issue for issue in issues)
def test_validate_invalid_id_format(self):
"""Test validation detects non-HTTPS $id."""
loader = MarkdownSchemaLoader()
schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'$id': 'http://example.com/schema', # HTTP not HTTPS
'version': '1.0.0',
'title': 'Test',
'type': 'object'
}
issues = loader.validate_schema_structure(schema)
assert any('HTTPS' in issue for issue in issues)
class TestEdgeCases:
"""Tests for edge cases and error conditions."""
def test_load_empty_file(self, temp_schema_dir):
"""Test loading empty file raises error."""
schema_file = temp_schema_dir / 'empty.md'
schema_file.write_text('')
loader = MarkdownSchemaLoader()
with pytest.raises(SchemaNotFoundError):
loader.load_schema(schema_file)
def test_load_binary_file(self, temp_schema_dir):
"""Test loading binary file with invalid UTF-8 raises error."""
schema_file = temp_schema_dir / 'binary.md'
# Use invalid UTF-8 sequences that will trigger UnicodeDecodeError
schema_file.write_bytes(b'\xff\xfe\x00\x00\x80\x81\x82')
loader = MarkdownSchemaLoader()
with pytest.raises(InvalidSchemaFormatError):
loader.load_schema(schema_file)
def test_malformed_code_block(self, temp_schema_dir):
"""Test handling malformed code block delimiters."""
content = """# Test
```json
{"valid": "json"
# Missing closing backticks
"""
schema_file = temp_schema_dir / 'malformed.md'
schema_file.write_text(content)
loader = MarkdownSchemaLoader()
with pytest.raises(SchemaNotFoundError):
loader.load_schema(schema_file)
def test_very_large_schema(self, temp_schema_dir):
"""Test loading very large schema."""
# Create large schema with many properties
large_schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'title': 'Large Schema',
'type': 'object',
'properties': {
f'prop_{i}': {'type': 'string'}
for i in range(1000)
}
}
content = f"""# Large Schema
```json
{json.dumps(large_schema, indent=2)}
```
"""
schema_file = temp_schema_dir / 'large.md'
schema_file.write_text(content)
loader = MarkdownSchemaLoader()
result = loader.load_schema(schema_file)
assert len(result['schema']['properties']) == 1000