Files
markitect-main/tests/test_l4_service_schema_generation.py
tegwick 60f33443ae
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat(schema): add semantic schema generation as default mode
schema-generate now builds content-aware schemas from the document's
section hierarchy instead of counting markdown syntax elements. Detects
key-value tables, data tables, link lists, and mixed content patterns
to produce schemas that reflect the actual document outline.

Old behavior preserved via --mode syntactic. Validator and visualization
tools pinned to syntactic mode for compatibility.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 18:49:50 +01:00

269 lines
8.5 KiB
Python

"""
Test for Issue #5: Generate a Schema from a Markdown File.
Tests the schema generation service that creates JSON schemas from markdown
AST structures with configurable depth limitations - critical for arc42
architectural documentation compliance validation.
"""
import json
import pytest
from pathlib import Path
from tempfile import NamedTemporaryFile
from markitect.schema_generator import SchemaGenerator
from markitect.exceptions import FileNotFoundError, InvalidDepthError
class TestIssue5SchemaGeneration:
"""Test suite for schema generation from markdown files."""
def setup_method(self):
"""Set up test environment."""
self.schema_generator = SchemaGenerator()
def test_generate_schema_from_simple_markdown_creates_valid_json_schema(self):
"""
ISSUE #5: Test basic schema generation from simple markdown structure.
Verifies that a simple markdown file generates a valid JSON schema
that captures heading structure and basic elements for arc42 compliance.
"""
# Arrange - Simple markdown with clear structure
markdown_content = """# Main Heading
This is a paragraph.
## Sub Heading
- List item 1
- List item 2
Some text here.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema in syntactic mode (element counting)
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Schema should be valid JSON and contain expected structure
assert isinstance(result, dict)
assert "$schema" in result
assert result["$schema"] == "http://json-schema.org/draft-07/schema#"
assert "type" in result
assert result["type"] == "object"
# Should capture heading structure
properties = result.get("properties", {})
assert "headings" in properties
# Should define heading levels found in the document
heading_properties = properties["headings"]["properties"]
assert "level_1" in heading_properties # # Main Heading
assert "level_2" in heading_properties # ## Sub Heading
# Should capture other structural elements
assert "paragraphs" in properties
assert "lists" in properties
assert "metadata" in properties
finally:
temp_file.unlink()
def test_generate_schema_with_depth_limitation_excludes_deep_headings(self):
"""
ISSUE #5: Test schema generation with depth limitation for arc42 templates.
Verifies that depth parameter correctly limits which heading levels
are included - essential for arc42 section-specific schema generation.
"""
# Arrange - Markdown with multiple heading levels
markdown_content = """# Level 1
Content here.
## Level 2
More content.
### Level 3
Deep content.
#### Level 4
Very deep content.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema in syntactic mode with depth limit of 2
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
# Assert - Only levels 1 and 2 should be included
properties = result.get("properties", {})
heading_properties = properties["headings"]["properties"]
assert "level_1" in heading_properties
assert "level_2" in heading_properties
assert "level_3" not in heading_properties # Should be excluded
assert "level_4" not in heading_properties # Should be excluded
finally:
temp_file.unlink()
def test_generate_schema_handles_file_not_found_error(self):
"""
ISSUE #5: Test error handling when markdown file doesn't exist.
"""
# Arrange - Non-existent file path
non_existent_file = Path("/tmp/non_existent_file.md")
# Act & Assert - Should raise appropriate exception
with pytest.raises(FileNotFoundError):
self.schema_generator.generate_schema_from_file(non_existent_file)
def test_generate_schema_handles_invalid_depth_parameters(self):
"""
ISSUE #5: Test error handling for invalid depth parameters.
"""
# Arrange - Simple markdown file
markdown_content = "# Test\n\nContent here."
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act & Assert - Invalid depth values should raise exceptions
with pytest.raises(InvalidDepthError):
self.schema_generator.generate_schema_from_file(temp_file, max_depth=0)
with pytest.raises(InvalidDepthError):
self.schema_generator.generate_schema_from_file(temp_file, max_depth=-1)
finally:
temp_file.unlink()
def test_generated_schema_is_json_serializable_and_valid(self):
"""
ISSUE #5: Test that generated schema follows JSON Schema specification.
Verifies the output can be used for validation by standard JSON Schema
validators - critical for arc42 document compliance checking.
"""
# Arrange - Standard markdown structure
markdown_content = """# Title
## Section
Content with **formatting**.
- List item
### Subsection
More content.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema
result = self.schema_generator.generate_schema_from_file(temp_file)
# Assert - Should be valid JSON Schema format
assert result.get("$schema") == "http://json-schema.org/draft-07/schema#"
assert result.get("type") == "object"
assert "properties" in result
assert "title" in result
assert "description" in result
# Should be serializable as JSON
json_string = json.dumps(result, indent=2)
assert len(json_string) > 0
# Should be deserializable back to same structure
deserialized = json.loads(json_string)
assert deserialized == result
finally:
temp_file.unlink()
def test_schema_generation_captures_structural_metadata(self):
"""
ISSUE #5: Test that schema includes comprehensive structural metadata.
Ensures generated schemas contain sufficient information for
architectural analysis and arc42 compliance validation.
"""
# Arrange - Complex document structure
markdown_content = """# Documentation
## Overview
This document describes the **architecture**.
### Components
- Component A
- Component B
- Sub-component B1
## API
```python
def api_function():
pass
```
> Important architectural decision.
| Service | Purpose |
|---------|---------|
| Auth | Authentication |
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema in syntactic mode
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Should capture comprehensive structure
properties = result.get("properties", {})
# Should have metadata about the document structure
assert "metadata" in properties
metadata_props = properties["metadata"]["properties"]
assert "total_elements" in metadata_props
assert "structure_types" in metadata_props
# Should capture heading hierarchy
assert "headings" in properties
heading_props = properties["headings"]["properties"]
assert "level_1" in heading_props
assert "level_2" in heading_props
assert "level_3" in heading_props
# Should identify structural elements present in document
expected_elements = ["paragraphs", "lists"] # Code blocks, blockquotes, tables may vary in parsing
for element in expected_elements:
assert element in properties
finally:
temp_file.unlink()