Files
markitect-main/tests/test_issue_5_schema_generation.py
tegwick 60f33443ae
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat(schema): add semantic schema generation as default mode
schema-generate now builds content-aware schemas from the document's
section hierarchy instead of counting markdown syntax elements. Detects
key-value tables, data tables, link lists, and mixed content patterns
to produce schemas that reflect the actual document outline.

Old behavior preserved via --mode syntactic. Validator and visualization
tools pinned to syntactic mode for compatibility.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 18:49:50 +01:00

305 lines
9.2 KiB
Python

"""
Test for Issue #5: Generate a Schema from a Markdown File.
Tests the ability to create JSON schemas from markdown file AST structures
with configurable depth limitations for structural analysis.
"""
import json
import pytest
from pathlib import Path
from tempfile import NamedTemporaryFile
from markitect.schema_generator import SchemaGenerator
from markitect.exceptions import FileNotFoundError, InvalidDepthError
class TestIssue5SchemaGeneration:
"""Test suite for schema generation from markdown files."""
def setup_method(self):
"""Set up test environment."""
self.schema_generator = SchemaGenerator()
def teardown_method(self):
"""Clean up after tests."""
pass
def test_generate_schema_from_simple_markdown(self):
"""
ISSUE #5: Test basic schema generation from simple markdown structure.
Verifies that a simple markdown file generates a valid JSON schema
that captures heading structure and basic elements.
"""
# Arrange - Simple markdown with clear structure
markdown_content = """# Main Heading
This is a paragraph.
## Sub Heading
- List item 1
- List item 2
Some text here.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema in syntactic mode (element counting)
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Schema should be valid JSON and contain expected structure
assert isinstance(result, dict)
assert "$schema" in result
assert "type" in result
assert result["type"] == "object"
# Should capture heading structure
properties = result.get("properties", {})
assert "headings" in properties
# Should define heading levels found in the document
heading_properties = properties["headings"]["properties"]
assert "level_1" in heading_properties # # Main Heading
assert "level_2" in heading_properties # ## Sub Heading
# Should capture other structural elements
assert "paragraphs" in properties
assert "lists" in properties
finally:
temp_file.unlink()
def test_generate_schema_with_depth_limitation(self):
"""
ISSUE #5: Test schema generation with depth limitation.
Verifies that depth parameter correctly limits which heading levels
are included in the generated schema.
"""
# Arrange - Markdown with multiple heading levels
markdown_content = """# Level 1
Content here.
## Level 2
More content.
### Level 3
Deep content.
#### Level 4
Very deep content.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema in syntactic mode with depth limit of 2
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
# Assert - Only levels 1 and 2 should be included
properties = result.get("properties", {})
heading_properties = properties["headings"]["properties"]
assert "level_1" in heading_properties
assert "level_2" in heading_properties
assert "level_3" not in heading_properties # Should be excluded
assert "level_4" not in heading_properties # Should be excluded
finally:
temp_file.unlink()
def test_generate_schema_from_complex_document(self):
"""
ISSUE #5: Test schema generation from complex markdown document.
Verifies handling of complex markdown structures including
code blocks, blockquotes, links, and nested lists.
"""
# Arrange - Complex markdown with various elements
markdown_content = """# Documentation
## Overview
This is an **important** document with *emphasis*.
### Features
- Feature 1 with [link](https://example.com)
- Feature 2
- Nested item A
- Nested item B
### Code Examples
```python
def hello():
print("Hello, World!")
```
> This is a blockquote with important information.
## API Reference
| Method | Description |
|--------|-------------|
| GET | Retrieve data |
| POST | Create data |
### Error Handling
1. Check input parameters
2. Validate data types
3. Handle exceptions
#### Implementation Details
Some implementation notes here.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema in syntactic mode
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Schema should capture complex structures
properties = result.get("properties", {})
# Should have all major structural elements
expected_elements = ["headings", "paragraphs", "lists", "code_blocks", "blockquotes", "tables"]
for element in expected_elements:
assert element in properties, f"Missing {element} in schema"
# Should capture heading hierarchy
heading_properties = properties["headings"]["properties"]
assert "level_1" in heading_properties
assert "level_2" in heading_properties
assert "level_3" in heading_properties
assert "level_4" in heading_properties
finally:
temp_file.unlink()
def test_generate_schema_file_not_found(self):
"""
ISSUE #5: Test error handling when markdown file doesn't exist.
"""
# Arrange - Non-existent file path
non_existent_file = Path("/tmp/non_existent_file.md")
# Act & Assert - Should raise appropriate exception
with pytest.raises(FileNotFoundError):
self.schema_generator.generate_schema_from_file(non_existent_file)
def test_generate_schema_invalid_depth(self):
"""
ISSUE #5: Test error handling for invalid depth parameters.
"""
# Arrange - Simple markdown file
markdown_content = "# Test\n\nContent here."
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act & Assert - Invalid depth values should raise exceptions
with pytest.raises(InvalidDepthError):
self.schema_generator.generate_schema_from_file(temp_file, max_depth=0)
with pytest.raises(InvalidDepthError):
self.schema_generator.generate_schema_from_file(temp_file, max_depth=-1)
finally:
temp_file.unlink()
def test_generate_schema_empty_file(self):
"""
ISSUE #5: Test schema generation from empty markdown file.
"""
# Arrange - Empty markdown file
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write("")
temp_file = Path(f.name)
try:
# Act - Generate schema from empty file
result = self.schema_generator.generate_schema_from_file(temp_file)
# Assert - Should generate valid but minimal schema
assert isinstance(result, dict)
assert "$schema" in result
assert "type" in result
# Should have empty or minimal structure
properties = result.get("properties", {})
if "headings" in properties:
heading_properties = properties["headings"].get("properties", {})
assert len(heading_properties) == 0 # No headings in empty file
finally:
temp_file.unlink()
def test_schema_format_compliance(self):
"""
ISSUE #5: Test that generated schema follows JSON Schema specification.
Verifies the output is a valid JSON Schema that could be used
for validation by standard JSON Schema validators.
"""
# Arrange - Standard markdown structure
markdown_content = """# Title
## Section
Content with **formatting**.
- List item
### Subsection
More content.
"""
with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
f.write(markdown_content)
temp_file = Path(f.name)
try:
# Act - Generate schema
result = self.schema_generator.generate_schema_from_file(temp_file)
# Assert - Should be valid JSON Schema format
assert result.get("$schema") == "http://json-schema.org/draft-07/schema#"
assert result.get("type") == "object"
assert "properties" in result
assert "title" in result
assert "description" in result
# Should be serializable as JSON
json_string = json.dumps(result, indent=2)
assert len(json_string) > 0
# Should be deserializable back to same structure
deserialized = json.loads(json_string)
assert deserialized == result
finally:
temp_file.unlink()