""" Test for Issue #5: Generate a Schema from a Markdown File. Tests the ability to create JSON schemas from markdown file AST structures with configurable depth limitations for structural analysis. """ import json import pytest from pathlib import Path from tempfile import NamedTemporaryFile from markitect.schema_generator import SchemaGenerator from markitect.exceptions import FileNotFoundError, InvalidDepthError class TestIssue5SchemaGeneration: """Test suite for schema generation from markdown files.""" def setup_method(self): """Set up test environment.""" self.schema_generator = SchemaGenerator() def teardown_method(self): """Clean up after tests.""" pass def test_generate_schema_from_simple_markdown(self): """ ISSUE #5: Test basic schema generation from simple markdown structure. Verifies that a simple markdown file generates a valid JSON schema that captures heading structure and basic elements. """ # Arrange - Simple markdown with clear structure markdown_content = """# Main Heading This is a paragraph. ## Sub Heading - List item 1 - List item 2 Some text here. """ with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(markdown_content) temp_file = Path(f.name) try: # Act - Generate schema in syntactic mode (element counting) result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic') # Assert - Schema should be valid JSON and contain expected structure assert isinstance(result, dict) assert "$schema" in result assert "type" in result assert result["type"] == "object" # Should capture heading structure properties = result.get("properties", {}) assert "headings" in properties # Should define heading levels found in the document heading_properties = properties["headings"]["properties"] assert "level_1" in heading_properties # # Main Heading assert "level_2" in heading_properties # ## Sub Heading # Should capture other structural elements assert "paragraphs" in properties assert "lists" in properties finally: temp_file.unlink() def test_generate_schema_with_depth_limitation(self): """ ISSUE #5: Test schema generation with depth limitation. Verifies that depth parameter correctly limits which heading levels are included in the generated schema. """ # Arrange - Markdown with multiple heading levels markdown_content = """# Level 1 Content here. ## Level 2 More content. ### Level 3 Deep content. #### Level 4 Very deep content. """ with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(markdown_content) temp_file = Path(f.name) try: # Act - Generate schema in syntactic mode with depth limit of 2 result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic') # Assert - Only levels 1 and 2 should be included properties = result.get("properties", {}) heading_properties = properties["headings"]["properties"] assert "level_1" in heading_properties assert "level_2" in heading_properties assert "level_3" not in heading_properties # Should be excluded assert "level_4" not in heading_properties # Should be excluded finally: temp_file.unlink() def test_generate_schema_from_complex_document(self): """ ISSUE #5: Test schema generation from complex markdown document. Verifies handling of complex markdown structures including code blocks, blockquotes, links, and nested lists. """ # Arrange - Complex markdown with various elements markdown_content = """# Documentation ## Overview This is an **important** document with *emphasis*. ### Features - Feature 1 with [link](https://example.com) - Feature 2 - Nested item A - Nested item B ### Code Examples ```python def hello(): print("Hello, World!") ``` > This is a blockquote with important information. ## API Reference | Method | Description | |--------|-------------| | GET | Retrieve data | | POST | Create data | ### Error Handling 1. Check input parameters 2. Validate data types 3. Handle exceptions #### Implementation Details Some implementation notes here. """ with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(markdown_content) temp_file = Path(f.name) try: # Act - Generate schema in syntactic mode result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic') # Assert - Schema should capture complex structures properties = result.get("properties", {}) # Should have all major structural elements expected_elements = ["headings", "paragraphs", "lists", "code_blocks", "blockquotes", "tables"] for element in expected_elements: assert element in properties, f"Missing {element} in schema" # Should capture heading hierarchy heading_properties = properties["headings"]["properties"] assert "level_1" in heading_properties assert "level_2" in heading_properties assert "level_3" in heading_properties assert "level_4" in heading_properties finally: temp_file.unlink() def test_generate_schema_file_not_found(self): """ ISSUE #5: Test error handling when markdown file doesn't exist. """ # Arrange - Non-existent file path non_existent_file = Path("/tmp/non_existent_file.md") # Act & Assert - Should raise appropriate exception with pytest.raises(FileNotFoundError): self.schema_generator.generate_schema_from_file(non_existent_file) def test_generate_schema_invalid_depth(self): """ ISSUE #5: Test error handling for invalid depth parameters. """ # Arrange - Simple markdown file markdown_content = "# Test\n\nContent here." with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(markdown_content) temp_file = Path(f.name) try: # Act & Assert - Invalid depth values should raise exceptions with pytest.raises(InvalidDepthError): self.schema_generator.generate_schema_from_file(temp_file, max_depth=0) with pytest.raises(InvalidDepthError): self.schema_generator.generate_schema_from_file(temp_file, max_depth=-1) finally: temp_file.unlink() def test_generate_schema_empty_file(self): """ ISSUE #5: Test schema generation from empty markdown file. """ # Arrange - Empty markdown file with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write("") temp_file = Path(f.name) try: # Act - Generate schema from empty file result = self.schema_generator.generate_schema_from_file(temp_file) # Assert - Should generate valid but minimal schema assert isinstance(result, dict) assert "$schema" in result assert "type" in result # Should have empty or minimal structure properties = result.get("properties", {}) if "headings" in properties: heading_properties = properties["headings"].get("properties", {}) assert len(heading_properties) == 0 # No headings in empty file finally: temp_file.unlink() def test_schema_format_compliance(self): """ ISSUE #5: Test that generated schema follows JSON Schema specification. Verifies the output is a valid JSON Schema that could be used for validation by standard JSON Schema validators. """ # Arrange - Standard markdown structure markdown_content = """# Title ## Section Content with **formatting**. - List item ### Subsection More content. """ with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(markdown_content) temp_file = Path(f.name) try: # Act - Generate schema result = self.schema_generator.generate_schema_from_file(temp_file) # Assert - Should be valid JSON Schema format assert result.get("$schema") == "http://json-schema.org/draft-07/schema#" assert result.get("type") == "object" assert "properties" in result assert "title" in result assert "description" in result # Should be serializable as JSON json_string = json.dumps(result, indent=2) assert len(json_string) > 0 # Should be deserializable back to same structure deserialized = json.loads(json_string) assert deserialized == result finally: temp_file.unlink()