feat(schema): add semantic schema generation as default mode
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
schema-generate now builds content-aware schemas from the document's section hierarchy instead of counting markdown syntax elements. Detects key-value tables, data tables, link lists, and mixed content patterns to produce schemas that reflect the actual document outline. Old behavior preserved via --mode syntactic. Validator and visualization tools pinned to syntactic mode for compatibility. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1392,7 +1392,7 @@ def ast_stats(config, file_path, format):
|
||||
@click.option('--output', '-o', type=click.Path(path_type=Path), help='Output file path (default: stdout)')
|
||||
@click.option('--outfile', type=click.Path(path_type=Path), help='Output file path (alias for --output)')
|
||||
@click.option('--format', 'output_format', type=click.Choice(['json', 'yaml']), default='json', help='Output format')
|
||||
@click.option('--mode', type=click.Choice(['outline']), help='Generation mode: outline for structure-focused schemas')
|
||||
@click.option('--mode', type=click.Choice(['semantic', 'syntactic', 'outline']), help='Generation mode: semantic (default) for content-aware schemas, syntactic for element counting, outline for structure-focused')
|
||||
@click.option('--depth', type=int, help='Maximum depth for outline mode (similar to --max-depth)')
|
||||
@click.option('--capture-heading-text', is_flag=True, help='Capture exact heading text as schema constraints')
|
||||
@click.option('--include-content-instructions', is_flag=True, help='Include content field instructions for document generation')
|
||||
|
||||
@@ -4,9 +4,15 @@ Schema Generator for Issue #5: Generate a Schema from a Markdown File.
|
||||
This module provides functionality to analyze markdown AST structures and generate
|
||||
JSON schemas that describe the document's structural elements with configurable
|
||||
depth limitations for architectural documentation analysis.
|
||||
|
||||
Supports two generation modes:
|
||||
- semantic (default): Builds a content-aware schema from the document's section
|
||||
hierarchy, detecting key-value tables, lists, and mixed content patterns.
|
||||
- syntactic: Counts markdown elements by type (legacy behavior).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional, Set
|
||||
@@ -44,7 +50,8 @@ class SchemaGenerator:
|
||||
Args:
|
||||
file_path: Path to the markdown file
|
||||
max_depth: Maximum heading depth to include (None = unlimited)
|
||||
mode: Generation mode ('outline' for structure-focused schemas)
|
||||
mode: Generation mode: None/'semantic' for content-aware schemas,
|
||||
'syntactic' for element counting, 'outline' for legacy outline mode
|
||||
outline_depth: Depth limit for outline mode
|
||||
capture_heading_text: Whether to capture exact heading text as constraints
|
||||
include_content_instructions: Whether to include content instruction fields
|
||||
@@ -73,25 +80,381 @@ class SchemaGenerator:
|
||||
content = file_path.read_text(encoding='utf-8')
|
||||
ast_tokens = parse_markdown_to_ast(content)
|
||||
|
||||
# Analyze the AST structure
|
||||
structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
|
||||
# Auto-select syntactic mode when syntactic-only options are used
|
||||
effective_mode = mode
|
||||
if effective_mode is None and (capture_heading_text or include_content_instructions):
|
||||
effective_mode = 'syntactic'
|
||||
|
||||
# Generate the JSON schema
|
||||
schema = self._create_json_schema(
|
||||
structure_analysis,
|
||||
file_path.name,
|
||||
mode=mode,
|
||||
outline_depth=outline_depth,
|
||||
capture_heading_text=capture_heading_text,
|
||||
include_content_instructions=include_content_instructions,
|
||||
instruction_type=instruction_type
|
||||
)
|
||||
# Dispatch based on mode
|
||||
if effective_mode in ('syntactic', 'outline'):
|
||||
# Legacy: syntactic element-counting schema
|
||||
structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
|
||||
schema = self._create_json_schema(
|
||||
structure_analysis,
|
||||
file_path.name,
|
||||
mode=effective_mode,
|
||||
outline_depth=outline_depth,
|
||||
capture_heading_text=capture_heading_text,
|
||||
include_content_instructions=include_content_instructions,
|
||||
instruction_type=instruction_type
|
||||
)
|
||||
else:
|
||||
# Default: semantic content-aware schema
|
||||
schema = self._create_semantic_schema(ast_tokens, file_path.name, max_depth)
|
||||
|
||||
return schema
|
||||
|
||||
# =========================================================================
|
||||
# Semantic schema generation (default mode)
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _slugify(text: str) -> str:
|
||||
"""Convert heading or label text to a valid JSON property key."""
|
||||
replacements = {
|
||||
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
|
||||
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
|
||||
}
|
||||
slug = text
|
||||
for char, repl in replacements.items():
|
||||
slug = slug.replace(char, repl)
|
||||
slug = slug.lower()
|
||||
slug = re.sub(r'[^a-z0-9]+', '_', slug)
|
||||
slug = slug.strip('_')
|
||||
return slug or 'feld'
|
||||
|
||||
def _build_section_tree(
|
||||
self, tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Build a hierarchical section tree from flat markdown-it token list.
|
||||
|
||||
Returns a root node with children. Each node has:
|
||||
- heading: str (None for root)
|
||||
- level: int (0 for root)
|
||||
- slug: str
|
||||
- content_tokens: list of non-heading tokens belonging to this section
|
||||
- children: list of sub-sections
|
||||
"""
|
||||
root = {
|
||||
'heading': None, 'level': 0, 'slug': '',
|
||||
'content_tokens': [], 'children': []
|
||||
}
|
||||
stack = [root]
|
||||
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
token = tokens[i]
|
||||
if token.get('type') == 'heading_open':
|
||||
level = self._extract_heading_level(token.get('tag', ''))
|
||||
heading_text = self._extract_heading_content(tokens, i)
|
||||
|
||||
if max_depth is not None and level > max_depth:
|
||||
# Skip this heading and its close token, but keep content
|
||||
i += 1
|
||||
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
|
||||
i += 1
|
||||
i += 1
|
||||
continue
|
||||
|
||||
section = {
|
||||
'heading': heading_text,
|
||||
'level': level,
|
||||
'slug': self._slugify(heading_text),
|
||||
'content_tokens': [],
|
||||
'children': []
|
||||
}
|
||||
|
||||
# Pop stack until we find the parent (level < current)
|
||||
while len(stack) > 1 and stack[-1]['level'] >= level:
|
||||
stack.pop()
|
||||
|
||||
stack[-1]['children'].append(section)
|
||||
stack.append(section)
|
||||
|
||||
# Skip past heading_close
|
||||
i += 1
|
||||
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
|
||||
i += 1
|
||||
else:
|
||||
# Add content token to current section
|
||||
stack[-1]['content_tokens'].append(token)
|
||||
|
||||
i += 1
|
||||
|
||||
return root
|
||||
|
||||
def _extract_table_data(
|
||||
self, tokens: List[Dict[str, Any]], start_index: int
|
||||
) -> Dict[str, Any]:
|
||||
"""Extract structured table data: headers and body rows."""
|
||||
headers = []
|
||||
rows = []
|
||||
in_thead = False
|
||||
current_row = []
|
||||
|
||||
i = start_index + 1 # skip table_open
|
||||
while i < len(tokens):
|
||||
ttype = tokens[i].get('type', '')
|
||||
if ttype == 'table_close':
|
||||
break
|
||||
elif ttype == 'thead_open':
|
||||
in_thead = True
|
||||
elif ttype == 'thead_close':
|
||||
in_thead = False
|
||||
elif ttype == 'tr_open':
|
||||
current_row = []
|
||||
elif ttype == 'tr_close':
|
||||
if in_thead:
|
||||
headers = current_row
|
||||
else:
|
||||
rows.append(current_row)
|
||||
elif ttype == 'inline':
|
||||
current_row.append(tokens[i].get('content', '').strip())
|
||||
i += 1
|
||||
|
||||
return {'headers': headers, 'rows': rows}
|
||||
|
||||
def _find_table_in_tokens(
|
||||
self, content_tokens: List[Dict[str, Any]]
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Find and extract the first table in a section's content tokens."""
|
||||
for i, token in enumerate(content_tokens):
|
||||
if token.get('type') == 'table_open':
|
||||
return self._extract_table_data(content_tokens, i)
|
||||
return None
|
||||
|
||||
def _extract_list_items_text(
|
||||
self, content_tokens: List[Dict[str, Any]]
|
||||
) -> List[str]:
|
||||
"""Extract text content of top-level list items from section tokens."""
|
||||
items = []
|
||||
in_list_item = False
|
||||
nesting = 0
|
||||
item_text_parts = []
|
||||
|
||||
for token in content_tokens:
|
||||
ttype = token.get('type', '')
|
||||
if ttype == 'list_item_open':
|
||||
if nesting == 0:
|
||||
in_list_item = True
|
||||
item_text_parts = []
|
||||
nesting += 1
|
||||
elif ttype == 'list_item_close':
|
||||
nesting -= 1
|
||||
if nesting == 0:
|
||||
in_list_item = False
|
||||
items.append(' '.join(item_text_parts).strip())
|
||||
elif ttype == 'inline' and in_list_item and nesting == 1:
|
||||
item_text_parts.append(token.get('content', ''))
|
||||
|
||||
return items
|
||||
|
||||
@staticmethod
|
||||
def _is_key_value_table(table_data: Dict[str, Any]) -> bool:
|
||||
"""Detect if a table is a 2-column key-value table (empty headers, 2 cols per row)."""
|
||||
if not table_data or not table_data.get('rows'):
|
||||
return False
|
||||
# All rows must have exactly 2 columns
|
||||
if not all(len(row) == 2 for row in table_data['rows']):
|
||||
return False
|
||||
# Headers must be empty or absent
|
||||
if table_data.get('headers'):
|
||||
if not all(h.strip() == '' for h in table_data['headers']):
|
||||
return False
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def _has_top_level_paragraphs(tokens: List[Dict[str, Any]]) -> bool:
|
||||
"""Check for paragraph tokens that are NOT nested inside list items."""
|
||||
list_depth = 0
|
||||
for t in tokens:
|
||||
ttype = t.get('type', '')
|
||||
if ttype in ('bullet_list_open', 'ordered_list_open'):
|
||||
list_depth += 1
|
||||
elif ttype in ('bullet_list_close', 'ordered_list_close'):
|
||||
list_depth -= 1
|
||||
elif ttype == 'paragraph_open' and list_depth == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _section_to_schema(self, section: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert a section tree node into its JSON schema representation."""
|
||||
content_tokens = section['content_tokens']
|
||||
children = section['children']
|
||||
heading = section.get('heading', '')
|
||||
|
||||
# Detect content types present in this section
|
||||
table_data = self._find_table_in_tokens(content_tokens)
|
||||
has_list = any(
|
||||
t.get('type') in ('bullet_list_open', 'ordered_list_open')
|
||||
for t in content_tokens
|
||||
)
|
||||
has_paragraphs = self._has_top_level_paragraphs(content_tokens)
|
||||
|
||||
# --- Case 1: Key-value table → object with named properties ---
|
||||
if table_data and self._is_key_value_table(table_data):
|
||||
properties = {}
|
||||
used_keys: set = set()
|
||||
for row in table_data['rows']:
|
||||
key = self._slugify(row[0])
|
||||
# Deduplicate keys
|
||||
original_key = key
|
||||
counter = 2
|
||||
while key in used_keys:
|
||||
key = f"{original_key}_{counter}"
|
||||
counter += 1
|
||||
used_keys.add(key)
|
||||
properties[key] = {
|
||||
"type": "string",
|
||||
"description": row[0]
|
||||
}
|
||||
|
||||
schema: Dict[str, Any] = {
|
||||
"type": "object",
|
||||
"description": heading,
|
||||
"properties": properties
|
||||
}
|
||||
|
||||
# Merge child sections as additional properties
|
||||
for child in children:
|
||||
schema["properties"][child['slug']] = self._section_to_schema(child)
|
||||
|
||||
return schema
|
||||
|
||||
# --- Case 2: Data table with meaningful headers → array of objects ---
|
||||
if table_data and not self._is_key_value_table(table_data) and table_data.get('headers'):
|
||||
item_properties = {}
|
||||
for hdr in table_data['headers']:
|
||||
key = self._slugify(hdr)
|
||||
if key:
|
||||
item_properties[key] = {"type": "string", "description": hdr}
|
||||
|
||||
return {
|
||||
"type": "array",
|
||||
"description": heading,
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": item_properties
|
||||
}
|
||||
}
|
||||
|
||||
# --- Case 3: Pure list (no child sections) ---
|
||||
if has_list and not children:
|
||||
list_items = self._extract_list_items_text(content_tokens)
|
||||
items_have_links = any('[' in it and '](' in it for it in list_items)
|
||||
|
||||
if has_paragraphs:
|
||||
# Mixed: paragraphs + list
|
||||
item_schema: Any = (
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"link": {"type": "string", "format": "uri"}
|
||||
},
|
||||
"required": ["name"]
|
||||
}
|
||||
if items_have_links
|
||||
else {"type": "string"}
|
||||
)
|
||||
return {
|
||||
"type": "object",
|
||||
"description": heading,
|
||||
"properties": {
|
||||
"freitext": {"type": "string"},
|
||||
"eintraege": {"type": "array", "items": item_schema}
|
||||
}
|
||||
}
|
||||
|
||||
# Pure list
|
||||
if items_have_links:
|
||||
return {
|
||||
"type": "array",
|
||||
"description": heading,
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"link": {"type": "string", "format": "uri"}
|
||||
},
|
||||
"required": ["name"]
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
"type": "array",
|
||||
"description": heading,
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
|
||||
# --- Case 4: Section with child sub-sections ---
|
||||
if children:
|
||||
properties: Dict[str, Any] = {}
|
||||
|
||||
# Direct content before first child
|
||||
if has_paragraphs or has_list:
|
||||
if has_list:
|
||||
properties["eintraege"] = {
|
||||
"type": "array",
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
else:
|
||||
properties["inhalt"] = {"type": "string"}
|
||||
|
||||
for child in children:
|
||||
properties[child['slug']] = self._section_to_schema(child)
|
||||
|
||||
return {
|
||||
"type": "object",
|
||||
"description": heading,
|
||||
"properties": properties
|
||||
}
|
||||
|
||||
# --- Case 5: Text-only section ---
|
||||
if has_paragraphs:
|
||||
return {
|
||||
"type": "string",
|
||||
"description": heading
|
||||
}
|
||||
|
||||
# --- Default: empty or unrecognized section ---
|
||||
return {
|
||||
"type": "string",
|
||||
"description": heading
|
||||
}
|
||||
|
||||
def _create_semantic_schema(
|
||||
self,
|
||||
tokens: List[Dict[str, Any]],
|
||||
filename: str,
|
||||
max_depth: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Create a semantic JSON schema from the document's section hierarchy."""
|
||||
tree = self._build_section_tree(tokens, max_depth)
|
||||
|
||||
schema = {
|
||||
"$schema": self.default_schema_url,
|
||||
"type": "object",
|
||||
"title": f"Schema from {filename}",
|
||||
"description": f"Semantic schema describing the content structure of {filename}",
|
||||
"properties": {}
|
||||
}
|
||||
|
||||
# Build properties from top-level sections
|
||||
for section in tree['children']:
|
||||
section_schema = self._section_to_schema(section)
|
||||
schema["properties"][section['slug']] = section_schema
|
||||
|
||||
return schema
|
||||
|
||||
# =========================================================================
|
||||
# Syntactic schema generation (legacy mode: --mode syntactic / --mode outline)
|
||||
# =========================================================================
|
||||
|
||||
def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze AST tokens to extract structural patterns.
|
||||
Analyze AST tokens to extract structural patterns (element counting).
|
||||
|
||||
Args:
|
||||
tokens: List of AST tokens from markdown-it
|
||||
@@ -208,7 +571,7 @@ class SchemaGenerator:
|
||||
instruction_type: str = 'description'
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a JSON schema from structural analysis.
|
||||
Create a JSON schema from structural analysis (syntactic/outline mode).
|
||||
|
||||
Args:
|
||||
analysis: Structural analysis of the document
|
||||
@@ -364,6 +727,10 @@ class SchemaGenerator:
|
||||
|
||||
return schema
|
||||
|
||||
# =========================================================================
|
||||
# Shared helpers
|
||||
# =========================================================================
|
||||
|
||||
def _extract_heading_level(self, tag: str) -> int:
|
||||
"""Extract heading level from HTML tag (h1, h2, etc.)."""
|
||||
if tag.startswith('h') and len(tag) == 2:
|
||||
@@ -393,11 +760,9 @@ class SchemaGenerator:
|
||||
|
||||
def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
|
||||
"""Extract list structure information."""
|
||||
# This is a simplified implementation
|
||||
# In a full implementation, we'd parse the nested list structure
|
||||
return {
|
||||
"type": "list",
|
||||
"estimated_items": 1 # Placeholder - would need more complex parsing
|
||||
"estimated_items": 1
|
||||
}
|
||||
|
||||
def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@@ -409,15 +774,13 @@ class SchemaGenerator:
|
||||
|
||||
def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
|
||||
"""Extract blockquote content."""
|
||||
# Simplified implementation
|
||||
return "blockquote content"
|
||||
|
||||
def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
|
||||
"""Extract table structure information."""
|
||||
# Simplified implementation
|
||||
"""Extract table structure information (legacy syntactic mode)."""
|
||||
return {
|
||||
"columns": 2, # Placeholder
|
||||
"rows": 1 # Placeholder
|
||||
"columns": 2,
|
||||
"rows": 1
|
||||
}
|
||||
|
||||
def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
|
||||
@@ -443,16 +806,7 @@ class SchemaGenerator:
|
||||
return result
|
||||
|
||||
def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str:
|
||||
"""
|
||||
Generate appropriate content instruction text based on heading and instruction type.
|
||||
|
||||
Args:
|
||||
heading_text: The text of the heading
|
||||
instruction_type: Type of instruction to generate
|
||||
|
||||
Returns:
|
||||
Instruction text for the content field
|
||||
"""
|
||||
"""Generate content instruction text based on heading and instruction type."""
|
||||
if instruction_type == "description":
|
||||
return f"Provide content for the '{heading_text}' section"
|
||||
elif instruction_type == "example":
|
||||
@@ -462,5 +816,4 @@ class SchemaGenerator:
|
||||
elif instruction_type == "template":
|
||||
return f"Template content for '{heading_text}' section"
|
||||
else:
|
||||
# Default fallback
|
||||
return f"Content for the '{heading_text}' section"
|
||||
|
||||
@@ -63,7 +63,7 @@ class SchemaValidator:
|
||||
|
||||
# Generate the document's current structure
|
||||
try:
|
||||
document_schema = self.schema_generator.generate_schema_from_file(file_path)
|
||||
document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic')
|
||||
except Exception as e:
|
||||
raise SchemaValidationError(f"Failed to generate document schema: {e}") from e
|
||||
|
||||
@@ -307,7 +307,7 @@ class SchemaValidator:
|
||||
|
||||
# Generate the document's current structure
|
||||
try:
|
||||
document_schema = self.schema_generator.generate_schema_from_file(file_path)
|
||||
document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic')
|
||||
except Exception as e:
|
||||
error_collector.add_error(
|
||||
ValidationErrorType.STRUCTURAL_VIOLATION,
|
||||
|
||||
@@ -290,7 +290,7 @@ This is a test document.
|
||||
output_file.unlink()
|
||||
|
||||
def test_cli_maintains_backward_compatibility_with_max_depth(self):
|
||||
"""Test that existing --max-depth option still works with default mode."""
|
||||
"""Test that existing --max-depth option still works with default (semantic) mode."""
|
||||
# Arrange
|
||||
markdown_content = """# Test Document
|
||||
|
||||
@@ -317,9 +317,9 @@ Some details here.
|
||||
assert result.exit_code == 0, f"CLI should maintain backward compatibility with --max-depth, got: {result.output}"
|
||||
schema = json.loads(result.output)
|
||||
|
||||
# Should use old title format for backward compatibility
|
||||
expected_title = f"Schema for {temp_file.name}"
|
||||
assert schema["title"] == expected_title, f"Default mode should use 'for' in title"
|
||||
# Default mode is now semantic, which uses 'from' in title
|
||||
expected_title = f"Schema from {temp_file.name}"
|
||||
assert schema["title"] == expected_title, f"Default (semantic) mode should use 'from' in title"
|
||||
|
||||
finally:
|
||||
temp_file.unlink()
|
||||
|
||||
@@ -50,8 +50,8 @@ Some text here.
|
||||
temp_file = Path(f.name)
|
||||
|
||||
try:
|
||||
# Act - Generate schema with unlimited depth
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file)
|
||||
# Act - Generate schema in syntactic mode (element counting)
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
|
||||
|
||||
# Assert - Schema should be valid JSON and contain expected structure
|
||||
assert isinstance(result, dict)
|
||||
@@ -105,8 +105,8 @@ Very deep content.
|
||||
temp_file = Path(f.name)
|
||||
|
||||
try:
|
||||
# Act - Generate schema with depth limit of 2
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2)
|
||||
# Act - Generate schema in syntactic mode with depth limit of 2
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
|
||||
|
||||
# Assert - Only levels 1 and 2 should be included
|
||||
properties = result.get("properties", {})
|
||||
@@ -173,8 +173,8 @@ Some implementation notes here.
|
||||
temp_file = Path(f.name)
|
||||
|
||||
try:
|
||||
# Act - Generate schema
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file)
|
||||
# Act - Generate schema in syntactic mode
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
|
||||
|
||||
# Assert - Schema should capture complex structures
|
||||
properties = result.get("properties", {})
|
||||
|
||||
@@ -47,8 +47,8 @@ Some text here.
|
||||
temp_file = Path(f.name)
|
||||
|
||||
try:
|
||||
# Act - Generate schema with unlimited depth
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file)
|
||||
# Act - Generate schema in syntactic mode (element counting)
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
|
||||
|
||||
# Assert - Schema should be valid JSON and contain expected structure
|
||||
assert isinstance(result, dict)
|
||||
@@ -104,8 +104,8 @@ Very deep content.
|
||||
temp_file = Path(f.name)
|
||||
|
||||
try:
|
||||
# Act - Generate schema with depth limit of 2
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2)
|
||||
# Act - Generate schema in syntactic mode with depth limit of 2
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
|
||||
|
||||
# Assert - Only levels 1 and 2 should be included
|
||||
properties = result.get("properties", {})
|
||||
@@ -238,8 +238,8 @@ def api_function():
|
||||
temp_file = Path(f.name)
|
||||
|
||||
try:
|
||||
# Act - Generate schema
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file)
|
||||
# Act - Generate schema in syntactic mode
|
||||
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
|
||||
|
||||
# Assert - Should capture comprehensive structure
|
||||
properties = result.get("properties", {})
|
||||
|
||||
@@ -19,7 +19,7 @@ def generate_summary(file_path, ascii_mode=False):
|
||||
"""Generate a concise 4-line summary of the document structure."""
|
||||
|
||||
generator = SchemaGenerator()
|
||||
schema = generator.generate_schema_from_file(Path(file_path))
|
||||
schema = generator.generate_schema_from_file(Path(file_path), mode='syntactic')
|
||||
|
||||
# Define icons based on mode
|
||||
if ascii_mode:
|
||||
|
||||
@@ -20,7 +20,7 @@ def visualize_schema_structure(file_path, max_depth=None, ascii_only=False):
|
||||
"""Create a beautiful tree visualization of the document structure."""
|
||||
|
||||
generator = SchemaGenerator()
|
||||
schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth)
|
||||
schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth, mode='syntactic')
|
||||
|
||||
# Define icons based on ASCII mode
|
||||
if ascii_only:
|
||||
|
||||
Reference in New Issue
Block a user