feat(schema): add semantic schema generation as default mode
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled

schema-generate now builds content-aware schemas from the document's
section hierarchy instead of counting markdown syntax elements. Detects
key-value tables, data tables, link lists, and mixed content patterns
to produce schemas that reflect the actual document outline.

Old behavior preserved via --mode syntactic. Validator and visualization
tools pinned to syntactic mode for compatibility.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 18:49:50 +01:00
parent 120ed89780
commit 60f33443ae
8 changed files with 408 additions and 55 deletions

View File

@@ -1392,7 +1392,7 @@ def ast_stats(config, file_path, format):
@click.option('--output', '-o', type=click.Path(path_type=Path), help='Output file path (default: stdout)')
@click.option('--outfile', type=click.Path(path_type=Path), help='Output file path (alias for --output)')
@click.option('--format', 'output_format', type=click.Choice(['json', 'yaml']), default='json', help='Output format')
@click.option('--mode', type=click.Choice(['outline']), help='Generation mode: outline for structure-focused schemas')
@click.option('--mode', type=click.Choice(['semantic', 'syntactic', 'outline']), help='Generation mode: semantic (default) for content-aware schemas, syntactic for element counting, outline for structure-focused')
@click.option('--depth', type=int, help='Maximum depth for outline mode (similar to --max-depth)')
@click.option('--capture-heading-text', is_flag=True, help='Capture exact heading text as schema constraints')
@click.option('--include-content-instructions', is_flag=True, help='Include content field instructions for document generation')

View File

@@ -4,9 +4,15 @@ Schema Generator for Issue #5: Generate a Schema from a Markdown File.
This module provides functionality to analyze markdown AST structures and generate
JSON schemas that describe the document's structural elements with configurable
depth limitations for architectural documentation analysis.
Supports two generation modes:
- semantic (default): Builds a content-aware schema from the document's section
hierarchy, detecting key-value tables, lists, and mixed content patterns.
- syntactic: Counts markdown elements by type (legacy behavior).
"""
import json
import re
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Any, Optional, Set
@@ -44,7 +50,8 @@ class SchemaGenerator:
Args:
file_path: Path to the markdown file
max_depth: Maximum heading depth to include (None = unlimited)
mode: Generation mode ('outline' for structure-focused schemas)
mode: Generation mode: None/'semantic' for content-aware schemas,
'syntactic' for element counting, 'outline' for legacy outline mode
outline_depth: Depth limit for outline mode
capture_heading_text: Whether to capture exact heading text as constraints
include_content_instructions: Whether to include content instruction fields
@@ -73,25 +80,381 @@ class SchemaGenerator:
content = file_path.read_text(encoding='utf-8')
ast_tokens = parse_markdown_to_ast(content)
# Analyze the AST structure
structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
# Auto-select syntactic mode when syntactic-only options are used
effective_mode = mode
if effective_mode is None and (capture_heading_text or include_content_instructions):
effective_mode = 'syntactic'
# Generate the JSON schema
schema = self._create_json_schema(
structure_analysis,
file_path.name,
mode=mode,
outline_depth=outline_depth,
capture_heading_text=capture_heading_text,
include_content_instructions=include_content_instructions,
instruction_type=instruction_type
)
# Dispatch based on mode
if effective_mode in ('syntactic', 'outline'):
# Legacy: syntactic element-counting schema
structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
schema = self._create_json_schema(
structure_analysis,
file_path.name,
mode=effective_mode,
outline_depth=outline_depth,
capture_heading_text=capture_heading_text,
include_content_instructions=include_content_instructions,
instruction_type=instruction_type
)
else:
# Default: semantic content-aware schema
schema = self._create_semantic_schema(ast_tokens, file_path.name, max_depth)
return schema
# =========================================================================
# Semantic schema generation (default mode)
# =========================================================================
@staticmethod
def _slugify(text: str) -> str:
"""Convert heading or label text to a valid JSON property key."""
replacements = {
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
}
slug = text
for char, repl in replacements.items():
slug = slug.replace(char, repl)
slug = slug.lower()
slug = re.sub(r'[^a-z0-9]+', '_', slug)
slug = slug.strip('_')
return slug or 'feld'
def _build_section_tree(
self, tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
) -> Dict[str, Any]:
"""
Build a hierarchical section tree from flat markdown-it token list.
Returns a root node with children. Each node has:
- heading: str (None for root)
- level: int (0 for root)
- slug: str
- content_tokens: list of non-heading tokens belonging to this section
- children: list of sub-sections
"""
root = {
'heading': None, 'level': 0, 'slug': '',
'content_tokens': [], 'children': []
}
stack = [root]
i = 0
while i < len(tokens):
token = tokens[i]
if token.get('type') == 'heading_open':
level = self._extract_heading_level(token.get('tag', ''))
heading_text = self._extract_heading_content(tokens, i)
if max_depth is not None and level > max_depth:
# Skip this heading and its close token, but keep content
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
i += 1
continue
section = {
'heading': heading_text,
'level': level,
'slug': self._slugify(heading_text),
'content_tokens': [],
'children': []
}
# Pop stack until we find the parent (level < current)
while len(stack) > 1 and stack[-1]['level'] >= level:
stack.pop()
stack[-1]['children'].append(section)
stack.append(section)
# Skip past heading_close
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
else:
# Add content token to current section
stack[-1]['content_tokens'].append(token)
i += 1
return root
def _extract_table_data(
self, tokens: List[Dict[str, Any]], start_index: int
) -> Dict[str, Any]:
"""Extract structured table data: headers and body rows."""
headers = []
rows = []
in_thead = False
current_row = []
i = start_index + 1 # skip table_open
while i < len(tokens):
ttype = tokens[i].get('type', '')
if ttype == 'table_close':
break
elif ttype == 'thead_open':
in_thead = True
elif ttype == 'thead_close':
in_thead = False
elif ttype == 'tr_open':
current_row = []
elif ttype == 'tr_close':
if in_thead:
headers = current_row
else:
rows.append(current_row)
elif ttype == 'inline':
current_row.append(tokens[i].get('content', '').strip())
i += 1
return {'headers': headers, 'rows': rows}
def _find_table_in_tokens(
self, content_tokens: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
"""Find and extract the first table in a section's content tokens."""
for i, token in enumerate(content_tokens):
if token.get('type') == 'table_open':
return self._extract_table_data(content_tokens, i)
return None
def _extract_list_items_text(
self, content_tokens: List[Dict[str, Any]]
) -> List[str]:
"""Extract text content of top-level list items from section tokens."""
items = []
in_list_item = False
nesting = 0
item_text_parts = []
for token in content_tokens:
ttype = token.get('type', '')
if ttype == 'list_item_open':
if nesting == 0:
in_list_item = True
item_text_parts = []
nesting += 1
elif ttype == 'list_item_close':
nesting -= 1
if nesting == 0:
in_list_item = False
items.append(' '.join(item_text_parts).strip())
elif ttype == 'inline' and in_list_item and nesting == 1:
item_text_parts.append(token.get('content', ''))
return items
@staticmethod
def _is_key_value_table(table_data: Dict[str, Any]) -> bool:
"""Detect if a table is a 2-column key-value table (empty headers, 2 cols per row)."""
if not table_data or not table_data.get('rows'):
return False
# All rows must have exactly 2 columns
if not all(len(row) == 2 for row in table_data['rows']):
return False
# Headers must be empty or absent
if table_data.get('headers'):
if not all(h.strip() == '' for h in table_data['headers']):
return False
return True
@staticmethod
def _has_top_level_paragraphs(tokens: List[Dict[str, Any]]) -> bool:
"""Check for paragraph tokens that are NOT nested inside list items."""
list_depth = 0
for t in tokens:
ttype = t.get('type', '')
if ttype in ('bullet_list_open', 'ordered_list_open'):
list_depth += 1
elif ttype in ('bullet_list_close', 'ordered_list_close'):
list_depth -= 1
elif ttype == 'paragraph_open' and list_depth == 0:
return True
return False
def _section_to_schema(self, section: Dict[str, Any]) -> Dict[str, Any]:
"""Convert a section tree node into its JSON schema representation."""
content_tokens = section['content_tokens']
children = section['children']
heading = section.get('heading', '')
# Detect content types present in this section
table_data = self._find_table_in_tokens(content_tokens)
has_list = any(
t.get('type') in ('bullet_list_open', 'ordered_list_open')
for t in content_tokens
)
has_paragraphs = self._has_top_level_paragraphs(content_tokens)
# --- Case 1: Key-value table → object with named properties ---
if table_data and self._is_key_value_table(table_data):
properties = {}
used_keys: set = set()
for row in table_data['rows']:
key = self._slugify(row[0])
# Deduplicate keys
original_key = key
counter = 2
while key in used_keys:
key = f"{original_key}_{counter}"
counter += 1
used_keys.add(key)
properties[key] = {
"type": "string",
"description": row[0]
}
schema: Dict[str, Any] = {
"type": "object",
"description": heading,
"properties": properties
}
# Merge child sections as additional properties
for child in children:
schema["properties"][child['slug']] = self._section_to_schema(child)
return schema
# --- Case 2: Data table with meaningful headers → array of objects ---
if table_data and not self._is_key_value_table(table_data) and table_data.get('headers'):
item_properties = {}
for hdr in table_data['headers']:
key = self._slugify(hdr)
if key:
item_properties[key] = {"type": "string", "description": hdr}
return {
"type": "array",
"description": heading,
"items": {
"type": "object",
"properties": item_properties
}
}
# --- Case 3: Pure list (no child sections) ---
if has_list and not children:
list_items = self._extract_list_items_text(content_tokens)
items_have_links = any('[' in it and '](' in it for it in list_items)
if has_paragraphs:
# Mixed: paragraphs + list
item_schema: Any = (
{
"type": "object",
"properties": {
"name": {"type": "string"},
"link": {"type": "string", "format": "uri"}
},
"required": ["name"]
}
if items_have_links
else {"type": "string"}
)
return {
"type": "object",
"description": heading,
"properties": {
"freitext": {"type": "string"},
"eintraege": {"type": "array", "items": item_schema}
}
}
# Pure list
if items_have_links:
return {
"type": "array",
"description": heading,
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"link": {"type": "string", "format": "uri"}
},
"required": ["name"]
}
}
return {
"type": "array",
"description": heading,
"items": {"type": "string"}
}
# --- Case 4: Section with child sub-sections ---
if children:
properties: Dict[str, Any] = {}
# Direct content before first child
if has_paragraphs or has_list:
if has_list:
properties["eintraege"] = {
"type": "array",
"items": {"type": "string"}
}
else:
properties["inhalt"] = {"type": "string"}
for child in children:
properties[child['slug']] = self._section_to_schema(child)
return {
"type": "object",
"description": heading,
"properties": properties
}
# --- Case 5: Text-only section ---
if has_paragraphs:
return {
"type": "string",
"description": heading
}
# --- Default: empty or unrecognized section ---
return {
"type": "string",
"description": heading
}
def _create_semantic_schema(
self,
tokens: List[Dict[str, Any]],
filename: str,
max_depth: Optional[int] = None
) -> Dict[str, Any]:
"""Create a semantic JSON schema from the document's section hierarchy."""
tree = self._build_section_tree(tokens, max_depth)
schema = {
"$schema": self.default_schema_url,
"type": "object",
"title": f"Schema from {filename}",
"description": f"Semantic schema describing the content structure of {filename}",
"properties": {}
}
# Build properties from top-level sections
for section in tree['children']:
section_schema = self._section_to_schema(section)
schema["properties"][section['slug']] = section_schema
return schema
# =========================================================================
# Syntactic schema generation (legacy mode: --mode syntactic / --mode outline)
# =========================================================================
def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
"""
Analyze AST tokens to extract structural patterns.
Analyze AST tokens to extract structural patterns (element counting).
Args:
tokens: List of AST tokens from markdown-it
@@ -208,7 +571,7 @@ class SchemaGenerator:
instruction_type: str = 'description'
) -> Dict[str, Any]:
"""
Create a JSON schema from structural analysis.
Create a JSON schema from structural analysis (syntactic/outline mode).
Args:
analysis: Structural analysis of the document
@@ -364,6 +727,10 @@ class SchemaGenerator:
return schema
# =========================================================================
# Shared helpers
# =========================================================================
def _extract_heading_level(self, tag: str) -> int:
"""Extract heading level from HTML tag (h1, h2, etc.)."""
if tag.startswith('h') and len(tag) == 2:
@@ -393,11 +760,9 @@ class SchemaGenerator:
def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
"""Extract list structure information."""
# This is a simplified implementation
# In a full implementation, we'd parse the nested list structure
return {
"type": "list",
"estimated_items": 1 # Placeholder - would need more complex parsing
"estimated_items": 1
}
def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
@@ -409,15 +774,13 @@ class SchemaGenerator:
def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Extract blockquote content."""
# Simplified implementation
return "blockquote content"
def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
"""Extract table structure information."""
# Simplified implementation
"""Extract table structure information (legacy syntactic mode)."""
return {
"columns": 2, # Placeholder
"rows": 1 # Placeholder
"columns": 2,
"rows": 1
}
def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
@@ -443,16 +806,7 @@ class SchemaGenerator:
return result
def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str:
"""
Generate appropriate content instruction text based on heading and instruction type.
Args:
heading_text: The text of the heading
instruction_type: Type of instruction to generate
Returns:
Instruction text for the content field
"""
"""Generate content instruction text based on heading and instruction type."""
if instruction_type == "description":
return f"Provide content for the '{heading_text}' section"
elif instruction_type == "example":
@@ -462,5 +816,4 @@ class SchemaGenerator:
elif instruction_type == "template":
return f"Template content for '{heading_text}' section"
else:
# Default fallback
return f"Content for the '{heading_text}' section"

View File

@@ -63,7 +63,7 @@ class SchemaValidator:
# Generate the document's current structure
try:
document_schema = self.schema_generator.generate_schema_from_file(file_path)
document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic')
except Exception as e:
raise SchemaValidationError(f"Failed to generate document schema: {e}") from e
@@ -307,7 +307,7 @@ class SchemaValidator:
# Generate the document's current structure
try:
document_schema = self.schema_generator.generate_schema_from_file(file_path)
document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic')
except Exception as e:
error_collector.add_error(
ValidationErrorType.STRUCTURAL_VIOLATION,

View File

@@ -290,7 +290,7 @@ This is a test document.
output_file.unlink()
def test_cli_maintains_backward_compatibility_with_max_depth(self):
"""Test that existing --max-depth option still works with default mode."""
"""Test that existing --max-depth option still works with default (semantic) mode."""
# Arrange
markdown_content = """# Test Document
@@ -317,9 +317,9 @@ Some details here.
assert result.exit_code == 0, f"CLI should maintain backward compatibility with --max-depth, got: {result.output}"
schema = json.loads(result.output)
# Should use old title format for backward compatibility
expected_title = f"Schema for {temp_file.name}"
assert schema["title"] == expected_title, f"Default mode should use 'for' in title"
# Default mode is now semantic, which uses 'from' in title
expected_title = f"Schema from {temp_file.name}"
assert schema["title"] == expected_title, f"Default (semantic) mode should use 'from' in title"
finally:
temp_file.unlink()

View File

@@ -50,8 +50,8 @@ Some text here.
temp_file = Path(f.name)
try:
# Act - Generate schema with unlimited depth
result = self.schema_generator.generate_schema_from_file(temp_file)
# Act - Generate schema in syntactic mode (element counting)
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Schema should be valid JSON and contain expected structure
assert isinstance(result, dict)
@@ -105,8 +105,8 @@ Very deep content.
temp_file = Path(f.name)
try:
# Act - Generate schema with depth limit of 2
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2)
# Act - Generate schema in syntactic mode with depth limit of 2
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
# Assert - Only levels 1 and 2 should be included
properties = result.get("properties", {})
@@ -173,8 +173,8 @@ Some implementation notes here.
temp_file = Path(f.name)
try:
# Act - Generate schema
result = self.schema_generator.generate_schema_from_file(temp_file)
# Act - Generate schema in syntactic mode
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Schema should capture complex structures
properties = result.get("properties", {})

View File

@@ -47,8 +47,8 @@ Some text here.
temp_file = Path(f.name)
try:
# Act - Generate schema with unlimited depth
result = self.schema_generator.generate_schema_from_file(temp_file)
# Act - Generate schema in syntactic mode (element counting)
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Schema should be valid JSON and contain expected structure
assert isinstance(result, dict)
@@ -104,8 +104,8 @@ Very deep content.
temp_file = Path(f.name)
try:
# Act - Generate schema with depth limit of 2
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2)
# Act - Generate schema in syntactic mode with depth limit of 2
result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
# Assert - Only levels 1 and 2 should be included
properties = result.get("properties", {})
@@ -238,8 +238,8 @@ def api_function():
temp_file = Path(f.name)
try:
# Act - Generate schema
result = self.schema_generator.generate_schema_from_file(temp_file)
# Act - Generate schema in syntactic mode
result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
# Assert - Should capture comprehensive structure
properties = result.get("properties", {})

View File

@@ -19,7 +19,7 @@ def generate_summary(file_path, ascii_mode=False):
"""Generate a concise 4-line summary of the document structure."""
generator = SchemaGenerator()
schema = generator.generate_schema_from_file(Path(file_path))
schema = generator.generate_schema_from_file(Path(file_path), mode='syntactic')
# Define icons based on mode
if ascii_mode:

View File

@@ -20,7 +20,7 @@ def visualize_schema_structure(file_path, max_depth=None, ascii_only=False):
"""Create a beautiful tree visualization of the document structure."""
generator = SchemaGenerator()
schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth)
schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth, mode='syntactic')
# Define icons based on ASCII mode
if ascii_only: