Add automated schema ingestion from markitect/schemas/ directory: - Create auto_ingest_schemas() function in schema_loader module - Automatically detect and ingest .md schema files from schemas/ - Skip schemas that are already ingested in database - Return detailed results with ingested/skipped/failed lists - Add 'markitect schema-auto-ingest' CLI command - Support verbose mode for detailed progress reporting - Useful for post-install setup and development workflows This eliminates the manual step of running schema-ingest for each bundled schema file, streamlining schema management. Usage: markitect schema-auto-ingest # Ingest all new schemas markitect schema-auto-ingest --verbose # Show detailed progress 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
611 lines
19 KiB
Python
611 lines
19 KiB
Python
"""
|
|
Schema Loader - Extract JSON schemas from markdown files.
|
|
|
|
This module provides functionality to load schemas from markdown files that
|
|
contain embedded JSON schemas in code blocks, along with YAML frontmatter
|
|
metadata and rich documentation.
|
|
|
|
Markdown Schema Format:
|
|
---
|
|
schema-id: "https://markitect.dev/schemas/domain/v1"
|
|
version: "1.0.0"
|
|
status: "stable|draft|deprecated"
|
|
---
|
|
|
|
# Schema Title v1.0
|
|
|
|
## Documentation sections...
|
|
|
|
## Schema Definition
|
|
|
|
```json
|
|
{
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
...
|
|
}
|
|
```
|
|
|
|
This enables:
|
|
- Rich documentation alongside schemas
|
|
- Version history in same file
|
|
- Human-readable schema files
|
|
- Markdown-first approach aligned with MarkiTect philosophy
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, List, Tuple
|
|
|
|
|
|
class SchemaLoaderError(Exception):
|
|
"""Base exception for schema loading errors."""
|
|
pass
|
|
|
|
|
|
class InvalidSchemaFormatError(SchemaLoaderError):
|
|
"""Schema file format is invalid."""
|
|
pass
|
|
|
|
|
|
class SchemaNotFoundError(SchemaLoaderError):
|
|
"""No JSON schema found in markdown file."""
|
|
pass
|
|
|
|
|
|
class MarkdownSchemaLoader:
|
|
"""
|
|
Load and parse markdown schema files.
|
|
|
|
Supports:
|
|
- YAML frontmatter for metadata
|
|
- JSON code blocks for schema definition
|
|
- Validation of schema structure
|
|
- Metadata merging
|
|
|
|
Example:
|
|
>>> loader = MarkdownSchemaLoader()
|
|
>>> schema_data = loader.load_schema(Path("manpage-schema-v1.0.md"))
|
|
>>> schema = schema_data['schema']
|
|
>>> metadata = schema_data['metadata']
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the schema loader with regex patterns."""
|
|
# Pattern to match YAML frontmatter
|
|
# Matches: --- ... --- at start of file
|
|
self.frontmatter_pattern = re.compile(
|
|
r'^---\s*\n(.*?)\n---\s*\n',
|
|
re.DOTALL | re.MULTILINE
|
|
)
|
|
|
|
# Pattern to match JSON code blocks
|
|
# Matches: ```json ... ```
|
|
self.json_code_block_pattern = re.compile(
|
|
r'```json\s*\n(.*?)\n```',
|
|
re.DOTALL | re.MULTILINE
|
|
)
|
|
|
|
# Pattern to find Schema Definition section
|
|
# This helps us find the right JSON block if there are multiple
|
|
self.schema_section_pattern = re.compile(
|
|
r'##\s+Schema Definition\s*\n',
|
|
re.MULTILINE
|
|
)
|
|
|
|
def load_schema(self, md_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Load schema from markdown file.
|
|
|
|
Args:
|
|
md_path: Path to markdown schema file
|
|
|
|
Returns:
|
|
Dictionary containing:
|
|
- schema: Extracted JSON schema (dict)
|
|
- metadata: Frontmatter metadata (dict)
|
|
- documentation: Full markdown content (str)
|
|
- source_file: Source file path (str)
|
|
|
|
Raises:
|
|
FileNotFoundError: If schema file doesn't exist
|
|
InvalidSchemaFormatError: If file format is invalid
|
|
SchemaNotFoundError: If no JSON schema found
|
|
|
|
Example:
|
|
>>> loader = MarkdownSchemaLoader()
|
|
>>> data = loader.load_schema(Path("manpage-schema-v1.0.md"))
|
|
>>> print(data['schema']['title'])
|
|
'Unix Manual Page Schema'
|
|
"""
|
|
if not md_path.exists():
|
|
raise FileNotFoundError(f"Schema file not found: {md_path}")
|
|
|
|
# Read file content
|
|
try:
|
|
content = md_path.read_text(encoding='utf-8')
|
|
except Exception as e:
|
|
raise InvalidSchemaFormatError(f"Failed to read schema file: {e}")
|
|
|
|
# Extract frontmatter
|
|
metadata = self._extract_frontmatter(content)
|
|
|
|
# Extract JSON schema
|
|
schema = self._extract_json_schema(content)
|
|
|
|
if not schema:
|
|
raise SchemaNotFoundError(
|
|
f"No JSON schema found in {md_path}. "
|
|
f"Expected a ```json code block with schema definition."
|
|
)
|
|
|
|
# Merge metadata into schema
|
|
schema = self._merge_metadata(schema, metadata, md_path)
|
|
|
|
return {
|
|
'schema': schema,
|
|
'metadata': metadata,
|
|
'documentation': content,
|
|
'source_file': str(md_path)
|
|
}
|
|
|
|
def _extract_frontmatter(self, content: str) -> Dict[str, Any]:
|
|
"""
|
|
Extract YAML frontmatter from markdown content.
|
|
|
|
Args:
|
|
content: Markdown file content
|
|
|
|
Returns:
|
|
Dictionary of frontmatter metadata (empty if none found)
|
|
|
|
Raises:
|
|
InvalidSchemaFormatError: If YAML is malformed
|
|
"""
|
|
match = self.frontmatter_pattern.search(content)
|
|
if not match:
|
|
return {}
|
|
|
|
yaml_content = match.group(1)
|
|
try:
|
|
metadata = yaml.safe_load(yaml_content) or {}
|
|
if not isinstance(metadata, dict):
|
|
raise InvalidSchemaFormatError(
|
|
f"Frontmatter must be a YAML dictionary, got {type(metadata)}"
|
|
)
|
|
return metadata
|
|
except yaml.YAMLError as e:
|
|
raise InvalidSchemaFormatError(f"Invalid YAML frontmatter: {e}")
|
|
|
|
def _extract_json_schema(self, content: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract JSON schema from markdown code blocks.
|
|
|
|
Prefers JSON blocks under "## Schema Definition" section,
|
|
but will use first JSON block if no Schema Definition section found.
|
|
|
|
Args:
|
|
content: Markdown file content
|
|
|
|
Returns:
|
|
JSON schema dictionary or None if not found
|
|
|
|
Raises:
|
|
InvalidSchemaFormatError: If JSON is malformed
|
|
"""
|
|
# Find all JSON code blocks
|
|
json_blocks = self.json_code_block_pattern.findall(content)
|
|
|
|
if not json_blocks:
|
|
return None
|
|
|
|
# Try to find the Schema Definition section
|
|
schema_section_match = self.schema_section_pattern.search(content)
|
|
|
|
if schema_section_match:
|
|
# Find JSON block that comes after Schema Definition section
|
|
section_pos = schema_section_match.end()
|
|
|
|
# Re-search for JSON blocks starting from section position
|
|
remaining_content = content[section_pos:]
|
|
section_json_blocks = self.json_code_block_pattern.findall(remaining_content)
|
|
|
|
if section_json_blocks:
|
|
json_text = section_json_blocks[0]
|
|
else:
|
|
# Fallback to first JSON block in entire document
|
|
json_text = json_blocks[0]
|
|
else:
|
|
# No Schema Definition section, use first JSON block
|
|
json_text = json_blocks[0]
|
|
|
|
# Parse JSON
|
|
try:
|
|
schema = json.loads(json_text)
|
|
if not isinstance(schema, dict):
|
|
raise InvalidSchemaFormatError(
|
|
f"Schema must be a JSON object, got {type(schema)}"
|
|
)
|
|
return schema
|
|
except json.JSONDecodeError as e:
|
|
raise InvalidSchemaFormatError(f"Invalid JSON schema: {e}")
|
|
|
|
def _merge_metadata(
|
|
self,
|
|
schema: Dict[str, Any],
|
|
metadata: Dict[str, Any],
|
|
source_file: Path
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Merge frontmatter metadata into schema.
|
|
|
|
Adds x-markitect-source extension with file info and metadata.
|
|
Optionally overrides schema fields with frontmatter values.
|
|
|
|
Args:
|
|
schema: JSON schema dictionary
|
|
metadata: Frontmatter metadata dictionary
|
|
source_file: Path to source file
|
|
|
|
Returns:
|
|
Schema with merged metadata
|
|
"""
|
|
# Create a copy to avoid modifying original
|
|
merged_schema = schema.copy()
|
|
|
|
# Add MarkiTect-specific source metadata
|
|
merged_schema['x-markitect-source'] = {
|
|
'file': str(source_file),
|
|
'filename': source_file.name,
|
|
'format': 'markdown',
|
|
'frontmatter': metadata
|
|
}
|
|
|
|
# Override schema fields with frontmatter if present
|
|
# This allows frontmatter to be the source of truth for metadata
|
|
if 'version' in metadata:
|
|
merged_schema['version'] = metadata['version']
|
|
|
|
if 'schema-id' in metadata:
|
|
merged_schema['$id'] = metadata['schema-id']
|
|
|
|
if 'status' in metadata:
|
|
if 'x-markitect-metadata' not in merged_schema:
|
|
merged_schema['x-markitect-metadata'] = {}
|
|
merged_schema['x-markitect-metadata']['status'] = metadata['status']
|
|
|
|
return merged_schema
|
|
|
|
def save_schema(
|
|
self,
|
|
schema: Dict[str, Any],
|
|
md_path: Path,
|
|
template: Optional[str] = None,
|
|
frontmatter: Optional[Dict[str, Any]] = None
|
|
):
|
|
"""
|
|
Save schema as markdown file.
|
|
|
|
Args:
|
|
schema: JSON schema dictionary to save
|
|
md_path: Output path for markdown file
|
|
template: Optional markdown template string
|
|
frontmatter: Optional frontmatter metadata (extracted from schema if not provided)
|
|
|
|
Raises:
|
|
InvalidSchemaFormatError: If schema is invalid
|
|
|
|
Example:
|
|
>>> loader = MarkdownSchemaLoader()
|
|
>>> loader.save_schema(
|
|
... schema={'title': 'My Schema', ...},
|
|
... md_path=Path('my-schema-v1.0.md')
|
|
... )
|
|
"""
|
|
if template:
|
|
# Use provided template
|
|
content = self._render_template(template, schema, frontmatter)
|
|
else:
|
|
# Generate basic markdown
|
|
content = self._generate_markdown(schema, frontmatter)
|
|
|
|
# Create parent directory if needed
|
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write file
|
|
try:
|
|
md_path.write_text(content, encoding='utf-8')
|
|
except Exception as e:
|
|
raise InvalidSchemaFormatError(f"Failed to write schema file: {e}")
|
|
|
|
def _generate_markdown(
|
|
self,
|
|
schema: Dict[str, Any],
|
|
frontmatter: Optional[Dict[str, Any]] = None
|
|
) -> str:
|
|
"""
|
|
Generate markdown from schema.
|
|
|
|
Args:
|
|
schema: JSON schema dictionary
|
|
frontmatter: Optional frontmatter metadata
|
|
|
|
Returns:
|
|
Markdown content as string
|
|
"""
|
|
# Extract metadata from schema
|
|
title = schema.get('title', 'Untitled Schema')
|
|
version = schema.get('version', '1.0.0')
|
|
description = schema.get('description', '')
|
|
schema_id = schema.get('$id', '')
|
|
|
|
# Build frontmatter
|
|
if frontmatter is None:
|
|
frontmatter = {}
|
|
|
|
# Set defaults
|
|
if 'schema-id' not in frontmatter and schema_id:
|
|
frontmatter['schema-id'] = schema_id
|
|
if 'version' not in frontmatter:
|
|
frontmatter['version'] = version
|
|
if 'status' not in frontmatter:
|
|
frontmatter['status'] = 'draft'
|
|
|
|
# Generate frontmatter YAML
|
|
frontmatter_yaml = yaml.dump(
|
|
frontmatter,
|
|
default_flow_style=False,
|
|
allow_unicode=True
|
|
).strip()
|
|
|
|
# Generate JSON (pretty-printed)
|
|
schema_json = json.dumps(schema, indent=2, ensure_ascii=False)
|
|
|
|
# Build markdown content
|
|
md_content = f"""---
|
|
{frontmatter_yaml}
|
|
---
|
|
|
|
# {title} v{version}
|
|
|
|
## Overview
|
|
|
|
{description}
|
|
|
|
## Usage
|
|
|
|
```bash
|
|
markitect validate document.md --schema {Path(frontmatter.get('schema-id', 'schema')).name}
|
|
```
|
|
|
|
## Schema Definition
|
|
|
|
```json
|
|
{schema_json}
|
|
```
|
|
|
|
## Version History
|
|
|
|
### v{version}
|
|
- Initial version
|
|
"""
|
|
|
|
return md_content
|
|
|
|
def _render_template(
|
|
self,
|
|
template: str,
|
|
schema: Dict[str, Any],
|
|
frontmatter: Optional[Dict[str, Any]] = None
|
|
) -> str:
|
|
"""
|
|
Render markdown from template.
|
|
|
|
Simple template rendering using string formatting.
|
|
For complex templates, consider using Jinja2 or similar.
|
|
|
|
Args:
|
|
template: Template string
|
|
schema: JSON schema dictionary
|
|
frontmatter: Optional frontmatter metadata
|
|
|
|
Returns:
|
|
Rendered markdown content
|
|
"""
|
|
# Build context for template
|
|
context = {
|
|
'title': schema.get('title', 'Untitled'),
|
|
'version': schema.get('version', '1.0.0'),
|
|
'description': schema.get('description', ''),
|
|
'schema_id': schema.get('$id', ''),
|
|
'schema_json': json.dumps(schema, indent=2, ensure_ascii=False),
|
|
'frontmatter': frontmatter or {},
|
|
}
|
|
|
|
# Simple template rendering
|
|
try:
|
|
return template.format(**context)
|
|
except KeyError as e:
|
|
raise InvalidSchemaFormatError(f"Template missing key: {e}")
|
|
|
|
def list_json_blocks(self, content: str) -> List[Tuple[int, str]]:
|
|
"""
|
|
List all JSON code blocks in markdown content.
|
|
|
|
Useful for debugging or when multiple JSON blocks exist.
|
|
|
|
Args:
|
|
content: Markdown file content
|
|
|
|
Returns:
|
|
List of (position, json_content) tuples
|
|
|
|
Example:
|
|
>>> loader = MarkdownSchemaLoader()
|
|
>>> content = Path('schema.md').read_text()
|
|
>>> blocks = loader.list_json_blocks(content)
|
|
>>> print(f"Found {len(blocks)} JSON blocks")
|
|
"""
|
|
blocks = []
|
|
for match in self.json_code_block_pattern.finditer(content):
|
|
blocks.append((match.start(), match.group(1)))
|
|
return blocks
|
|
|
|
def validate_schema_structure(self, schema: Dict[str, Any]) -> List[str]:
|
|
"""
|
|
Validate basic schema structure.
|
|
|
|
Checks for required JSON Schema fields and MarkiTect conventions.
|
|
|
|
Args:
|
|
schema: JSON schema dictionary
|
|
|
|
Returns:
|
|
List of warning/error messages (empty if valid)
|
|
|
|
Example:
|
|
>>> loader = MarkdownSchemaLoader()
|
|
>>> issues = loader.validate_schema_structure(schema)
|
|
>>> if issues:
|
|
... print("Schema issues:", issues)
|
|
"""
|
|
issues = []
|
|
|
|
# Check required JSON Schema fields
|
|
if '$schema' not in schema:
|
|
issues.append("Missing required field: $schema")
|
|
|
|
if 'type' not in schema:
|
|
issues.append("Missing recommended field: type")
|
|
|
|
if 'title' not in schema:
|
|
issues.append("Missing recommended field: title")
|
|
|
|
if 'description' not in schema:
|
|
issues.append("Missing recommended field: description")
|
|
|
|
# Check MarkiTect conventions
|
|
if 'version' not in schema:
|
|
issues.append("Missing MarkiTect convention: version field")
|
|
|
|
if '$id' not in schema:
|
|
issues.append("Missing recommended field: $id")
|
|
|
|
# Check $id format if present
|
|
if '$id' in schema:
|
|
schema_id = schema['$id']
|
|
if not isinstance(schema_id, str):
|
|
issues.append("$id must be a string")
|
|
elif not schema_id.startswith('https://'):
|
|
issues.append("$id should be a full HTTPS URL")
|
|
|
|
return issues
|
|
|
|
|
|
def auto_ingest_schemas(db_manager=None, schema_dir: Optional[Path] = None, verbose: bool = False) -> Dict[str, Any]:
|
|
"""Automatically ingest schemas from markitect/schemas/ directory.
|
|
|
|
This function scans the schemas directory for .md schema files and ingests
|
|
any that are not already in the database. Useful for post-install setup
|
|
or automatic schema registration.
|
|
|
|
Args:
|
|
db_manager: DatabaseManager instance (optional, will create if not provided)
|
|
schema_dir: Directory containing schemas (defaults to markitect/schemas/)
|
|
verbose: If True, print detailed progress messages
|
|
|
|
Returns:
|
|
Dictionary with ingestion results:
|
|
{
|
|
'ingested': [list of schema names that were ingested],
|
|
'skipped': [list of schema names that were already present],
|
|
'failed': [list of (schema_name, error) tuples for failures]
|
|
}
|
|
|
|
Example:
|
|
>>> from markitect.schema_loader import auto_ingest_schemas
|
|
>>> results = auto_ingest_schemas(verbose=True)
|
|
>>> print(f"Ingested {len(results['ingested'])} schemas")
|
|
"""
|
|
# Determine schema directory
|
|
if schema_dir is None:
|
|
schema_dir = Path(__file__).parent / "schemas"
|
|
|
|
if not schema_dir.exists():
|
|
if verbose:
|
|
print(f"⚠️ Schema directory not found: {schema_dir}")
|
|
return {'ingested': [], 'skipped': [], 'failed': []}
|
|
|
|
# Initialize database manager if not provided
|
|
if db_manager is None:
|
|
from .database import DatabaseManager
|
|
db_path = Path.home() / '.markitect' / 'markitect.db'
|
|
db_manager = DatabaseManager(str(db_path))
|
|
db_manager.initialize_database()
|
|
|
|
# Get list of already ingested schemas
|
|
try:
|
|
existing_schemas = {schema['name'] for schema in db_manager.list_schemas()}
|
|
except Exception as e:
|
|
if verbose:
|
|
print(f"❌ Error listing existing schemas: {e}")
|
|
return {'ingested': [], 'skipped': [], 'failed': []}
|
|
|
|
results = {
|
|
'ingested': [],
|
|
'skipped': [],
|
|
'failed': []
|
|
}
|
|
|
|
# Find all schema files
|
|
schema_files = list(schema_dir.glob("*-schema-v*.md"))
|
|
|
|
if verbose and schema_files:
|
|
print(f"🔍 Found {len(schema_files)} schema file(s) in {schema_dir}")
|
|
|
|
loader = MarkdownSchemaLoader()
|
|
|
|
for schema_file in sorted(schema_files):
|
|
schema_name = schema_file.name
|
|
|
|
# Skip if already ingested
|
|
if schema_name in existing_schemas:
|
|
results['skipped'].append(schema_name)
|
|
if verbose:
|
|
print(f"⏭️ Skipping {schema_name} (already ingested)")
|
|
continue
|
|
|
|
# Try to ingest
|
|
try:
|
|
# Load schema
|
|
schema_data_full = loader.load_schema(schema_file)
|
|
schema_data = schema_data_full['schema']
|
|
|
|
# Store in database
|
|
schema_content = json.dumps(schema_data, indent=2)
|
|
record_id = db_manager.store_schema_file(schema_name, schema_content)
|
|
|
|
if record_id:
|
|
results['ingested'].append(schema_name)
|
|
if verbose:
|
|
title = schema_data.get('title', schema_name)
|
|
print(f"✅ Ingested {schema_name} (title: {title})")
|
|
else:
|
|
results['failed'].append((schema_name, "Failed to store in database"))
|
|
if verbose:
|
|
print(f"❌ Failed to store {schema_name} in database")
|
|
|
|
except Exception as e:
|
|
results['failed'].append((schema_name, str(e)))
|
|
if verbose:
|
|
print(f"❌ Failed to ingest {schema_name}: {e}")
|
|
|
|
if verbose:
|
|
print(f"\n📊 Auto-ingestion complete:")
|
|
print(f" Ingested: {len(results['ingested'])}")
|
|
print(f" Skipped: {len(results['skipped'])}")
|
|
print(f" Failed: {len(results['failed'])}")
|
|
|
|
return results
|