""" Schema Loader - Extract JSON schemas from markdown files. This module provides functionality to load schemas from markdown files that contain embedded JSON schemas in code blocks, along with YAML frontmatter metadata and rich documentation. Markdown Schema Format: --- schema-id: "https://markitect.dev/schemas/domain/v1" version: "1.0.0" status: "stable|draft|deprecated" --- # Schema Title v1.0 ## Documentation sections... ## Schema Definition ```json { "$schema": "http://json-schema.org/draft-07/schema#", ... } ``` This enables: - Rich documentation alongside schemas - Version history in same file - Human-readable schema files - Markdown-first approach aligned with MarkiTect philosophy """ import re import json import yaml from pathlib import Path from typing import Dict, Any, Optional, List, Tuple class SchemaLoaderError(Exception): """Base exception for schema loading errors.""" pass class InvalidSchemaFormatError(SchemaLoaderError): """Schema file format is invalid.""" pass class SchemaNotFoundError(SchemaLoaderError): """No JSON schema found in markdown file.""" pass class MarkdownSchemaLoader: """ Load and parse markdown schema files. Supports: - YAML frontmatter for metadata - JSON code blocks for schema definition - Validation of schema structure - Metadata merging Example: >>> loader = MarkdownSchemaLoader() >>> schema_data = loader.load_schema(Path("manpage-schema-v1.0.md")) >>> schema = schema_data['schema'] >>> metadata = schema_data['metadata'] """ def __init__(self): """Initialize the schema loader with regex patterns.""" # Pattern to match YAML frontmatter # Matches: --- ... --- at start of file self.frontmatter_pattern = re.compile( r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL | re.MULTILINE ) # Pattern to match JSON code blocks # Matches: ```json ... ``` self.json_code_block_pattern = re.compile( r'```json\s*\n(.*?)\n```', re.DOTALL | re.MULTILINE ) # Pattern to find Schema Definition section # This helps us find the right JSON block if there are multiple self.schema_section_pattern = re.compile( r'##\s+Schema Definition\s*\n', re.MULTILINE ) def load_schema(self, md_path: Path) -> Dict[str, Any]: """ Load schema from markdown file. Args: md_path: Path to markdown schema file Returns: Dictionary containing: - schema: Extracted JSON schema (dict) - metadata: Frontmatter metadata (dict) - documentation: Full markdown content (str) - source_file: Source file path (str) Raises: FileNotFoundError: If schema file doesn't exist InvalidSchemaFormatError: If file format is invalid SchemaNotFoundError: If no JSON schema found Example: >>> loader = MarkdownSchemaLoader() >>> data = loader.load_schema(Path("manpage-schema-v1.0.md")) >>> print(data['schema']['title']) 'Unix Manual Page Schema' """ if not md_path.exists(): raise FileNotFoundError(f"Schema file not found: {md_path}") # Read file content try: content = md_path.read_text(encoding='utf-8') except Exception as e: raise InvalidSchemaFormatError(f"Failed to read schema file: {e}") # Extract frontmatter metadata = self._extract_frontmatter(content) # Extract JSON schema schema = self._extract_json_schema(content) if not schema: raise SchemaNotFoundError( f"No JSON schema found in {md_path}. " f"Expected a ```json code block with schema definition." ) # Merge metadata into schema schema = self._merge_metadata(schema, metadata, md_path) return { 'schema': schema, 'metadata': metadata, 'documentation': content, 'source_file': str(md_path) } def _extract_frontmatter(self, content: str) -> Dict[str, Any]: """ Extract YAML frontmatter from markdown content. Args: content: Markdown file content Returns: Dictionary of frontmatter metadata (empty if none found) Raises: InvalidSchemaFormatError: If YAML is malformed """ match = self.frontmatter_pattern.search(content) if not match: return {} yaml_content = match.group(1) try: metadata = yaml.safe_load(yaml_content) or {} if not isinstance(metadata, dict): raise InvalidSchemaFormatError( f"Frontmatter must be a YAML dictionary, got {type(metadata)}" ) return metadata except yaml.YAMLError as e: raise InvalidSchemaFormatError(f"Invalid YAML frontmatter: {e}") def _extract_json_schema(self, content: str) -> Optional[Dict[str, Any]]: """ Extract JSON schema from markdown code blocks. Prefers JSON blocks under "## Schema Definition" section, but will use first JSON block if no Schema Definition section found. Args: content: Markdown file content Returns: JSON schema dictionary or None if not found Raises: InvalidSchemaFormatError: If JSON is malformed """ # Find all JSON code blocks json_blocks = self.json_code_block_pattern.findall(content) if not json_blocks: return None # Try to find the Schema Definition section schema_section_match = self.schema_section_pattern.search(content) if schema_section_match: # Find JSON block that comes after Schema Definition section section_pos = schema_section_match.end() # Re-search for JSON blocks starting from section position remaining_content = content[section_pos:] section_json_blocks = self.json_code_block_pattern.findall(remaining_content) if section_json_blocks: json_text = section_json_blocks[0] else: # Fallback to first JSON block in entire document json_text = json_blocks[0] else: # No Schema Definition section, use first JSON block json_text = json_blocks[0] # Parse JSON try: schema = json.loads(json_text) if not isinstance(schema, dict): raise InvalidSchemaFormatError( f"Schema must be a JSON object, got {type(schema)}" ) return schema except json.JSONDecodeError as e: raise InvalidSchemaFormatError(f"Invalid JSON schema: {e}") def _merge_metadata( self, schema: Dict[str, Any], metadata: Dict[str, Any], source_file: Path ) -> Dict[str, Any]: """ Merge frontmatter metadata into schema. Adds x-markitect-source extension with file info and metadata. Optionally overrides schema fields with frontmatter values. Args: schema: JSON schema dictionary metadata: Frontmatter metadata dictionary source_file: Path to source file Returns: Schema with merged metadata """ # Create a copy to avoid modifying original merged_schema = schema.copy() # Add MarkiTect-specific source metadata merged_schema['x-markitect-source'] = { 'file': str(source_file), 'filename': source_file.name, 'format': 'markdown', 'frontmatter': metadata } # Override schema fields with frontmatter if present # This allows frontmatter to be the source of truth for metadata if 'version' in metadata: merged_schema['version'] = metadata['version'] if 'schema-id' in metadata: merged_schema['$id'] = metadata['schema-id'] if 'status' in metadata: if 'x-markitect-metadata' not in merged_schema: merged_schema['x-markitect-metadata'] = {} merged_schema['x-markitect-metadata']['status'] = metadata['status'] return merged_schema def save_schema( self, schema: Dict[str, Any], md_path: Path, template: Optional[str] = None, frontmatter: Optional[Dict[str, Any]] = None ): """ Save schema as markdown file. Args: schema: JSON schema dictionary to save md_path: Output path for markdown file template: Optional markdown template string frontmatter: Optional frontmatter metadata (extracted from schema if not provided) Raises: InvalidSchemaFormatError: If schema is invalid Example: >>> loader = MarkdownSchemaLoader() >>> loader.save_schema( ... schema={'title': 'My Schema', ...}, ... md_path=Path('my-schema-v1.0.md') ... ) """ if template: # Use provided template content = self._render_template(template, schema, frontmatter) else: # Generate basic markdown content = self._generate_markdown(schema, frontmatter) # Create parent directory if needed md_path.parent.mkdir(parents=True, exist_ok=True) # Write file try: md_path.write_text(content, encoding='utf-8') except Exception as e: raise InvalidSchemaFormatError(f"Failed to write schema file: {e}") def _generate_markdown( self, schema: Dict[str, Any], frontmatter: Optional[Dict[str, Any]] = None ) -> str: """ Generate markdown from schema. Args: schema: JSON schema dictionary frontmatter: Optional frontmatter metadata Returns: Markdown content as string """ # Extract metadata from schema title = schema.get('title', 'Untitled Schema') version = schema.get('version', '1.0.0') description = schema.get('description', '') schema_id = schema.get('$id', '') # Build frontmatter if frontmatter is None: frontmatter = {} # Set defaults if 'schema-id' not in frontmatter and schema_id: frontmatter['schema-id'] = schema_id if 'version' not in frontmatter: frontmatter['version'] = version if 'status' not in frontmatter: frontmatter['status'] = 'draft' # Generate frontmatter YAML frontmatter_yaml = yaml.dump( frontmatter, default_flow_style=False, allow_unicode=True ).strip() # Generate JSON (pretty-printed) schema_json = json.dumps(schema, indent=2, ensure_ascii=False) # Build markdown content md_content = f"""--- {frontmatter_yaml} --- # {title} v{version} ## Overview {description} ## Usage ```bash markitect validate document.md --schema {Path(frontmatter.get('schema-id', 'schema')).name} ``` ## Schema Definition ```json {schema_json} ``` ## Version History ### v{version} - Initial version """ return md_content def _render_template( self, template: str, schema: Dict[str, Any], frontmatter: Optional[Dict[str, Any]] = None ) -> str: """ Render markdown from template. Simple template rendering using string formatting. For complex templates, consider using Jinja2 or similar. Args: template: Template string schema: JSON schema dictionary frontmatter: Optional frontmatter metadata Returns: Rendered markdown content """ # Build context for template context = { 'title': schema.get('title', 'Untitled'), 'version': schema.get('version', '1.0.0'), 'description': schema.get('description', ''), 'schema_id': schema.get('$id', ''), 'schema_json': json.dumps(schema, indent=2, ensure_ascii=False), 'frontmatter': frontmatter or {}, } # Simple template rendering try: return template.format(**context) except KeyError as e: raise InvalidSchemaFormatError(f"Template missing key: {e}") def list_json_blocks(self, content: str) -> List[Tuple[int, str]]: """ List all JSON code blocks in markdown content. Useful for debugging or when multiple JSON blocks exist. Args: content: Markdown file content Returns: List of (position, json_content) tuples Example: >>> loader = MarkdownSchemaLoader() >>> content = Path('schema.md').read_text() >>> blocks = loader.list_json_blocks(content) >>> print(f"Found {len(blocks)} JSON blocks") """ blocks = [] for match in self.json_code_block_pattern.finditer(content): blocks.append((match.start(), match.group(1))) return blocks def validate_schema_structure(self, schema: Dict[str, Any]) -> List[str]: """ Validate basic schema structure. Checks for required JSON Schema fields and MarkiTect conventions. Args: schema: JSON schema dictionary Returns: List of warning/error messages (empty if valid) Example: >>> loader = MarkdownSchemaLoader() >>> issues = loader.validate_schema_structure(schema) >>> if issues: ... print("Schema issues:", issues) """ issues = [] # Check required JSON Schema fields if '$schema' not in schema: issues.append("Missing required field: $schema") if 'type' not in schema: issues.append("Missing recommended field: type") if 'title' not in schema: issues.append("Missing recommended field: title") if 'description' not in schema: issues.append("Missing recommended field: description") # Check MarkiTect conventions if 'version' not in schema: issues.append("Missing MarkiTect convention: version field") if '$id' not in schema: issues.append("Missing recommended field: $id") # Check $id format if present if '$id' in schema: schema_id = schema['$id'] if not isinstance(schema_id, str): issues.append("$id must be a string") elif not schema_id.startswith('https://'): issues.append("$id should be a full HTTPS URL") return issues def auto_ingest_schemas(db_manager=None, schema_dir: Optional[Path] = None, verbose: bool = False) -> Dict[str, Any]: """Automatically ingest schemas from markitect/schemas/ directory. This function scans the schemas directory for .md schema files and ingests any that are not already in the database. Useful for post-install setup or automatic schema registration. Args: db_manager: DatabaseManager instance (optional, will create if not provided) schema_dir: Directory containing schemas (defaults to markitect/schemas/) verbose: If True, print detailed progress messages Returns: Dictionary with ingestion results: { 'ingested': [list of schema names that were ingested], 'skipped': [list of schema names that were already present], 'failed': [list of (schema_name, error) tuples for failures] } Example: >>> from markitect.schema_loader import auto_ingest_schemas >>> results = auto_ingest_schemas(verbose=True) >>> print(f"Ingested {len(results['ingested'])} schemas") """ # Determine schema directory if schema_dir is None: schema_dir = Path(__file__).parent / "schemas" if not schema_dir.exists(): if verbose: print(f"āš ļø Schema directory not found: {schema_dir}") return {'ingested': [], 'skipped': [], 'failed': []} # Initialize database manager if not provided if db_manager is None: from .database import DatabaseManager db_path = Path.home() / '.markitect' / 'markitect.db' db_manager = DatabaseManager(str(db_path)) db_manager.initialize_database() # Get list of already ingested schemas try: existing_schemas = {schema['name'] for schema in db_manager.list_schemas()} except Exception as e: if verbose: print(f"āŒ Error listing existing schemas: {e}") return {'ingested': [], 'skipped': [], 'failed': []} results = { 'ingested': [], 'skipped': [], 'failed': [] } # Find all schema files schema_files = list(schema_dir.glob("*-schema-v*.md")) if verbose and schema_files: print(f"šŸ” Found {len(schema_files)} schema file(s) in {schema_dir}") loader = MarkdownSchemaLoader() for schema_file in sorted(schema_files): schema_name = schema_file.name # Skip if already ingested if schema_name in existing_schemas: results['skipped'].append(schema_name) if verbose: print(f"ā­ļø Skipping {schema_name} (already ingested)") continue # Try to ingest try: # Load schema schema_data_full = loader.load_schema(schema_file) schema_data = schema_data_full['schema'] # Store in database schema_content = json.dumps(schema_data, indent=2) record_id = db_manager.store_schema_file(schema_name, schema_content) if record_id: results['ingested'].append(schema_name) if verbose: title = schema_data.get('title', schema_name) print(f"āœ… Ingested {schema_name} (title: {title})") else: results['failed'].append((schema_name, "Failed to store in database")) if verbose: print(f"āŒ Failed to store {schema_name} in database") except Exception as e: results['failed'].append((schema_name, str(e))) if verbose: print(f"āŒ Failed to ingest {schema_name}: {e}") if verbose: print(f"\nšŸ“Š Auto-ingestion complete:") print(f" Ingested: {len(results['ingested'])}") print(f" Skipped: {len(results['skipped'])}") print(f" Failed: {len(results['failed'])}") return results