feat(spaces): implement Phase 0-1 of Information Space Service
Phase 0 - Project Organization: - Create docs/PROJECT_STRUCTURE.md documenting codebase layout - Create markitect/core/ with parser, serializer, document_manager, workspace - Create markitect/schema/ consolidating 6 schema_*.py modules - Create markitect/storage/ with database module - Maintain backward compatibility via re-exports from original locations - Add docs/roadmap/information-space-service/ with README and WORKPLAN Phase 1 - Foundation (Weeks 1-3): - Week 1: Core domain models (InformationSpace, SpaceDocument, SpaceConfig, SpaceMetadata, SpaceVariable, TransclusionReference, SpaceStatus) - Week 2: Repository layer with interfaces (ISpaceRepository, IDocumentAssociationRepository, IVariableRepository, IReferenceRepository) and SQLite implementations with foreign key cascade deletes - Week 3: SpaceService orchestration layer with full CRUD, document, variable, and reference tracking operations Test coverage: 124 tests (25 model + 63 repository + 36 integration) Capabilities delivered: - CAP-001: InformationSpace entity with lifecycle management - CAP-002: SpaceRepository CRUD with SQLite backing - CAP-003: Document-Space associations with path-based organization - CAP-004: Space metadata and configuration schemas - CAP-005: Database schema with migrations Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,610 +1,23 @@
|
||||
"""
|
||||
Schema Loader - Extract JSON schemas from markdown files.
|
||||
Schema Loader - Backward Compatibility Module.
|
||||
|
||||
This module provides functionality to load schemas from markdown files that
|
||||
contain embedded JSON schemas in code blocks, along with YAML frontmatter
|
||||
metadata and rich documentation.
|
||||
|
||||
Markdown Schema Format:
|
||||
---
|
||||
schema-id: "https://markitect.dev/schemas/domain/v1"
|
||||
version: "1.0.0"
|
||||
status: "stable|draft|deprecated"
|
||||
---
|
||||
|
||||
# Schema Title v1.0
|
||||
|
||||
## Documentation sections...
|
||||
|
||||
## Schema Definition
|
||||
|
||||
```json
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
This enables:
|
||||
- Rich documentation alongside schemas
|
||||
- Version history in same file
|
||||
- Human-readable schema files
|
||||
- Markdown-first approach aligned with MarkiTect philosophy
|
||||
This module re-exports from markitect.schema.loader for backward compatibility.
|
||||
New code should import from markitect.schema.loader directly.
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
|
||||
|
||||
class SchemaLoaderError(Exception):
|
||||
"""Base exception for schema loading errors."""
|
||||
pass
|
||||
|
||||
|
||||
class InvalidSchemaFormatError(SchemaLoaderError):
|
||||
"""Schema file format is invalid."""
|
||||
pass
|
||||
|
||||
|
||||
class SchemaNotFoundError(SchemaLoaderError):
|
||||
"""No JSON schema found in markdown file."""
|
||||
pass
|
||||
|
||||
|
||||
class MarkdownSchemaLoader:
|
||||
"""
|
||||
Load and parse markdown schema files.
|
||||
|
||||
Supports:
|
||||
- YAML frontmatter for metadata
|
||||
- JSON code blocks for schema definition
|
||||
- Validation of schema structure
|
||||
- Metadata merging
|
||||
|
||||
Example:
|
||||
>>> loader = MarkdownSchemaLoader()
|
||||
>>> schema_data = loader.load_schema(Path("manpage-schema-v1.0.md"))
|
||||
>>> schema = schema_data['schema']
|
||||
>>> metadata = schema_data['metadata']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the schema loader with regex patterns."""
|
||||
# Pattern to match YAML frontmatter
|
||||
# Matches: --- ... --- at start of file
|
||||
self.frontmatter_pattern = re.compile(
|
||||
r'^---\s*\n(.*?)\n---\s*\n',
|
||||
re.DOTALL | re.MULTILINE
|
||||
)
|
||||
|
||||
# Pattern to match JSON code blocks
|
||||
# Matches: ```json ... ```
|
||||
self.json_code_block_pattern = re.compile(
|
||||
r'```json\s*\n(.*?)\n```',
|
||||
re.DOTALL | re.MULTILINE
|
||||
)
|
||||
|
||||
# Pattern to find Schema Definition section
|
||||
# This helps us find the right JSON block if there are multiple
|
||||
self.schema_section_pattern = re.compile(
|
||||
r'##\s+Schema Definition\s*\n',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
def load_schema(self, md_path: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Load schema from markdown file.
|
||||
|
||||
Args:
|
||||
md_path: Path to markdown schema file
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- schema: Extracted JSON schema (dict)
|
||||
- metadata: Frontmatter metadata (dict)
|
||||
- documentation: Full markdown content (str)
|
||||
- source_file: Source file path (str)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If schema file doesn't exist
|
||||
InvalidSchemaFormatError: If file format is invalid
|
||||
SchemaNotFoundError: If no JSON schema found
|
||||
|
||||
Example:
|
||||
>>> loader = MarkdownSchemaLoader()
|
||||
>>> data = loader.load_schema(Path("manpage-schema-v1.0.md"))
|
||||
>>> print(data['schema']['title'])
|
||||
'Unix Manual Page Schema'
|
||||
"""
|
||||
if not md_path.exists():
|
||||
raise FileNotFoundError(f"Schema file not found: {md_path}")
|
||||
|
||||
# Read file content
|
||||
try:
|
||||
content = md_path.read_text(encoding='utf-8')
|
||||
except Exception as e:
|
||||
raise InvalidSchemaFormatError(f"Failed to read schema file: {e}")
|
||||
|
||||
# Extract frontmatter
|
||||
metadata = self._extract_frontmatter(content)
|
||||
|
||||
# Extract JSON schema
|
||||
schema = self._extract_json_schema(content)
|
||||
|
||||
if not schema:
|
||||
raise SchemaNotFoundError(
|
||||
f"No JSON schema found in {md_path}. "
|
||||
f"Expected a ```json code block with schema definition."
|
||||
)
|
||||
|
||||
# Merge metadata into schema
|
||||
schema = self._merge_metadata(schema, metadata, md_path)
|
||||
|
||||
return {
|
||||
'schema': schema,
|
||||
'metadata': metadata,
|
||||
'documentation': content,
|
||||
'source_file': str(md_path)
|
||||
}
|
||||
|
||||
def _extract_frontmatter(self, content: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract YAML frontmatter from markdown content.
|
||||
|
||||
Args:
|
||||
content: Markdown file content
|
||||
|
||||
Returns:
|
||||
Dictionary of frontmatter metadata (empty if none found)
|
||||
|
||||
Raises:
|
||||
InvalidSchemaFormatError: If YAML is malformed
|
||||
"""
|
||||
match = self.frontmatter_pattern.search(content)
|
||||
if not match:
|
||||
return {}
|
||||
|
||||
yaml_content = match.group(1)
|
||||
try:
|
||||
metadata = yaml.safe_load(yaml_content) or {}
|
||||
if not isinstance(metadata, dict):
|
||||
raise InvalidSchemaFormatError(
|
||||
f"Frontmatter must be a YAML dictionary, got {type(metadata)}"
|
||||
)
|
||||
return metadata
|
||||
except yaml.YAMLError as e:
|
||||
raise InvalidSchemaFormatError(f"Invalid YAML frontmatter: {e}")
|
||||
|
||||
def _extract_json_schema(self, content: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract JSON schema from markdown code blocks.
|
||||
|
||||
Prefers JSON blocks under "## Schema Definition" section,
|
||||
but will use first JSON block if no Schema Definition section found.
|
||||
|
||||
Args:
|
||||
content: Markdown file content
|
||||
|
||||
Returns:
|
||||
JSON schema dictionary or None if not found
|
||||
|
||||
Raises:
|
||||
InvalidSchemaFormatError: If JSON is malformed
|
||||
"""
|
||||
# Find all JSON code blocks
|
||||
json_blocks = self.json_code_block_pattern.findall(content)
|
||||
|
||||
if not json_blocks:
|
||||
return None
|
||||
|
||||
# Try to find the Schema Definition section
|
||||
schema_section_match = self.schema_section_pattern.search(content)
|
||||
|
||||
if schema_section_match:
|
||||
# Find JSON block that comes after Schema Definition section
|
||||
section_pos = schema_section_match.end()
|
||||
|
||||
# Re-search for JSON blocks starting from section position
|
||||
remaining_content = content[section_pos:]
|
||||
section_json_blocks = self.json_code_block_pattern.findall(remaining_content)
|
||||
|
||||
if section_json_blocks:
|
||||
json_text = section_json_blocks[0]
|
||||
else:
|
||||
# Fallback to first JSON block in entire document
|
||||
json_text = json_blocks[0]
|
||||
else:
|
||||
# No Schema Definition section, use first JSON block
|
||||
json_text = json_blocks[0]
|
||||
|
||||
# Parse JSON
|
||||
try:
|
||||
schema = json.loads(json_text)
|
||||
if not isinstance(schema, dict):
|
||||
raise InvalidSchemaFormatError(
|
||||
f"Schema must be a JSON object, got {type(schema)}"
|
||||
)
|
||||
return schema
|
||||
except json.JSONDecodeError as e:
|
||||
raise InvalidSchemaFormatError(f"Invalid JSON schema: {e}")
|
||||
|
||||
def _merge_metadata(
|
||||
self,
|
||||
schema: Dict[str, Any],
|
||||
metadata: Dict[str, Any],
|
||||
source_file: Path
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Merge frontmatter metadata into schema.
|
||||
|
||||
Adds x-markitect-source extension with file info and metadata.
|
||||
Optionally overrides schema fields with frontmatter values.
|
||||
|
||||
Args:
|
||||
schema: JSON schema dictionary
|
||||
metadata: Frontmatter metadata dictionary
|
||||
source_file: Path to source file
|
||||
|
||||
Returns:
|
||||
Schema with merged metadata
|
||||
"""
|
||||
# Create a copy to avoid modifying original
|
||||
merged_schema = schema.copy()
|
||||
|
||||
# Add MarkiTect-specific source metadata
|
||||
merged_schema['x-markitect-source'] = {
|
||||
'file': str(source_file),
|
||||
'filename': source_file.name,
|
||||
'format': 'markdown',
|
||||
'frontmatter': metadata
|
||||
}
|
||||
|
||||
# Override schema fields with frontmatter if present
|
||||
# This allows frontmatter to be the source of truth for metadata
|
||||
if 'version' in metadata:
|
||||
merged_schema['version'] = metadata['version']
|
||||
|
||||
if 'schema-id' in metadata:
|
||||
merged_schema['$id'] = metadata['schema-id']
|
||||
|
||||
if 'status' in metadata:
|
||||
if 'x-markitect-metadata' not in merged_schema:
|
||||
merged_schema['x-markitect-metadata'] = {}
|
||||
merged_schema['x-markitect-metadata']['status'] = metadata['status']
|
||||
|
||||
return merged_schema
|
||||
|
||||
def save_schema(
|
||||
self,
|
||||
schema: Dict[str, Any],
|
||||
md_path: Path,
|
||||
template: Optional[str] = None,
|
||||
frontmatter: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""
|
||||
Save schema as markdown file.
|
||||
|
||||
Args:
|
||||
schema: JSON schema dictionary to save
|
||||
md_path: Output path for markdown file
|
||||
template: Optional markdown template string
|
||||
frontmatter: Optional frontmatter metadata (extracted from schema if not provided)
|
||||
|
||||
Raises:
|
||||
InvalidSchemaFormatError: If schema is invalid
|
||||
|
||||
Example:
|
||||
>>> loader = MarkdownSchemaLoader()
|
||||
>>> loader.save_schema(
|
||||
... schema={'title': 'My Schema', ...},
|
||||
... md_path=Path('my-schema-v1.0.md')
|
||||
... )
|
||||
"""
|
||||
if template:
|
||||
# Use provided template
|
||||
content = self._render_template(template, schema, frontmatter)
|
||||
else:
|
||||
# Generate basic markdown
|
||||
content = self._generate_markdown(schema, frontmatter)
|
||||
|
||||
# Create parent directory if needed
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write file
|
||||
try:
|
||||
md_path.write_text(content, encoding='utf-8')
|
||||
except Exception as e:
|
||||
raise InvalidSchemaFormatError(f"Failed to write schema file: {e}")
|
||||
|
||||
def _generate_markdown(
|
||||
self,
|
||||
schema: Dict[str, Any],
|
||||
frontmatter: Optional[Dict[str, Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate markdown from schema.
|
||||
|
||||
Args:
|
||||
schema: JSON schema dictionary
|
||||
frontmatter: Optional frontmatter metadata
|
||||
|
||||
Returns:
|
||||
Markdown content as string
|
||||
"""
|
||||
# Extract metadata from schema
|
||||
title = schema.get('title', 'Untitled Schema')
|
||||
version = schema.get('version', '1.0.0')
|
||||
description = schema.get('description', '')
|
||||
schema_id = schema.get('$id', '')
|
||||
|
||||
# Build frontmatter
|
||||
if frontmatter is None:
|
||||
frontmatter = {}
|
||||
|
||||
# Set defaults
|
||||
if 'schema-id' not in frontmatter and schema_id:
|
||||
frontmatter['schema-id'] = schema_id
|
||||
if 'version' not in frontmatter:
|
||||
frontmatter['version'] = version
|
||||
if 'status' not in frontmatter:
|
||||
frontmatter['status'] = 'draft'
|
||||
|
||||
# Generate frontmatter YAML
|
||||
frontmatter_yaml = yaml.dump(
|
||||
frontmatter,
|
||||
default_flow_style=False,
|
||||
allow_unicode=True
|
||||
).strip()
|
||||
|
||||
# Generate JSON (pretty-printed)
|
||||
schema_json = json.dumps(schema, indent=2, ensure_ascii=False)
|
||||
|
||||
# Build markdown content
|
||||
md_content = f"""---
|
||||
{frontmatter_yaml}
|
||||
---
|
||||
|
||||
# {title} v{version}
|
||||
|
||||
## Overview
|
||||
|
||||
{description}
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
markitect validate document.md --schema {Path(frontmatter.get('schema-id', 'schema')).name}
|
||||
```
|
||||
|
||||
## Schema Definition
|
||||
|
||||
```json
|
||||
{schema_json}
|
||||
```
|
||||
|
||||
## Version History
|
||||
|
||||
### v{version}
|
||||
- Initial version
|
||||
"""
|
||||
|
||||
return md_content
|
||||
|
||||
def _render_template(
|
||||
self,
|
||||
template: str,
|
||||
schema: Dict[str, Any],
|
||||
frontmatter: Optional[Dict[str, Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Render markdown from template.
|
||||
|
||||
Simple template rendering using string formatting.
|
||||
For complex templates, consider using Jinja2 or similar.
|
||||
|
||||
Args:
|
||||
template: Template string
|
||||
schema: JSON schema dictionary
|
||||
frontmatter: Optional frontmatter metadata
|
||||
|
||||
Returns:
|
||||
Rendered markdown content
|
||||
"""
|
||||
# Build context for template
|
||||
context = {
|
||||
'title': schema.get('title', 'Untitled'),
|
||||
'version': schema.get('version', '1.0.0'),
|
||||
'description': schema.get('description', ''),
|
||||
'schema_id': schema.get('$id', ''),
|
||||
'schema_json': json.dumps(schema, indent=2, ensure_ascii=False),
|
||||
'frontmatter': frontmatter or {},
|
||||
}
|
||||
|
||||
# Simple template rendering
|
||||
try:
|
||||
return template.format(**context)
|
||||
except KeyError as e:
|
||||
raise InvalidSchemaFormatError(f"Template missing key: {e}")
|
||||
|
||||
def list_json_blocks(self, content: str) -> List[Tuple[int, str]]:
|
||||
"""
|
||||
List all JSON code blocks in markdown content.
|
||||
|
||||
Useful for debugging or when multiple JSON blocks exist.
|
||||
|
||||
Args:
|
||||
content: Markdown file content
|
||||
|
||||
Returns:
|
||||
List of (position, json_content) tuples
|
||||
|
||||
Example:
|
||||
>>> loader = MarkdownSchemaLoader()
|
||||
>>> content = Path('schema.md').read_text()
|
||||
>>> blocks = loader.list_json_blocks(content)
|
||||
>>> print(f"Found {len(blocks)} JSON blocks")
|
||||
"""
|
||||
blocks = []
|
||||
for match in self.json_code_block_pattern.finditer(content):
|
||||
blocks.append((match.start(), match.group(1)))
|
||||
return blocks
|
||||
|
||||
def validate_schema_structure(self, schema: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Validate basic schema structure.
|
||||
|
||||
Checks for required JSON Schema fields and MarkiTect conventions.
|
||||
|
||||
Args:
|
||||
schema: JSON schema dictionary
|
||||
|
||||
Returns:
|
||||
List of warning/error messages (empty if valid)
|
||||
|
||||
Example:
|
||||
>>> loader = MarkdownSchemaLoader()
|
||||
>>> issues = loader.validate_schema_structure(schema)
|
||||
>>> if issues:
|
||||
... print("Schema issues:", issues)
|
||||
"""
|
||||
issues = []
|
||||
|
||||
# Check required JSON Schema fields
|
||||
if '$schema' not in schema:
|
||||
issues.append("Missing required field: $schema")
|
||||
|
||||
if 'type' not in schema:
|
||||
issues.append("Missing recommended field: type")
|
||||
|
||||
if 'title' not in schema:
|
||||
issues.append("Missing recommended field: title")
|
||||
|
||||
if 'description' not in schema:
|
||||
issues.append("Missing recommended field: description")
|
||||
|
||||
# Check MarkiTect conventions
|
||||
if 'version' not in schema:
|
||||
issues.append("Missing MarkiTect convention: version field")
|
||||
|
||||
if '$id' not in schema:
|
||||
issues.append("Missing recommended field: $id")
|
||||
|
||||
# Check $id format if present
|
||||
if '$id' in schema:
|
||||
schema_id = schema['$id']
|
||||
if not isinstance(schema_id, str):
|
||||
issues.append("$id must be a string")
|
||||
elif not schema_id.startswith('https://'):
|
||||
issues.append("$id should be a full HTTPS URL")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def auto_ingest_schemas(db_manager=None, schema_dir: Optional[Path] = None, verbose: bool = False) -> Dict[str, Any]:
|
||||
"""Automatically ingest schemas from markitect/schemas/ directory.
|
||||
|
||||
This function scans the schemas directory for .md schema files and ingests
|
||||
any that are not already in the database. Useful for post-install setup
|
||||
or automatic schema registration.
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance (optional, will create if not provided)
|
||||
schema_dir: Directory containing schemas (defaults to markitect/schemas/)
|
||||
verbose: If True, print detailed progress messages
|
||||
|
||||
Returns:
|
||||
Dictionary with ingestion results:
|
||||
{
|
||||
'ingested': [list of schema names that were ingested],
|
||||
'skipped': [list of schema names that were already present],
|
||||
'failed': [list of (schema_name, error) tuples for failures]
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from markitect.schema_loader import auto_ingest_schemas
|
||||
>>> results = auto_ingest_schemas(verbose=True)
|
||||
>>> print(f"Ingested {len(results['ingested'])} schemas")
|
||||
"""
|
||||
# Determine schema directory
|
||||
if schema_dir is None:
|
||||
schema_dir = Path(__file__).parent / "schemas"
|
||||
|
||||
if not schema_dir.exists():
|
||||
if verbose:
|
||||
print(f"⚠️ Schema directory not found: {schema_dir}")
|
||||
return {'ingested': [], 'skipped': [], 'failed': []}
|
||||
|
||||
# Initialize database manager if not provided
|
||||
if db_manager is None:
|
||||
from .database import DatabaseManager
|
||||
db_path = Path.home() / '.markitect' / 'markitect.db'
|
||||
db_manager = DatabaseManager(str(db_path))
|
||||
db_manager.initialize_database()
|
||||
|
||||
# Get list of already ingested schemas
|
||||
try:
|
||||
existing_schemas = {schema['name'] for schema in db_manager.list_schemas()}
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f"❌ Error listing existing schemas: {e}")
|
||||
return {'ingested': [], 'skipped': [], 'failed': []}
|
||||
|
||||
results = {
|
||||
'ingested': [],
|
||||
'skipped': [],
|
||||
'failed': []
|
||||
}
|
||||
|
||||
# Find all schema files
|
||||
schema_files = list(schema_dir.glob("*-schema-v*.md"))
|
||||
|
||||
if verbose and schema_files:
|
||||
print(f"🔍 Found {len(schema_files)} schema file(s) in {schema_dir}")
|
||||
|
||||
loader = MarkdownSchemaLoader()
|
||||
|
||||
for schema_file in sorted(schema_files):
|
||||
schema_name = schema_file.name
|
||||
|
||||
# Skip if already ingested
|
||||
if schema_name in existing_schemas:
|
||||
results['skipped'].append(schema_name)
|
||||
if verbose:
|
||||
print(f"⏭️ Skipping {schema_name} (already ingested)")
|
||||
continue
|
||||
|
||||
# Try to ingest
|
||||
try:
|
||||
# Load schema
|
||||
schema_data_full = loader.load_schema(schema_file)
|
||||
schema_data = schema_data_full['schema']
|
||||
|
||||
# Store in database
|
||||
schema_content = json.dumps(schema_data, indent=2)
|
||||
record_id = db_manager.store_schema_file(schema_name, schema_content)
|
||||
|
||||
if record_id:
|
||||
results['ingested'].append(schema_name)
|
||||
if verbose:
|
||||
title = schema_data.get('title', schema_name)
|
||||
print(f"✅ Ingested {schema_name} (title: {title})")
|
||||
else:
|
||||
results['failed'].append((schema_name, "Failed to store in database"))
|
||||
if verbose:
|
||||
print(f"❌ Failed to store {schema_name} in database")
|
||||
|
||||
except Exception as e:
|
||||
results['failed'].append((schema_name, str(e)))
|
||||
if verbose:
|
||||
print(f"❌ Failed to ingest {schema_name}: {e}")
|
||||
|
||||
if verbose:
|
||||
print(f"\n📊 Auto-ingestion complete:")
|
||||
print(f" Ingested: {len(results['ingested'])}")
|
||||
print(f" Skipped: {len(results['skipped'])}")
|
||||
print(f" Failed: {len(results['failed'])}")
|
||||
|
||||
return results
|
||||
# Re-export from schema package for backward compatibility
|
||||
from markitect.schema.loader import (
|
||||
MarkdownSchemaLoader,
|
||||
SchemaLoaderError,
|
||||
InvalidSchemaFormatError,
|
||||
SchemaNotFoundError,
|
||||
auto_ingest_schemas,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'MarkdownSchemaLoader',
|
||||
'SchemaLoaderError',
|
||||
'InvalidSchemaFormatError',
|
||||
'SchemaNotFoundError',
|
||||
'auto_ingest_schemas',
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user