diff --git a/docs/PROJECT_STRUCTURE.md b/docs/PROJECT_STRUCTURE.md new file mode 100644 index 00000000..ecc6a4a3 --- /dev/null +++ b/docs/PROJECT_STRUCTURE.md @@ -0,0 +1,286 @@ +# MarkiTect Project Structure + +This document describes the current project layout, architectural decisions, and the reorganization plan for the Information Space Service evolution. + +## Overview + +MarkiTect is a markdown processing toolkit with transclusion, schema validation, asset management, and multi-format output capabilities. The project follows a hybrid layout that is being incrementally consolidated. + +## Current Directory Structure + +``` +markitect_project/ +├── markitect/ # Main package +│ ├── [34 root-level .py files] # Core functionality (see below) +│ ├── assets/ # Asset discovery, management, caching (21 files) +│ ├── finance/ # Cost tracking, work time management (9 files) +│ ├── plugins/ # Plugin system with base classes (7 files) +│ ├── packaging/ # Asset packaging, MDZ variants (7 files) +│ ├── production/ # Deployment validation, benchmarks (6 files) +│ ├── legacy/ # Legacy compatibility layer (8 files) +│ ├── explode_variants/ # Document expansion, variants (9 files) +│ ├── query_paradigms/ # Query paradigm implementations (4 files) +│ ├── validators/ # Content/link/section validation (4 files) +│ ├── matter_frontmatter/ # Front matter parsing (4 files) +│ ├── matter_contentmatter/ # Content matter parsing (4 files) +│ ├── matter_tailmatter/ # Tail matter parsing (4 files) +│ ├── profile/ # User profile management (4 files) +│ ├── graphql/ # GraphQL query implementation (4 files) +│ ├── template/ # Template management (3 files) +│ ├── themes/ # Theme system with subdirectories (1 file) +│ └── schemas/ # Built-in schema definitions (9 files) +├── application/ # Application layer services +├── domain/ # Domain models +├── infrastructure/ # Infrastructure implementations +├── tests/ # Test suite (90+ test files) +│ ├── unit/ # Unit tests +│ ├── integration/ # Integration tests +│ ├── e2e/ # End-to-end tests +│ └── fixtures/ # Test data +├── docs/ # Documentation (12+ subdirectories) +├── src/ # JavaScript/frontend components +└── roadmap/ # Project roadmap +``` + +## Root-Level Modules (/markitect/) + +The 34 root-level Python files are organized by function: + +### Core Infrastructure +| File | Lines | Purpose | +|------|-------|---------| +| `parser.py` | ~50 | Markdown AST parsing using markdown-it | +| `serializer.py` | ~360 | AST serialization back to Markdown | +| `document_manager.py` | ~100 | Wrapper around CleanDocumentManager | +| `clean_document_manager.py` | ~2000 | Clean document management implementation | +| `workspace.py` | ~200 | Workspace management | +| `database.py` | ~400 | SQLite database management | + +### Schema Management (6 files, 99KB total) +| File | Lines | Purpose | +|------|-------|---------| +| `schema_generator.py` | ~600 | JSON schema generation from markdown AST | +| `schema_analyzer.py` | ~450 | Schema rigidity analysis with phase classification | +| `schema_loader.py` | ~600 | Schema loading from markdown with frontmatter | +| `schema_refiner.py` | ~600 | Automatic schema refinement using loosening rules | +| `schema_validator.py` | ~900 | Comprehensive schema validation | +| `schema_naming.py` | ~300 | Schema naming convention enforcement | + +### Configuration & Services +| File | Purpose | +|------|---------| +| `config_manager.py` | Configuration file management | +| `frontmatter.py` | YAML frontmatter parsing | +| `exceptions.py` | Custom exception classes | +| `ast_service.py` | AST service layer | +| `cache_service.py` | Caching functionality | +| `ast_cache.py` | AST caching implementation | +| `performance_tracker.py` | Performance metrics | + +### Validation & Analysis +| File | Purpose | +|------|---------| +| `semantic_validator.py` | Semantic validation layer | +| `validation_error.py` | Validation error handling | +| `metaschema.py` | Metaschema validation for custom extensions | + +### CLI & Commands +| File | Purpose | +|------|---------| +| `cli.py` | Main CLI interface (274KB, comprehensive) | +| `cli_utils.py` | CLI utilities | +| `asset_commands.py` | Asset-related CLI commands | +| `draft_generator.py` | Draft generation functionality | + +### Utilities +| File | Purpose | +|------|---------| +| `batch_processor.py` | Batch processing operations | +| `associated_files.py` | Associated file tracking | +| `legacy_compat.py` | Legacy compatibility layer | +| `legacy_integration_example.py` | Integration examples | +| `_version.py`, `__version__.py` | Version management | + +## Subpackages + +### assets/ (21 files) +Complete asset management system including discovery, analytics, caching, deduplication, and packaging. Key files: +- `repository.py` - Asset repository pattern +- `discovery.py` - Asset discovery algorithms +- `cache.py` - Asset caching layer +- `analytics.py` - Asset usage analytics + +### finance/ (9 files) +Cost tracking and work time management: +- `models.py` - Financial data models +- `cost_tracker.py` - Cost tracking implementation +- `period_tracker.py` - Period-based tracking +- `report_generator.py` - Financial reports + +### plugins/ (7 files) +Extensible plugin system: +- `base.py` - Plugin base classes and types +- `registry.py` - Plugin registry +- `builtin/` - Built-in plugin implementations + +### packaging/ (7 files) +Asset packaging and MDZ format support: +- `mdz_packager.py` - MDZ package creation +- `transclusion.py` - Transclusion handling +- `variant_factory.py` - Variant generation + +### production/ (6 files) +Deployment and production validation: +- `deployment_validator.py` - Deployment checks +- `performance_benchmark.py` - Performance testing +- `cross_platform_validator.py` - Platform compatibility + +### legacy/ (8 files) +Backward compatibility layer: +- `compatibility.py` - Compatibility wrappers +- `deprecation.py` - Deprecation warnings +- `git_tracker.py` - Git integration (useful for Phase 8) + +## Test Structure + +``` +tests/ +├── conftest.py # Shared pytest configuration +├── fixtures/ # Test data files +│ ├── content_test_files/ +│ ├── contentmatter_test_files/ +│ ├── frontmatter_test_files/ +│ └── tailmatter_test_files/ +├── unit/ # Unit tests by domain +│ ├── application/ +│ └── infrastructure/ +├── integration/ # Integration tests +│ └── repositories/ +└── e2e/ # End-to-end tests + ├── cli/ + └── performance/ +``` + +--- + +## Planned Reorganization + +### Motivation + +The current layout has grown organically, resulting in: +1. **34 files at root level** - Too many modules at package root +2. **No clear grouping** - Schema tools, core infrastructure, and utilities mixed +3. **Hybrid architecture** - Mix of root packages and monolithic /markitect/ + +### Target Structure + +After reorganization, the /markitect/ package will have clearer structure: + +``` +markitect/ +├── core/ # Core infrastructure (NEW) +│ ├── __init__.py +│ ├── parser.py # (from markitect/) +│ ├── serializer.py # (from markitect/) +│ ├── document_manager.py # (from markitect/) +│ └── workspace.py # (from markitect/) +├── schema/ # Schema management (NEW) +│ ├── __init__.py +│ ├── validator.py # (from schema_validator.py) +│ ├── generator.py # (from schema_generator.py) +│ ├── loader.py # (from schema_loader.py) +│ ├── analyzer.py # (from schema_analyzer.py) +│ ├── refiner.py # (from schema_refiner.py) +│ └── naming.py # (from schema_naming.py) +├── storage/ # Storage concerns (NEW) +│ ├── __init__.py +│ ├── database.py # (from markitect/) +│ └── cache.py # (consolidated) +├── spaces/ # Information spaces (Phase 1+) +│ ├── models.py +│ ├── events/ +│ ├── repositories/ +│ ├── transclusion/ +│ ├── rendering/ +│ ├── sync/ +│ └── services/ +└── [existing subpackages] # assets/, plugins/, etc. +``` + +### Backward Compatibility + +Original import paths will continue to work through re-exports: + +```python +# Old import (still works) +from markitect.parser import parse_markdown + +# New import (preferred) +from markitect.core.parser import parse_markdown +``` + +### Migration Strategy + +1. Create new subpackages with copied content +2. Update internal imports to new paths +3. Add deprecation warnings to old paths +4. Re-export from original locations for compatibility +5. Verify all tests pass +6. Update documentation + +--- + +## Information Space Service Architecture + +The reorganization prepares for the Information Space Service evolution, which adds: + +### Phase 1-3: Foundation +- `InformationSpace` entity with lifecycle management +- `SpaceRepository` for persistence +- Event system for change tracking +- Persistent transclusion context + +### Phase 4-5: Modes +- HTML rendering mode with caching +- Directory mode with bidirectional sync + +### Phase 6-7: API & Composability +- GraphQL schema extensions +- CLI commands for space management +- Space references and inheritance + +### Phase 8: Git History (Optional) +- Git-based version control for spaces +- Event-driven commits +- Version navigation + +See [docs/roadmap/information-space-service/](./roadmap/information-space-service/) for the complete workplan. + +--- + +## Key Dependencies + +From `pyproject.toml`: +- Python >=3.8 (tested on 3.12) +- markdown-it-py - Markdown parsing +- PyYAML - YAML/frontmatter handling +- click - CLI framework +- tabulate - Table formatting +- jsonpath-ng - JSON path queries +- aiohttp - Async HTTP + +## Version Information + +- Current version is managed in `_version.py` and `__version__.py` +- Follows semantic versioning +- CHANGELOG.md tracks all changes + +--- + +## Related Documentation + +- [CLI Tutorial](CLI_TUTORIAL.md) - CLI usage guide +- [Plugin System](PLUGIN_SYSTEM.md) - Plugin architecture +- [Schema Management Guide](SCHEMA_MANAGEMENT_GUIDE.md) - Schema workflows +- [Asset Management Guide](ASSET_MANAGEMENT_USER_GUIDE.md) - Asset system +- [Error Handling Strategy](ERROR_HANDLING_STRATEGY.md) - Error patterns diff --git a/docs/roadmap/information-space-service/README.md b/docs/roadmap/information-space-service/README.md new file mode 100644 index 00000000..e32f15bb --- /dev/null +++ b/docs/roadmap/information-space-service/README.md @@ -0,0 +1,142 @@ +# Headless Information Space Service Evolution + +## Vision + +Evolve markitect into a headless markdown transclusion-based information space service that supports: + +1. **HTML Rendering Mode** - Render markdown to HTML, track changes, update space +2. **Directory Structure Mode** - Represent information as canonical directory with markdown files +3. **Multiple Frontends** - Support different interaction modes via clean API layer + +## What is an Information Space? + +An Information Space is a first-class abstraction that: + +- Contains a collection of documents with transclusion relationships +- Maintains persistent context for variable resolution +- Tracks document dependencies for cache invalidation +- Can be rendered to HTML or exported to directory structure +- Supports event-driven updates and subscriptions +- Can reference other spaces (composability) + +## Key Capabilities + +| Phase | Capability | Description | +|-------|-----------|-------------| +| 1 | InformationSpace Entity | Space abstraction with identity, metadata, lifecycle | +| 2 | Event System | In-process pub/sub for space events | +| 3 | Persistent Transclusion | Store context state, track references | +| 4 | HTML Rendering | Render resolved markdown to HTML with caching | +| 5 | Directory Mode | Bidirectional sync with filesystem | +| 6 | API Layer | GraphQL, REST, CLI interfaces | +| 7 | Composability | Space references and inheritance | +| 8 | Git History | Optional git-based version control | + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────┐ +│ API Layer │ +│ GraphQL Schema │ REST Endpoints │ CLI Commands │ +├─────────────────────────────────────────────────────────────┤ +│ Service Layer │ +│ SpaceService │ RenderService │ SyncService │ +├─────────────────────────────────────────────────────────────┤ +│ Domain Layer │ +│ InformationSpace │ SpaceDocument │ SpaceEvent │ EventBus │ +│ PersistentTransclusionContext │ ReferenceGraph │ +├─────────────────────────────────────────────────────────────┤ +│ Storage Layer │ +│ SpaceRepository │ EventStore │ Cache Backend │ +├─────────────────────────────────────────────────────────────┤ +│ Existing Markitect │ +│ DatabaseManager │ TransclusionEngine │ VariantFactory │ +│ PluginRegistry │ QueryParadigms │ ASTService │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Integration Strategy + +The Information Space Service builds on existing markitect infrastructure: + +| Existing Component | Integration | +|-------------------|-------------| +| `TransclusionContext` | Extended with `PersistentTransclusionContext` | +| `VariantFactory` | Used for directory export/import | +| `PluginRegistry` | Add SPACE_RENDERER, SPACE_SYNC, EVENT_HANDLER types | +| `DatabaseManager` | Add space-related tables | +| `GraphQL Schema` | Extend with Space types and mutations | + +## Project Status + +- [ ] **Phase 0**: Project Organization (prerequisite cleanup) +- [ ] **Phase 1**: Foundation (Space entity, repository) +- [ ] **Phase 2**: Event System +- [ ] **Phase 3**: Persistent Transclusion +- [ ] **Phase 4**: HTML Rendering Mode +- [ ] **Phase 5**: Directory Mode +- [ ] **Phase 6**: API Layer +- [ ] **Phase 7**: Composability +- [ ] **Phase 8**: Git History (optional) + +## Documentation + +- [WORKPLAN.md](WORKPLAN.md) - Detailed implementation workplan +- [PROJECT_STRUCTURE.md](../../PROJECT_STRUCTURE.md) - Current project structure + +## Usage Example (Target State) + +```python +from markitect.spaces import SpaceService, InformationSpace + +# Create a space +service = SpaceService() +space = await service.create_space("my-docs", description="Documentation space") + +# Add documents +await service.add_document(space, "/intro.md", content="# Introduction") +await service.add_document(space, "/getting-started.md", content="# Getting Started") + +# Render to HTML +html_output = await service.render(space, theme="default") + +# Export to directory +await service.export_to_directory(space, "./output/") + +# Watch for changes +async for event in service.subscribe(space): + print(f"Change detected: {event.type} on {event.document_path}") +``` + +## CLI Commands (Target State) + +```bash +# Space management +markitect space create my-space --description "My documentation" +markitect space list +markitect space show my-space + +# Document management +markitect space add-doc my-space --path "/intro.md" --file ./intro.md +markitect space list-docs my-space + +# Rendering +markitect space render my-space --output ./html/ --theme default + +# Directory sync +markitect space sync my-space --directory ./my-space-dir/ --bidirectional + +# History (Phase 8) +markitect space history log my-space +markitect space history diff my-space --rev HEAD~1 +``` + +## Contributing + +See the main project CONTRIBUTING.md for guidelines. For this initiative specifically: + +1. Follow the phased implementation order +2. Write tests before implementing features +3. Update documentation as you go +4. Use the event system for loose coupling +5. Maintain backward compatibility diff --git a/docs/roadmap/information-space-service/WORKPLAN.md b/docs/roadmap/information-space-service/WORKPLAN.md new file mode 100644 index 00000000..a405ffcf --- /dev/null +++ b/docs/roadmap/information-space-service/WORKPLAN.md @@ -0,0 +1,599 @@ +# Headless Information Space Service - Implementation Workplan + +## Overview + +This workplan details the implementation phases for evolving markitect into a headless markdown transclusion-based information space service. + +--- + +## Phase 0: Project Organization (Prerequisite) + +### Current State Issues +- **Hybrid layout** - Mix of root-level packages and monolithic `/markitect/` +- **Flat root in markitect** - 34 .py files at `/markitect/` root level +- **No structure documentation** - Missing PROJECT_STRUCTURE.md (now created) + +### Reorganization Tasks + +| ID | Task | Description | Status | +|----|------|-------------|--------| +| ORG-001 | Create PROJECT_STRUCTURE.md | Document current layout and rationale | Done | +| ORG-002 | Create `/markitect/core/` | Move parser, serializer, document_manager | Pending | +| ORG-003 | Create `/markitect/schema/` | Consolidate 6 schema_*.py files | Pending | +| ORG-004 | Create `/markitect/storage/` | Group database.py, cache modules | Pending | +| ORG-005 | Update imports | Fix all import statements after moves | Pending | +| ORG-006 | Verify tests | Ensure all tests pass after moves | Pending | + +### Target Structure After Phase 0 + +``` +markitect/ +├── core/ # Core infrastructure +│ ├── __init__.py +│ ├── parser.py # (from markitect/) +│ ├── serializer.py # (from markitect/) +│ ├── document_manager.py # (from markitect/) +│ └── workspace.py # (from markitect/) +├── schema/ # Schema management +│ ├── __init__.py +│ ├── validator.py # (from schema_validator.py) +│ ├── generator.py # (from schema_generator.py) +│ ├── loader.py # (from schema_loader.py) +│ ├── analyzer.py # (from schema_analyzer.py) +│ ├── refiner.py # (from schema_refiner.py) +│ └── naming.py # (from schema_naming.py) +├── storage/ # Storage concerns +│ ├── __init__.py +│ └── database.py # (from markitect/) +└── spaces/ # Information spaces (Phase 1+) +``` + +--- + +## Phase 1: Foundation + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-001 | InformationSpace Entity | First-class space abstraction with identity, metadata, lifecycle | Critical | +| CAP-002 | SpaceRepository | CRUD operations for spaces with SQLite backing | Critical | +| CAP-003 | Document-Space Association | Link documents to spaces with membership tracking | Critical | +| CAP-004 | Space Metadata Schema | Extensible metadata schema for space configuration | High | +| CAP-005 | Database Migrations | Schema evolution for space-related tables | High | + +### Implementation Tasks + +**Week 1: Core Models** +- Create `markitect/spaces/models.py` + - `InformationSpace` dataclass with id, name, description, metadata, config + - `SpaceDocument` dataclass for document membership + - `SpaceConfig` dataclass for space settings +- Create `markitect/spaces/repositories/interfaces.py` +- Unit tests for models + +**Week 2: Repository Implementation** +- Create `markitect/spaces/repositories/sqlite.py` +- Implement `ISpaceRepository` for SQLite +- Implement `IDocumentAssociationRepository` +- Database migration scripts +- Repository unit tests + +**Week 3: Basic SpaceService** +- Create `markitect/spaces/services/space_service.py` +- CRUD operations for spaces +- Document add/remove operations +- Integration tests + +### Database Schema + +```sql +CREATE TABLE spaces ( + id TEXT PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + description TEXT, + metadata JSON, + config JSON, + parent_space_id TEXT REFERENCES spaces(id), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE space_documents ( + id TEXT PRIMARY KEY, + space_id TEXT NOT NULL REFERENCES spaces(id), + document_id TEXT NOT NULL, + space_path TEXT NOT NULL, + order_index INTEGER DEFAULT 0, + metadata JSON, + added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(space_id, space_path) +); +``` + +### Verification +```bash +pytest tests/unit/spaces/test_*_model.py +pytest tests/unit/spaces/test_*_repository.py +pytest tests/integration/spaces/test_space_service_integration.py +``` + +--- + +## Phase 2: Event System + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-010 | SpaceEvent Base | Event dataclass with type, payload, timestamp | Critical | +| CAP-011 | Event Bus | In-process publish/subscribe for space events | Critical | +| CAP-012 | Event Handlers Registry | Register/unregister event handlers by type | High | +| CAP-013 | Change Detection | Detect document changes via content hash comparison | High | +| CAP-014 | Event Persistence | Store events for replay/audit | Medium | + +### Implementation Tasks + +**Week 4: Event Infrastructure** +- Create `markitect/spaces/events/models.py` + - `SpaceEvent` dataclass with event_id, type, space_id, payload, timestamp + - `SpaceEventType` enum (DOCUMENT_ADDED, DOCUMENT_UPDATED, DOCUMENT_REMOVED, etc.) +- Create `markitect/spaces/events/bus.py` + - `EventBus` with sync/async handler support + - Handler registration by event type +- Unit tests for event bus + +**Week 5: Integration** +- Wire events into SpaceService (emit on document operations) +- Implement change detection (content hash comparison) +- Optional: event persistence table +- Integration tests for event flow + +### Event Types + +```python +class SpaceEventType(Enum): + SPACE_CREATED = "space.created" + SPACE_UPDATED = "space.updated" + SPACE_DELETED = "space.deleted" + DOCUMENT_ADDED = "document.added" + DOCUMENT_UPDATED = "document.updated" + DOCUMENT_REMOVED = "document.removed" + DOCUMENT_MOVED = "document.moved" + VARIABLE_SET = "variable.set" + RENDER_COMPLETED = "render.completed" + SYNC_COMPLETED = "sync.completed" +``` + +### Verification +```bash +pytest tests/unit/spaces/test_event_bus.py +pytest tests/integration/spaces/test_event_propagation.py +``` + +--- + +## Phase 3: Persistent Transclusion Context + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-020 | Persistent TransclusionContext | Store context state in database | Critical | +| CAP-021 | Cross-Space References | Resolve transclusions across space boundaries | High | +| CAP-022 | Reference Graph | Track document dependencies for invalidation | High | +| CAP-023 | Variable Scope Layers | Space-level, document-level, request-level variables | Medium | +| CAP-024 | Transclusion Cache Invalidation | Invalidate rendered content on dependency change | High | + +### Implementation Tasks + +**Week 6: Persistent Context** +- Create `markitect/spaces/transclusion/persistent_context.py` +- Extend existing `TransclusionContext` with DB persistence +- Space-scoped variable storage + +**Week 7: Reference Graph** +- Implement reference tracking during transclusion resolution +- Cross-space reference resolution with space:// protocol +- Variable scope layers (space → document → request) + +**Week 8: Cache Invalidation** +- Wire change events to cache invalidation +- Dependency-aware cache clearing +- Integration tests + +### Database Schema Additions + +```sql +CREATE TABLE space_variables ( + space_id TEXT NOT NULL REFERENCES spaces(id), + name TEXT NOT NULL, + value JSON, + scope TEXT DEFAULT 'space', + PRIMARY KEY(space_id, name) +); + +CREATE TABLE transclusion_references ( + source_doc_id TEXT NOT NULL, + target_doc_id TEXT NOT NULL, + space_id TEXT NOT NULL REFERENCES spaces(id), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(source_doc_id, target_doc_id, space_id) +); +``` + +### Verification +```bash +pytest tests/unit/spaces/test_persistent_context.py +pytest tests/unit/spaces/test_reference_graph.py +pytest tests/integration/spaces/test_transclusion_persistence.py +``` + +--- + +## Phase 4: HTML Rendering Mode + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-030 | SpaceRenderer Base | Abstract renderer interface | Critical | +| CAP-031 | MarkdownToHTMLRenderer | Render resolved markdown to HTML | Critical | +| CAP-032 | Rendering Cache | Cache rendered output with invalidation | High | +| CAP-033 | Theme Support | Apply themes to rendered HTML | Medium | +| CAP-034 | Incremental Rendering | Re-render only changed documents | Medium | + +### Implementation Tasks + +**Week 9: Renderer Base** +- Create `markitect/spaces/rendering/base.py` - SpaceRenderer ABC +- Create `markitect/spaces/rendering/html_renderer.py` - MarkdownToHTMLRenderer +- Integrate with existing `CleanDocumentManager` + +**Week 10: Caching and Themes** +- Implement render output caching (keyed by content hash) +- Theme integration using existing theme system +- Invalidation on dependency change via events + +**Week 11: Incremental Rendering** +- Re-render only affected documents on change +- Rendering events emission +- E2E tests for render workflow + +### Verification +```bash +pytest tests/e2e/spaces/test_html_rendering_workflow.py +``` + +--- + +## Phase 5: Directory Mode + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-040 | SpaceToDirectory Exporter | Export space to canonical directory structure | Critical | +| CAP-041 | DirectoryToSpace Importer | Import directory structure as space | Critical | +| CAP-042 | Bidirectional Sync | Detect and sync changes both directions | High | +| CAP-043 | Filesystem Watcher | Watch directory for external changes | Medium | +| CAP-044 | Conflict Resolution | Handle conflicts in bidirectional sync | Medium | + +### Implementation Tasks + +**Week 12: Export** +- Create `markitect/spaces/sync/directory_exporter.py` +- Integrate with existing `VariantFactory` +- Support flat/hierarchical/semantic variants + +**Week 13: Import and Sync** +- Create `markitect/spaces/sync/directory_importer.py` +- Create `markitect/spaces/sync/bidirectional.py` +- Conflict detection (modification time, content hash) + +**Week 14: Filesystem Watcher** +- Implement watcher using `watchdog` library +- Sync events emission +- E2E tests for bidirectional sync + +### Canonical Directory Structure + +``` +.markitect/spaces/{space-name}/ +├── .space.yaml # Space metadata and config +├── documents/ # Document files +│ ├── intro.md +│ ├── getting-started.md +│ └── advanced/ +│ └── topics.md +└── assets/ # Associated assets +``` + +### Verification +```bash +pytest tests/e2e/spaces/test_directory_mode_workflow.py +``` + +--- + +## Phase 6: API Layer + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-060 | SpaceService | Service layer orchestrating space operations | Critical | +| CAP-061 | GraphQL Space Schema | Extend existing GraphQL with space types | High | +| CAP-062 | REST Endpoints | Alternative REST API for spaces | Medium | +| CAP-063 | WebSocket Subscriptions | Real-time event subscriptions | Medium | +| CAP-064 | CLI Space Commands | CLI commands for space management | High | + +### Implementation Tasks + +**Week 15: GraphQL Extension** +- Extend `markitect/graphql/schema.py` with Space types +- Add mutations: createSpace, updateSpace, deleteSpace +- Add queries: space, spaces, spaceDocuments +- Add subscriptions: onSpaceEvent + +**Week 16: CLI Commands** +- Add to `markitect/cli.py`: + - `markitect space create/list/show/delete` + - `markitect space add-doc/remove-doc/list-docs` + - `markitect space render` + - `markitect space sync` + +**Week 17: WebSocket and Polish** +- WebSocket subscriptions for real-time events +- Documentation updates +- Final integration testing + +### GraphQL Schema Extensions + +```graphql +type InformationSpace { + id: ID! + name: String! + description: String + documents: [SpaceDocument!]! + config: SpaceConfig! + parentSpace: InformationSpace + createdAt: DateTime! + updatedAt: DateTime! +} + +type SpaceDocument { + id: ID! + spacePath: String! + content: String! + metadata: JSON +} + +type Mutation { + createSpace(input: CreateSpaceInput!): InformationSpace! + addDocument(spaceId: ID!, input: AddDocumentInput!): SpaceDocument! + renderSpace(spaceId: ID!, options: RenderOptions): RenderResult! +} + +type Subscription { + onSpaceEvent(spaceId: ID!): SpaceEvent! +} +``` + +### Verification +```bash +pytest tests/integration/spaces/ +pytest tests/e2e/spaces/ +markitect space --help # Verify CLI +``` + +--- + +## Phase 7: Composability + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-050 | Space References | Spaces can reference other spaces | High | +| CAP-051 | Space Inheritance | Child spaces inherit parent context | Medium | +| CAP-053 | Space Access Control | Basic permission model for space access | Medium | + +### Implementation Tasks + +**Week 18-19: Space References** +- Space-to-space references via space:// protocol +- Variable inheritance from parent spaces +- Basic access control (read/write/admin) + +**Week 20: Final Integration** +- Complete E2E test suite +- Performance testing +- User documentation + +### Space Reference Protocol + +```markdown + +{{transclude space://other-space/path/to/doc.md}} + + +{{transclude space://shared-components/header.md | title="My Page"}} +``` + +--- + +## Phase 8: Git History Tracking (Optional) + +### Capability Requirements + +| ID | Capability | Description | Priority | +|----|-----------|-------------|----------| +| CAP-070 | History Configuration | Per-space history tracking configuration | High | +| CAP-071 | HistoryBackend Interface | Abstract interface for history backends | High | +| CAP-072 | GitHistoryBackend | Git implementation of history backend | High | +| CAP-073 | Canonical Directory Binding | Bind space to canonical directory for git | High | +| CAP-074 | Event-Driven Commits | Commit on document change events | Medium | +| CAP-075 | History Query API | Query commits, diffs, branches | High | +| CAP-076 | History CLI Commands | CLI for log, diff, restore, checkout | High | +| CAP-077 | Versioned Read/Render | Read/render documents at specific versions | Medium | + +### Implementation Tasks + +**Week 21: History Infrastructure** +- Create `markitect/spaces/history/interfaces.py` - IHistoryBackend ABC +- Create `markitect/spaces/history/models.py` - Commit, HistoryEntry dataclasses +- Add `SpaceConfig` fields: history_enabled, history_backend, history_options +- Add `SPACE_SYNC` to PluginType enum + +**Week 22: Git Backend** +- Create `markitect/spaces/history/git_backend.py` +- Leverage existing `legacy/git_tracker.py` patterns +- Create event handlers for auto-commit on document changes +- Integration tests + +**Week 23: API and CLI** +- History query service for log, diff, branches +- CLI commands: `markitect space history log/diff/restore/checkout` +- Extend read/render with `--version` option +- E2E tests + +### Integration Diagram + +``` +Document Update Flow (with history enabled): + +User updates document + │ + ▼ +┌───────────────────┐ +│ SpaceService │ +│ update_document()│ +└────────┬──────────┘ + │ emit event + ▼ +┌───────────────────┐ ┌─────────────────────┐ +│ Event Bus │────▶│ GitHistoryHandler │ +│ DOCUMENT_UPDATED │ │ (subscribed) │ +└───────────────────┘ └──────────┬──────────┘ + │ + ┌──────────▼──────────┐ + │ DirectorySyncService │ + └──────────┬──────────┘ + │ writes to + ┌──────────▼──────────┐ + │ Canonical Directory │ + │ .markitect/spaces/X/ │ + └──────────┬──────────┘ + │ + ┌──────────▼──────────┐ + │ GitHistoryBackend │ + │ git add && git commit│ + └─────────────────────┘ +``` + +### Verification +```bash +markitect space create my-space --history-enabled +markitect space add-doc my-space --content "# V1" +markitect space update-doc my-space/doc.md --content "# V2" +markitect space history log my-space +markitect space history diff my-space --rev HEAD~1 +``` + +--- + +## Timeline Summary + +| Phase | Focus | Duration | +|-------|-------|----------| +| 0 | Project Organization | 1 week | +| 1 | Foundation | 3 weeks | +| 2 | Event System | 2 weeks | +| 3 | Persistent Transclusion | 3 weeks | +| 4 | HTML Rendering Mode | 3 weeks | +| 5 | Directory Mode | 3 weeks | +| 6 | API Layer | 3 weeks | +| 7 | Composability | 3 weeks | +| 8 | Git History (Optional) | 3 weeks | + +**Total: 21-24 weeks** (5-6 months) + +### Parallel Work Opportunities +- Phase 4 (HTML) and Phase 5 (Directory) can run in parallel after Phase 3 +- Phase 8 can start in parallel with Phase 7 +- Documentation can be written incrementally +- CLI commands can start in parallel with Phase 4/5 + +--- + +## Files to Create + +### Phase 0 +``` +docs/PROJECT_STRUCTURE.md # Done +roadmap/information-space-service/ # Done +├── README.md # Done +└── WORKPLAN.md # This file +markitect/core/ # To do +markitect/schema/ # To do +markitect/storage/ # To do +``` + +### Phase 1+ +``` +markitect/spaces/ +├── __init__.py +├── models.py +├── events/ +│ ├── __init__.py +│ ├── models.py +│ └── bus.py +├── repositories/ +│ ├── __init__.py +│ ├── interfaces.py +│ └── sqlite.py +├── transclusion/ +│ ├── __init__.py +│ └── persistent_context.py +├── rendering/ +│ ├── __init__.py +│ ├── base.py +│ └── html_renderer.py +├── sync/ +│ ├── __init__.py +│ ├── directory_exporter.py +│ ├── directory_importer.py +│ └── bidirectional.py +├── history/ # Phase 8 +│ ├── __init__.py +│ ├── interfaces.py +│ ├── models.py +│ ├── git_backend.py +│ ├── events.py +│ └── queries.py +└── services/ + ├── __init__.py + └── space_service.py +``` + +### Test Files +``` +tests/unit/spaces/ +tests/integration/spaces/ +tests/e2e/spaces/ +tests/fixtures/spaces.py +``` + +--- + +## Success Criteria + +1. Phase 0 complete: project reorganized with docs/PROJECT_STRUCTURE.md +2. All phases complete with passing tests +3. HTML rendering mode fully functional +4. Directory mode with bidirectional sync working +5. GraphQL API exposing all space operations +6. CLI commands operational +7. Events propagating correctly +8. Cross-space transclusion resolving diff --git a/markitect/core/__init__.py b/markitect/core/__init__.py new file mode 100644 index 00000000..c6864e29 --- /dev/null +++ b/markitect/core/__init__.py @@ -0,0 +1,50 @@ +""" +Core infrastructure modules for MarkiTect. + +This package contains the fundamental building blocks: +- Parser: Markdown to AST conversion +- Serializer: AST to Markdown serialization +- DocumentManager: Document ingestion and management +- Workspace: Workspace and project management +""" + +from .parser import parse_markdown_to_ast +from .serializer import ASTSerializer +from .document_manager import DocumentManager, CleanDocumentManager +from .workspace import ( + WorkspaceManager, + WorkspaceTemplate, + TemplateMetadata, + TemplateResult, + WorkspaceCreationResult, + ProjectResult, + SyncResult, + BackupResult, + RestoreResult, + WorkspaceState, + ConflictInfo, + MergeResult, +) + +__all__ = [ + # Parser + "parse_markdown_to_ast", + # Serializer + "ASTSerializer", + # Document Manager + "DocumentManager", + "CleanDocumentManager", + # Workspace + "WorkspaceManager", + "WorkspaceTemplate", + "TemplateMetadata", + "TemplateResult", + "WorkspaceCreationResult", + "ProjectResult", + "SyncResult", + "BackupResult", + "RestoreResult", + "WorkspaceState", + "ConflictInfo", + "MergeResult", +] diff --git a/markitect/core/document_manager.py b/markitect/core/document_manager.py new file mode 100644 index 00000000..f22557b5 --- /dev/null +++ b/markitect/core/document_manager.py @@ -0,0 +1,98 @@ +""" +Document manager - Clean implementation. + +This module provides the DocumentManager class which is now a wrapper around +the CleanDocumentManager for backward compatibility. +""" + +from markitect.clean_document_manager import CleanDocumentManager +from .parser import parse_markdown_to_ast +from markitect.frontmatter import FrontMatterParser + + +class DocumentManager(CleanDocumentManager): + """ + Document manager for backward compatibility. + + This class extends CleanDocumentManager to maintain compatibility + with existing code while using the clean implementation. + """ + + def __init__(self, db_manager=None): + super().__init__(db_manager) + + def ingest_file(self, file_path: str): + """ + Ingest a markdown file for processing. + + This method provides compatibility for tests expecting the ingest_file interface. + """ + import time + import json + from pathlib import Path + + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # Read file content + content = file_path.read_text(encoding='utf-8') + + # Extract front matter + start_time = time.time() + parser = FrontMatterParser() + front_matter_data, content_without_front_matter = parser.parse(content) + + # Parse to AST + ast = parse_markdown_to_ast(content) + parse_time = time.time() - start_time + + # Extract title - first try front matter, then first heading, then filename + title = "Unknown" + if front_matter_data and 'title' in front_matter_data: + title = front_matter_data['title'] + elif isinstance(ast, list): + # Look for first H1 heading in AST tokens + for token in ast: + if token.get('type') == 'heading_open' and token.get('tag') == 'h1': + # Find the next inline token with content + idx = ast.index(token) + 1 + if idx < len(ast) and ast[idx].get('type') == 'inline': + title = ast[idx].get('content', 'Unknown') + break + + # Create actual cache file for compatibility + cache_dir = Path(file_path.parent) / '.ast_cache' + cache_dir.mkdir(exist_ok=True) + cache_file = cache_dir / f"{file_path.stem}_ast.json" + + # Write AST to cache file + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump(ast, f, indent=2) + + # Store document in database if db_manager exists + if hasattr(self, 'db_manager') and self.db_manager: + try: + # Store using the clean document manager's method + self.store_document(str(file_path), content, ast, front_matter_data) + except Exception: + # If storage fails, continue without error for test compatibility + pass + + return { + 'ast': ast, + 'content': content, + 'metadata': { + 'filename': file_path.name, + 'title': title, + 'size': len(content), + 'path': str(file_path) + }, + 'ast_cache_path': cache_file, + 'parse_time': parse_time, + 'cache_time': 0 # Mock cache time for compatibility + } + + +# For backward compatibility, also export the clean document manager directly +__all__ = ['DocumentManager', 'CleanDocumentManager'] diff --git a/markitect/core/parser.py b/markitect/core/parser.py new file mode 100644 index 00000000..9c6237b9 --- /dev/null +++ b/markitect/core/parser.py @@ -0,0 +1,47 @@ +""" +Markdown AST Parser. + +This module provides functionality to parse markdown content into an +Abstract Syntax Tree (AST) using the markdown-it library. +""" + +from markdown_it import MarkdownIt + + +def parse_markdown_to_ast(md_content: str): + """ + Parse markdown content into a JSON-serializable AST. + + Args: + md_content: Markdown text to parse + + Returns: + List of token dictionaries representing the AST + + Example: + ast = parse_markdown_to_ast("# Hello\\n\\nWorld") + """ + # Enable table parsing and other common plugins + md = MarkdownIt("commonmark", {"tables": True}).enable(['table']) + tokens = md.parse(md_content) + + # Convert to a JSON-serializable list of dicts + def token_to_dict(token): + d = { + 'type': token.type, + 'tag': token.tag, + 'attrs': token.attrs, + 'map': token.map, + 'nesting': token.nesting, + 'level': token.level, + 'children': [token_to_dict(child) if child else None for child in token.children] if token.children else None, + 'content': token.content, + 'markup': token.markup, + 'info': token.info, + 'meta': token.meta, + 'block': token.block, + 'hidden': token.hidden + } + return {k: v for k, v in d.items() if v is not None} # Remove None values + + return [token_to_dict(token) for token in tokens] diff --git a/markitect/core/serializer.py b/markitect/core/serializer.py new file mode 100644 index 00000000..7a02abc2 --- /dev/null +++ b/markitect/core/serializer.py @@ -0,0 +1,359 @@ +""" +AST to Markdown Serialization - Issue #2 Completion + +This module provides functionality to serialize markdown-it AST tokens back into +markdown format, enabling roundtrip validation and document manipulation. + +Key Features: +- Convert AST tokens back to markdown text +- Preserve front matter during serialization +- Support for content manipulation operations +- Roundtrip integrity validation +""" + +from typing import List, Dict, Any, Optional +import yaml + + +class ASTSerializer: + """ + Serializes markdown-it AST tokens back to markdown format. + + Provides roundtrip capability: markdown -> AST -> markdown + Supports front matter preservation and content manipulation. + """ + + def __init__(self): + """Initialize the AST serializer.""" + pass + + def serialize_to_markdown(self, ast: List[Dict[str, Any]], front_matter: Optional[Dict[str, Any]] = None) -> str: + """ + Convert AST tokens back to markdown format. + + Args: + ast: List of markdown-it AST tokens + front_matter: Optional YAML front matter dictionary + + Returns: + Markdown text with optional front matter + + Example: + serializer = ASTSerializer() + markdown = serializer.serialize_to_markdown(ast, front_matter) + """ + markdown_parts = [] + + # Add front matter if present + if front_matter and isinstance(front_matter, dict) and front_matter: + yaml_content = yaml.dump(front_matter, default_flow_style=False).strip() + markdown_parts.append(f"---\n{yaml_content}\n---\n\n") + + # Process AST tokens + markdown_content = self._process_tokens(ast) + markdown_parts.append(markdown_content) + + return ''.join(markdown_parts) + + def _process_tokens(self, tokens: List[Dict[str, Any]]) -> str: + """ + Process a list of AST tokens into markdown text. + + Args: + tokens: List of markdown-it tokens + + Returns: + Markdown text representation + """ + markdown_lines = [] + current_line = "" + list_level = 0 + + for token in tokens: + token_type = token.get('type', '') + content = token.get('content', '') + markup = token.get('markup', '') + tag = token.get('tag', '') + nesting = token.get('nesting', 0) + level = token.get('level', 0) + + # Handle different token types + if token_type == 'heading_open': + heading_level = int(tag[1]) if tag.startswith('h') else 1 + current_line = '#' * heading_level + ' ' + elif token_type == 'heading_close': + if current_line: + markdown_lines.append(current_line.rstrip()) + current_line = "" + markdown_lines.append("") # Empty line after heading + + elif token_type == 'paragraph_open': + pass # Start of paragraph + elif token_type == 'paragraph_close': + if current_line: + markdown_lines.append(current_line.rstrip()) + current_line = "" + markdown_lines.append("") # Empty line after paragraph + + elif token_type == 'inline': + # Process inline content and children + if content: + current_line += content + elif 'children' in token: + current_line += self._process_inline_children(token['children']) + + elif token_type == 'list_item_open': + # Handle list items + indent = ' ' * (level // 2) + if markup in ('-', '*'): + current_line = indent + '- ' + elif markup.isdigit(): + current_line = indent + '1. ' + elif token_type == 'list_item_close': + if current_line: + markdown_lines.append(current_line.rstrip()) + current_line = "" + + elif token_type in ('bullet_list_open', 'ordered_list_open'): + list_level += 1 + elif token_type in ('bullet_list_close', 'ordered_list_close'): + list_level -= 1 + if list_level == 0: + markdown_lines.append("") # Empty line after list + + elif token_type == 'blockquote_open': + pass + elif token_type == 'blockquote_close': + markdown_lines.append("") + + elif token_type == 'code_block': + markdown_lines.append(f"```{token.get('info', '')}") + markdown_lines.append(content.rstrip()) + markdown_lines.append("```") + markdown_lines.append("") + + elif token_type == 'fence': + if nesting == 1: # Opening fence + markdown_lines.append(f"```{token.get('info', '')}") + else: # Closing fence + markdown_lines.append("```") + markdown_lines.append("") + + elif token_type == 'hr': + markdown_lines.append("---") + markdown_lines.append("") + + elif token_type == 'text': + current_line += content + + # Add any remaining content + if current_line: + markdown_lines.append(current_line.rstrip()) + + # Clean up extra empty lines at the end + while markdown_lines and markdown_lines[-1] == "": + markdown_lines.pop() + + return '\n'.join(markdown_lines) + + def _process_inline_children(self, children: List[Dict[str, Any]]) -> str: + """ + Process inline children tokens (emphasis, strong, links, etc.). + + Args: + children: List of inline token children + + Returns: + Processed inline markdown text + """ + result = "" + + for child in children: + token_type = child.get('type', '') + content = child.get('content', '') + markup = child.get('markup', '') + + if token_type == 'text': + result += content + elif token_type == 'code_inline': + result += f"`{content}`" + elif token_type == 'em_open': + result += markup or '*' + elif token_type == 'em_close': + result += markup or '*' + elif token_type == 'strong_open': + result += markup or '**' + elif token_type == 'strong_close': + result += markup or '**' + elif token_type == 'link_open': + # Extract href from attrs + href = "" + if 'attrs' in child and child['attrs']: + for attr in child['attrs']: + if attr[0] == 'href': + href = attr[1] + break + result += "[" + elif token_type == 'link_close': + # This is tricky - we need to get the href from the opening token + # For now, we'll use a placeholder approach + result += "](#)" + elif token_type == 'softbreak': + result += '\n' + elif token_type == 'hardbreak': + result += ' \n' + + return result + + def modify_ast_content(self, ast: List[Dict[str, Any]], modifications: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Modify AST content based on provided modifications. + + Args: + ast: Original AST tokens + modifications: Dictionary of modifications to apply + + Returns: + Modified AST tokens + + Supported modifications: + - add_section: Add a new section with title and content + - update_front_matter: Update front matter values + """ + modified_ast = ast.copy() + + # Handle adding sections + if 'add_section' in modifications: + section_data = modifications['add_section'] + title = section_data.get('title', 'New Section') + content = section_data.get('content', '') + level = section_data.get('level', 2) + + # Create new section tokens + new_tokens = [ + { + "type": "heading_open", + "tag": f"h{level}", + "attrs": {}, + "map": None, + "nesting": 1, + "level": 0, + "content": "", + "markup": "#" * level, + "info": "", + "meta": {}, + "block": True, + "hidden": False + }, + { + "type": "inline", + "tag": "", + "attrs": {}, + "map": None, + "nesting": 0, + "level": 1, + "children": [ + { + "type": "text", + "tag": "", + "attrs": {}, + "map": None, + "nesting": 0, + "level": 0, + "content": title, + "markup": "", + "info": "", + "meta": {}, + "block": False, + "hidden": False + } + ], + "content": title, + "markup": "", + "info": "", + "meta": {}, + "block": True, + "hidden": False + }, + { + "type": "heading_close", + "tag": f"h{level}", + "attrs": {}, + "map": None, + "nesting": -1, + "level": 0, + "content": "", + "markup": "#" * level, + "info": "", + "meta": {}, + "block": True, + "hidden": False + } + ] + + if content: + new_tokens.extend([ + { + "type": "paragraph_open", + "tag": "p", + "attrs": {}, + "map": None, + "nesting": 1, + "level": 0, + "content": "", + "markup": "", + "info": "", + "meta": {}, + "block": True, + "hidden": False + }, + { + "type": "inline", + "tag": "", + "attrs": {}, + "map": None, + "nesting": 0, + "level": 1, + "children": [ + { + "type": "text", + "tag": "", + "attrs": {}, + "map": None, + "nesting": 0, + "level": 0, + "content": content, + "markup": "", + "info": "", + "meta": {}, + "block": False, + "hidden": False + } + ], + "content": content, + "markup": "", + "info": "", + "meta": {}, + "block": True, + "hidden": False + }, + { + "type": "paragraph_close", + "tag": "p", + "attrs": {}, + "map": None, + "nesting": -1, + "level": 0, + "content": "", + "markup": "", + "info": "", + "meta": {}, + "block": True, + "hidden": False + } + ]) + + # Add to end of AST + modified_ast.extend(new_tokens) + + return modified_ast diff --git a/markitect/core/workspace.py b/markitect/core/workspace.py new file mode 100644 index 00000000..cf879593 --- /dev/null +++ b/markitect/core/workspace.py @@ -0,0 +1,475 @@ +""" +Workspace management functionality for Issue #144. + +This module provides workspace templates, multi-project support, and +collaborative workspace features. +""" + +import json +import yaml +import shutil +import zipfile +import hashlib +from pathlib import Path +from typing import Dict, Any, List, Optional +from dataclasses import dataclass, field +from datetime import datetime + +from markitect.assets import AssetManager + + +@dataclass +class TemplateMetadata: + """Metadata for workspace templates.""" + name: str + description: str + version: str + created_at: datetime + asset_count: int + author: str = "Unknown" + tags: List[str] = field(default_factory=list) + + +@dataclass +class TemplateResult: + """Result of template creation.""" + success: bool + template_path: Path + template_name: str + error: Optional[Exception] = None + + +@dataclass +class WorkspaceCreationResult: + """Result of workspace creation from template.""" + success: bool + workspace_path: Path + project_name: str + error: Optional[Exception] = None + + +@dataclass +class ProjectResult: + """Result of project operations.""" + success: bool + project_path: Path + project_name: str + error: Optional[Exception] = None + + +@dataclass +class SyncResult: + """Result of workspace synchronization.""" + synchronized_count: int + skipped_count: int + error_count: int + errors: List[Exception] = field(default_factory=list) + + +@dataclass +class BackupResult: + """Result of workspace backup.""" + success: bool + backup_path: Path + backup_size: int + error: Optional[Exception] = None + + +@dataclass +class RestoreResult: + """Result of workspace restore.""" + success: bool + restored_path: Path + files_restored: int + error: Optional[Exception] = None + + +@dataclass +class WorkspaceState: + """Snapshot of workspace state.""" + timestamp: datetime + file_checksums: Dict[str, str] + directory_structure: List[str] + asset_hashes: List[str] + + +@dataclass +class ConflictInfo: + """Information about a workspace conflict.""" + file_path: Path + conflict_type: str + local_timestamp: datetime + remote_timestamp: datetime + + +@dataclass +class MergeResult: + """Result of conflict resolution.""" + resolved_conflicts: int + unresolved_conflicts: int + merge_strategy: str + + +class WorkspaceTemplate: + """Workspace template management.""" + + def __init__(self, template_path: Path): + """Initialize workspace template.""" + self.template_path = template_path + self.metadata_file = template_path / "template.json" + + def get_metadata(self) -> TemplateMetadata: + """Get template metadata.""" + if self.metadata_file.exists(): + metadata_dict = json.loads(self.metadata_file.read_text()) + return TemplateMetadata(**metadata_dict) + else: + return TemplateMetadata( + name="Unknown", + description="No description", + version="1.0.0", + created_at=datetime.now(), + asset_count=0 + ) + + +class WorkspaceManager: + """Workspace management system.""" + + def __init__(self, templates_dir: Optional[Path] = None): + """Initialize workspace manager.""" + self.templates_dir = templates_dir or Path.home() / ".markitect" / "templates" + self.templates_dir.mkdir(parents=True, exist_ok=True) + + def create_template(self, name: str, source_path: Path, description: str = "", + include_assets: bool = True, configuration: Optional[Dict] = None) -> TemplateResult: + """Create a workspace template from existing workspace.""" + try: + template_path = self.templates_dir / name + template_path.mkdir(exist_ok=True) + + # Copy workspace structure + self._copy_workspace_structure(source_path, template_path, include_assets) + + # Count assets + asset_count = 0 + if include_assets and (source_path / "assets").exists(): + asset_count = len(list((source_path / "assets").rglob("*"))) + + # Create template metadata + metadata = { + "name": name, + "description": description, + "version": "1.0.0", + "created_at": datetime.now().isoformat(), + "asset_count": asset_count, + "author": "Unknown", + "tags": [] + } + + metadata_file = template_path / "template.json" + metadata_file.write_text(json.dumps(metadata, indent=2)) + + # Save configuration if provided + if configuration: + config_file = template_path / "markitect.yaml" + config_file.write_text(yaml.dump(configuration, indent=2)) + + return TemplateResult( + success=True, + template_path=template_path, + template_name=name + ) + + except Exception as e: + return TemplateResult( + success=False, + template_path=Path(), + template_name=name, + error=e + ) + + def get_template_metadata(self, template_name: str) -> TemplateMetadata: + """Get metadata for a specific template.""" + template_path = self.templates_dir / template_name + template = WorkspaceTemplate(template_path) + return template.get_metadata() + + def create_workspace_from_template(self, template_name: str, target_path: Path, + project_name: str) -> WorkspaceCreationResult: + """Create a new workspace from a template.""" + try: + template_path = self.templates_dir / template_name + + if not template_path.exists(): + raise FileNotFoundError(f"Template '{template_name}' not found") + + # Create target directory + target_path.mkdir(parents=True, exist_ok=True) + + # Copy template contents + self._copy_workspace_structure(template_path, target_path, include_assets=True) + + # Update project-specific files + self._customize_workspace(target_path, project_name) + + return WorkspaceCreationResult( + success=True, + workspace_path=target_path, + project_name=project_name + ) + + except Exception as e: + return WorkspaceCreationResult( + success=False, + workspace_path=target_path, + project_name=project_name, + error=e + ) + + def initialize_multi_project_workspace(self, workspace_root: Path): + """Initialize a multi-project workspace.""" + workspace_root.mkdir(parents=True, exist_ok=True) + + # Create shared directories + (workspace_root / "shared_assets").mkdir(exist_ok=True) + (workspace_root / "templates").mkdir(exist_ok=True) + (workspace_root / "config").mkdir(exist_ok=True) + + # Create workspace configuration + config = { + "workspace_type": "multi_project", + "shared_assets_enabled": True, + "project_isolation": True, + "created_at": datetime.now().isoformat() + } + + config_file = workspace_root / "workspace.yaml" + config_file.write_text(yaml.dump(config, indent=2)) + + def add_project(self, workspace_root: Path, project_name: str, + template: Optional[str] = None) -> ProjectResult: + """Add a project to multi-project workspace.""" + try: + project_path = workspace_root / project_name + project_path.mkdir(exist_ok=True) + + if template: + # Use template if specified + result = self.create_workspace_from_template(template, project_path, project_name) + if not result.success: + raise result.error or Exception("Template creation failed") + else: + # Create basic project structure + (project_path / "docs").mkdir(exist_ok=True) + (project_path / "assets").mkdir(exist_ok=True) + + return ProjectResult( + success=True, + project_path=project_path, + project_name=project_name + ) + + except Exception as e: + return ProjectResult( + success=False, + project_path=workspace_root / project_name, + project_name=project_name, + error=e + ) + + def get_shared_asset_library(self, workspace_root: Path) -> Optional[AssetManager]: + """Get shared asset library for multi-project workspace.""" + shared_assets_path = workspace_root / "shared_assets" + if shared_assets_path.exists(): + return AssetManager(storage_path=shared_assets_path) + return None + + def initialize_workspace(self, workspace_path: Path): + """Initialize a single workspace.""" + workspace_path.mkdir(parents=True, exist_ok=True) + (workspace_path / "assets").mkdir(exist_ok=True) + (workspace_path / "docs").mkdir(exist_ok=True) + + def synchronize_assets(self, source_workspace: Path, target_workspace: Path, + sync_mode: str = "incremental") -> SyncResult: + """Synchronize assets between workspaces.""" + result = SyncResult( + synchronized_count=0, + skipped_count=0, + error_count=0 + ) + + try: + source_assets = source_workspace / "assets" + target_assets = target_workspace / "assets" + + if not source_assets.exists(): + return result + + target_assets.mkdir(exist_ok=True) + + # Simple synchronization (copy new files) + for asset_file in source_assets.rglob("*"): + if asset_file.is_file(): + relative_path = asset_file.relative_to(source_assets) + target_file = target_assets / relative_path + + if not target_file.exists() or sync_mode == "overwrite": + target_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(asset_file, target_file) + result.synchronized_count += 1 + else: + result.skipped_count += 1 + + except Exception as e: + result.error_count += 1 + result.errors.append(e) + + return result + + def create_backup(self, workspace_path: Path, backup_path: Path, + include_assets: bool = True, compression_level: int = 6) -> BackupResult: + """Create a backup of workspace.""" + try: + with zipfile.ZipFile(backup_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=compression_level) as backup_zip: + for file_path in workspace_path.rglob("*"): + if file_path.is_file(): + # Skip assets if not included + if not include_assets and "assets" in file_path.parts: + continue + + arc_name = file_path.relative_to(workspace_path) + backup_zip.write(file_path, arc_name) + + backup_size = backup_path.stat().st_size + + return BackupResult( + success=True, + backup_path=backup_path, + backup_size=backup_size + ) + + except Exception as e: + return BackupResult( + success=False, + backup_path=backup_path, + backup_size=0, + error=e + ) + + def restore_from_backup(self, backup_path: Path, target_path: Path) -> RestoreResult: + """Restore workspace from backup.""" + try: + target_path.mkdir(parents=True, exist_ok=True) + + files_restored = 0 + with zipfile.ZipFile(backup_path, 'r') as backup_zip: + backup_zip.extractall(target_path) + files_restored = len(backup_zip.namelist()) + + return RestoreResult( + success=True, + restored_path=target_path, + files_restored=files_restored + ) + + except Exception as e: + return RestoreResult( + success=False, + restored_path=target_path, + files_restored=0, + error=e + ) + + def capture_workspace_state(self, workspace_path: Path) -> WorkspaceState: + """Capture current state of workspace.""" + file_checksums = {} + directory_structure = [] + asset_hashes = [] + + for item_path in workspace_path.rglob("*"): + relative_path = str(item_path.relative_to(workspace_path)) + + if item_path.is_file(): + # Calculate file checksum + content = item_path.read_bytes() + checksum = hashlib.md5(content).hexdigest() + file_checksums[relative_path] = checksum + + # Track asset hashes + if "assets" in item_path.parts: + asset_hashes.append(checksum) + + directory_structure.append(relative_path) + + return WorkspaceState( + timestamp=datetime.now(), + file_checksums=file_checksums, + directory_structure=directory_structure, + asset_hashes=asset_hashes + ) + + def detect_conflicts(self, state1: WorkspaceState, state2: WorkspaceState) -> List[ConflictInfo]: + """Detect conflicts between workspace states.""" + conflicts = [] + + # Find files that exist in both states but have different checksums + for file_path, checksum1 in state1.file_checksums.items(): + if file_path in state2.file_checksums: + checksum2 = state2.file_checksums[file_path] + if checksum1 != checksum2: + conflict = ConflictInfo( + file_path=Path(file_path), + conflict_type="content_conflict", + local_timestamp=state1.timestamp, + remote_timestamp=state2.timestamp + ) + conflicts.append(conflict) + + return conflicts + + def resolve_conflicts(self, conflicts: List[ConflictInfo], + resolution_strategy: str = "manual") -> MergeResult: + """Resolve workspace conflicts.""" + # Mock conflict resolution + result = MergeResult( + resolved_conflicts=len(conflicts), + unresolved_conflicts=0, + merge_strategy=resolution_strategy + ) + + return result + + def _copy_workspace_structure(self, source: Path, target: Path, include_assets: bool): + """Copy workspace structure from source to target.""" + for item in source.rglob("*"): + if item.is_file(): + relative_path = item.relative_to(source) + + # Skip assets if not included + if not include_assets and "assets" in relative_path.parts: + continue + + # Skip template metadata + if item.name == "template.json": + continue + + target_path = target / relative_path + target_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(item, target_path) + + def _customize_workspace(self, workspace_path: Path, project_name: str): + """Customize workspace for specific project.""" + # Update any configuration files with project name + config_files = list(workspace_path.glob("*.yaml")) + list(workspace_path.glob("*.yml")) + + for config_file in config_files: + try: + content = config_file.read_text() + # Replace placeholder project names + content = content.replace("{{PROJECT_NAME}}", project_name) + content = content.replace("New Project", project_name) + config_file.write_text(content) + except Exception: + pass # Ignore errors in customization diff --git a/markitect/database.py b/markitect/database.py index c9727a1d..de070901 100644 --- a/markitect/database.py +++ b/markitect/database.py @@ -1,444 +1,11 @@ """ -Database management functionality for MarkiTect. +Database management - Backward Compatibility Module. -This module provides SQLite database initialization, markdown file storage -with front matter support, and JSON schema storage (Issue #3). +This module re-exports from markitect.storage.database for backward compatibility. +New code should import from markitect.storage.database directly. """ -import sqlite3 -import json -import os -from datetime import datetime -from pathlib import Path -from typing import Optional, Dict, Any +# Re-export from storage package for backward compatibility +from markitect.storage.database import DatabaseManager -from .frontmatter import FrontMatterParser - - -class DatabaseManager: - """Manager for SQLite database operations.""" - - def __init__(self, db_path: str): - """ - Initialize database manager. - - Args: - db_path: Path to SQLite database file - """ - self.db_path = db_path - self.front_matter_parser = FrontMatterParser() - - def initialize_database(self) -> None: - """ - Initialize SQLite database with required tables. - - Creates the markdown_files table with the following schema: - - id: INTEGER PRIMARY KEY - - filename: TEXT NOT NULL - - front_matter: TEXT (JSON) - - content: TEXT - - created_at: TIMESTAMP DEFAULT CURRENT_TIMESTAMP - - Also initializes finance schema if finance module is available. - """ - # Ensure directory exists - db_dir = os.path.dirname(self.db_path) - if db_dir and not os.path.exists(db_dir): - os.makedirs(db_dir) - - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - # Create markdown_files table - cursor.execute(''' - CREATE TABLE IF NOT EXISTS markdown_files ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - filename TEXT NOT NULL, - front_matter TEXT, - content TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - ''') - - # Create schemas table for Issue #3 - cursor.execute(''' - CREATE TABLE IF NOT EXISTS schemas ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - filename TEXT NOT NULL UNIQUE, - title TEXT, - description TEXT, - schema_content TEXT NOT NULL, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - ''') - - conn.commit() - conn.close() - - # Initialize finance schema if available - self.initialize_finance_schema() - - def initialize_finance_schema(self) -> None: - """ - Initialize finance schema for cost tracking (Issue #88). - - This method is called automatically during database initialization - to set up cost tracking tables if the finance module is available. - """ - try: - from .finance.models import FinanceModels - finance_models = FinanceModels(self.db_path) - finance_models.initialize_finance_schema() - except ImportError: - # Finance module not available, skip initialization - pass - except Exception as e: - # Silently ignore finance schema initialization errors for CLI compatibility - pass - - def store_markdown_file(self, filename: str, content: str) -> Optional[int]: - """ - Store a markdown file in the database. - - Args: - filename: Name of the markdown file - content: Raw markdown content with optional front matter - - Returns: - ID of the inserted record, or None if insertion failed - """ - # Parse front matter and content - front_matter, markdown_content = self.front_matter_parser.parse(content) - - # Convert front matter to JSON string - front_matter_json = json.dumps(front_matter) if front_matter else '{}' - - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - try: - cursor.execute(''' - INSERT INTO markdown_files (filename, front_matter, content, created_at) - VALUES (?, ?, ?, ?) - ''', (filename, front_matter_json, markdown_content, datetime.now().isoformat())) - - record_id = cursor.lastrowid - conn.commit() - return record_id - - except sqlite3.Error: - conn.rollback() - return None - - finally: - conn.close() - - def get_markdown_file(self, filename: str) -> Optional[Dict[str, Any]]: - """ - Retrieve a markdown file from the database. - - Args: - filename: Name of the markdown file to retrieve - - Returns: - Dictionary containing file data, or None if not found - """ - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - cursor.execute(''' - SELECT id, filename, front_matter, content, created_at - FROM markdown_files - WHERE filename = ? - ''', (filename,)) - - row = cursor.fetchone() - conn.close() - - if row: - return { - 'id': row[0], - 'filename': row[1], - 'front_matter': json.loads(row[2]) if row[2] else {}, - 'content': row[3], - 'created_at': row[4] - } - - return None - - def list_markdown_files(self) -> list: - """ - List all markdown files in the database. - - Returns: - List of dictionaries containing file metadata - """ - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - cursor.execute(''' - SELECT id, filename, front_matter, created_at - FROM markdown_files - ORDER BY created_at DESC - ''') - - rows = cursor.fetchall() - conn.close() - - files = [] - for row in rows: - files.append({ - 'id': row[0], - 'filename': row[1], - 'front_matter': json.loads(row[2]) if row[2] else {}, - 'created_at': row[3] - }) - - return files - - def execute_query(self, sql: str) -> list: - """ - Execute a read-only SQL query against the database. - - Args: - sql: SQL query string (SELECT operations only) - - Returns: - List of dictionaries representing query results - - Raises: - ValueError: If query contains non-SELECT operations - sqlite3.Error: If query execution fails - """ - # Security check: only allow SELECT queries - sql_upper = sql.strip().upper() - if not sql_upper.startswith('SELECT'): - allowed_starts = ['SELECT', 'WITH'] # Allow WITH for CTEs - if not any(sql_upper.startswith(start) for start in allowed_starts): - raise ValueError("Only SELECT and WITH queries are allowed for safety") - - # Additional safety checks for dangerous keywords (as whole words) - dangerous_keywords = [ - 'DROP', 'DELETE', 'UPDATE', 'INSERT', 'CREATE', 'ALTER', - 'TRUNCATE', 'REPLACE', 'PRAGMA' - ] - import re - for keyword in dangerous_keywords: - # Use word boundaries to match only complete words - pattern = r'\b' + keyword + r'\b' - if re.search(pattern, sql_upper): - raise ValueError(f"Query contains dangerous keyword: {keyword}") - - conn = sqlite3.connect(self.db_path) - conn.row_factory = sqlite3.Row # Enable column access by name - cursor = conn.cursor() - - try: - cursor.execute(sql) - rows = cursor.fetchall() - - # Convert rows to dictionaries - results = [] - for row in rows: - results.append(dict(row)) - - conn.close() - return results - - except sqlite3.Error as e: - conn.close() - raise e - - def get_schema(self) -> dict: - """ - Get database schema information. - - Returns: - Dictionary containing table schemas with column information - """ - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - schema = {} - - try: - # Get all table names - cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") - tables = cursor.fetchall() - - for table_row in tables: - table_name = table_row[0] - - # Get column information for each table - cursor.execute(f"PRAGMA table_info({table_name})") - columns = cursor.fetchall() - - column_info = [] - for col in columns: - column_info.append({ - 'name': col[1], - 'type': col[2], - 'nullable': not bool(col[3]), # notnull flag - 'default_value': col[4], - 'primary_key': bool(col[5]) - }) - - schema[table_name] = { - 'columns': column_info - } - - conn.close() - return schema - - except sqlite3.Error as e: - conn.close() - raise e - - # Schema management methods for Issue #3 - def store_schema_file(self, filename: str, schema_content: str) -> Optional[int]: - """ - Store a JSON schema file in the database. - - Args: - filename: Name of the schema file - schema_content: JSON schema content as string - - Returns: - ID of the inserted/updated record, or None if operation failed - """ - try: - # Parse and validate JSON schema - schema_data = json.loads(schema_content) - title = schema_data.get('title', filename) - description = schema_data.get('description', '') - except json.JSONDecodeError: - return None - - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - try: - # Check if schema already exists - cursor.execute('SELECT id FROM schemas WHERE filename = ?', (filename,)) - existing = cursor.fetchone() - - if existing: - # Update existing schema - cursor.execute(''' - UPDATE schemas - SET title = ?, description = ?, schema_content = ?, updated_at = ? - WHERE filename = ? - ''', (title, description, schema_content, datetime.now().isoformat(), filename)) - record_id = existing[0] - else: - # Insert new schema - cursor.execute(''' - INSERT INTO schemas (filename, title, description, schema_content, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, ?) - ''', (filename, title, description, schema_content, - datetime.now().isoformat(), datetime.now().isoformat())) - record_id = cursor.lastrowid - - conn.commit() - return record_id - - except sqlite3.Error: - conn.rollback() - return None - - finally: - conn.close() - - def get_schema_file(self, filename: str) -> Optional[Dict[str, Any]]: - """ - Retrieve a schema file from the database. - - Args: - filename: Name of the schema file to retrieve - - Returns: - Dictionary containing schema data, or None if not found - """ - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - cursor.execute(''' - SELECT id, filename, title, description, schema_content, created_at, updated_at - FROM schemas - WHERE filename = ? - ''', (filename,)) - - row = cursor.fetchone() - conn.close() - - if row: - return { - 'id': row[0], - 'filename': row[1], - 'title': row[2], - 'description': row[3], - 'schema_content': row[4], - 'created_at': row[5], - 'updated_at': row[6] - } - - return None - - def list_schema_files(self) -> list: - """ - List all schema files in the database. - - Returns: - List of dictionaries containing schema metadata - """ - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - cursor.execute(''' - SELECT id, filename, title, description, created_at, updated_at - FROM schemas - ORDER BY updated_at DESC - ''') - - rows = cursor.fetchall() - conn.close() - - schemas = [] - for row in rows: - schemas.append({ - 'id': row[0], - 'filename': row[1], - 'title': row[2], - 'description': row[3], - 'created_at': row[4], - 'updated_at': row[5] - }) - - return schemas - - def delete_schema_file(self, filename: str) -> bool: - """ - Delete a schema file from the database. - - Args: - filename: Name of the schema file to delete - - Returns: - True if deletion was successful, False otherwise - """ - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - try: - cursor.execute('DELETE FROM schemas WHERE filename = ?', (filename,)) - success = cursor.rowcount > 0 - conn.commit() - return success - - except sqlite3.Error: - conn.rollback() - return False - - finally: - conn.close() +__all__ = ['DatabaseManager'] diff --git a/markitect/document_manager.py b/markitect/document_manager.py index db39d99f..2ace0541 100644 --- a/markitect/document_manager.py +++ b/markitect/document_manager.py @@ -1,98 +1,11 @@ """ -Document manager - Clean implementation. +Document manager - Backward Compatibility Module. -This module provides the DocumentManager class which is now a wrapper around -the CleanDocumentManager for backward compatibility. +This module re-exports from markitect.core.document_manager for backward compatibility. +New code should import from markitect.core.document_manager directly. """ -from .clean_document_manager import CleanDocumentManager +# Re-export from core for backward compatibility +from markitect.core.document_manager import DocumentManager, CleanDocumentManager - -class DocumentManager(CleanDocumentManager): - """ - Document manager for backward compatibility. - - This class extends CleanDocumentManager to maintain compatibility - with existing code while using the clean implementation. - """ - - def __init__(self, db_manager=None): - super().__init__(db_manager) - - def ingest_file(self, file_path: str): - """ - Ingest a markdown file for processing. - - This method provides compatibility for tests expecting the ingest_file interface. - """ - import time - from pathlib import Path - from .parser import parse_markdown_to_ast - from .frontmatter import FrontMatterParser - - file_path = Path(file_path) - if not file_path.exists(): - raise FileNotFoundError(f"File not found: {file_path}") - - # Read file content - content = file_path.read_text(encoding='utf-8') - - # Extract front matter - start_time = time.time() - parser = FrontMatterParser() - front_matter_data, content_without_front_matter = parser.parse(content) - - # Parse to AST - ast = parse_markdown_to_ast(content) - parse_time = time.time() - start_time - - # Extract title - first try front matter, then first heading, then filename - title = "Unknown" - if front_matter_data and 'title' in front_matter_data: - title = front_matter_data['title'] - elif isinstance(ast, list): - # Look for first H1 heading in AST tokens - for token in ast: - if token.get('type') == 'heading_open' and token.get('tag') == 'h1': - # Find the next inline token with content - idx = ast.index(token) + 1 - if idx < len(ast) and ast[idx].get('type') == 'inline': - title = ast[idx].get('content', 'Unknown') - break - - # Create actual cache file for compatibility - cache_dir = Path(file_path.parent) / '.ast_cache' - cache_dir.mkdir(exist_ok=True) - cache_file = cache_dir / f"{file_path.stem}_ast.json" - - # Write AST to cache file - import json - with open(cache_file, 'w', encoding='utf-8') as f: - json.dump(ast, f, indent=2) - - # Store document in database if db_manager exists - if hasattr(self, 'db_manager') and self.db_manager: - try: - # Store using the clean document manager's method - self.store_document(str(file_path), content, ast, front_matter_data) - except Exception: - # If storage fails, continue without error for test compatibility - pass - - return { - 'ast': ast, - 'content': content, - 'metadata': { - 'filename': file_path.name, - 'title': title, - 'size': len(content), - 'path': str(file_path) - }, - 'ast_cache_path': cache_file, - 'parse_time': parse_time, - 'cache_time': 0 # Mock cache time for compatibility - } - - -# For backward compatibility, also export the clean document manager directly -__all__ = ['DocumentManager', 'CleanDocumentManager'] \ No newline at end of file +__all__ = ['DocumentManager', 'CleanDocumentManager'] diff --git a/markitect/parser.py b/markitect/parser.py index 474f3380..c9067475 100644 --- a/markitect/parser.py +++ b/markitect/parser.py @@ -1,26 +1,11 @@ -from markdown_it import MarkdownIt +""" +Markdown AST Parser - Backward Compatibility Module. -def parse_markdown_to_ast(md_content: str): - # Enable table parsing and other common plugins - md = MarkdownIt("commonmark", {"tables": True}).enable(['table']) - tokens = md.parse(md_content) - # Convert to a JSON-serializable list of dicts (tokens are objects, so we dict-ify them recursively) - def token_to_dict(token): - d = { - 'type': token.type, - 'tag': token.tag, - 'attrs': token.attrs, - 'map': token.map, - 'nesting': token.nesting, - 'level': token.level, - 'children': [token_to_dict(child) if child else None for child in token.children] if token.children else None, - 'content': token.content, - 'markup': token.markup, - 'info': token.info, - 'meta': token.meta, - 'block': token.block, - 'hidden': token.hidden - } - return {k: v for k, v in d.items() if v is not None} # Remove None values for cleanliness +This module re-exports from markitect.core.parser for backward compatibility. +New code should import from markitect.core.parser directly. +""" - return [token_to_dict(token) for token in tokens] +# Re-export from core for backward compatibility +from markitect.core.parser import parse_markdown_to_ast + +__all__ = ['parse_markdown_to_ast'] diff --git a/markitect/schema/__init__.py b/markitect/schema/__init__.py new file mode 100644 index 00000000..a4b2f69d --- /dev/null +++ b/markitect/schema/__init__.py @@ -0,0 +1,72 @@ +""" +Schema management modules for MarkiTect. + +This package contains the schema-related functionality: +- Validator: Validate markdown documents against JSON schemas +- Generator: Generate JSON schemas from markdown structures +- Loader: Load schemas from markdown files with embedded JSON +- Analyzer: Analyze schemas for rigidity issues +- Refiner: Refine rigid schemas with loosening rules +- Naming: Schema filename convention validation + +All modules are re-exported from their original schema_*.py locations +for backward compatibility. +""" + +from .validator import SchemaValidator +from .generator import SchemaGenerator +from .loader import ( + MarkdownSchemaLoader, + SchemaLoaderError, + InvalidSchemaFormatError, + SchemaNotFoundError, +) +from .analyzer import ( + SchemaAnalyzer, + SchemaAnalysisResult, + SchemaIssue, + IssueType, + IssueSeverity, +) +from .refiner import ( + SchemaRefiner, + RefinementResult, + RefinementAction, +) +from .naming import ( + validate_schema_filename, + suggest_valid_filename, + extract_schema_domain, + get_schema_version, + SchemaFilenameError, + SCHEMA_FILENAME_PATTERN, +) + +__all__ = [ + # Validator + "SchemaValidator", + # Generator + "SchemaGenerator", + # Loader + "MarkdownSchemaLoader", + "SchemaLoaderError", + "InvalidSchemaFormatError", + "SchemaNotFoundError", + # Analyzer + "SchemaAnalyzer", + "SchemaAnalysisResult", + "SchemaIssue", + "IssueType", + "IssueSeverity", + # Refiner + "SchemaRefiner", + "RefinementResult", + "RefinementAction", + # Naming + "validate_schema_filename", + "suggest_valid_filename", + "extract_schema_domain", + "get_schema_version", + "SchemaFilenameError", + "SCHEMA_FILENAME_PATTERN", +] diff --git a/markitect/schema/analyzer.py b/markitect/schema/analyzer.py new file mode 100644 index 00000000..e9e07a80 --- /dev/null +++ b/markitect/schema/analyzer.py @@ -0,0 +1,352 @@ +""" +Schema Analyzer for Phase 2: Schema Refinement Tools + +Analyzes JSON schemas to detect rigidity issues and provide suggestions +for improvement using the Phase 1 classification system. +""" + +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple +import json +from dataclasses import dataclass, field +from enum import Enum + + +class IssueType(Enum): + """Types of schema rigidity issues.""" + EXACT_COUNT = "exact_count" + MISSING_CLASSIFICATIONS = "missing_classifications" + MISSING_CONTENT_INSTRUCTIONS = "missing_content_instructions" + OVERLY_SPECIFIC = "overly_specific" + NO_FLEXIBILITY = "no_flexibility" + DEPRECATED_EXTENSIONS = "deprecated_extensions" + + +class IssueSeverity(Enum): + """Severity levels for schema issues.""" + INFO = "info" + WARNING = "warning" + ERROR = "error" + + +@dataclass +class SchemaIssue: + """Represents a detected schema issue.""" + issue_type: IssueType + severity: IssueSeverity + path: str + message: str + suggestion: str + current_value: Any = None + suggested_value: Any = None + + +@dataclass +class SchemaAnalysisResult: + """Results of schema analysis.""" + is_rigid: bool + rigidity_score: int # 0-100, higher = more rigid + issues: List[SchemaIssue] = field(default_factory=list) + has_classifications: bool = False + has_content_control: bool = False + uses_deprecated_extensions: bool = False + + @property + def issue_count_by_severity(self) -> Dict[IssueSeverity, int]: + """Count issues by severity.""" + counts = {severity: 0 for severity in IssueSeverity} + for issue in self.issues: + counts[issue.severity] += 1 + return counts + + +class SchemaAnalyzer: + """Analyzes schemas for rigidity and suggests improvements.""" + + def __init__(self): + """Initialize the schema analyzer.""" + self.deprecated_extensions = [ + "x-markitect-required-sections", + "x-markitect-recommended-sections", + "x-markitect-optional-sections" + ] + + def analyze_schema(self, schema: Dict[str, Any]) -> SchemaAnalysisResult: + """ + Analyze a schema for rigidity issues. + + Args: + schema: The JSON schema to analyze + + Returns: + SchemaAnalysisResult with detected issues and suggestions + """ + result = SchemaAnalysisResult(is_rigid=False, rigidity_score=0) + + # Check for Phase 1 features + result.has_classifications = "x-markitect-sections" in schema + result.has_content_control = "x-markitect-content-control" in schema + + # Check for deprecated extensions + for deprecated in self.deprecated_extensions: + if deprecated in schema: + result.uses_deprecated_extensions = True + result.issues.append(SchemaIssue( + issue_type=IssueType.DEPRECATED_EXTENSIONS, + severity=IssueSeverity.WARNING, + path=deprecated, + message=f"Using deprecated extension '{deprecated}'", + suggestion=f"Migrate to 'x-markitect-sections' with classification system" + )) + + # Analyze properties for rigidity + if "properties" in schema: + self._analyze_properties(schema["properties"], result, "properties") + + # Check for missing classifications + if not result.has_classifications: + result.issues.append(SchemaIssue( + issue_type=IssueType.MISSING_CLASSIFICATIONS, + severity=IssueSeverity.INFO, + path="root", + message="Schema does not use section classification system", + suggestion="Add 'x-markitect-sections' to classify sections as required/recommended/optional/discouraged/improper" + )) + + # Check for missing content control + if not result.has_content_control: + result.issues.append(SchemaIssue( + issue_type=IssueType.MISSING_CONTENT_INSTRUCTIONS, + severity=IssueSeverity.INFO, + path="root", + message="Schema does not provide content control", + suggestion="Add 'x-markitect-content-control' for pattern validation and quality metrics" + )) + + # Calculate rigidity score + result.rigidity_score = self._calculate_rigidity_score(result) + result.is_rigid = result.rigidity_score > 50 + + return result + + def _analyze_properties(self, properties: Dict[str, Any], result: SchemaAnalysisResult, path: str): + """Analyze schema properties for rigidity issues.""" + for prop_name, prop_def in properties.items(): + prop_path = f"{path}.{prop_name}" + + if not isinstance(prop_def, dict): + continue + + # Check for exact counts (const) + if "const" in prop_def: + result.issues.append(SchemaIssue( + issue_type=IssueType.EXACT_COUNT, + severity=IssueSeverity.WARNING, + path=prop_path, + message=f"Property '{prop_name}' requires exact value", + suggestion=f"Consider using a range or removing constraint for flexibility", + current_value=prop_def["const"] + )) + + # Check for arrays with exact counts + if prop_def.get("type") == "array": + min_items = prop_def.get("minItems") + max_items = prop_def.get("maxItems") + + if min_items is not None and max_items is not None and min_items == max_items: + result.issues.append(SchemaIssue( + issue_type=IssueType.EXACT_COUNT, + severity=IssueSeverity.WARNING, + path=prop_path, + message=f"Array '{prop_name}' requires exactly {min_items} items", + suggestion=f"Use a range like minItems: {max(0, min_items - 2)}, maxItems: {min_items + 5}", + current_value={"minItems": min_items, "maxItems": max_items}, + suggested_value={ + "minItems": max(0, min_items - 2), + "maxItems": min_items + 5 + } + )) + + # Check for overly specific counts (large numbers) + if min_items is not None and min_items > 50: + result.issues.append(SchemaIssue( + issue_type=IssueType.OVERLY_SPECIFIC, + severity=IssueSeverity.INFO, + path=prop_path, + message=f"Array '{prop_name}' has very specific minItems: {min_items}", + suggestion=f"Consider rounding to {(min_items // 10) * 10} for flexibility", + current_value=min_items, + suggested_value=(min_items // 10) * 10 + )) + + # Check for overly specific integer constraints + if prop_def.get("type") == "integer": + if "minimum" in prop_def and "maximum" in prop_def: + min_val = prop_def["minimum"] + max_val = prop_def["maximum"] + range_size = max_val - min_val + + if range_size < 3: + result.issues.append(SchemaIssue( + issue_type=IssueType.NO_FLEXIBILITY, + severity=IssueSeverity.INFO, + path=prop_path, + message=f"Integer '{prop_name}' has very narrow range: {min_val}-{max_val}", + suggestion=f"Consider widening range for flexibility", + current_value={"minimum": min_val, "maximum": max_val} + )) + + # Recursively check nested properties + if "properties" in prop_def: + self._analyze_properties(prop_def["properties"], result, prop_path) + + # Check items schema for arrays + if "items" in prop_def and isinstance(prop_def["items"], dict): + if "properties" in prop_def["items"]: + self._analyze_properties( + prop_def["items"]["properties"], + result, + f"{prop_path}.items" + ) + + def _calculate_rigidity_score(self, result: SchemaAnalysisResult) -> int: + """ + Calculate overall rigidity score (0-100). + + Higher score = more rigid schema. + """ + score = 0 + + # Count issues by type with weighted scores + weights = { + IssueType.EXACT_COUNT: 15, + IssueType.OVERLY_SPECIFIC: 10, + IssueType.NO_FLEXIBILITY: 8, + IssueType.MISSING_CLASSIFICATIONS: 5, + IssueType.MISSING_CONTENT_INSTRUCTIONS: 3, + IssueType.DEPRECATED_EXTENSIONS: 5 + } + + for issue in result.issues: + score += weights.get(issue.issue_type, 5) + + # Cap at 100 + return min(100, score) + + def analyze_schema_file(self, schema_path: Path) -> SchemaAnalysisResult: + """ + Analyze a schema file. + + Args: + schema_path: Path to JSON schema file + + Returns: + SchemaAnalysisResult + """ + with open(schema_path) as f: + schema = json.load(f) + + return self.analyze_schema(schema) + + def format_analysis_report(self, result: SchemaAnalysisResult, verbose: bool = False) -> str: + """ + Format analysis results as a human-readable report. + + Args: + result: Analysis results + verbose: Include detailed information + + Returns: + Formatted report string + """ + lines = [] + + # Header + lines.append("=" * 70) + lines.append("Schema Analysis Report") + lines.append("=" * 70) + lines.append("") + + # Overall assessment + rigidity_level = "HIGH" if result.rigidity_score > 70 else "MEDIUM" if result.rigidity_score > 40 else "LOW" + lines.append(f"Rigidity Score: {result.rigidity_score}/100 ({rigidity_level})") + lines.append(f"Status: {'RIGID - Needs refinement' if result.is_rigid else 'FLEXIBLE - Good'}") + lines.append("") + + # Features check + lines.append("Phase 1 Features:") + lines.append(f" - Classifications: {'Yes' if result.has_classifications else 'No'}") + lines.append(f" - Content Control: {'Yes' if result.has_content_control else 'No'}") + if result.uses_deprecated_extensions: + lines.append(f" - Deprecated Extensions: Yes (needs migration)") + lines.append("") + + # Issue summary + counts = result.issue_count_by_severity + lines.append(f"Issues Found: {len(result.issues)} total") + lines.append(f" - Errors: {counts[IssueSeverity.ERROR]}") + lines.append(f" - Warnings: {counts[IssueSeverity.WARNING]}") + lines.append(f" - Info: {counts[IssueSeverity.INFO]}") + lines.append("") + + # List issues + if result.issues: + lines.append("Detected Issues:") + lines.append("-" * 70) + + for i, issue in enumerate(result.issues, 1): + severity_icon = "ERROR" if issue.severity == IssueSeverity.ERROR else "WARN" if issue.severity == IssueSeverity.WARNING else "INFO" + lines.append(f"{i}. [{severity_icon}] {issue.message}") + lines.append(f" Path: {issue.path}") + lines.append(f" Suggestion: {issue.suggestion}") + + if verbose and issue.current_value is not None: + lines.append(f" Current: {json.dumps(issue.current_value)}") + if verbose and issue.suggested_value is not None: + lines.append(f" Suggested: {json.dumps(issue.suggested_value)}") + + lines.append("") + else: + lines.append("No issues found - schema is well-designed!") + lines.append("") + + # Recommendations + if result.is_rigid: + lines.append("Recommendations:") + lines.append("-" * 70) + lines.append("Run: markitect schema-refine --loosen-counts") + lines.append(" to automatically apply suggested improvements") + lines.append("") + + return "\n".join(lines) + + +def analyze_schema_cli(schema_path: str, verbose: bool = False) -> int: + """ + CLI entry point for schema analysis. + + Args: + schema_path: Path to schema file + verbose: Show detailed information + + Returns: + Exit code (0 = success, 1 = rigid schema found) + """ + analyzer = SchemaAnalyzer() + + try: + result = analyzer.analyze_schema_file(Path(schema_path)) + report = analyzer.format_analysis_report(result, verbose=verbose) + print(report) + + return 1 if result.is_rigid else 0 + + except FileNotFoundError: + print(f"Error: Schema file not found: {schema_path}") + return 2 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in schema file: {e}") + return 2 + except Exception as e: + print(f"Error: {e}") + return 2 diff --git a/markitect/schema/generator.py b/markitect/schema/generator.py new file mode 100644 index 00000000..dfa8544b --- /dev/null +++ b/markitect/schema/generator.py @@ -0,0 +1,466 @@ +""" +Schema Generator for Issue #5: Generate a Schema from a Markdown File. + +This module provides functionality to analyze markdown AST structures and generate +JSON schemas that describe the document's structural elements with configurable +depth limitations for architectural documentation analysis. +""" + +import json +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Any, Optional, Set + +from markitect.core.parser import parse_markdown_to_ast +from markitect.exceptions import FileNotFoundError, InvalidDepthError, InvalidInstructionTypeError + + +class SchemaGenerator: + """ + Generates JSON schemas from markdown file AST structures. + + Analyzes the structural elements of markdown documents and creates + JSON schemas that can be used for validation and compliance checking + in architecture documentation workflows. + """ + + def __init__(self): + """Initialize the schema generator.""" + self.default_schema_url = "http://json-schema.org/draft-07/schema#" + + def generate_schema_from_file( + self, + file_path: Path, + max_depth: Optional[int] = None, + mode: Optional[str] = None, + outline_depth: Optional[int] = None, + capture_heading_text: bool = False, + include_content_instructions: bool = False, + instruction_type: str = 'description' + ) -> Dict[str, Any]: + """ + Generate a JSON schema from a markdown file's AST structure. + + Args: + file_path: Path to the markdown file + max_depth: Maximum heading depth to include (None = unlimited) + mode: Generation mode ('outline' for structure-focused schemas) + outline_depth: Depth limit for outline mode + capture_heading_text: Whether to capture exact heading text as constraints + include_content_instructions: Whether to include content instruction fields + instruction_type: Type of content instructions ('description', 'example', 'constraint', 'template') + + Returns: + JSON schema as a dictionary + + Raises: + FileNotFoundError: If the markdown file doesn't exist + InvalidDepthError: If max_depth is invalid (< 1) + """ + # Validate inputs + if not file_path.exists(): + raise FileNotFoundError(f"Markdown file not found: {file_path}") + + if max_depth is not None and max_depth < 1: + raise InvalidDepthError(f"max_depth must be >= 1, got: {max_depth}") + + # Validate instruction type + valid_instruction_types = {'description', 'example', 'constraint', 'template'} + if instruction_type not in valid_instruction_types: + raise InvalidInstructionTypeError(f"Invalid instruction type '{instruction_type}'. Must be one of: {', '.join(valid_instruction_types)}") + + # Read and parse the markdown file + content = file_path.read_text(encoding='utf-8') + ast_tokens = parse_markdown_to_ast(content) + + # Analyze the AST structure + structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth) + + # Generate the JSON schema + schema = self._create_json_schema( + structure_analysis, + file_path.name, + mode=mode, + outline_depth=outline_depth, + capture_heading_text=capture_heading_text, + include_content_instructions=include_content_instructions, + instruction_type=instruction_type + ) + + return schema + + def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]: + """ + Analyze AST tokens to extract structural patterns. + + Args: + tokens: List of AST tokens from markdown-it + max_depth: Maximum heading depth to analyze + + Returns: + Dictionary containing structural analysis + """ + analysis = { + 'headings': defaultdict(list), + 'paragraphs': [], + 'lists': [], + 'code_blocks': [], + 'blockquotes': [], + 'tables': [], + 'links': [], + 'images': [], + 'emphasis': [], + 'structure_types': set() + } + + current_heading_level = 0 + i = 0 + + while i < len(tokens): + token = tokens[i] + token_type = token.get('type', '') + + # Track all structural types found + analysis['structure_types'].add(token_type) + + # Analyze headings with depth filtering + if token_type == 'heading_open': + level = self._extract_heading_level(token.get('tag', '')) + if max_depth is None or level <= max_depth: + heading_content = self._extract_heading_content(tokens, i) + analysis['headings'][f'level_{level}'].append({ + 'content': heading_content, + 'level': level, + 'position': i + }) + current_heading_level = level + + # Analyze paragraphs + elif token_type == 'paragraph_open': + paragraph_content = self._extract_paragraph_content(tokens, i) + analysis['paragraphs'].append({ + 'content': paragraph_content, + 'position': i, + 'under_heading_level': current_heading_level + }) + + # Analyze lists + elif token_type in ['bullet_list_open', 'ordered_list_open']: + list_structure = self._extract_list_structure(tokens, i) + analysis['lists'].append({ + 'type': 'bullet' if token_type == 'bullet_list_open' else 'ordered', + 'structure': list_structure, + 'position': i, + 'under_heading_level': current_heading_level + }) + + # Analyze code blocks + elif token_type == 'code_block' or token_type == 'fence': + code_info = self._extract_code_block_info(token) + analysis['code_blocks'].append({ + 'language': code_info.get('language', ''), + 'content_length': len(code_info.get('content', '')), + 'position': i, + 'under_heading_level': current_heading_level + }) + + # Analyze blockquotes + elif token_type == 'blockquote_open': + quote_content = self._extract_blockquote_content(tokens, i) + analysis['blockquotes'].append({ + 'content': quote_content, + 'position': i, + 'under_heading_level': current_heading_level + }) + + # Analyze tables + elif token_type == 'table_open': + table_structure = self._extract_table_structure(tokens, i) + analysis['tables'].append({ + 'columns': table_structure.get('columns', 0), + 'rows': table_structure.get('rows', 0), + 'position': i, + 'under_heading_level': current_heading_level + }) + + # Analyze inline elements + elif token_type == 'inline': + inline_analysis = self._analyze_inline_content(token) + analysis['links'].extend(inline_analysis.get('links', [])) + analysis['images'].extend(inline_analysis.get('images', [])) + analysis['emphasis'].extend(inline_analysis.get('emphasis', [])) + + i += 1 + + # Convert sets to lists for JSON serialization + analysis['structure_types'] = list(analysis['structure_types']) + + return analysis + + def _create_json_schema( + self, + analysis: Dict[str, Any], + filename: str, + mode: Optional[str] = None, + outline_depth: Optional[int] = None, + capture_heading_text: bool = False, + include_content_instructions: bool = False, + instruction_type: str = 'description' + ) -> Dict[str, Any]: + """ + Create a JSON schema from structural analysis. + + Args: + analysis: Structural analysis of the document + filename: Name of the source file + mode: Generation mode ('outline' for structure-focused schemas) + outline_depth: Depth limit for outline mode + capture_heading_text: Whether to capture exact heading text as constraints + include_content_instructions: Whether to include content instruction fields + instruction_type: Type of content instructions to generate + + Returns: + JSON schema dictionary + """ + # Determine title format based on mode + title_preposition = "from" if mode == "outline" else "for" + + schema = { + "$schema": self.default_schema_url, + "type": "object", + "title": f"Schema {title_preposition} {filename}", + "description": f"JSON schema describing the structure of {filename}", + "properties": {} + } + + # Add metaschema extensions for outline mode + if mode == "outline": + schema["x-markitect-outline-mode"] = True + if outline_depth is not None: + schema["x-markitect-outline-depth"] = outline_depth + + # Add metaschema extension for heading text capture + if capture_heading_text: + schema["x-markitect-heading-text-capture"] = True + + # Add metaschema extension for content instructions + if include_content_instructions: + schema["x-markitect-content-instructions-enabled"] = True + + # Add heading structure + if analysis['headings']: + heading_properties = {} + for level_key, headings in analysis['headings'].items(): + if headings: # Only include levels that have content + # Configure content property based on heading text capture + if capture_heading_text: + # Extract actual heading texts in document order + heading_texts = [heading['content'] for heading in headings] + content_property = {"enum": heading_texts} + else: + content_property = {"type": "string"} + + # Build properties for the heading item + item_properties = { + "content": content_property, + "level": {"type": "integer"}, + "position": {"type": "integer"} + } + + # Add content instruction fields if enabled + if include_content_instructions: + # Generate appropriate instruction text based on heading level + level_num = int(level_key.split('_')[1]) + section_name = f"level {level_num} heading" + instruction_text = self._generate_content_instruction(section_name, instruction_type) + + item_properties["x-markitect-content-instructions"] = { + "type": "string", + "const": instruction_text + } + + item_properties["x-markitect-instruction-type"] = { + "type": "string", + "enum": [instruction_type] + } + + heading_properties[level_key] = { + "type": "array", + "description": f"Headings at {level_key.replace('_', ' ')}", + "items": { + "type": "object", + "properties": item_properties, + "required": ["content", "level"] + }, + "minItems": len(headings), + "maxItems": len(headings) + } + + if heading_properties: + schema["properties"]["headings"] = { + "type": "object", + "description": "Document heading structure", + "properties": heading_properties + } + + # Add other structural elements + structural_elements = { + "paragraphs": ("Text paragraphs", analysis['paragraphs']), + "lists": ("Lists (ordered and unordered)", analysis['lists']), + "code_blocks": ("Code blocks and fenced code", analysis['code_blocks']), + "blockquotes": ("Block quotations", analysis['blockquotes']), + "tables": ("Tables with rows and columns", analysis['tables']), + "links": ("Links to external resources", analysis['links']), + "images": ("Embedded images", analysis['images']), + "emphasis": ("Text emphasis (bold, italic)", analysis['emphasis']) + } + + for element_name, (description, element_list) in structural_elements.items(): + if element_list: + # Build base schema for the element + element_schema = { + "type": "array", + "description": description, + "minItems": len(element_list), + "maxItems": len(element_list) + } + + # Add content instructions for paragraphs and lists if enabled + if include_content_instructions and element_name in ["paragraphs", "lists"]: + element_schema["items"] = { + "type": "object", + "properties": { + "content": {"type": "string"}, + "x-markitect-content-instructions": { + "type": "string", + "const": self._generate_content_instruction(element_name, instruction_type) + }, + "x-markitect-instruction-type": { + "type": "string", + "enum": [instruction_type] + } + } + } + + schema["properties"][element_name] = element_schema + + # Add metadata + schema["properties"]["metadata"] = { + "type": "object", + "description": "Document structure metadata", + "properties": { + "total_elements": { + "type": "integer", + "const": sum(len(v) if isinstance(v, list) else 0 for v in analysis.values()) + }, + "structure_types": { + "type": "array", + "items": {"type": "string"}, + "description": "All structural element types found", + "const": analysis['structure_types'] + } + } + } + + return schema + + def _extract_heading_level(self, tag: str) -> int: + """Extract heading level from HTML tag (h1, h2, etc.).""" + if tag.startswith('h') and len(tag) == 2: + try: + return int(tag[1]) + except ValueError: + pass + return 1 + + def _extract_heading_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: + """Extract text content from heading tokens.""" + # Look for the inline token that contains the heading text + for i in range(start_index, min(start_index + 3, len(tokens))): + token = tokens[i] + if token.get('type') == 'inline': + return token.get('content', '') + return '' + + def _extract_paragraph_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: + """Extract text content from paragraph tokens.""" + # Look for the inline token that contains the paragraph text + for i in range(start_index, min(start_index + 3, len(tokens))): + token = tokens[i] + if token.get('type') == 'inline': + return token.get('content', '') + return '' + + def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]: + """Extract list structure information.""" + # This is a simplified implementation + # In a full implementation, we'd parse the nested list structure + return { + "type": "list", + "estimated_items": 1 # Placeholder - would need more complex parsing + } + + def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]: + """Extract code block information.""" + return { + "language": token.get('info', '').split()[0] if token.get('info') else '', + "content": token.get('content', '') + } + + def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: + """Extract blockquote content.""" + # Simplified implementation + return "blockquote content" + + def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]: + """Extract table structure information.""" + # Simplified implementation + return { + "columns": 2, # Placeholder + "rows": 1 # Placeholder + } + + def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]: + """Analyze inline content for links, images, emphasis.""" + result = { + "links": [], + "images": [], + "emphasis": [] + } + + # Analyze children tokens if they exist + children = token.get('children', []) + for child in children: + if child and isinstance(child, dict): + child_type = child.get('type', '') + if child_type == 'link_open': + result['links'].append({"type": "link"}) + elif child_type == 'image': + result['images'].append({"type": "image"}) + elif child_type in ['em_open', 'strong_open']: + result['emphasis'].append({"type": child_type}) + + return result + + def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str: + """ + Generate appropriate content instruction text based on heading and instruction type. + + Args: + heading_text: The text of the heading + instruction_type: Type of instruction to generate + + Returns: + Instruction text for the content field + """ + if instruction_type == "description": + return f"Provide content for the '{heading_text}' section" + elif instruction_type == "example": + return f"Example content for the '{heading_text}' section" + elif instruction_type == "constraint": + return f"Content must be relevant to '{heading_text}'" + elif instruction_type == "template": + return f"Template content for '{heading_text}' section" + else: + # Default fallback + return f"Content for the '{heading_text}' section" diff --git a/markitect/schema/loader.py b/markitect/schema/loader.py new file mode 100644 index 00000000..76317003 --- /dev/null +++ b/markitect/schema/loader.py @@ -0,0 +1,610 @@ +""" +Schema Loader - Extract JSON schemas from markdown files. + +This module provides functionality to load schemas from markdown files that +contain embedded JSON schemas in code blocks, along with YAML frontmatter +metadata and rich documentation. + +Markdown Schema Format: + --- + schema-id: "https://markitect.dev/schemas/domain/v1" + version: "1.0.0" + status: "stable|draft|deprecated" + --- + + # Schema Title v1.0 + + ## Documentation sections... + + ## Schema Definition + + ```json + { + "$schema": "http://json-schema.org/draft-07/schema#", + ... + } + ``` + +This enables: +- Rich documentation alongside schemas +- Version history in same file +- Human-readable schema files +- Markdown-first approach aligned with MarkiTect philosophy +""" + +import re +import json +import yaml +from pathlib import Path +from typing import Dict, Any, Optional, List, Tuple + + +class SchemaLoaderError(Exception): + """Base exception for schema loading errors.""" + pass + + +class InvalidSchemaFormatError(SchemaLoaderError): + """Schema file format is invalid.""" + pass + + +class SchemaNotFoundError(SchemaLoaderError): + """No JSON schema found in markdown file.""" + pass + + +class MarkdownSchemaLoader: + """ + Load and parse markdown schema files. + + Supports: + - YAML frontmatter for metadata + - JSON code blocks for schema definition + - Validation of schema structure + - Metadata merging + + Example: + >>> loader = MarkdownSchemaLoader() + >>> schema_data = loader.load_schema(Path("manpage-schema-v1.0.md")) + >>> schema = schema_data['schema'] + >>> metadata = schema_data['metadata'] + """ + + def __init__(self): + """Initialize the schema loader with regex patterns.""" + # Pattern to match YAML frontmatter + # Matches: --- ... --- at start of file + self.frontmatter_pattern = re.compile( + r'^---\s*\n(.*?)\n---\s*\n', + re.DOTALL | re.MULTILINE + ) + + # Pattern to match JSON code blocks + # Matches: ```json ... ``` + self.json_code_block_pattern = re.compile( + r'```json\s*\n(.*?)\n```', + re.DOTALL | re.MULTILINE + ) + + # Pattern to find Schema Definition section + # This helps us find the right JSON block if there are multiple + self.schema_section_pattern = re.compile( + r'##\s+Schema Definition\s*\n', + re.MULTILINE + ) + + def load_schema(self, md_path: Path) -> Dict[str, Any]: + """ + Load schema from markdown file. + + Args: + md_path: Path to markdown schema file + + Returns: + Dictionary containing: + - schema: Extracted JSON schema (dict) + - metadata: Frontmatter metadata (dict) + - documentation: Full markdown content (str) + - source_file: Source file path (str) + + Raises: + FileNotFoundError: If schema file doesn't exist + InvalidSchemaFormatError: If file format is invalid + SchemaNotFoundError: If no JSON schema found + + Example: + >>> loader = MarkdownSchemaLoader() + >>> data = loader.load_schema(Path("manpage-schema-v1.0.md")) + >>> print(data['schema']['title']) + 'Unix Manual Page Schema' + """ + if not md_path.exists(): + raise FileNotFoundError(f"Schema file not found: {md_path}") + + # Read file content + try: + content = md_path.read_text(encoding='utf-8') + except Exception as e: + raise InvalidSchemaFormatError(f"Failed to read schema file: {e}") + + # Extract frontmatter + metadata = self._extract_frontmatter(content) + + # Extract JSON schema + schema = self._extract_json_schema(content) + + if not schema: + raise SchemaNotFoundError( + f"No JSON schema found in {md_path}. " + f"Expected a ```json code block with schema definition." + ) + + # Merge metadata into schema + schema = self._merge_metadata(schema, metadata, md_path) + + return { + 'schema': schema, + 'metadata': metadata, + 'documentation': content, + 'source_file': str(md_path) + } + + def _extract_frontmatter(self, content: str) -> Dict[str, Any]: + """ + Extract YAML frontmatter from markdown content. + + Args: + content: Markdown file content + + Returns: + Dictionary of frontmatter metadata (empty if none found) + + Raises: + InvalidSchemaFormatError: If YAML is malformed + """ + match = self.frontmatter_pattern.search(content) + if not match: + return {} + + yaml_content = match.group(1) + try: + metadata = yaml.safe_load(yaml_content) or {} + if not isinstance(metadata, dict): + raise InvalidSchemaFormatError( + f"Frontmatter must be a YAML dictionary, got {type(metadata)}" + ) + return metadata + except yaml.YAMLError as e: + raise InvalidSchemaFormatError(f"Invalid YAML frontmatter: {e}") + + def _extract_json_schema(self, content: str) -> Optional[Dict[str, Any]]: + """ + Extract JSON schema from markdown code blocks. + + Prefers JSON blocks under "## Schema Definition" section, + but will use first JSON block if no Schema Definition section found. + + Args: + content: Markdown file content + + Returns: + JSON schema dictionary or None if not found + + Raises: + InvalidSchemaFormatError: If JSON is malformed + """ + # Find all JSON code blocks + json_blocks = self.json_code_block_pattern.findall(content) + + if not json_blocks: + return None + + # Try to find the Schema Definition section + schema_section_match = self.schema_section_pattern.search(content) + + if schema_section_match: + # Find JSON block that comes after Schema Definition section + section_pos = schema_section_match.end() + + # Re-search for JSON blocks starting from section position + remaining_content = content[section_pos:] + section_json_blocks = self.json_code_block_pattern.findall(remaining_content) + + if section_json_blocks: + json_text = section_json_blocks[0] + else: + # Fallback to first JSON block in entire document + json_text = json_blocks[0] + else: + # No Schema Definition section, use first JSON block + json_text = json_blocks[0] + + # Parse JSON + try: + schema = json.loads(json_text) + if not isinstance(schema, dict): + raise InvalidSchemaFormatError( + f"Schema must be a JSON object, got {type(schema)}" + ) + return schema + except json.JSONDecodeError as e: + raise InvalidSchemaFormatError(f"Invalid JSON schema: {e}") + + def _merge_metadata( + self, + schema: Dict[str, Any], + metadata: Dict[str, Any], + source_file: Path + ) -> Dict[str, Any]: + """ + Merge frontmatter metadata into schema. + + Adds x-markitect-source extension with file info and metadata. + Optionally overrides schema fields with frontmatter values. + + Args: + schema: JSON schema dictionary + metadata: Frontmatter metadata dictionary + source_file: Path to source file + + Returns: + Schema with merged metadata + """ + # Create a copy to avoid modifying original + merged_schema = schema.copy() + + # Add MarkiTect-specific source metadata + merged_schema['x-markitect-source'] = { + 'file': str(source_file), + 'filename': source_file.name, + 'format': 'markdown', + 'frontmatter': metadata + } + + # Override schema fields with frontmatter if present + # This allows frontmatter to be the source of truth for metadata + if 'version' in metadata: + merged_schema['version'] = metadata['version'] + + if 'schema-id' in metadata: + merged_schema['$id'] = metadata['schema-id'] + + if 'status' in metadata: + if 'x-markitect-metadata' not in merged_schema: + merged_schema['x-markitect-metadata'] = {} + merged_schema['x-markitect-metadata']['status'] = metadata['status'] + + return merged_schema + + def save_schema( + self, + schema: Dict[str, Any], + md_path: Path, + template: Optional[str] = None, + frontmatter: Optional[Dict[str, Any]] = None + ): + """ + Save schema as markdown file. + + Args: + schema: JSON schema dictionary to save + md_path: Output path for markdown file + template: Optional markdown template string + frontmatter: Optional frontmatter metadata (extracted from schema if not provided) + + Raises: + InvalidSchemaFormatError: If schema is invalid + + Example: + >>> loader = MarkdownSchemaLoader() + >>> loader.save_schema( + ... schema={'title': 'My Schema', ...}, + ... md_path=Path('my-schema-v1.0.md') + ... ) + """ + if template: + # Use provided template + content = self._render_template(template, schema, frontmatter) + else: + # Generate basic markdown + content = self._generate_markdown(schema, frontmatter) + + # Create parent directory if needed + md_path.parent.mkdir(parents=True, exist_ok=True) + + # Write file + try: + md_path.write_text(content, encoding='utf-8') + except Exception as e: + raise InvalidSchemaFormatError(f"Failed to write schema file: {e}") + + def _generate_markdown( + self, + schema: Dict[str, Any], + frontmatter: Optional[Dict[str, Any]] = None + ) -> str: + """ + Generate markdown from schema. + + Args: + schema: JSON schema dictionary + frontmatter: Optional frontmatter metadata + + Returns: + Markdown content as string + """ + # Extract metadata from schema + title = schema.get('title', 'Untitled Schema') + version = schema.get('version', '1.0.0') + description = schema.get('description', '') + schema_id = schema.get('$id', '') + + # Build frontmatter + if frontmatter is None: + frontmatter = {} + + # Set defaults + if 'schema-id' not in frontmatter and schema_id: + frontmatter['schema-id'] = schema_id + if 'version' not in frontmatter: + frontmatter['version'] = version + if 'status' not in frontmatter: + frontmatter['status'] = 'draft' + + # Generate frontmatter YAML + frontmatter_yaml = yaml.dump( + frontmatter, + default_flow_style=False, + allow_unicode=True + ).strip() + + # Generate JSON (pretty-printed) + schema_json = json.dumps(schema, indent=2, ensure_ascii=False) + + # Build markdown content + md_content = f"""--- +{frontmatter_yaml} +--- + +# {title} v{version} + +## Overview + +{description} + +## Usage + +```bash +markitect validate document.md --schema {Path(frontmatter.get('schema-id', 'schema')).name} +``` + +## Schema Definition + +```json +{schema_json} +``` + +## Version History + +### v{version} +- Initial version +""" + + return md_content + + def _render_template( + self, + template: str, + schema: Dict[str, Any], + frontmatter: Optional[Dict[str, Any]] = None + ) -> str: + """ + Render markdown from template. + + Simple template rendering using string formatting. + For complex templates, consider using Jinja2 or similar. + + Args: + template: Template string + schema: JSON schema dictionary + frontmatter: Optional frontmatter metadata + + Returns: + Rendered markdown content + """ + # Build context for template + context = { + 'title': schema.get('title', 'Untitled'), + 'version': schema.get('version', '1.0.0'), + 'description': schema.get('description', ''), + 'schema_id': schema.get('$id', ''), + 'schema_json': json.dumps(schema, indent=2, ensure_ascii=False), + 'frontmatter': frontmatter or {}, + } + + # Simple template rendering + try: + return template.format(**context) + except KeyError as e: + raise InvalidSchemaFormatError(f"Template missing key: {e}") + + def list_json_blocks(self, content: str) -> List[Tuple[int, str]]: + """ + List all JSON code blocks in markdown content. + + Useful for debugging or when multiple JSON blocks exist. + + Args: + content: Markdown file content + + Returns: + List of (position, json_content) tuples + + Example: + >>> loader = MarkdownSchemaLoader() + >>> content = Path('schema.md').read_text() + >>> blocks = loader.list_json_blocks(content) + >>> print(f"Found {len(blocks)} JSON blocks") + """ + blocks = [] + for match in self.json_code_block_pattern.finditer(content): + blocks.append((match.start(), match.group(1))) + return blocks + + def validate_schema_structure(self, schema: Dict[str, Any]) -> List[str]: + """ + Validate basic schema structure. + + Checks for required JSON Schema fields and MarkiTect conventions. + + Args: + schema: JSON schema dictionary + + Returns: + List of warning/error messages (empty if valid) + + Example: + >>> loader = MarkdownSchemaLoader() + >>> issues = loader.validate_schema_structure(schema) + >>> if issues: + ... print("Schema issues:", issues) + """ + issues = [] + + # Check required JSON Schema fields + if '$schema' not in schema: + issues.append("Missing required field: $schema") + + if 'type' not in schema: + issues.append("Missing recommended field: type") + + if 'title' not in schema: + issues.append("Missing recommended field: title") + + if 'description' not in schema: + issues.append("Missing recommended field: description") + + # Check MarkiTect conventions + if 'version' not in schema: + issues.append("Missing MarkiTect convention: version field") + + if '$id' not in schema: + issues.append("Missing recommended field: $id") + + # Check $id format if present + if '$id' in schema: + schema_id = schema['$id'] + if not isinstance(schema_id, str): + issues.append("$id must be a string") + elif not schema_id.startswith('https://'): + issues.append("$id should be a full HTTPS URL") + + return issues + + +def auto_ingest_schemas(db_manager=None, schema_dir: Optional[Path] = None, verbose: bool = False) -> Dict[str, Any]: + """Automatically ingest schemas from markitect/schemas/ directory. + + This function scans the schemas directory for .md schema files and ingests + any that are not already in the database. Useful for post-install setup + or automatic schema registration. + + Args: + db_manager: DatabaseManager instance (optional, will create if not provided) + schema_dir: Directory containing schemas (defaults to markitect/schemas/) + verbose: If True, print detailed progress messages + + Returns: + Dictionary with ingestion results: + { + 'ingested': [list of schema names that were ingested], + 'skipped': [list of schema names that were already present], + 'failed': [list of (schema_name, error) tuples for failures] + } + + Example: + >>> from markitect.schema.loader import auto_ingest_schemas + >>> results = auto_ingest_schemas(verbose=True) + >>> print(f"Ingested {len(results['ingested'])} schemas") + """ + # Determine schema directory + if schema_dir is None: + schema_dir = Path(__file__).parent.parent / "schemas" + + if not schema_dir.exists(): + if verbose: + print(f"Warning: Schema directory not found: {schema_dir}") + return {'ingested': [], 'skipped': [], 'failed': []} + + # Initialize database manager if not provided + if db_manager is None: + from markitect.database import DatabaseManager + db_path = Path.home() / '.markitect' / 'markitect.db' + db_manager = DatabaseManager(str(db_path)) + db_manager.initialize_database() + + # Get list of already ingested schemas + try: + existing_schemas = {schema['name'] for schema in db_manager.list_schemas()} + except Exception as e: + if verbose: + print(f"Error listing existing schemas: {e}") + return {'ingested': [], 'skipped': [], 'failed': []} + + results = { + 'ingested': [], + 'skipped': [], + 'failed': [] + } + + # Find all schema files + schema_files = list(schema_dir.glob("*-schema-v*.md")) + + if verbose and schema_files: + print(f"Found {len(schema_files)} schema file(s) in {schema_dir}") + + loader = MarkdownSchemaLoader() + + for schema_file in sorted(schema_files): + schema_name = schema_file.name + + # Skip if already ingested + if schema_name in existing_schemas: + results['skipped'].append(schema_name) + if verbose: + print(f"Skipping {schema_name} (already ingested)") + continue + + # Try to ingest + try: + # Load schema + schema_data_full = loader.load_schema(schema_file) + schema_data = schema_data_full['schema'] + + # Store in database + schema_content = json.dumps(schema_data, indent=2) + record_id = db_manager.store_schema_file(schema_name, schema_content) + + if record_id: + results['ingested'].append(schema_name) + if verbose: + title = schema_data.get('title', schema_name) + print(f"Ingested {schema_name} (title: {title})") + else: + results['failed'].append((schema_name, "Failed to store in database")) + if verbose: + print(f"Failed to store {schema_name} in database") + + except Exception as e: + results['failed'].append((schema_name, str(e))) + if verbose: + print(f"Failed to ingest {schema_name}: {e}") + + if verbose: + print(f"\nAuto-ingestion complete:") + print(f" Ingested: {len(results['ingested'])}") + print(f" Skipped: {len(results['skipped'])}") + print(f" Failed: {len(results['failed'])}") + + return results diff --git a/markitect/schema/naming.py b/markitect/schema/naming.py new file mode 100644 index 00000000..618d18b7 --- /dev/null +++ b/markitect/schema/naming.py @@ -0,0 +1,369 @@ +""" +Schema Naming Validation - Enforce filename conventions for schemas. + +This module provides validation and utilities for schema filename conventions +to ensure consistency across the MarkiTect schema ecosystem. + +Naming Convention: + Format: {domain}-schema-v{major}.{minor}.md + + Components: + - domain: lowercase, hyphen-separated identifier (e.g., "manpage", "api-documentation") + - schema: literal string "schema" + - version: SemVer major.minor (e.g., "v1.0", "v2.1") + - extension: ".md" (markdown) + + Valid Examples: + - manpage-schema-v1.0.md + - terminology-schema-v1.0.md + - api-documentation-schema-v1.0.md + - my-custom-type-schema-v2.1.md + + Invalid Examples: + - manpage.json (missing version and wrong extension) + - manpage-v1.md (missing "schema" keyword) + - ManPage-Schema-v1.0.md (wrong case - must be lowercase) + - manpage-schema-1.0.md (missing 'v' prefix) + - manpage-schema-v1.md (missing minor version) +""" + +import re +from pathlib import Path +from typing import Tuple, Optional, Dict, Any + + +# Regex pattern for schema filename validation +# Matches: {domain}-schema-v{major}.{minor}.md +# Where domain is lowercase letters/numbers/hyphens starting with letter +SCHEMA_FILENAME_PATTERN = re.compile( + r'^(?P[a-z][a-z0-9-]*)-schema-v(?P\d+)\.(?P\d+)\.md$' +) + + +class SchemaFilenameError(Exception): + """Exception raised for invalid schema filenames.""" + pass + + +def validate_schema_filename(filename: str) -> Tuple[bool, Optional[Dict[str, Any]]]: + """ + Validate schema filename against naming convention. + + Args: + filename: The filename to validate (e.g., "manpage-schema-v1.0.md") + + Returns: + Tuple of (is_valid, metadata_dict or None) + + If valid, metadata_dict contains: + - domain: str - The domain identifier + - version: str - Full version string (e.g., "1.0") + - major: int - Major version number + - minor: int - Minor version number + - filename: str - The original filename + + If invalid, metadata_dict is None + + Examples: + >>> validate_schema_filename("manpage-schema-v1.0.md") + (True, {'domain': 'manpage', 'version': '1.0', ...}) + + >>> validate_schema_filename("invalid.json") + (False, None) + """ + match = SCHEMA_FILENAME_PATTERN.match(filename) + + if not match: + return False, None + + return True, { + 'domain': match.group('domain'), + 'version': f"{match.group('major')}.{match.group('minor')}", + 'major': int(match.group('major')), + 'minor': int(match.group('minor')), + 'filename': filename + } + + +def suggest_valid_filename( + domain: str, + version: str = "1.0", + normalize: bool = True +) -> str: + """ + Generate a valid schema filename from domain and version. + + Args: + domain: The schema domain (e.g., "manpage", "API Documentation") + version: Version string in format "major.minor" (default: "1.0") + normalize: Whether to normalize domain to lowercase/hyphenated + + Returns: + Valid schema filename + + Raises: + ValueError: If domain or version format is invalid + + Examples: + >>> suggest_valid_filename("manpage", "1.0") + 'manpage-schema-v1.0.md' + + >>> suggest_valid_filename("API Documentation", "2.1") + 'api-documentation-schema-v2.1.md' + + >>> suggest_valid_filename("My_Custom_Type", "1.0") + 'my-custom-type-schema-v1.0.md' + """ + if not domain: + raise ValueError("Domain cannot be empty") + + if normalize: + # Normalize domain: lowercase, replace spaces/underscores with hyphens + domain_clean = domain.lower() + domain_clean = domain_clean.replace(' ', '-').replace('_', '-') + # Remove consecutive hyphens + domain_clean = re.sub(r'-+', '-', domain_clean) + # Remove leading/trailing hyphens + domain_clean = domain_clean.strip('-') + else: + domain_clean = domain + + # Validate domain format (must start with letter, contain only lowercase, numbers, hyphens) + if not re.match(r'^[a-z][a-z0-9-]*$', domain_clean): + raise ValueError( + f"Invalid domain '{domain_clean}': must start with lowercase letter " + "and contain only lowercase letters, numbers, and hyphens" + ) + + # Parse and validate version + version_parts = version.split('.') + if len(version_parts) != 2: + raise ValueError( + f"Invalid version '{version}': must be in format 'major.minor' (e.g., '1.0')" + ) + + try: + major = int(version_parts[0]) + minor = int(version_parts[1]) + except ValueError: + raise ValueError( + f"Invalid version '{version}': major and minor must be integers" + ) + + if major < 0 or minor < 0: + raise ValueError( + f"Invalid version '{version}': major and minor must be non-negative" + ) + + return f"{domain_clean}-schema-v{major}.{minor}.md" + + +# Alias for backward compatibility +suggest_schema_filename = suggest_valid_filename + + +def extract_schema_domain(filename: str) -> str: + """ + Extract the domain from a valid schema filename. + + Args: + filename: Schema filename to parse + + Returns: + The domain identifier + + Raises: + SchemaFilenameError: If filename is invalid + + Examples: + >>> extract_schema_domain("manpage-schema-v1.0.md") + 'manpage' + """ + is_valid, metadata = validate_schema_filename(filename) + + if not is_valid: + raise SchemaFilenameError( + f"Invalid schema filename: {filename}\n" + f"Expected format: {{domain}}-schema-v{{major}}.{{minor}}.md" + ) + + return metadata['domain'] + + +def get_schema_version(filename: str) -> str: + """ + Get the version string from a valid schema filename. + + Args: + filename: Schema filename to parse + + Returns: + Version string (e.g., "1.0") + + Raises: + SchemaFilenameError: If filename is invalid + + Examples: + >>> get_schema_version("manpage-schema-v1.0.md") + '1.0' + """ + is_valid, metadata = validate_schema_filename(filename) + + if not is_valid: + raise SchemaFilenameError( + f"Invalid schema filename: {filename}\n" + f"Expected format: {{domain}}-schema-v{{major}}.{{minor}}.md" + ) + + return metadata['version'] + + +def extract_schema_metadata(filename: str) -> Dict[str, Any]: + """ + Extract metadata from a valid schema filename. + + Args: + filename: Schema filename to parse + + Returns: + Dictionary with metadata + + Raises: + SchemaFilenameError: If filename is invalid + + Examples: + >>> extract_schema_metadata("manpage-schema-v1.0.md") + {'domain': 'manpage', 'version': '1.0', 'major': 1, 'minor': 0} + """ + is_valid, metadata = validate_schema_filename(filename) + + if not is_valid: + raise SchemaFilenameError( + f"Invalid schema filename: {filename}\n" + f"Expected format: {{domain}}-schema-v{{major}}.{{minor}}.md" + ) + + return metadata + + +def get_validation_errors(filename: str) -> list: + """ + Get detailed validation errors for a filename. + + Args: + filename: Filename to validate + + Returns: + List of error messages (empty if valid) + + Examples: + >>> get_validation_errors("manpage-schema-v1.0.md") + [] + + >>> get_validation_errors("invalid.json") + ['Filename does not match pattern: {domain}-schema-v{major}.{minor}.md', ...] + """ + errors = [] + + # Check basic pattern match + is_valid, _ = validate_schema_filename(filename) + if is_valid: + return errors + + # Provide detailed feedback + errors.append( + f"Filename does not match pattern: {{domain}}-schema-v{{major}}.{{minor}}.md" + ) + + # Check extension + if not filename.endswith('.md'): + errors.append(f"Extension must be '.md', got: {Path(filename).suffix}") + + # Check for version + if '-v' not in filename: + errors.append("Missing version: filename must include '-v{major}.{minor}'") + elif not re.search(r'-v\d+\.\d+', filename): + errors.append( + "Invalid version format: must be '-v{major}.{minor}' (e.g., '-v1.0')" + ) + + # Check for schema keyword + if '-schema-' not in filename: + errors.append("Missing '-schema-' keyword in filename") + + # Check for uppercase (must be lowercase) + if any(c.isupper() for c in filename): + errors.append("Filename must be lowercase") + + # Check domain format (if we can isolate it) + parts = filename.split('-schema-') + if len(parts) >= 1: + domain = parts[0] + if domain and not re.match(r'^[a-z][a-z0-9-]*$', domain): + errors.append( + f"Invalid domain '{domain}': must start with lowercase letter " + "and contain only lowercase letters, numbers, and hyphens" + ) + + return errors + + +def is_valid_schema_filename(filename: str) -> bool: + """ + Check if filename is valid (convenience function). + + Args: + filename: Filename to check + + Returns: + True if valid, False otherwise + + Examples: + >>> is_valid_schema_filename("manpage-schema-v1.0.md") + True + + >>> is_valid_schema_filename("invalid.json") + False + """ + is_valid, _ = validate_schema_filename(filename) + return is_valid + + +def format_validation_message(filename: str) -> str: + """ + Format a user-friendly validation message. + + Args: + filename: Filename that failed validation + + Returns: + Formatted error message with suggestions + + Examples: + >>> print(format_validation_message("manpage.json")) + Invalid schema filename: manpage.json + ... + """ + errors = get_validation_errors(filename) + + if not errors: + return f"\u2705 Valid schema filename: {filename}" + + message = f"\u274c Invalid schema filename: {filename}\n\n" + message += "Errors:\n" + for i, error in enumerate(errors, 1): + message += f" {i}. {error}\n" + + message += "\nExpected format: {domain}-schema-v{major}.{minor}.md\n" + message += "Example: manpage-schema-v1.0.md\n" + + # Try to suggest a corrected filename + try: + # Extract domain guess (everything before first hyphen or dot) + domain_guess = filename.split('-')[0].split('.')[0] + suggestion = suggest_valid_filename(domain_guess, "1.0") + message += f"\nSuggested filename: {suggestion}\n" + except Exception: + pass + + return message diff --git a/markitect/schema/refiner.py b/markitect/schema/refiner.py new file mode 100644 index 00000000..6c5d3b01 --- /dev/null +++ b/markitect/schema/refiner.py @@ -0,0 +1,530 @@ +""" +Schema Refiner for Phase 2: Schema Refinement Tools + +Automatically refines rigid schemas by applying loosening rules and fixes. +""" + +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple +import json +import copy +from dataclasses import dataclass, field + +from .analyzer import SchemaAnalyzer, SchemaIssue, IssueType, IssueSeverity + + +@dataclass +class RefinementAction: + """Represents a refinement action taken on the schema.""" + issue_type: IssueType + path: str + description: str + old_value: Any = None + new_value: Any = None + + +@dataclass +class RefinementResult: + """Results of schema refinement.""" + success: bool + actions_taken: List[RefinementAction] = field(default_factory=list) + refined_schema: Optional[Dict[str, Any]] = None + error_message: Optional[str] = None + + +class SchemaRefiner: + """Refines rigid schemas by applying loosening rules.""" + + def __init__(self): + """Initialize the schema refiner.""" + self.analyzer = SchemaAnalyzer() + + def _navigate_to_path(self, schema: Dict[str, Any], path: str) -> Optional[Tuple[Dict[str, Any], str]]: + """ + Navigate to a path in the schema, handling nested 'properties' objects. + + Returns (parent_object, property_name) or None if path doesn't exist. + """ + path_parts = path.split('.') + obj = schema + + # Navigate through all but the last part + for i, part in enumerate(path_parts[:-1]): + # Try direct access first + if part in obj: + obj = obj[part] + # If not found and obj has 'properties', try there + elif isinstance(obj, dict) and "properties" in obj and part in obj["properties"]: + obj = obj["properties"][part] + else: + return None + + # For the final part, check if we need to descend into 'properties' + prop_name = path_parts[-1] + if prop_name in obj: + return (obj, prop_name) + elif isinstance(obj, dict) and "properties" in obj and prop_name in obj["properties"]: + return (obj["properties"], prop_name) + else: + return None + + def refine_schema_interactive( + self, + schema: Dict[str, Any], + loosen_counts: bool = True, + migrate_deprecated: bool = False, + round_numbers: bool = True + ) -> RefinementResult: + """ + Refine a schema interactively, prompting for each fix. + + Args: + schema: The JSON schema to refine + loosen_counts: Enable fixes for exact counts + migrate_deprecated: Enable migration of deprecated extensions + round_numbers: Enable rounding of overly specific numbers + + Returns: + RefinementResult with actions taken and refined schema + """ + result = RefinementResult(success=False) + + try: + # Analyze the schema first + analysis = self.analyzer.analyze_schema(schema) + + print(f"\nFound {len(analysis.issues)} issue(s) to review\n") + + # Deep copy to avoid modifying original + refined = copy.deepcopy(schema) + + # Process each issue interactively + for i, issue in enumerate(analysis.issues, 1): + print(f"Issue {i}/{len(analysis.issues)}") + print(f" Type: {issue.issue_type.value}") + print(f" Path: {issue.path}") + print(f" {issue.message}") + print(f" Suggestion: {issue.suggestion}") + + if issue.current_value is not None: + print(f" Current: {json.dumps(issue.current_value)}") + if issue.suggested_value is not None: + print(f" Suggested: {json.dumps(issue.suggested_value)}") + + # Ask user if they want to apply the fix + response = input("\nApply this fix? [y/N/q]: ").strip().lower() + + if response == 'q': + print("Refinement cancelled by user") + result.success = False + return result + elif response == 'y': + action = None + + if loosen_counts and issue.issue_type == IssueType.EXACT_COUNT: + action = self._fix_exact_count(refined, issue) + + elif round_numbers and issue.issue_type == IssueType.OVERLY_SPECIFIC: + action = self._fix_overly_specific(refined, issue) + + elif loosen_counts and issue.issue_type == IssueType.NO_FLEXIBILITY: + action = self._fix_no_flexibility(refined, issue) + + elif migrate_deprecated and issue.issue_type == IssueType.DEPRECATED_EXTENSIONS: + action = self._fix_deprecated_extension(refined, issue) + + if action: + result.actions_taken.append(action) + print(f" ✓ Applied") + else: + print(f" ✗ Could not apply fix") + else: + print(f" - Skipped") + + print() + + result.refined_schema = refined + result.success = True + + except Exception as e: + result.error_message = str(e) + + return result + + def refine_schema( + self, + schema: Dict[str, Any], + loosen_counts: bool = True, + migrate_deprecated: bool = False, + round_numbers: bool = True + ) -> RefinementResult: + """ + Refine a schema by applying fixes for detected issues. + + Args: + schema: The JSON schema to refine + loosen_counts: Apply fixes for exact counts + migrate_deprecated: Migrate deprecated extensions + round_numbers: Round overly specific numbers + + Returns: + RefinementResult with actions taken and refined schema + """ + result = RefinementResult(success=False) + + try: + # Analyze the schema first + analysis = self.analyzer.analyze_schema(schema) + + # Deep copy to avoid modifying original + refined = copy.deepcopy(schema) + + # Apply fixes based on issues found + for issue in analysis.issues: + action = None + + if loosen_counts and issue.issue_type == IssueType.EXACT_COUNT: + action = self._fix_exact_count(refined, issue) + + elif round_numbers and issue.issue_type == IssueType.OVERLY_SPECIFIC: + action = self._fix_overly_specific(refined, issue) + + elif loosen_counts and issue.issue_type == IssueType.NO_FLEXIBILITY: + action = self._fix_no_flexibility(refined, issue) + + elif migrate_deprecated and issue.issue_type == IssueType.DEPRECATED_EXTENSIONS: + action = self._fix_deprecated_extension(refined, issue) + + if action: + result.actions_taken.append(action) + + result.refined_schema = refined + result.success = True + + except Exception as e: + result.error_message = str(e) + + return result + + def _fix_exact_count(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: + """Fix exact count constraints by converting to ranges.""" + nav_result = self._navigate_to_path(schema, issue.path) + if not nav_result: + return None + + obj, prop_name = nav_result + prop_def = obj[prop_name] + old_value = copy.deepcopy(prop_def) + + # Check if it's an array with exact minItems/maxItems + if isinstance(prop_def, dict) and prop_def.get("type") == "array": + min_items = prop_def.get("minItems") + max_items = prop_def.get("maxItems") + + if min_items is not None and max_items is not None and min_items == max_items: + # Apply suggested loosening + new_min = max(0, min_items - 2) + new_max = min_items + 5 + + prop_def["minItems"] = new_min + prop_def["maxItems"] = new_max + + return RefinementAction( + issue_type=IssueType.EXACT_COUNT, + path=issue.path, + description=f"Loosened array count from exactly {min_items} to range {new_min}-{new_max}", + old_value={"minItems": min_items, "maxItems": max_items}, + new_value={"minItems": new_min, "maxItems": new_max} + ) + + # Check if it's a const value + if isinstance(prop_def, dict) and "const" in prop_def: + const_value = prop_def["const"] + del prop_def["const"] + + # If it's a number, convert to a range + if isinstance(const_value, int): + prop_def["minimum"] = const_value - 1 + prop_def["maximum"] = const_value + 1 + + return RefinementAction( + issue_type=IssueType.EXACT_COUNT, + path=issue.path, + description=f"Converted const {const_value} to range {const_value-1}-{const_value+1}", + old_value=const_value, + new_value={"minimum": const_value - 1, "maximum": const_value + 1} + ) + else: + # For non-numeric constants, just remove the constraint + return RefinementAction( + issue_type=IssueType.EXACT_COUNT, + path=issue.path, + description=f"Removed const constraint: {const_value}", + old_value=const_value, + new_value=None + ) + + return None + + def _fix_overly_specific(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: + """Fix overly specific number constraints by rounding.""" + if issue.suggested_value is None: + return None + + nav_result = self._navigate_to_path(schema, issue.path) + if not nav_result: + return None + + obj, prop_name = nav_result + prop_def = obj[prop_name] + + # Round the minItems value + if isinstance(prop_def, dict) and "minItems" in prop_def: + old_value = prop_def["minItems"] + new_value = issue.suggested_value + prop_def["minItems"] = new_value + + return RefinementAction( + issue_type=IssueType.OVERLY_SPECIFIC, + path=issue.path, + description=f"Rounded minItems from {old_value} to {new_value}", + old_value=old_value, + new_value=new_value + ) + + return None + + def _fix_no_flexibility(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: + """Fix narrow ranges by widening them.""" + nav_result = self._navigate_to_path(schema, issue.path) + if not nav_result: + return None + + obj, prop_name = nav_result + prop_def = obj[prop_name] + + if isinstance(prop_def, dict) and "minimum" in prop_def and "maximum" in prop_def: + old_min = prop_def["minimum"] + old_max = prop_def["maximum"] + range_size = old_max - old_min + + # Widen the range + new_min = old_min - 5 + new_max = old_max + 5 + + prop_def["minimum"] = new_min + prop_def["maximum"] = new_max + + return RefinementAction( + issue_type=IssueType.NO_FLEXIBILITY, + path=issue.path, + description=f"Widened range from {old_min}-{old_max} to {new_min}-{new_max}", + old_value={"minimum": old_min, "maximum": old_max}, + new_value={"minimum": new_min, "maximum": new_max} + ) + + return None + + def _fix_deprecated_extension(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: + """Remove deprecated extension (migration requires manual work).""" + # For now, just document that manual migration is needed + # Full migration would require understanding the old format + + deprecated_key = issue.path + if deprecated_key in schema: + old_value = schema[deprecated_key] + # Don't actually remove it automatically - too risky + return RefinementAction( + issue_type=IssueType.DEPRECATED_EXTENSIONS, + path=issue.path, + description=f"Detected deprecated extension (manual migration recommended)", + old_value=old_value, + new_value=None + ) + + return None + + def refine_schema_file( + self, + input_path: Path, + output_path: Optional[Path] = None, + loosen_counts: bool = True, + migrate_deprecated: bool = False, + round_numbers: bool = True + ) -> RefinementResult: + """ + Refine a schema file. + + Args: + input_path: Path to input schema file + output_path: Path to output file (if None, overwrites input) + loosen_counts: Apply fixes for exact counts + migrate_deprecated: Migrate deprecated extensions + round_numbers: Round overly specific numbers + + Returns: + RefinementResult + """ + with open(input_path) as f: + schema = json.load(f) + + result = self.refine_schema( + schema, + loosen_counts=loosen_counts, + migrate_deprecated=migrate_deprecated, + round_numbers=round_numbers + ) + + if result.success and result.refined_schema: + output = output_path or input_path + with open(output, 'w') as f: + json.dump(result.refined_schema, f, indent=2) + + return result + + def format_refinement_report(self, result: RefinementResult) -> str: + """ + Format refinement results as a human-readable report. + + Args: + result: Refinement results + + Returns: + Formatted report string + """ + lines = [] + + # Header + lines.append("=" * 70) + lines.append("Schema Refinement Report") + lines.append("=" * 70) + lines.append("") + + if not result.success: + lines.append(f"❌ Refinement failed: {result.error_message}") + return "\n".join(lines) + + # Summary + action_count = len(result.actions_taken) + if action_count == 0: + lines.append("✅ No refinements needed - schema is already flexible") + else: + lines.append(f"✅ Applied {action_count} refinement(s)") + lines.append("") + + # List actions + if result.actions_taken: + lines.append("Actions Taken:") + lines.append("-" * 70) + + for i, action in enumerate(result.actions_taken, 1): + lines.append(f"{i}. {action.description}") + lines.append(f" Path: {action.path}") + + if action.old_value is not None: + lines.append(f" Before: {json.dumps(action.old_value)}") + if action.new_value is not None: + lines.append(f" After: {json.dumps(action.new_value)}") + + lines.append("") + + return "\n".join(lines) + + +def refine_schema_cli( + schema_path: str, + output: Optional[str] = None, + loosen_counts: bool = True, + migrate_deprecated: bool = False, + round_numbers: bool = True, + dry_run: bool = False, + interactive: bool = False +) -> int: + """ + CLI entry point for schema refinement. + + Args: + schema_path: Path to schema file + output: Output path (None = overwrite input) + loosen_counts: Apply count loosening fixes + migrate_deprecated: Migrate deprecated extensions + round_numbers: Round overly specific numbers + dry_run: Show changes without applying + interactive: Prompt for each fix + + Returns: + Exit code (0 = success, 1 = no changes needed, 2 = error) + """ + refiner = SchemaRefiner() + + try: + input_path = Path(schema_path) + output_path = Path(output) if output else None + + # Load schema + with open(input_path) as f: + schema = json.load(f) + + if interactive: + # Interactive mode - prompt for each fix + print(f"Refining schema: {schema_path}") + result = refiner.refine_schema_interactive( + schema, + loosen_counts=loosen_counts, + migrate_deprecated=migrate_deprecated, + round_numbers=round_numbers + ) + + if result.success and result.refined_schema and not dry_run: + # Write the refined schema + output = output_path or input_path + with open(output, 'w') as f: + json.dump(result.refined_schema, f, indent=2) + print(f"\nRefined schema written to: {output}") + + elif dry_run: + # Just analyze and show what would be done + result = refiner.refine_schema( + schema, + loosen_counts=loosen_counts, + migrate_deprecated=migrate_deprecated, + round_numbers=round_numbers + ) + + print("DRY RUN - No changes will be made") + print() + else: + result = refiner.refine_schema_file( + input_path, + output_path, + loosen_counts=loosen_counts, + migrate_deprecated=migrate_deprecated, + round_numbers=round_numbers + ) + + # Only print full report if not in interactive mode (user already saw changes) + if not interactive: + report = refiner.format_refinement_report(result) + print(report) + elif result.success: + # Just print summary for interactive mode + print(f"\n{'='*70}") + print(f"Refinement complete: {len(result.actions_taken)} change(s) applied") + print(f"{'='*70}") + + if result.success and len(result.actions_taken) > 0: + return 0 # Success with changes + elif result.success: + return 1 # Success but no changes needed + else: + return 2 # Error + + except FileNotFoundError: + print(f"Error: Schema file not found: {schema_path}") + return 2 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in schema file: {e}") + return 2 + except Exception as e: + print(f"Error: {e}") + return 2 diff --git a/markitect/schema/validator.py b/markitect/schema/validator.py new file mode 100644 index 00000000..fd3377a9 --- /dev/null +++ b/markitect/schema/validator.py @@ -0,0 +1,679 @@ +""" +Schema Validator for Issue #7: Validate a Markdown File Against a Schema. + +This module provides functionality to validate markdown documents against JSON schemas +for arc42 architecture documentation compliance checking - essential for intelligent +document analysis and plan-actual comparison capabilities. +""" + +import json +from pathlib import Path +from typing import Dict, Any + +try: + import jsonschema + from jsonschema import SchemaError + JSONSCHEMA_AVAILABLE = True +except ImportError: + # Fallback to basic validation without full JSON Schema validation + JSONSCHEMA_AVAILABLE = False + SchemaError = Exception + +from markitect.core.parser import parse_markdown_to_ast +from .generator import SchemaGenerator +from markitect.validation_error import ValidationErrorCollector, ValidationErrorType +from markitect.exceptions import FileNotFoundError, SchemaValidationError, InvalidSchemaError + + +class SchemaValidator: + """ + Validates markdown documents against JSON schemas for arc42 compliance checking. + + This service provides boolean validation results for markdown documents against + schemas, enabling strict compliance checking for architectural documentation + templates and intelligent plan-actual comparison. + """ + + def __init__(self): + """Initialize the schema validator.""" + self.schema_generator = SchemaGenerator() + self.jsonschema_available = JSONSCHEMA_AVAILABLE + + def validate_file_against_schema(self, file_path: Path, schema: Dict[str, Any]) -> bool: + """ + Validate a markdown file against a JSON schema. + + Args: + file_path: Path to the markdown file + schema: JSON schema dictionary to validate against + + Returns: + True if the document matches the schema, False otherwise + + Raises: + FileNotFoundError: If the markdown file doesn't exist + InvalidSchemaError: If the schema is invalid + """ + # Validate inputs + if not file_path.exists(): + raise FileNotFoundError(f"Markdown file not found: {file_path}") + + # Validate the schema itself + self._validate_schema(schema) + + # Generate the document's current structure + try: + document_schema = self.schema_generator.generate_schema_from_file(file_path) + except Exception as e: + raise SchemaValidationError(f"Failed to generate document schema: {e}") from e + + # Check if the expected schema has heading text constraints + if self._has_heading_text_constraints(schema): + # For heading text validation, we need to extract actual content and compare against enum constraints + return self._validate_with_heading_text_constraints(file_path, schema, document_schema) + + # Use standard structure comparison for backward compatibility + return self._compare_structures(document_schema, schema) + + def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool: + """ + Validate a markdown file against a JSON schema provided as a string. + + Args: + file_path: Path to the markdown file + schema_json: JSON schema as a string + + Returns: + True if the document matches the schema, False otherwise + + Raises: + FileNotFoundError: If the markdown file doesn't exist + InvalidSchemaError: If the schema is invalid JSON or schema + """ + try: + schema = json.loads(schema_json) + except json.JSONDecodeError as e: + raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e + + return self.validate_file_against_schema(file_path, schema) + + def validate_file_against_schema_file(self, file_path: Path, schema_file_path: Path) -> bool: + """ + Validate a markdown file against a schema stored in a file. + + Args: + file_path: Path to the markdown file + schema_file_path: Path to the JSON schema file + + Returns: + True if the document matches the schema, False otherwise + + Raises: + FileNotFoundError: If either file doesn't exist + InvalidSchemaError: If the schema file is invalid + """ + if not schema_file_path.exists(): + raise FileNotFoundError(f"Schema file not found: {schema_file_path}") + + try: + schema_content = schema_file_path.read_text(encoding='utf-8') + schema = json.loads(schema_content) + except (IOError, json.JSONDecodeError) as e: + raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e + + return self.validate_file_against_schema(file_path, schema) + + def _validate_schema(self, schema: Dict[str, Any]) -> None: + """ + Validate that a schema is a valid JSON Schema. + + Args: + schema: Schema dictionary to validate + + Raises: + InvalidSchemaError: If the schema is invalid + """ + try: + # Check basic schema structure + if not isinstance(schema, dict): + raise InvalidSchemaError("Schema must be a dictionary") + + # Basic schema validation + if not schema.get('$schema') or not schema.get('type'): + raise InvalidSchemaError("Schema must have '$schema' and 'type' fields") + + # If jsonschema library is available, use it for full validation + if self.jsonschema_available: + jsonschema.validators.validator_for(schema).check_schema(schema) + + except (SchemaError, TypeError, AttributeError) as e: + raise InvalidSchemaError(f"Invalid JSON schema: {e}") from e + + def _compare_structures(self, document_schema: Dict[str, Any], expected_schema: Dict[str, Any]) -> bool: + """ + Compare a document's actual structure against expected schema requirements. + + This method performs the core validation logic by analyzing whether the + document's generated schema satisfies the requirements defined in the + expected schema. + + Args: + document_schema: Schema generated from the actual document + expected_schema: Expected schema requirements + + Returns: + True if the document satisfies the expected schema requirements + """ + try: + # Extract actual document structure + doc_properties = document_schema.get('properties', {}) + expected_properties = expected_schema.get('properties', {}) + + # Check all required properties are present + required_properties = expected_schema.get('required', []) + for prop in required_properties: + if prop not in doc_properties: + return False + + # Validate heading structure if specified + if 'headings' in expected_properties and 'headings' in doc_properties: + if not self._validate_heading_structure( + doc_properties['headings'], + expected_properties['headings'] + ): + return False + + # Validate other structural elements + structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables'] + for element in structural_elements: + if element in expected_properties: + if not self._validate_structural_element( + doc_properties.get(element), + expected_properties[element] + ): + return False + + return True + + except Exception: + # If comparison fails for any reason, consider validation failed + return False + + def _validate_heading_structure(self, actual_headings: Dict[str, Any], expected_headings: Dict[str, Any]) -> bool: + """ + Validate heading structure against expected requirements. + + Args: + actual_headings: Actual heading structure from document + expected_headings: Expected heading requirements + + Returns: + True if heading structure meets requirements + """ + actual_heading_props = actual_headings.get('properties', {}) + expected_heading_props = expected_headings.get('properties', {}) + required_heading_levels = expected_headings.get('required', []) + + # Check required heading levels are present + for level in required_heading_levels: + if level not in actual_heading_props: + return False + + # Check each expected heading level meets requirements + for level, expected_spec in expected_heading_props.items(): + if level not in actual_heading_props: + # If level is not required, skip it + if level not in required_heading_levels: + continue + return False + + actual_spec = actual_heading_props[level] + + # Check minimum and maximum item requirements + if not self._validate_array_constraints(actual_spec, expected_spec): + return False + + return True + + def _validate_structural_element(self, actual_element: Dict[str, Any], expected_element: Dict[str, Any]) -> bool: + """ + Validate a structural element (paragraphs, lists, etc.) against requirements. + + Args: + actual_element: Actual element structure from document + expected_element: Expected element requirements + + Returns: + True if element meets requirements + """ + if actual_element is None: + # Element doesn't exist in document + return False + + return self._validate_array_constraints(actual_element, expected_element) + + def _validate_array_constraints(self, actual: Dict[str, Any], expected: Dict[str, Any]) -> bool: + """ + Validate array constraints (minItems, maxItems) for structural elements. + + Args: + actual: Actual element specification + expected: Expected element specification + + Returns: + True if constraints are satisfied + """ + # Get actual count from the schema specification + # For generated schemas, we use minItems/maxItems which represent actual counts + actual_min = actual.get('minItems', 0) + actual_max = actual.get('maxItems', actual_min) + actual_count = actual_max # In our generated schemas, min=max=actual count + + # Check against expected constraints + expected_min = expected.get('minItems', 0) + expected_max = expected.get('maxItems', float('inf')) + + return expected_min <= actual_count <= expected_max + + # Issue #8: Detailed Error Reporting Methods + + def validate_file_with_errors(self, file_path: Path, schema: Dict[str, Any]) -> ValidationErrorCollector: + """ + Validate a markdown file against a JSON schema and collect detailed errors. + + This method provides comprehensive error reporting for Issue #8, enabling + users to understand exactly how their documents deviate from schemas. + + Args: + file_path: Path to the markdown file + schema: JSON schema dictionary to validate against + + Returns: + ValidationErrorCollector with all validation errors + + Raises: + FileNotFoundError: If the markdown file doesn't exist + InvalidSchemaError: If the schema is invalid + """ + # Validate inputs + if not file_path.exists(): + raise FileNotFoundError(f"Markdown file not found: {file_path}") + + # Validate the schema itself + self._validate_schema(schema) + + # Initialize error collector + error_collector = ValidationErrorCollector() + + # Generate the document's current structure + try: + document_schema = self.schema_generator.generate_schema_from_file(file_path) + except Exception as e: + error_collector.add_error( + ValidationErrorType.STRUCTURAL_VIOLATION, + f"Failed to generate document schema: {e}", + "document.structure", + suggestion="Check if the markdown file is properly formatted" + ) + return error_collector + + # Compare the document's structure against the expected schema and collect errors + if self._has_heading_text_constraints(schema): + # For heading text validation, we need to handle enum constraints specially + self._compare_structures_with_errors(document_schema, schema, error_collector) + self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector) + else: + # Use standard structure comparison for backward compatibility + self._compare_structures_with_errors(document_schema, schema, error_collector) + + return error_collector + + def validate_file_with_errors_string(self, file_path: Path, schema_json: str) -> ValidationErrorCollector: + """ + Validate a markdown file against a JSON schema string and collect detailed errors. + + Args: + file_path: Path to the markdown file + schema_json: JSON schema as a string + + Returns: + ValidationErrorCollector with all validation errors + + Raises: + FileNotFoundError: If the markdown file doesn't exist + InvalidSchemaError: If the schema is invalid JSON or schema + """ + try: + schema = json.loads(schema_json) + except json.JSONDecodeError as e: + raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e + + return self.validate_file_with_errors(file_path, schema) + + def validate_file_with_errors_file(self, file_path: Path, schema_file_path: Path) -> ValidationErrorCollector: + """ + Validate a markdown file against a schema file and collect detailed errors. + + Args: + file_path: Path to the markdown file + schema_file_path: Path to the JSON schema file + + Returns: + ValidationErrorCollector with all validation errors + + Raises: + FileNotFoundError: If either file doesn't exist + InvalidSchemaError: If the schema file is invalid + """ + if not schema_file_path.exists(): + raise FileNotFoundError(f"Schema file not found: {schema_file_path}") + + try: + schema_content = schema_file_path.read_text(encoding='utf-8') + schema = json.loads(schema_content) + except (IOError, json.JSONDecodeError) as e: + raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e + + return self.validate_file_with_errors(file_path, schema) + + def _compare_structures_with_errors( + self, + document_schema: Dict[str, Any], + expected_schema: Dict[str, Any], + error_collector: ValidationErrorCollector + ) -> None: + """ + Compare document structure against expected schema and collect detailed errors. + + This method performs comprehensive validation analysis, collecting specific + errors about missing headings, incorrect content counts, and structural violations. + + Args: + document_schema: Schema generated from the actual document + expected_schema: Expected schema requirements + error_collector: Collector to accumulate validation errors + """ + try: + # Extract actual document structure + doc_properties = document_schema.get('properties', {}) + expected_properties = expected_schema.get('properties', {}) + + # Check all required properties are present + required_properties = expected_schema.get('required', []) + for prop in required_properties: + if prop not in doc_properties: + error_collector.add_error( + ValidationErrorType.MISSING_REQUIRED_SECTION, + f"Missing required section: '{prop}'", + f"document.{prop}", + expected=f"Section '{prop}' is required by schema", + actual="Section not found", + suggestion=f"Add the '{prop}' section to your document" + ) + + # Validate heading structure if specified + if 'headings' in expected_properties and 'headings' in doc_properties: + self._validate_heading_structure_with_errors( + doc_properties['headings'], + expected_properties['headings'], + error_collector + ) + + # Validate other structural elements + structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables'] + for element in structural_elements: + if element in expected_properties: + self._validate_structural_element_with_errors( + doc_properties.get(element), + expected_properties[element], + element, + error_collector + ) + + except Exception as e: + error_collector.add_error( + ValidationErrorType.STRUCTURAL_VIOLATION, + f"Error during structure comparison: {e}", + "document.structure", + suggestion="Check if both the document and schema are properly formatted" + ) + + def _validate_heading_structure_with_errors( + self, + actual_headings: Dict[str, Any], + expected_headings: Dict[str, Any], + error_collector: ValidationErrorCollector + ) -> None: + """ + Validate heading structure and collect detailed errors. + + Args: + actual_headings: Actual heading structure from document + expected_headings: Expected heading requirements + error_collector: Collector for validation errors + """ + actual_heading_props = actual_headings.get('properties', {}) + expected_heading_props = expected_headings.get('properties', {}) + required_heading_levels = expected_headings.get('required', []) + + # Check required heading levels are present + for level in required_heading_levels: + if level not in actual_heading_props: + level_num = level.replace('level_', '') + error_collector.add_error( + ValidationErrorType.MISSING_REQUIRED_HEADING, + f"Missing required heading level {level_num}", + f"headings.{level}", + expected=f"At least one heading at level {level_num}", + actual="No headings found at this level", + suggestion=f"Add heading(s) at level {level_num} (e.g., {'#' * int(level_num)} Heading)" + ) + + # Check each expected heading level meets requirements + for level, expected_spec in expected_heading_props.items(): + if level not in actual_heading_props: + # If level is not required, skip it + if level not in required_heading_levels: + continue + # Already handled above in required check + + else: + actual_spec = actual_heading_props[level] + level_num = level.replace('level_', '') + + # Check minimum and maximum item requirements + self._validate_array_constraints_with_errors( + actual_spec, + expected_spec, + f"headings.{level}", + f"level {level_num} headings", + error_collector + ) + + def _validate_structural_element_with_errors( + self, + actual_element: Dict[str, Any], + expected_element: Dict[str, Any], + element_name: str, + error_collector: ValidationErrorCollector + ) -> None: + """ + Validate a structural element and collect errors. + + Args: + actual_element: Actual element structure from document + expected_element: Expected element requirements + element_name: Name of the structural element (for error messages) + error_collector: Collector for validation errors + """ + if actual_element is None: + error_collector.add_error( + ValidationErrorType.MISSING_REQUIRED_SECTION, + f"Missing required structural element: {element_name}", + f"content.{element_name}", + expected=f"Document should contain {element_name}", + actual="Element not found", + suggestion=f"Add {element_name} to your document" + ) + return + + self._validate_array_constraints_with_errors( + actual_element, + expected_element, + f"content.{element_name}", + element_name, + error_collector + ) + + def _validate_array_constraints_with_errors( + self, + actual: Dict[str, Any], + expected: Dict[str, Any], + path: str, + element_description: str, + error_collector: ValidationErrorCollector + ) -> None: + """ + Validate array constraints and collect specific errors. + + Args: + actual: Actual element specification + expected: Expected element specification + path: JSON path for error location + element_description: Human-readable element description + error_collector: Collector for validation errors + """ + # Get actual count from the schema specification + actual_min = actual.get('minItems', 0) + actual_max = actual.get('maxItems', actual_min) + actual_count = actual_max # In our generated schemas, min=max=actual count + + # Check against expected constraints + expected_min = expected.get('minItems', 0) + expected_max = expected.get('maxItems', float('inf')) + + # Check minimum constraint + if actual_count < expected_min: + error_collector.add_error( + ValidationErrorType.INSUFFICIENT_CONTENT, + f"Insufficient {element_description}: found {actual_count}, required at least {expected_min}", + path, + expected=f"At least {expected_min} {element_description}", + actual=f"{actual_count} {element_description}", + suggestion=f"Add {expected_min - actual_count} more {element_description}" + ) + + # Check maximum constraint + if expected_max != float('inf') and actual_count > expected_max: + error_collector.add_error( + ValidationErrorType.EXCESS_CONTENT, + f"Too many {element_description}: found {actual_count}, maximum allowed {expected_max}", + path, + expected=f"At most {expected_max} {element_description}", + actual=f"{actual_count} {element_description}", + suggestion=f"Remove {actual_count - expected_max} {element_description}" + ) + + def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool: + """ + Check if the schema has heading text constraints (enum values on heading content). + + Args: + schema: JSON schema to check + + Returns: + True if schema has heading text constraints + """ + headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {}) + + for level_props in headings_props.values(): + items = level_props.get('items', {}) + content_prop = items.get('properties', {}).get('content', {}) + if 'enum' in content_prop: + return True + + return False + + def _validate_with_heading_text_constraints( + self, + file_path: Path, + expected_schema: Dict[str, Any], + document_schema: Dict[str, Any] + ) -> bool: + """ + Validate document with heading text constraints by comparing actual content against enum values. + + Args: + file_path: Path to the markdown file + expected_schema: Schema with heading text constraints + document_schema: Generated schema from the actual document + + Returns: + True if document meets all constraints including heading text + """ + # First check standard structure compliance + if not self._compare_structures(document_schema, expected_schema): + return False + + # Then check heading text constraints + expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {}) + + # Generate document analysis with actual heading content + content = file_path.read_text(encoding='utf-8') + ast_tokens = parse_markdown_to_ast(content) + structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None) + + for level_key, expected_level_spec in expected_headings.items(): + content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {}) + + if 'enum' in content_constraints: + allowed_texts = content_constraints['enum'] + actual_headings = structure_analysis['headings'].get(level_key, []) + + for heading in actual_headings: + actual_text = heading['content'] + if actual_text not in allowed_texts: + return False + + return True + + def _validate_heading_text_constraints_with_errors( + self, + file_path: Path, + expected_schema: Dict[str, Any], + error_collector: ValidationErrorCollector + ) -> None: + """ + Validate heading text constraints and collect detailed errors. + + Args: + file_path: Path to the markdown file + expected_schema: Schema with heading text constraints + error_collector: Collector for validation errors + """ + expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {}) + + # Generate document analysis with actual heading content + content = file_path.read_text(encoding='utf-8') + ast_tokens = parse_markdown_to_ast(content) + structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None) + + for level_key, expected_level_spec in expected_headings.items(): + content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {}) + + if 'enum' in content_constraints: + allowed_texts = content_constraints['enum'] + actual_headings = structure_analysis['headings'].get(level_key, []) + + for i, heading in enumerate(actual_headings): + actual_text = heading['content'] + if actual_text not in allowed_texts: + # Add detailed error about heading text mismatch + error_collector.add_error( + ValidationErrorType.HEADING_COUNT_MISMATCH, + f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'", + f"headings.{level_key}[{i}].content", + expected=f"One of: {allowed_texts}", + actual=actual_text, + suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}" + ) diff --git a/markitect/schema_analyzer.py b/markitect/schema_analyzer.py index 94d259be..97563b84 100644 --- a/markitect/schema_analyzer.py +++ b/markitect/schema_analyzer.py @@ -1,352 +1,25 @@ """ -Schema Analyzer for Phase 2: Schema Refinement Tools +Schema Analyzer - Backward Compatibility Module. -Analyzes JSON schemas to detect rigidity issues and provide suggestions -for improvement using the Phase 1 classification system. +This module re-exports from markitect.schema.analyzer for backward compatibility. +New code should import from markitect.schema.analyzer directly. """ -from pathlib import Path -from typing import Dict, Any, List, Optional, Tuple -import json -from dataclasses import dataclass, field -from enum import Enum +# Re-export from schema package for backward compatibility +from markitect.schema.analyzer import ( + SchemaAnalyzer, + SchemaAnalysisResult, + SchemaIssue, + IssueType, + IssueSeverity, + analyze_schema_cli, +) - -class IssueType(Enum): - """Types of schema rigidity issues.""" - EXACT_COUNT = "exact_count" - MISSING_CLASSIFICATIONS = "missing_classifications" - MISSING_CONTENT_INSTRUCTIONS = "missing_content_instructions" - OVERLY_SPECIFIC = "overly_specific" - NO_FLEXIBILITY = "no_flexibility" - DEPRECATED_EXTENSIONS = "deprecated_extensions" - - -class IssueSeverity(Enum): - """Severity levels for schema issues.""" - INFO = "info" - WARNING = "warning" - ERROR = "error" - - -@dataclass -class SchemaIssue: - """Represents a detected schema issue.""" - issue_type: IssueType - severity: IssueSeverity - path: str - message: str - suggestion: str - current_value: Any = None - suggested_value: Any = None - - -@dataclass -class SchemaAnalysisResult: - """Results of schema analysis.""" - is_rigid: bool - rigidity_score: int # 0-100, higher = more rigid - issues: List[SchemaIssue] = field(default_factory=list) - has_classifications: bool = False - has_content_control: bool = False - uses_deprecated_extensions: bool = False - - @property - def issue_count_by_severity(self) -> Dict[IssueSeverity, int]: - """Count issues by severity.""" - counts = {severity: 0 for severity in IssueSeverity} - for issue in self.issues: - counts[issue.severity] += 1 - return counts - - -class SchemaAnalyzer: - """Analyzes schemas for rigidity and suggests improvements.""" - - def __init__(self): - """Initialize the schema analyzer.""" - self.deprecated_extensions = [ - "x-markitect-required-sections", - "x-markitect-recommended-sections", - "x-markitect-optional-sections" - ] - - def analyze_schema(self, schema: Dict[str, Any]) -> SchemaAnalysisResult: - """ - Analyze a schema for rigidity issues. - - Args: - schema: The JSON schema to analyze - - Returns: - SchemaAnalysisResult with detected issues and suggestions - """ - result = SchemaAnalysisResult(is_rigid=False, rigidity_score=0) - - # Check for Phase 1 features - result.has_classifications = "x-markitect-sections" in schema - result.has_content_control = "x-markitect-content-control" in schema - - # Check for deprecated extensions - for deprecated in self.deprecated_extensions: - if deprecated in schema: - result.uses_deprecated_extensions = True - result.issues.append(SchemaIssue( - issue_type=IssueType.DEPRECATED_EXTENSIONS, - severity=IssueSeverity.WARNING, - path=deprecated, - message=f"Using deprecated extension '{deprecated}'", - suggestion=f"Migrate to 'x-markitect-sections' with classification system" - )) - - # Analyze properties for rigidity - if "properties" in schema: - self._analyze_properties(schema["properties"], result, "properties") - - # Check for missing classifications - if not result.has_classifications: - result.issues.append(SchemaIssue( - issue_type=IssueType.MISSING_CLASSIFICATIONS, - severity=IssueSeverity.INFO, - path="root", - message="Schema does not use section classification system", - suggestion="Add 'x-markitect-sections' to classify sections as required/recommended/optional/discouraged/improper" - )) - - # Check for missing content control - if not result.has_content_control: - result.issues.append(SchemaIssue( - issue_type=IssueType.MISSING_CONTENT_INSTRUCTIONS, - severity=IssueSeverity.INFO, - path="root", - message="Schema does not provide content control", - suggestion="Add 'x-markitect-content-control' for pattern validation and quality metrics" - )) - - # Calculate rigidity score - result.rigidity_score = self._calculate_rigidity_score(result) - result.is_rigid = result.rigidity_score > 50 - - return result - - def _analyze_properties(self, properties: Dict[str, Any], result: SchemaAnalysisResult, path: str): - """Analyze schema properties for rigidity issues.""" - for prop_name, prop_def in properties.items(): - prop_path = f"{path}.{prop_name}" - - if not isinstance(prop_def, dict): - continue - - # Check for exact counts (const) - if "const" in prop_def: - result.issues.append(SchemaIssue( - issue_type=IssueType.EXACT_COUNT, - severity=IssueSeverity.WARNING, - path=prop_path, - message=f"Property '{prop_name}' requires exact value", - suggestion=f"Consider using a range or removing constraint for flexibility", - current_value=prop_def["const"] - )) - - # Check for arrays with exact counts - if prop_def.get("type") == "array": - min_items = prop_def.get("minItems") - max_items = prop_def.get("maxItems") - - if min_items is not None and max_items is not None and min_items == max_items: - result.issues.append(SchemaIssue( - issue_type=IssueType.EXACT_COUNT, - severity=IssueSeverity.WARNING, - path=prop_path, - message=f"Array '{prop_name}' requires exactly {min_items} items", - suggestion=f"Use a range like minItems: {max(0, min_items - 2)}, maxItems: {min_items + 5}", - current_value={"minItems": min_items, "maxItems": max_items}, - suggested_value={ - "minItems": max(0, min_items - 2), - "maxItems": min_items + 5 - } - )) - - # Check for overly specific counts (large numbers) - if min_items is not None and min_items > 50: - result.issues.append(SchemaIssue( - issue_type=IssueType.OVERLY_SPECIFIC, - severity=IssueSeverity.INFO, - path=prop_path, - message=f"Array '{prop_name}' has very specific minItems: {min_items}", - suggestion=f"Consider rounding to {(min_items // 10) * 10} for flexibility", - current_value=min_items, - suggested_value=(min_items // 10) * 10 - )) - - # Check for overly specific integer constraints - if prop_def.get("type") == "integer": - if "minimum" in prop_def and "maximum" in prop_def: - min_val = prop_def["minimum"] - max_val = prop_def["maximum"] - range_size = max_val - min_val - - if range_size < 3: - result.issues.append(SchemaIssue( - issue_type=IssueType.NO_FLEXIBILITY, - severity=IssueSeverity.INFO, - path=prop_path, - message=f"Integer '{prop_name}' has very narrow range: {min_val}-{max_val}", - suggestion=f"Consider widening range for flexibility", - current_value={"minimum": min_val, "maximum": max_val} - )) - - # Recursively check nested properties - if "properties" in prop_def: - self._analyze_properties(prop_def["properties"], result, prop_path) - - # Check items schema for arrays - if "items" in prop_def and isinstance(prop_def["items"], dict): - if "properties" in prop_def["items"]: - self._analyze_properties( - prop_def["items"]["properties"], - result, - f"{prop_path}.items" - ) - - def _calculate_rigidity_score(self, result: SchemaAnalysisResult) -> int: - """ - Calculate overall rigidity score (0-100). - - Higher score = more rigid schema. - """ - score = 0 - - # Count issues by type with weighted scores - weights = { - IssueType.EXACT_COUNT: 15, - IssueType.OVERLY_SPECIFIC: 10, - IssueType.NO_FLEXIBILITY: 8, - IssueType.MISSING_CLASSIFICATIONS: 5, - IssueType.MISSING_CONTENT_INSTRUCTIONS: 3, - IssueType.DEPRECATED_EXTENSIONS: 5 - } - - for issue in result.issues: - score += weights.get(issue.issue_type, 5) - - # Cap at 100 - return min(100, score) - - def analyze_schema_file(self, schema_path: Path) -> SchemaAnalysisResult: - """ - Analyze a schema file. - - Args: - schema_path: Path to JSON schema file - - Returns: - SchemaAnalysisResult - """ - with open(schema_path) as f: - schema = json.load(f) - - return self.analyze_schema(schema) - - def format_analysis_report(self, result: SchemaAnalysisResult, verbose: bool = False) -> str: - """ - Format analysis results as a human-readable report. - - Args: - result: Analysis results - verbose: Include detailed information - - Returns: - Formatted report string - """ - lines = [] - - # Header - lines.append("=" * 70) - lines.append("Schema Analysis Report") - lines.append("=" * 70) - lines.append("") - - # Overall assessment - rigidity_level = "HIGH" if result.rigidity_score > 70 else "MEDIUM" if result.rigidity_score > 40 else "LOW" - lines.append(f"Rigidity Score: {result.rigidity_score}/100 ({rigidity_level})") - lines.append(f"Status: {'RIGID - Needs refinement' if result.is_rigid else 'FLEXIBLE - Good'}") - lines.append("") - - # Features check - lines.append("Phase 1 Features:") - lines.append(f" ✓ Classifications: {'Yes' if result.has_classifications else 'No'}") - lines.append(f" ✓ Content Control: {'Yes' if result.has_content_control else 'No'}") - if result.uses_deprecated_extensions: - lines.append(f" ⚠ Deprecated Extensions: Yes (needs migration)") - lines.append("") - - # Issue summary - counts = result.issue_count_by_severity - lines.append(f"Issues Found: {len(result.issues)} total") - lines.append(f" - Errors: {counts[IssueSeverity.ERROR]}") - lines.append(f" - Warnings: {counts[IssueSeverity.WARNING]}") - lines.append(f" - Info: {counts[IssueSeverity.INFO]}") - lines.append("") - - # List issues - if result.issues: - lines.append("Detected Issues:") - lines.append("-" * 70) - - for i, issue in enumerate(result.issues, 1): - severity_icon = "❌" if issue.severity == IssueSeverity.ERROR else "⚠️ " if issue.severity == IssueSeverity.WARNING else "ℹ️ " - lines.append(f"{i}. {severity_icon} {issue.message}") - lines.append(f" Path: {issue.path}") - lines.append(f" Suggestion: {issue.suggestion}") - - if verbose and issue.current_value is not None: - lines.append(f" Current: {json.dumps(issue.current_value)}") - if verbose and issue.suggested_value is not None: - lines.append(f" Suggested: {json.dumps(issue.suggested_value)}") - - lines.append("") - else: - lines.append("✅ No issues found - schema is well-designed!") - lines.append("") - - # Recommendations - if result.is_rigid: - lines.append("Recommendations:") - lines.append("-" * 70) - lines.append("Run: markitect schema-refine --loosen-counts") - lines.append(" to automatically apply suggested improvements") - lines.append("") - - return "\n".join(lines) - - -def analyze_schema_cli(schema_path: str, verbose: bool = False) -> int: - """ - CLI entry point for schema analysis. - - Args: - schema_path: Path to schema file - verbose: Show detailed information - - Returns: - Exit code (0 = success, 1 = rigid schema found) - """ - analyzer = SchemaAnalyzer() - - try: - result = analyzer.analyze_schema_file(Path(schema_path)) - report = analyzer.format_analysis_report(result, verbose=verbose) - print(report) - - return 1 if result.is_rigid else 0 - - except FileNotFoundError: - print(f"Error: Schema file not found: {schema_path}") - return 2 - except json.JSONDecodeError as e: - print(f"Error: Invalid JSON in schema file: {e}") - return 2 - except Exception as e: - print(f"Error: {e}") - return 2 +__all__ = [ + 'SchemaAnalyzer', + 'SchemaAnalysisResult', + 'SchemaIssue', + 'IssueType', + 'IssueSeverity', + 'analyze_schema_cli', +] diff --git a/markitect/schema_generator.py b/markitect/schema_generator.py index 5a464c6d..48509ee8 100644 --- a/markitect/schema_generator.py +++ b/markitect/schema_generator.py @@ -1,466 +1,11 @@ """ -Schema Generator for Issue #5: Generate a Schema from a Markdown File. +Schema Generator - Backward Compatibility Module. -This module provides functionality to analyze markdown AST structures and generate -JSON schemas that describe the document's structural elements with configurable -depth limitations for architectural documentation analysis. +This module re-exports from markitect.schema.generator for backward compatibility. +New code should import from markitect.schema.generator directly. """ -import json -from collections import defaultdict -from pathlib import Path -from typing import Dict, List, Any, Optional, Set +# Re-export from schema package for backward compatibility +from markitect.schema.generator import SchemaGenerator -from .parser import parse_markdown_to_ast -from .exceptions import FileNotFoundError, InvalidDepthError, InvalidInstructionTypeError - - -class SchemaGenerator: - """ - Generates JSON schemas from markdown file AST structures. - - Analyzes the structural elements of markdown documents and creates - JSON schemas that can be used for validation and compliance checking - in architecture documentation workflows. - """ - - def __init__(self): - """Initialize the schema generator.""" - self.default_schema_url = "http://json-schema.org/draft-07/schema#" - - def generate_schema_from_file( - self, - file_path: Path, - max_depth: Optional[int] = None, - mode: Optional[str] = None, - outline_depth: Optional[int] = None, - capture_heading_text: bool = False, - include_content_instructions: bool = False, - instruction_type: str = 'description' - ) -> Dict[str, Any]: - """ - Generate a JSON schema from a markdown file's AST structure. - - Args: - file_path: Path to the markdown file - max_depth: Maximum heading depth to include (None = unlimited) - mode: Generation mode ('outline' for structure-focused schemas) - outline_depth: Depth limit for outline mode - capture_heading_text: Whether to capture exact heading text as constraints - include_content_instructions: Whether to include content instruction fields - instruction_type: Type of content instructions ('description', 'example', 'constraint', 'template') - - Returns: - JSON schema as a dictionary - - Raises: - FileNotFoundError: If the markdown file doesn't exist - InvalidDepthError: If max_depth is invalid (< 1) - """ - # Validate inputs - if not file_path.exists(): - raise FileNotFoundError(f"Markdown file not found: {file_path}") - - if max_depth is not None and max_depth < 1: - raise InvalidDepthError(f"max_depth must be >= 1, got: {max_depth}") - - # Validate instruction type - valid_instruction_types = {'description', 'example', 'constraint', 'template'} - if instruction_type not in valid_instruction_types: - raise InvalidInstructionTypeError(f"Invalid instruction type '{instruction_type}'. Must be one of: {', '.join(valid_instruction_types)}") - - # Read and parse the markdown file - content = file_path.read_text(encoding='utf-8') - ast_tokens = parse_markdown_to_ast(content) - - # Analyze the AST structure - structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth) - - # Generate the JSON schema - schema = self._create_json_schema( - structure_analysis, - file_path.name, - mode=mode, - outline_depth=outline_depth, - capture_heading_text=capture_heading_text, - include_content_instructions=include_content_instructions, - instruction_type=instruction_type - ) - - return schema - - def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]: - """ - Analyze AST tokens to extract structural patterns. - - Args: - tokens: List of AST tokens from markdown-it - max_depth: Maximum heading depth to analyze - - Returns: - Dictionary containing structural analysis - """ - analysis = { - 'headings': defaultdict(list), - 'paragraphs': [], - 'lists': [], - 'code_blocks': [], - 'blockquotes': [], - 'tables': [], - 'links': [], - 'images': [], - 'emphasis': [], - 'structure_types': set() - } - - current_heading_level = 0 - i = 0 - - while i < len(tokens): - token = tokens[i] - token_type = token.get('type', '') - - # Track all structural types found - analysis['structure_types'].add(token_type) - - # Analyze headings with depth filtering - if token_type == 'heading_open': - level = self._extract_heading_level(token.get('tag', '')) - if max_depth is None or level <= max_depth: - heading_content = self._extract_heading_content(tokens, i) - analysis['headings'][f'level_{level}'].append({ - 'content': heading_content, - 'level': level, - 'position': i - }) - current_heading_level = level - - # Analyze paragraphs - elif token_type == 'paragraph_open': - paragraph_content = self._extract_paragraph_content(tokens, i) - analysis['paragraphs'].append({ - 'content': paragraph_content, - 'position': i, - 'under_heading_level': current_heading_level - }) - - # Analyze lists - elif token_type in ['bullet_list_open', 'ordered_list_open']: - list_structure = self._extract_list_structure(tokens, i) - analysis['lists'].append({ - 'type': 'bullet' if token_type == 'bullet_list_open' else 'ordered', - 'structure': list_structure, - 'position': i, - 'under_heading_level': current_heading_level - }) - - # Analyze code blocks - elif token_type == 'code_block' or token_type == 'fence': - code_info = self._extract_code_block_info(token) - analysis['code_blocks'].append({ - 'language': code_info.get('language', ''), - 'content_length': len(code_info.get('content', '')), - 'position': i, - 'under_heading_level': current_heading_level - }) - - # Analyze blockquotes - elif token_type == 'blockquote_open': - quote_content = self._extract_blockquote_content(tokens, i) - analysis['blockquotes'].append({ - 'content': quote_content, - 'position': i, - 'under_heading_level': current_heading_level - }) - - # Analyze tables - elif token_type == 'table_open': - table_structure = self._extract_table_structure(tokens, i) - analysis['tables'].append({ - 'columns': table_structure.get('columns', 0), - 'rows': table_structure.get('rows', 0), - 'position': i, - 'under_heading_level': current_heading_level - }) - - # Analyze inline elements - elif token_type == 'inline': - inline_analysis = self._analyze_inline_content(token) - analysis['links'].extend(inline_analysis.get('links', [])) - analysis['images'].extend(inline_analysis.get('images', [])) - analysis['emphasis'].extend(inline_analysis.get('emphasis', [])) - - i += 1 - - # Convert sets to lists for JSON serialization - analysis['structure_types'] = list(analysis['structure_types']) - - return analysis - - def _create_json_schema( - self, - analysis: Dict[str, Any], - filename: str, - mode: Optional[str] = None, - outline_depth: Optional[int] = None, - capture_heading_text: bool = False, - include_content_instructions: bool = False, - instruction_type: str = 'description' - ) -> Dict[str, Any]: - """ - Create a JSON schema from structural analysis. - - Args: - analysis: Structural analysis of the document - filename: Name of the source file - mode: Generation mode ('outline' for structure-focused schemas) - outline_depth: Depth limit for outline mode - capture_heading_text: Whether to capture exact heading text as constraints - include_content_instructions: Whether to include content instruction fields - instruction_type: Type of content instructions to generate - - Returns: - JSON schema dictionary - """ - # Determine title format based on mode - title_preposition = "from" if mode == "outline" else "for" - - schema = { - "$schema": self.default_schema_url, - "type": "object", - "title": f"Schema {title_preposition} {filename}", - "description": f"JSON schema describing the structure of {filename}", - "properties": {} - } - - # Add metaschema extensions for outline mode - if mode == "outline": - schema["x-markitect-outline-mode"] = True - if outline_depth is not None: - schema["x-markitect-outline-depth"] = outline_depth - - # Add metaschema extension for heading text capture - if capture_heading_text: - schema["x-markitect-heading-text-capture"] = True - - # Add metaschema extension for content instructions - if include_content_instructions: - schema["x-markitect-content-instructions-enabled"] = True - - # Add heading structure - if analysis['headings']: - heading_properties = {} - for level_key, headings in analysis['headings'].items(): - if headings: # Only include levels that have content - # Configure content property based on heading text capture - if capture_heading_text: - # Extract actual heading texts in document order - heading_texts = [heading['content'] for heading in headings] - content_property = {"enum": heading_texts} - else: - content_property = {"type": "string"} - - # Build properties for the heading item - item_properties = { - "content": content_property, - "level": {"type": "integer"}, - "position": {"type": "integer"} - } - - # Add content instruction fields if enabled - if include_content_instructions: - # Generate appropriate instruction text based on heading level - level_num = int(level_key.split('_')[1]) - section_name = f"level {level_num} heading" - instruction_text = self._generate_content_instruction(section_name, instruction_type) - - item_properties["x-markitect-content-instructions"] = { - "type": "string", - "const": instruction_text - } - - item_properties["x-markitect-instruction-type"] = { - "type": "string", - "enum": [instruction_type] - } - - heading_properties[level_key] = { - "type": "array", - "description": f"Headings at {level_key.replace('_', ' ')}", - "items": { - "type": "object", - "properties": item_properties, - "required": ["content", "level"] - }, - "minItems": len(headings), - "maxItems": len(headings) - } - - if heading_properties: - schema["properties"]["headings"] = { - "type": "object", - "description": "Document heading structure", - "properties": heading_properties - } - - # Add other structural elements - structural_elements = { - "paragraphs": ("Text paragraphs", analysis['paragraphs']), - "lists": ("Lists (ordered and unordered)", analysis['lists']), - "code_blocks": ("Code blocks and fenced code", analysis['code_blocks']), - "blockquotes": ("Block quotations", analysis['blockquotes']), - "tables": ("Tables with rows and columns", analysis['tables']), - "links": ("Links to external resources", analysis['links']), - "images": ("Embedded images", analysis['images']), - "emphasis": ("Text emphasis (bold, italic)", analysis['emphasis']) - } - - for element_name, (description, element_list) in structural_elements.items(): - if element_list: - # Build base schema for the element - element_schema = { - "type": "array", - "description": description, - "minItems": len(element_list), - "maxItems": len(element_list) - } - - # Add content instructions for paragraphs and lists if enabled - if include_content_instructions and element_name in ["paragraphs", "lists"]: - element_schema["items"] = { - "type": "object", - "properties": { - "content": {"type": "string"}, - "x-markitect-content-instructions": { - "type": "string", - "const": self._generate_content_instruction(element_name, instruction_type) - }, - "x-markitect-instruction-type": { - "type": "string", - "enum": [instruction_type] - } - } - } - - schema["properties"][element_name] = element_schema - - # Add metadata - schema["properties"]["metadata"] = { - "type": "object", - "description": "Document structure metadata", - "properties": { - "total_elements": { - "type": "integer", - "const": sum(len(v) if isinstance(v, list) else 0 for v in analysis.values()) - }, - "structure_types": { - "type": "array", - "items": {"type": "string"}, - "description": "All structural element types found", - "const": analysis['structure_types'] - } - } - } - - return schema - - def _extract_heading_level(self, tag: str) -> int: - """Extract heading level from HTML tag (h1, h2, etc.).""" - if tag.startswith('h') and len(tag) == 2: - try: - return int(tag[1]) - except ValueError: - pass - return 1 - - def _extract_heading_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: - """Extract text content from heading tokens.""" - # Look for the inline token that contains the heading text - for i in range(start_index, min(start_index + 3, len(tokens))): - token = tokens[i] - if token.get('type') == 'inline': - return token.get('content', '') - return '' - - def _extract_paragraph_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: - """Extract text content from paragraph tokens.""" - # Look for the inline token that contains the paragraph text - for i in range(start_index, min(start_index + 3, len(tokens))): - token = tokens[i] - if token.get('type') == 'inline': - return token.get('content', '') - return '' - - def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]: - """Extract list structure information.""" - # This is a simplified implementation - # In a full implementation, we'd parse the nested list structure - return { - "type": "list", - "estimated_items": 1 # Placeholder - would need more complex parsing - } - - def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]: - """Extract code block information.""" - return { - "language": token.get('info', '').split()[0] if token.get('info') else '', - "content": token.get('content', '') - } - - def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: - """Extract blockquote content.""" - # Simplified implementation - return "blockquote content" - - def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]: - """Extract table structure information.""" - # Simplified implementation - return { - "columns": 2, # Placeholder - "rows": 1 # Placeholder - } - - def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]: - """Analyze inline content for links, images, emphasis.""" - result = { - "links": [], - "images": [], - "emphasis": [] - } - - # Analyze children tokens if they exist - children = token.get('children', []) - for child in children: - if child and isinstance(child, dict): - child_type = child.get('type', '') - if child_type == 'link_open': - result['links'].append({"type": "link"}) - elif child_type == 'image': - result['images'].append({"type": "image"}) - elif child_type in ['em_open', 'strong_open']: - result['emphasis'].append({"type": child_type}) - - return result - - def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str: - """ - Generate appropriate content instruction text based on heading and instruction type. - - Args: - heading_text: The text of the heading - instruction_type: Type of instruction to generate - - Returns: - Instruction text for the content field - """ - if instruction_type == "description": - return f"Provide content for the '{heading_text}' section" - elif instruction_type == "example": - return f"Example content for the '{heading_text}' section" - elif instruction_type == "constraint": - return f"Content must be relevant to '{heading_text}'" - elif instruction_type == "template": - return f"Template content for '{heading_text}' section" - else: - # Default fallback - return f"Content for the '{heading_text}' section" +__all__ = ['SchemaGenerator'] diff --git a/markitect/schema_loader.py b/markitect/schema_loader.py index af334358..c2a417fe 100644 --- a/markitect/schema_loader.py +++ b/markitect/schema_loader.py @@ -1,610 +1,23 @@ """ -Schema Loader - Extract JSON schemas from markdown files. +Schema Loader - Backward Compatibility Module. -This module provides functionality to load schemas from markdown files that -contain embedded JSON schemas in code blocks, along with YAML frontmatter -metadata and rich documentation. - -Markdown Schema Format: - --- - schema-id: "https://markitect.dev/schemas/domain/v1" - version: "1.0.0" - status: "stable|draft|deprecated" - --- - - # Schema Title v1.0 - - ## Documentation sections... - - ## Schema Definition - - ```json - { - "$schema": "http://json-schema.org/draft-07/schema#", - ... - } - ``` - -This enables: -- Rich documentation alongside schemas -- Version history in same file -- Human-readable schema files -- Markdown-first approach aligned with MarkiTect philosophy +This module re-exports from markitect.schema.loader for backward compatibility. +New code should import from markitect.schema.loader directly. """ -import re -import json -import yaml -from pathlib import Path -from typing import Dict, Any, Optional, List, Tuple - - -class SchemaLoaderError(Exception): - """Base exception for schema loading errors.""" - pass - - -class InvalidSchemaFormatError(SchemaLoaderError): - """Schema file format is invalid.""" - pass - - -class SchemaNotFoundError(SchemaLoaderError): - """No JSON schema found in markdown file.""" - pass - - -class MarkdownSchemaLoader: - """ - Load and parse markdown schema files. - - Supports: - - YAML frontmatter for metadata - - JSON code blocks for schema definition - - Validation of schema structure - - Metadata merging - - Example: - >>> loader = MarkdownSchemaLoader() - >>> schema_data = loader.load_schema(Path("manpage-schema-v1.0.md")) - >>> schema = schema_data['schema'] - >>> metadata = schema_data['metadata'] - """ - - def __init__(self): - """Initialize the schema loader with regex patterns.""" - # Pattern to match YAML frontmatter - # Matches: --- ... --- at start of file - self.frontmatter_pattern = re.compile( - r'^---\s*\n(.*?)\n---\s*\n', - re.DOTALL | re.MULTILINE - ) - - # Pattern to match JSON code blocks - # Matches: ```json ... ``` - self.json_code_block_pattern = re.compile( - r'```json\s*\n(.*?)\n```', - re.DOTALL | re.MULTILINE - ) - - # Pattern to find Schema Definition section - # This helps us find the right JSON block if there are multiple - self.schema_section_pattern = re.compile( - r'##\s+Schema Definition\s*\n', - re.MULTILINE - ) - - def load_schema(self, md_path: Path) -> Dict[str, Any]: - """ - Load schema from markdown file. - - Args: - md_path: Path to markdown schema file - - Returns: - Dictionary containing: - - schema: Extracted JSON schema (dict) - - metadata: Frontmatter metadata (dict) - - documentation: Full markdown content (str) - - source_file: Source file path (str) - - Raises: - FileNotFoundError: If schema file doesn't exist - InvalidSchemaFormatError: If file format is invalid - SchemaNotFoundError: If no JSON schema found - - Example: - >>> loader = MarkdownSchemaLoader() - >>> data = loader.load_schema(Path("manpage-schema-v1.0.md")) - >>> print(data['schema']['title']) - 'Unix Manual Page Schema' - """ - if not md_path.exists(): - raise FileNotFoundError(f"Schema file not found: {md_path}") - - # Read file content - try: - content = md_path.read_text(encoding='utf-8') - except Exception as e: - raise InvalidSchemaFormatError(f"Failed to read schema file: {e}") - - # Extract frontmatter - metadata = self._extract_frontmatter(content) - - # Extract JSON schema - schema = self._extract_json_schema(content) - - if not schema: - raise SchemaNotFoundError( - f"No JSON schema found in {md_path}. " - f"Expected a ```json code block with schema definition." - ) - - # Merge metadata into schema - schema = self._merge_metadata(schema, metadata, md_path) - - return { - 'schema': schema, - 'metadata': metadata, - 'documentation': content, - 'source_file': str(md_path) - } - - def _extract_frontmatter(self, content: str) -> Dict[str, Any]: - """ - Extract YAML frontmatter from markdown content. - - Args: - content: Markdown file content - - Returns: - Dictionary of frontmatter metadata (empty if none found) - - Raises: - InvalidSchemaFormatError: If YAML is malformed - """ - match = self.frontmatter_pattern.search(content) - if not match: - return {} - - yaml_content = match.group(1) - try: - metadata = yaml.safe_load(yaml_content) or {} - if not isinstance(metadata, dict): - raise InvalidSchemaFormatError( - f"Frontmatter must be a YAML dictionary, got {type(metadata)}" - ) - return metadata - except yaml.YAMLError as e: - raise InvalidSchemaFormatError(f"Invalid YAML frontmatter: {e}") - - def _extract_json_schema(self, content: str) -> Optional[Dict[str, Any]]: - """ - Extract JSON schema from markdown code blocks. - - Prefers JSON blocks under "## Schema Definition" section, - but will use first JSON block if no Schema Definition section found. - - Args: - content: Markdown file content - - Returns: - JSON schema dictionary or None if not found - - Raises: - InvalidSchemaFormatError: If JSON is malformed - """ - # Find all JSON code blocks - json_blocks = self.json_code_block_pattern.findall(content) - - if not json_blocks: - return None - - # Try to find the Schema Definition section - schema_section_match = self.schema_section_pattern.search(content) - - if schema_section_match: - # Find JSON block that comes after Schema Definition section - section_pos = schema_section_match.end() - - # Re-search for JSON blocks starting from section position - remaining_content = content[section_pos:] - section_json_blocks = self.json_code_block_pattern.findall(remaining_content) - - if section_json_blocks: - json_text = section_json_blocks[0] - else: - # Fallback to first JSON block in entire document - json_text = json_blocks[0] - else: - # No Schema Definition section, use first JSON block - json_text = json_blocks[0] - - # Parse JSON - try: - schema = json.loads(json_text) - if not isinstance(schema, dict): - raise InvalidSchemaFormatError( - f"Schema must be a JSON object, got {type(schema)}" - ) - return schema - except json.JSONDecodeError as e: - raise InvalidSchemaFormatError(f"Invalid JSON schema: {e}") - - def _merge_metadata( - self, - schema: Dict[str, Any], - metadata: Dict[str, Any], - source_file: Path - ) -> Dict[str, Any]: - """ - Merge frontmatter metadata into schema. - - Adds x-markitect-source extension with file info and metadata. - Optionally overrides schema fields with frontmatter values. - - Args: - schema: JSON schema dictionary - metadata: Frontmatter metadata dictionary - source_file: Path to source file - - Returns: - Schema with merged metadata - """ - # Create a copy to avoid modifying original - merged_schema = schema.copy() - - # Add MarkiTect-specific source metadata - merged_schema['x-markitect-source'] = { - 'file': str(source_file), - 'filename': source_file.name, - 'format': 'markdown', - 'frontmatter': metadata - } - - # Override schema fields with frontmatter if present - # This allows frontmatter to be the source of truth for metadata - if 'version' in metadata: - merged_schema['version'] = metadata['version'] - - if 'schema-id' in metadata: - merged_schema['$id'] = metadata['schema-id'] - - if 'status' in metadata: - if 'x-markitect-metadata' not in merged_schema: - merged_schema['x-markitect-metadata'] = {} - merged_schema['x-markitect-metadata']['status'] = metadata['status'] - - return merged_schema - - def save_schema( - self, - schema: Dict[str, Any], - md_path: Path, - template: Optional[str] = None, - frontmatter: Optional[Dict[str, Any]] = None - ): - """ - Save schema as markdown file. - - Args: - schema: JSON schema dictionary to save - md_path: Output path for markdown file - template: Optional markdown template string - frontmatter: Optional frontmatter metadata (extracted from schema if not provided) - - Raises: - InvalidSchemaFormatError: If schema is invalid - - Example: - >>> loader = MarkdownSchemaLoader() - >>> loader.save_schema( - ... schema={'title': 'My Schema', ...}, - ... md_path=Path('my-schema-v1.0.md') - ... ) - """ - if template: - # Use provided template - content = self._render_template(template, schema, frontmatter) - else: - # Generate basic markdown - content = self._generate_markdown(schema, frontmatter) - - # Create parent directory if needed - md_path.parent.mkdir(parents=True, exist_ok=True) - - # Write file - try: - md_path.write_text(content, encoding='utf-8') - except Exception as e: - raise InvalidSchemaFormatError(f"Failed to write schema file: {e}") - - def _generate_markdown( - self, - schema: Dict[str, Any], - frontmatter: Optional[Dict[str, Any]] = None - ) -> str: - """ - Generate markdown from schema. - - Args: - schema: JSON schema dictionary - frontmatter: Optional frontmatter metadata - - Returns: - Markdown content as string - """ - # Extract metadata from schema - title = schema.get('title', 'Untitled Schema') - version = schema.get('version', '1.0.0') - description = schema.get('description', '') - schema_id = schema.get('$id', '') - - # Build frontmatter - if frontmatter is None: - frontmatter = {} - - # Set defaults - if 'schema-id' not in frontmatter and schema_id: - frontmatter['schema-id'] = schema_id - if 'version' not in frontmatter: - frontmatter['version'] = version - if 'status' not in frontmatter: - frontmatter['status'] = 'draft' - - # Generate frontmatter YAML - frontmatter_yaml = yaml.dump( - frontmatter, - default_flow_style=False, - allow_unicode=True - ).strip() - - # Generate JSON (pretty-printed) - schema_json = json.dumps(schema, indent=2, ensure_ascii=False) - - # Build markdown content - md_content = f"""--- -{frontmatter_yaml} ---- - -# {title} v{version} - -## Overview - -{description} - -## Usage - -```bash -markitect validate document.md --schema {Path(frontmatter.get('schema-id', 'schema')).name} -``` - -## Schema Definition - -```json -{schema_json} -``` - -## Version History - -### v{version} -- Initial version -""" - - return md_content - - def _render_template( - self, - template: str, - schema: Dict[str, Any], - frontmatter: Optional[Dict[str, Any]] = None - ) -> str: - """ - Render markdown from template. - - Simple template rendering using string formatting. - For complex templates, consider using Jinja2 or similar. - - Args: - template: Template string - schema: JSON schema dictionary - frontmatter: Optional frontmatter metadata - - Returns: - Rendered markdown content - """ - # Build context for template - context = { - 'title': schema.get('title', 'Untitled'), - 'version': schema.get('version', '1.0.0'), - 'description': schema.get('description', ''), - 'schema_id': schema.get('$id', ''), - 'schema_json': json.dumps(schema, indent=2, ensure_ascii=False), - 'frontmatter': frontmatter or {}, - } - - # Simple template rendering - try: - return template.format(**context) - except KeyError as e: - raise InvalidSchemaFormatError(f"Template missing key: {e}") - - def list_json_blocks(self, content: str) -> List[Tuple[int, str]]: - """ - List all JSON code blocks in markdown content. - - Useful for debugging or when multiple JSON blocks exist. - - Args: - content: Markdown file content - - Returns: - List of (position, json_content) tuples - - Example: - >>> loader = MarkdownSchemaLoader() - >>> content = Path('schema.md').read_text() - >>> blocks = loader.list_json_blocks(content) - >>> print(f"Found {len(blocks)} JSON blocks") - """ - blocks = [] - for match in self.json_code_block_pattern.finditer(content): - blocks.append((match.start(), match.group(1))) - return blocks - - def validate_schema_structure(self, schema: Dict[str, Any]) -> List[str]: - """ - Validate basic schema structure. - - Checks for required JSON Schema fields and MarkiTect conventions. - - Args: - schema: JSON schema dictionary - - Returns: - List of warning/error messages (empty if valid) - - Example: - >>> loader = MarkdownSchemaLoader() - >>> issues = loader.validate_schema_structure(schema) - >>> if issues: - ... print("Schema issues:", issues) - """ - issues = [] - - # Check required JSON Schema fields - if '$schema' not in schema: - issues.append("Missing required field: $schema") - - if 'type' not in schema: - issues.append("Missing recommended field: type") - - if 'title' not in schema: - issues.append("Missing recommended field: title") - - if 'description' not in schema: - issues.append("Missing recommended field: description") - - # Check MarkiTect conventions - if 'version' not in schema: - issues.append("Missing MarkiTect convention: version field") - - if '$id' not in schema: - issues.append("Missing recommended field: $id") - - # Check $id format if present - if '$id' in schema: - schema_id = schema['$id'] - if not isinstance(schema_id, str): - issues.append("$id must be a string") - elif not schema_id.startswith('https://'): - issues.append("$id should be a full HTTPS URL") - - return issues - - -def auto_ingest_schemas(db_manager=None, schema_dir: Optional[Path] = None, verbose: bool = False) -> Dict[str, Any]: - """Automatically ingest schemas from markitect/schemas/ directory. - - This function scans the schemas directory for .md schema files and ingests - any that are not already in the database. Useful for post-install setup - or automatic schema registration. - - Args: - db_manager: DatabaseManager instance (optional, will create if not provided) - schema_dir: Directory containing schemas (defaults to markitect/schemas/) - verbose: If True, print detailed progress messages - - Returns: - Dictionary with ingestion results: - { - 'ingested': [list of schema names that were ingested], - 'skipped': [list of schema names that were already present], - 'failed': [list of (schema_name, error) tuples for failures] - } - - Example: - >>> from markitect.schema_loader import auto_ingest_schemas - >>> results = auto_ingest_schemas(verbose=True) - >>> print(f"Ingested {len(results['ingested'])} schemas") - """ - # Determine schema directory - if schema_dir is None: - schema_dir = Path(__file__).parent / "schemas" - - if not schema_dir.exists(): - if verbose: - print(f"⚠️ Schema directory not found: {schema_dir}") - return {'ingested': [], 'skipped': [], 'failed': []} - - # Initialize database manager if not provided - if db_manager is None: - from .database import DatabaseManager - db_path = Path.home() / '.markitect' / 'markitect.db' - db_manager = DatabaseManager(str(db_path)) - db_manager.initialize_database() - - # Get list of already ingested schemas - try: - existing_schemas = {schema['name'] for schema in db_manager.list_schemas()} - except Exception as e: - if verbose: - print(f"❌ Error listing existing schemas: {e}") - return {'ingested': [], 'skipped': [], 'failed': []} - - results = { - 'ingested': [], - 'skipped': [], - 'failed': [] - } - - # Find all schema files - schema_files = list(schema_dir.glob("*-schema-v*.md")) - - if verbose and schema_files: - print(f"🔍 Found {len(schema_files)} schema file(s) in {schema_dir}") - - loader = MarkdownSchemaLoader() - - for schema_file in sorted(schema_files): - schema_name = schema_file.name - - # Skip if already ingested - if schema_name in existing_schemas: - results['skipped'].append(schema_name) - if verbose: - print(f"⏭️ Skipping {schema_name} (already ingested)") - continue - - # Try to ingest - try: - # Load schema - schema_data_full = loader.load_schema(schema_file) - schema_data = schema_data_full['schema'] - - # Store in database - schema_content = json.dumps(schema_data, indent=2) - record_id = db_manager.store_schema_file(schema_name, schema_content) - - if record_id: - results['ingested'].append(schema_name) - if verbose: - title = schema_data.get('title', schema_name) - print(f"✅ Ingested {schema_name} (title: {title})") - else: - results['failed'].append((schema_name, "Failed to store in database")) - if verbose: - print(f"❌ Failed to store {schema_name} in database") - - except Exception as e: - results['failed'].append((schema_name, str(e))) - if verbose: - print(f"❌ Failed to ingest {schema_name}: {e}") - - if verbose: - print(f"\n📊 Auto-ingestion complete:") - print(f" Ingested: {len(results['ingested'])}") - print(f" Skipped: {len(results['skipped'])}") - print(f" Failed: {len(results['failed'])}") - - return results +# Re-export from schema package for backward compatibility +from markitect.schema.loader import ( + MarkdownSchemaLoader, + SchemaLoaderError, + InvalidSchemaFormatError, + SchemaNotFoundError, + auto_ingest_schemas, +) + +__all__ = [ + 'MarkdownSchemaLoader', + 'SchemaLoaderError', + 'InvalidSchemaFormatError', + 'SchemaNotFoundError', + 'auto_ingest_schemas', +] diff --git a/markitect/schema_naming.py b/markitect/schema_naming.py index cf3f0095..b00aa3bd 100644 --- a/markitect/schema_naming.py +++ b/markitect/schema_naming.py @@ -1,309 +1,35 @@ """ -Schema Naming Validation - Enforce filename conventions for schemas. +Schema Naming - Backward Compatibility Module. -This module provides validation and utilities for schema filename conventions -to ensure consistency across the MarkiTect schema ecosystem. - -Naming Convention: - Format: {domain}-schema-v{major}.{minor}.md - - Components: - - domain: lowercase, hyphen-separated identifier (e.g., "manpage", "api-documentation") - - schema: literal string "schema" - - version: SemVer major.minor (e.g., "v1.0", "v2.1") - - extension: ".md" (markdown) - - Valid Examples: - ✓ manpage-schema-v1.0.md - ✓ terminology-schema-v1.0.md - ✓ api-documentation-schema-v1.0.md - ✓ my-custom-type-schema-v2.1.md - - Invalid Examples: - ✗ manpage.json (missing version and wrong extension) - ✗ manpage-v1.md (missing "schema" keyword) - ✗ ManPage-Schema-v1.0.md (wrong case - must be lowercase) - ✗ manpage-schema-1.0.md (missing 'v' prefix) - ✗ manpage-schema-v1.md (missing minor version) +This module re-exports from markitect.schema.naming for backward compatibility. +New code should import from markitect.schema.naming directly. """ -import re -from pathlib import Path -from typing import Tuple, Optional, Dict, Any - - -# Regex pattern for schema filename validation -# Matches: {domain}-schema-v{major}.{minor}.md -# Where domain is lowercase letters/numbers/hyphens starting with letter -SCHEMA_FILENAME_PATTERN = re.compile( - r'^(?P[a-z][a-z0-9-]*)-schema-v(?P\d+)\.(?P\d+)\.md$' +# Re-export from schema package for backward compatibility +from markitect.schema.naming import ( + validate_schema_filename, + suggest_valid_filename, + suggest_schema_filename, + extract_schema_domain, + get_schema_version, + extract_schema_metadata, + get_validation_errors, + is_valid_schema_filename, + format_validation_message, + SchemaFilenameError, + SCHEMA_FILENAME_PATTERN, ) - -class SchemaFilenameError(Exception): - """Exception raised for invalid schema filenames.""" - pass - - -def validate_schema_filename(filename: str) -> Tuple[bool, Optional[Dict[str, Any]]]: - """ - Validate schema filename against naming convention. - - Args: - filename: The filename to validate (e.g., "manpage-schema-v1.0.md") - - Returns: - Tuple of (is_valid, metadata_dict or None) - - If valid, metadata_dict contains: - - domain: str - The domain identifier - - version: str - Full version string (e.g., "1.0") - - major: int - Major version number - - minor: int - Minor version number - - filename: str - The original filename - - If invalid, metadata_dict is None - - Examples: - >>> validate_schema_filename("manpage-schema-v1.0.md") - (True, {'domain': 'manpage', 'version': '1.0', ...}) - - >>> validate_schema_filename("invalid.json") - (False, None) - """ - match = SCHEMA_FILENAME_PATTERN.match(filename) - - if not match: - return False, None - - return True, { - 'domain': match.group('domain'), - 'version': f"{match.group('major')}.{match.group('minor')}", - 'major': int(match.group('major')), - 'minor': int(match.group('minor')), - 'filename': filename - } - - -def suggest_schema_filename( - domain: str, - version: str = "1.0", - normalize: bool = True -) -> str: - """ - Generate a valid schema filename from domain and version. - - Args: - domain: The schema domain (e.g., "manpage", "API Documentation") - version: Version string in format "major.minor" (default: "1.0") - normalize: Whether to normalize domain to lowercase/hyphenated - - Returns: - Valid schema filename - - Raises: - ValueError: If domain or version format is invalid - - Examples: - >>> suggest_schema_filename("manpage", "1.0") - 'manpage-schema-v1.0.md' - - >>> suggest_schema_filename("API Documentation", "2.1") - 'api-documentation-schema-v2.1.md' - - >>> suggest_schema_filename("My_Custom_Type", "1.0") - 'my-custom-type-schema-v1.0.md' - """ - if not domain: - raise ValueError("Domain cannot be empty") - - if normalize: - # Normalize domain: lowercase, replace spaces/underscores with hyphens - domain_clean = domain.lower() - domain_clean = domain_clean.replace(' ', '-').replace('_', '-') - # Remove consecutive hyphens - domain_clean = re.sub(r'-+', '-', domain_clean) - # Remove leading/trailing hyphens - domain_clean = domain_clean.strip('-') - else: - domain_clean = domain - - # Validate domain format (must start with letter, contain only lowercase, numbers, hyphens) - if not re.match(r'^[a-z][a-z0-9-]*$', domain_clean): - raise ValueError( - f"Invalid domain '{domain_clean}': must start with lowercase letter " - "and contain only lowercase letters, numbers, and hyphens" - ) - - # Parse and validate version - version_parts = version.split('.') - if len(version_parts) != 2: - raise ValueError( - f"Invalid version '{version}': must be in format 'major.minor' (e.g., '1.0')" - ) - - try: - major = int(version_parts[0]) - minor = int(version_parts[1]) - except ValueError: - raise ValueError( - f"Invalid version '{version}': major and minor must be integers" - ) - - if major < 0 or minor < 0: - raise ValueError( - f"Invalid version '{version}': major and minor must be non-negative" - ) - - return f"{domain_clean}-schema-v{major}.{minor}.md" - - -def extract_schema_metadata(filename: str) -> Dict[str, Any]: - """ - Extract metadata from a valid schema filename. - - Args: - filename: Schema filename to parse - - Returns: - Dictionary with metadata - - Raises: - SchemaFilenameError: If filename is invalid - - Examples: - >>> extract_schema_metadata("manpage-schema-v1.0.md") - {'domain': 'manpage', 'version': '1.0', 'major': 1, 'minor': 0} - """ - is_valid, metadata = validate_schema_filename(filename) - - if not is_valid: - raise SchemaFilenameError( - f"Invalid schema filename: {filename}\n" - f"Expected format: {{domain}}-schema-v{{major}}.{{minor}}.md" - ) - - return metadata - - -def get_validation_errors(filename: str) -> list: - """ - Get detailed validation errors for a filename. - - Args: - filename: Filename to validate - - Returns: - List of error messages (empty if valid) - - Examples: - >>> get_validation_errors("manpage-schema-v1.0.md") - [] - - >>> get_validation_errors("invalid.json") - ['Filename does not match pattern: {domain}-schema-v{major}.{minor}.md', ...] - """ - errors = [] - - # Check basic pattern match - is_valid, _ = validate_schema_filename(filename) - if is_valid: - return errors - - # Provide detailed feedback - errors.append( - f"Filename does not match pattern: {{domain}}-schema-v{{major}}.{{minor}}.md" - ) - - # Check extension - if not filename.endswith('.md'): - errors.append(f"Extension must be '.md', got: {Path(filename).suffix}") - - # Check for version - if '-v' not in filename: - errors.append("Missing version: filename must include '-v{major}.{minor}'") - elif not re.search(r'-v\d+\.\d+', filename): - errors.append( - "Invalid version format: must be '-v{major}.{minor}' (e.g., '-v1.0')" - ) - - # Check for schema keyword - if '-schema-' not in filename: - errors.append("Missing '-schema-' keyword in filename") - - # Check for uppercase (must be lowercase) - if any(c.isupper() for c in filename): - errors.append("Filename must be lowercase") - - # Check domain format (if we can isolate it) - parts = filename.split('-schema-') - if len(parts) >= 1: - domain = parts[0] - if domain and not re.match(r'^[a-z][a-z0-9-]*$', domain): - errors.append( - f"Invalid domain '{domain}': must start with lowercase letter " - "and contain only lowercase letters, numbers, and hyphens" - ) - - return errors - - -def is_valid_schema_filename(filename: str) -> bool: - """ - Check if filename is valid (convenience function). - - Args: - filename: Filename to check - - Returns: - True if valid, False otherwise - - Examples: - >>> is_valid_schema_filename("manpage-schema-v1.0.md") - True - - >>> is_valid_schema_filename("invalid.json") - False - """ - is_valid, _ = validate_schema_filename(filename) - return is_valid - - -def format_validation_message(filename: str) -> str: - """ - Format a user-friendly validation message. - - Args: - filename: Filename that failed validation - - Returns: - Formatted error message with suggestions - - Examples: - >>> print(format_validation_message("manpage.json")) - ❌ Invalid schema filename: manpage.json - ... - """ - errors = get_validation_errors(filename) - - if not errors: - return f"✅ Valid schema filename: {filename}" - - message = f"❌ Invalid schema filename: {filename}\n\n" - message += "Errors:\n" - for i, error in enumerate(errors, 1): - message += f" {i}. {error}\n" - - message += "\nExpected format: {domain}-schema-v{major}.{minor}.md\n" - message += "Example: manpage-schema-v1.0.md\n" - - # Try to suggest a corrected filename - try: - # Extract domain guess (everything before first hyphen or dot) - domain_guess = filename.split('-')[0].split('.')[0] - suggestion = suggest_schema_filename(domain_guess, "1.0") - message += f"\nSuggested filename: {suggestion}\n" - except Exception: - pass - - return message +__all__ = [ + 'validate_schema_filename', + 'suggest_valid_filename', + 'suggest_schema_filename', + 'extract_schema_domain', + 'get_schema_version', + 'extract_schema_metadata', + 'get_validation_errors', + 'is_valid_schema_filename', + 'format_validation_message', + 'SchemaFilenameError', + 'SCHEMA_FILENAME_PATTERN', +] diff --git a/markitect/schema_refiner.py b/markitect/schema_refiner.py index b27ff0aa..c9ee43ee 100644 --- a/markitect/schema_refiner.py +++ b/markitect/schema_refiner.py @@ -1,530 +1,19 @@ """ -Schema Refiner for Phase 2: Schema Refinement Tools +Schema Refiner - Backward Compatibility Module. -Automatically refines rigid schemas by applying loosening rules and fixes. +This module re-exports from markitect.schema.refiner for backward compatibility. +New code should import from markitect.schema.refiner directly. """ -from pathlib import Path -from typing import Dict, Any, List, Optional, Tuple -import json -import copy -from dataclasses import dataclass, field - -from .schema_analyzer import SchemaAnalyzer, SchemaIssue, IssueType, IssueSeverity - - -@dataclass -class RefinementAction: - """Represents a refinement action taken on the schema.""" - issue_type: IssueType - path: str - description: str - old_value: Any = None - new_value: Any = None - - -@dataclass -class RefinementResult: - """Results of schema refinement.""" - success: bool - actions_taken: List[RefinementAction] = field(default_factory=list) - refined_schema: Optional[Dict[str, Any]] = None - error_message: Optional[str] = None - - -class SchemaRefiner: - """Refines rigid schemas by applying loosening rules.""" - - def __init__(self): - """Initialize the schema refiner.""" - self.analyzer = SchemaAnalyzer() - - def _navigate_to_path(self, schema: Dict[str, Any], path: str) -> Optional[Tuple[Dict[str, Any], str]]: - """ - Navigate to a path in the schema, handling nested 'properties' objects. - - Returns (parent_object, property_name) or None if path doesn't exist. - """ - path_parts = path.split('.') - obj = schema - - # Navigate through all but the last part - for i, part in enumerate(path_parts[:-1]): - # Try direct access first - if part in obj: - obj = obj[part] - # If not found and obj has 'properties', try there - elif isinstance(obj, dict) and "properties" in obj and part in obj["properties"]: - obj = obj["properties"][part] - else: - return None - - # For the final part, check if we need to descend into 'properties' - prop_name = path_parts[-1] - if prop_name in obj: - return (obj, prop_name) - elif isinstance(obj, dict) and "properties" in obj and prop_name in obj["properties"]: - return (obj["properties"], prop_name) - else: - return None - - def refine_schema_interactive( - self, - schema: Dict[str, Any], - loosen_counts: bool = True, - migrate_deprecated: bool = False, - round_numbers: bool = True - ) -> RefinementResult: - """ - Refine a schema interactively, prompting for each fix. - - Args: - schema: The JSON schema to refine - loosen_counts: Enable fixes for exact counts - migrate_deprecated: Enable migration of deprecated extensions - round_numbers: Enable rounding of overly specific numbers - - Returns: - RefinementResult with actions taken and refined schema - """ - result = RefinementResult(success=False) - - try: - # Analyze the schema first - analysis = self.analyzer.analyze_schema(schema) - - print(f"\nFound {len(analysis.issues)} issue(s) to review\n") - - # Deep copy to avoid modifying original - refined = copy.deepcopy(schema) - - # Process each issue interactively - for i, issue in enumerate(analysis.issues, 1): - print(f"Issue {i}/{len(analysis.issues)}") - print(f" Type: {issue.issue_type.value}") - print(f" Path: {issue.path}") - print(f" {issue.message}") - print(f" Suggestion: {issue.suggestion}") - - if issue.current_value is not None: - print(f" Current: {json.dumps(issue.current_value)}") - if issue.suggested_value is not None: - print(f" Suggested: {json.dumps(issue.suggested_value)}") - - # Ask user if they want to apply the fix - response = input("\nApply this fix? [y/N/q]: ").strip().lower() - - if response == 'q': - print("Refinement cancelled by user") - result.success = False - return result - elif response == 'y': - action = None - - if loosen_counts and issue.issue_type == IssueType.EXACT_COUNT: - action = self._fix_exact_count(refined, issue) - - elif round_numbers and issue.issue_type == IssueType.OVERLY_SPECIFIC: - action = self._fix_overly_specific(refined, issue) - - elif loosen_counts and issue.issue_type == IssueType.NO_FLEXIBILITY: - action = self._fix_no_flexibility(refined, issue) - - elif migrate_deprecated and issue.issue_type == IssueType.DEPRECATED_EXTENSIONS: - action = self._fix_deprecated_extension(refined, issue) - - if action: - result.actions_taken.append(action) - print(f" ✓ Applied") - else: - print(f" ✗ Could not apply fix") - else: - print(f" - Skipped") - - print() - - result.refined_schema = refined - result.success = True - - except Exception as e: - result.error_message = str(e) - - return result - - def refine_schema( - self, - schema: Dict[str, Any], - loosen_counts: bool = True, - migrate_deprecated: bool = False, - round_numbers: bool = True - ) -> RefinementResult: - """ - Refine a schema by applying fixes for detected issues. - - Args: - schema: The JSON schema to refine - loosen_counts: Apply fixes for exact counts - migrate_deprecated: Migrate deprecated extensions - round_numbers: Round overly specific numbers - - Returns: - RefinementResult with actions taken and refined schema - """ - result = RefinementResult(success=False) - - try: - # Analyze the schema first - analysis = self.analyzer.analyze_schema(schema) - - # Deep copy to avoid modifying original - refined = copy.deepcopy(schema) - - # Apply fixes based on issues found - for issue in analysis.issues: - action = None - - if loosen_counts and issue.issue_type == IssueType.EXACT_COUNT: - action = self._fix_exact_count(refined, issue) - - elif round_numbers and issue.issue_type == IssueType.OVERLY_SPECIFIC: - action = self._fix_overly_specific(refined, issue) - - elif loosen_counts and issue.issue_type == IssueType.NO_FLEXIBILITY: - action = self._fix_no_flexibility(refined, issue) - - elif migrate_deprecated and issue.issue_type == IssueType.DEPRECATED_EXTENSIONS: - action = self._fix_deprecated_extension(refined, issue) - - if action: - result.actions_taken.append(action) - - result.refined_schema = refined - result.success = True - - except Exception as e: - result.error_message = str(e) - - return result - - def _fix_exact_count(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: - """Fix exact count constraints by converting to ranges.""" - nav_result = self._navigate_to_path(schema, issue.path) - if not nav_result: - return None - - obj, prop_name = nav_result - prop_def = obj[prop_name] - old_value = copy.deepcopy(prop_def) - - # Check if it's an array with exact minItems/maxItems - if isinstance(prop_def, dict) and prop_def.get("type") == "array": - min_items = prop_def.get("minItems") - max_items = prop_def.get("maxItems") - - if min_items is not None and max_items is not None and min_items == max_items: - # Apply suggested loosening - new_min = max(0, min_items - 2) - new_max = min_items + 5 - - prop_def["minItems"] = new_min - prop_def["maxItems"] = new_max - - return RefinementAction( - issue_type=IssueType.EXACT_COUNT, - path=issue.path, - description=f"Loosened array count from exactly {min_items} to range {new_min}-{new_max}", - old_value={"minItems": min_items, "maxItems": max_items}, - new_value={"minItems": new_min, "maxItems": new_max} - ) - - # Check if it's a const value - if isinstance(prop_def, dict) and "const" in prop_def: - const_value = prop_def["const"] - del prop_def["const"] - - # If it's a number, convert to a range - if isinstance(const_value, int): - prop_def["minimum"] = const_value - 1 - prop_def["maximum"] = const_value + 1 - - return RefinementAction( - issue_type=IssueType.EXACT_COUNT, - path=issue.path, - description=f"Converted const {const_value} to range {const_value-1}-{const_value+1}", - old_value=const_value, - new_value={"minimum": const_value - 1, "maximum": const_value + 1} - ) - else: - # For non-numeric constants, just remove the constraint - return RefinementAction( - issue_type=IssueType.EXACT_COUNT, - path=issue.path, - description=f"Removed const constraint: {const_value}", - old_value=const_value, - new_value=None - ) - - return None - - def _fix_overly_specific(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: - """Fix overly specific number constraints by rounding.""" - if issue.suggested_value is None: - return None - - nav_result = self._navigate_to_path(schema, issue.path) - if not nav_result: - return None - - obj, prop_name = nav_result - prop_def = obj[prop_name] - - # Round the minItems value - if isinstance(prop_def, dict) and "minItems" in prop_def: - old_value = prop_def["minItems"] - new_value = issue.suggested_value - prop_def["minItems"] = new_value - - return RefinementAction( - issue_type=IssueType.OVERLY_SPECIFIC, - path=issue.path, - description=f"Rounded minItems from {old_value} to {new_value}", - old_value=old_value, - new_value=new_value - ) - - return None - - def _fix_no_flexibility(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: - """Fix narrow ranges by widening them.""" - nav_result = self._navigate_to_path(schema, issue.path) - if not nav_result: - return None - - obj, prop_name = nav_result - prop_def = obj[prop_name] - - if isinstance(prop_def, dict) and "minimum" in prop_def and "maximum" in prop_def: - old_min = prop_def["minimum"] - old_max = prop_def["maximum"] - range_size = old_max - old_min - - # Widen the range - new_min = old_min - 5 - new_max = old_max + 5 - - prop_def["minimum"] = new_min - prop_def["maximum"] = new_max - - return RefinementAction( - issue_type=IssueType.NO_FLEXIBILITY, - path=issue.path, - description=f"Widened range from {old_min}-{old_max} to {new_min}-{new_max}", - old_value={"minimum": old_min, "maximum": old_max}, - new_value={"minimum": new_min, "maximum": new_max} - ) - - return None - - def _fix_deprecated_extension(self, schema: Dict[str, Any], issue: SchemaIssue) -> Optional[RefinementAction]: - """Remove deprecated extension (migration requires manual work).""" - # For now, just document that manual migration is needed - # Full migration would require understanding the old format - - deprecated_key = issue.path - if deprecated_key in schema: - old_value = schema[deprecated_key] - # Don't actually remove it automatically - too risky - return RefinementAction( - issue_type=IssueType.DEPRECATED_EXTENSIONS, - path=issue.path, - description=f"Detected deprecated extension (manual migration recommended)", - old_value=old_value, - new_value=None - ) - - return None - - def refine_schema_file( - self, - input_path: Path, - output_path: Optional[Path] = None, - loosen_counts: bool = True, - migrate_deprecated: bool = False, - round_numbers: bool = True - ) -> RefinementResult: - """ - Refine a schema file. - - Args: - input_path: Path to input schema file - output_path: Path to output file (if None, overwrites input) - loosen_counts: Apply fixes for exact counts - migrate_deprecated: Migrate deprecated extensions - round_numbers: Round overly specific numbers - - Returns: - RefinementResult - """ - with open(input_path) as f: - schema = json.load(f) - - result = self.refine_schema( - schema, - loosen_counts=loosen_counts, - migrate_deprecated=migrate_deprecated, - round_numbers=round_numbers - ) - - if result.success and result.refined_schema: - output = output_path or input_path - with open(output, 'w') as f: - json.dump(result.refined_schema, f, indent=2) - - return result - - def format_refinement_report(self, result: RefinementResult) -> str: - """ - Format refinement results as a human-readable report. - - Args: - result: Refinement results - - Returns: - Formatted report string - """ - lines = [] - - # Header - lines.append("=" * 70) - lines.append("Schema Refinement Report") - lines.append("=" * 70) - lines.append("") - - if not result.success: - lines.append(f"❌ Refinement failed: {result.error_message}") - return "\n".join(lines) - - # Summary - action_count = len(result.actions_taken) - if action_count == 0: - lines.append("✅ No refinements needed - schema is already flexible") - else: - lines.append(f"✅ Applied {action_count} refinement(s)") - lines.append("") - - # List actions - if result.actions_taken: - lines.append("Actions Taken:") - lines.append("-" * 70) - - for i, action in enumerate(result.actions_taken, 1): - lines.append(f"{i}. {action.description}") - lines.append(f" Path: {action.path}") - - if action.old_value is not None: - lines.append(f" Before: {json.dumps(action.old_value)}") - if action.new_value is not None: - lines.append(f" After: {json.dumps(action.new_value)}") - - lines.append("") - - return "\n".join(lines) - - -def refine_schema_cli( - schema_path: str, - output: Optional[str] = None, - loosen_counts: bool = True, - migrate_deprecated: bool = False, - round_numbers: bool = True, - dry_run: bool = False, - interactive: bool = False -) -> int: - """ - CLI entry point for schema refinement. - - Args: - schema_path: Path to schema file - output: Output path (None = overwrite input) - loosen_counts: Apply count loosening fixes - migrate_deprecated: Migrate deprecated extensions - round_numbers: Round overly specific numbers - dry_run: Show changes without applying - interactive: Prompt for each fix - - Returns: - Exit code (0 = success, 1 = no changes needed, 2 = error) - """ - refiner = SchemaRefiner() - - try: - input_path = Path(schema_path) - output_path = Path(output) if output else None - - # Load schema - with open(input_path) as f: - schema = json.load(f) - - if interactive: - # Interactive mode - prompt for each fix - print(f"Refining schema: {schema_path}") - result = refiner.refine_schema_interactive( - schema, - loosen_counts=loosen_counts, - migrate_deprecated=migrate_deprecated, - round_numbers=round_numbers - ) - - if result.success and result.refined_schema and not dry_run: - # Write the refined schema - output = output_path or input_path - with open(output, 'w') as f: - json.dump(result.refined_schema, f, indent=2) - print(f"\nRefined schema written to: {output}") - - elif dry_run: - # Just analyze and show what would be done - result = refiner.refine_schema( - schema, - loosen_counts=loosen_counts, - migrate_deprecated=migrate_deprecated, - round_numbers=round_numbers - ) - - print("DRY RUN - No changes will be made") - print() - else: - result = refiner.refine_schema_file( - input_path, - output_path, - loosen_counts=loosen_counts, - migrate_deprecated=migrate_deprecated, - round_numbers=round_numbers - ) - - # Only print full report if not in interactive mode (user already saw changes) - if not interactive: - report = refiner.format_refinement_report(result) - print(report) - elif result.success: - # Just print summary for interactive mode - print(f"\n{'='*70}") - print(f"Refinement complete: {len(result.actions_taken)} change(s) applied") - print(f"{'='*70}") - - if result.success and len(result.actions_taken) > 0: - return 0 # Success with changes - elif result.success: - return 1 # Success but no changes needed - else: - return 2 # Error - - except FileNotFoundError: - print(f"Error: Schema file not found: {schema_path}") - return 2 - except json.JSONDecodeError as e: - print(f"Error: Invalid JSON in schema file: {e}") - return 2 - except Exception as e: - print(f"Error: {e}") - return 2 +# Re-export from schema package for backward compatibility +from markitect.schema.refiner import ( + SchemaRefiner, + RefinementResult, + RefinementAction, +) + +__all__ = [ + 'SchemaRefiner', + 'RefinementResult', + 'RefinementAction', +] diff --git a/markitect/schema_validator.py b/markitect/schema_validator.py index 1e39c40f..c711c05b 100644 --- a/markitect/schema_validator.py +++ b/markitect/schema_validator.py @@ -1,679 +1,11 @@ """ -Schema Validator for Issue #7: Validate a Markdown File Against a Schema. +Schema Validator - Backward Compatibility Module. -This module provides functionality to validate markdown documents against JSON schemas -for arc42 architecture documentation compliance checking - essential for intelligent -document analysis and plan-actual comparison capabilities. +This module re-exports from markitect.schema.validator for backward compatibility. +New code should import from markitect.schema.validator directly. """ -import json -from pathlib import Path -from typing import Dict, Any +# Re-export from schema package for backward compatibility +from markitect.schema.validator import SchemaValidator -try: - import jsonschema - from jsonschema import SchemaError - JSONSCHEMA_AVAILABLE = True -except ImportError: - # Fallback to basic validation without full JSON Schema validation - JSONSCHEMA_AVAILABLE = False - SchemaError = Exception - -from .parser import parse_markdown_to_ast -from .schema_generator import SchemaGenerator -from .validation_error import ValidationErrorCollector, ValidationErrorType -from .exceptions import FileNotFoundError, SchemaValidationError, InvalidSchemaError - - -class SchemaValidator: - """ - Validates markdown documents against JSON schemas for arc42 compliance checking. - - This service provides boolean validation results for markdown documents against - schemas, enabling strict compliance checking for architectural documentation - templates and intelligent plan-actual comparison. - """ - - def __init__(self): - """Initialize the schema validator.""" - self.schema_generator = SchemaGenerator() - self.jsonschema_available = JSONSCHEMA_AVAILABLE - - def validate_file_against_schema(self, file_path: Path, schema: Dict[str, Any]) -> bool: - """ - Validate a markdown file against a JSON schema. - - Args: - file_path: Path to the markdown file - schema: JSON schema dictionary to validate against - - Returns: - True if the document matches the schema, False otherwise - - Raises: - FileNotFoundError: If the markdown file doesn't exist - InvalidSchemaError: If the schema is invalid - """ - # Validate inputs - if not file_path.exists(): - raise FileNotFoundError(f"Markdown file not found: {file_path}") - - # Validate the schema itself - self._validate_schema(schema) - - # Generate the document's current structure - try: - document_schema = self.schema_generator.generate_schema_from_file(file_path) - except Exception as e: - raise SchemaValidationError(f"Failed to generate document schema: {e}") from e - - # Check if the expected schema has heading text constraints - if self._has_heading_text_constraints(schema): - # For heading text validation, we need to extract actual content and compare against enum constraints - return self._validate_with_heading_text_constraints(file_path, schema, document_schema) - - # Use standard structure comparison for backward compatibility - return self._compare_structures(document_schema, schema) - - def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool: - """ - Validate a markdown file against a JSON schema provided as a string. - - Args: - file_path: Path to the markdown file - schema_json: JSON schema as a string - - Returns: - True if the document matches the schema, False otherwise - - Raises: - FileNotFoundError: If the markdown file doesn't exist - InvalidSchemaError: If the schema is invalid JSON or schema - """ - try: - schema = json.loads(schema_json) - except json.JSONDecodeError as e: - raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e - - return self.validate_file_against_schema(file_path, schema) - - def validate_file_against_schema_file(self, file_path: Path, schema_file_path: Path) -> bool: - """ - Validate a markdown file against a schema stored in a file. - - Args: - file_path: Path to the markdown file - schema_file_path: Path to the JSON schema file - - Returns: - True if the document matches the schema, False otherwise - - Raises: - FileNotFoundError: If either file doesn't exist - InvalidSchemaError: If the schema file is invalid - """ - if not schema_file_path.exists(): - raise FileNotFoundError(f"Schema file not found: {schema_file_path}") - - try: - schema_content = schema_file_path.read_text(encoding='utf-8') - schema = json.loads(schema_content) - except (IOError, json.JSONDecodeError) as e: - raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e - - return self.validate_file_against_schema(file_path, schema) - - def _validate_schema(self, schema: Dict[str, Any]) -> None: - """ - Validate that a schema is a valid JSON Schema. - - Args: - schema: Schema dictionary to validate - - Raises: - InvalidSchemaError: If the schema is invalid - """ - try: - # Check basic schema structure - if not isinstance(schema, dict): - raise InvalidSchemaError("Schema must be a dictionary") - - # Basic schema validation - if not schema.get('$schema') or not schema.get('type'): - raise InvalidSchemaError("Schema must have '$schema' and 'type' fields") - - # If jsonschema library is available, use it for full validation - if self.jsonschema_available: - jsonschema.validators.validator_for(schema).check_schema(schema) - - except (SchemaError, TypeError, AttributeError) as e: - raise InvalidSchemaError(f"Invalid JSON schema: {e}") from e - - def _compare_structures(self, document_schema: Dict[str, Any], expected_schema: Dict[str, Any]) -> bool: - """ - Compare a document's actual structure against expected schema requirements. - - This method performs the core validation logic by analyzing whether the - document's generated schema satisfies the requirements defined in the - expected schema. - - Args: - document_schema: Schema generated from the actual document - expected_schema: Expected schema requirements - - Returns: - True if the document satisfies the expected schema requirements - """ - try: - # Extract actual document structure - doc_properties = document_schema.get('properties', {}) - expected_properties = expected_schema.get('properties', {}) - - # Check all required properties are present - required_properties = expected_schema.get('required', []) - for prop in required_properties: - if prop not in doc_properties: - return False - - # Validate heading structure if specified - if 'headings' in expected_properties and 'headings' in doc_properties: - if not self._validate_heading_structure( - doc_properties['headings'], - expected_properties['headings'] - ): - return False - - # Validate other structural elements - structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables'] - for element in structural_elements: - if element in expected_properties: - if not self._validate_structural_element( - doc_properties.get(element), - expected_properties[element] - ): - return False - - return True - - except Exception: - # If comparison fails for any reason, consider validation failed - return False - - def _validate_heading_structure(self, actual_headings: Dict[str, Any], expected_headings: Dict[str, Any]) -> bool: - """ - Validate heading structure against expected requirements. - - Args: - actual_headings: Actual heading structure from document - expected_headings: Expected heading requirements - - Returns: - True if heading structure meets requirements - """ - actual_heading_props = actual_headings.get('properties', {}) - expected_heading_props = expected_headings.get('properties', {}) - required_heading_levels = expected_headings.get('required', []) - - # Check required heading levels are present - for level in required_heading_levels: - if level not in actual_heading_props: - return False - - # Check each expected heading level meets requirements - for level, expected_spec in expected_heading_props.items(): - if level not in actual_heading_props: - # If level is not required, skip it - if level not in required_heading_levels: - continue - return False - - actual_spec = actual_heading_props[level] - - # Check minimum and maximum item requirements - if not self._validate_array_constraints(actual_spec, expected_spec): - return False - - return True - - def _validate_structural_element(self, actual_element: Dict[str, Any], expected_element: Dict[str, Any]) -> bool: - """ - Validate a structural element (paragraphs, lists, etc.) against requirements. - - Args: - actual_element: Actual element structure from document - expected_element: Expected element requirements - - Returns: - True if element meets requirements - """ - if actual_element is None: - # Element doesn't exist in document - return False - - return self._validate_array_constraints(actual_element, expected_element) - - def _validate_array_constraints(self, actual: Dict[str, Any], expected: Dict[str, Any]) -> bool: - """ - Validate array constraints (minItems, maxItems) for structural elements. - - Args: - actual: Actual element specification - expected: Expected element specification - - Returns: - True if constraints are satisfied - """ - # Get actual count from the schema specification - # For generated schemas, we use minItems/maxItems which represent actual counts - actual_min = actual.get('minItems', 0) - actual_max = actual.get('maxItems', actual_min) - actual_count = actual_max # In our generated schemas, min=max=actual count - - # Check against expected constraints - expected_min = expected.get('minItems', 0) - expected_max = expected.get('maxItems', float('inf')) - - return expected_min <= actual_count <= expected_max - - # Issue #8: Detailed Error Reporting Methods - - def validate_file_with_errors(self, file_path: Path, schema: Dict[str, Any]) -> ValidationErrorCollector: - """ - Validate a markdown file against a JSON schema and collect detailed errors. - - This method provides comprehensive error reporting for Issue #8, enabling - users to understand exactly how their documents deviate from schemas. - - Args: - file_path: Path to the markdown file - schema: JSON schema dictionary to validate against - - Returns: - ValidationErrorCollector with all validation errors - - Raises: - FileNotFoundError: If the markdown file doesn't exist - InvalidSchemaError: If the schema is invalid - """ - # Validate inputs - if not file_path.exists(): - raise FileNotFoundError(f"Markdown file not found: {file_path}") - - # Validate the schema itself - self._validate_schema(schema) - - # Initialize error collector - error_collector = ValidationErrorCollector() - - # Generate the document's current structure - try: - document_schema = self.schema_generator.generate_schema_from_file(file_path) - except Exception as e: - error_collector.add_error( - ValidationErrorType.STRUCTURAL_VIOLATION, - f"Failed to generate document schema: {e}", - "document.structure", - suggestion="Check if the markdown file is properly formatted" - ) - return error_collector - - # Compare the document's structure against the expected schema and collect errors - if self._has_heading_text_constraints(schema): - # For heading text validation, we need to handle enum constraints specially - self._compare_structures_with_errors(document_schema, schema, error_collector) - self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector) - else: - # Use standard structure comparison for backward compatibility - self._compare_structures_with_errors(document_schema, schema, error_collector) - - return error_collector - - def validate_file_with_errors_string(self, file_path: Path, schema_json: str) -> ValidationErrorCollector: - """ - Validate a markdown file against a JSON schema string and collect detailed errors. - - Args: - file_path: Path to the markdown file - schema_json: JSON schema as a string - - Returns: - ValidationErrorCollector with all validation errors - - Raises: - FileNotFoundError: If the markdown file doesn't exist - InvalidSchemaError: If the schema is invalid JSON or schema - """ - try: - schema = json.loads(schema_json) - except json.JSONDecodeError as e: - raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e - - return self.validate_file_with_errors(file_path, schema) - - def validate_file_with_errors_file(self, file_path: Path, schema_file_path: Path) -> ValidationErrorCollector: - """ - Validate a markdown file against a schema file and collect detailed errors. - - Args: - file_path: Path to the markdown file - schema_file_path: Path to the JSON schema file - - Returns: - ValidationErrorCollector with all validation errors - - Raises: - FileNotFoundError: If either file doesn't exist - InvalidSchemaError: If the schema file is invalid - """ - if not schema_file_path.exists(): - raise FileNotFoundError(f"Schema file not found: {schema_file_path}") - - try: - schema_content = schema_file_path.read_text(encoding='utf-8') - schema = json.loads(schema_content) - except (IOError, json.JSONDecodeError) as e: - raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e - - return self.validate_file_with_errors(file_path, schema) - - def _compare_structures_with_errors( - self, - document_schema: Dict[str, Any], - expected_schema: Dict[str, Any], - error_collector: ValidationErrorCollector - ) -> None: - """ - Compare document structure against expected schema and collect detailed errors. - - This method performs comprehensive validation analysis, collecting specific - errors about missing headings, incorrect content counts, and structural violations. - - Args: - document_schema: Schema generated from the actual document - expected_schema: Expected schema requirements - error_collector: Collector to accumulate validation errors - """ - try: - # Extract actual document structure - doc_properties = document_schema.get('properties', {}) - expected_properties = expected_schema.get('properties', {}) - - # Check all required properties are present - required_properties = expected_schema.get('required', []) - for prop in required_properties: - if prop not in doc_properties: - error_collector.add_error( - ValidationErrorType.MISSING_REQUIRED_SECTION, - f"Missing required section: '{prop}'", - f"document.{prop}", - expected=f"Section '{prop}' is required by schema", - actual="Section not found", - suggestion=f"Add the '{prop}' section to your document" - ) - - # Validate heading structure if specified - if 'headings' in expected_properties and 'headings' in doc_properties: - self._validate_heading_structure_with_errors( - doc_properties['headings'], - expected_properties['headings'], - error_collector - ) - - # Validate other structural elements - structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables'] - for element in structural_elements: - if element in expected_properties: - self._validate_structural_element_with_errors( - doc_properties.get(element), - expected_properties[element], - element, - error_collector - ) - - except Exception as e: - error_collector.add_error( - ValidationErrorType.STRUCTURAL_VIOLATION, - f"Error during structure comparison: {e}", - "document.structure", - suggestion="Check if both the document and schema are properly formatted" - ) - - def _validate_heading_structure_with_errors( - self, - actual_headings: Dict[str, Any], - expected_headings: Dict[str, Any], - error_collector: ValidationErrorCollector - ) -> None: - """ - Validate heading structure and collect detailed errors. - - Args: - actual_headings: Actual heading structure from document - expected_headings: Expected heading requirements - error_collector: Collector for validation errors - """ - actual_heading_props = actual_headings.get('properties', {}) - expected_heading_props = expected_headings.get('properties', {}) - required_heading_levels = expected_headings.get('required', []) - - # Check required heading levels are present - for level in required_heading_levels: - if level not in actual_heading_props: - level_num = level.replace('level_', '') - error_collector.add_error( - ValidationErrorType.MISSING_REQUIRED_HEADING, - f"Missing required heading level {level_num}", - f"headings.{level}", - expected=f"At least one heading at level {level_num}", - actual="No headings found at this level", - suggestion=f"Add heading(s) at level {level_num} (e.g., {'#' * int(level_num)} Heading)" - ) - - # Check each expected heading level meets requirements - for level, expected_spec in expected_heading_props.items(): - if level not in actual_heading_props: - # If level is not required, skip it - if level not in required_heading_levels: - continue - # Already handled above in required check - - else: - actual_spec = actual_heading_props[level] - level_num = level.replace('level_', '') - - # Check minimum and maximum item requirements - self._validate_array_constraints_with_errors( - actual_spec, - expected_spec, - f"headings.{level}", - f"level {level_num} headings", - error_collector - ) - - def _validate_structural_element_with_errors( - self, - actual_element: Dict[str, Any], - expected_element: Dict[str, Any], - element_name: str, - error_collector: ValidationErrorCollector - ) -> None: - """ - Validate a structural element and collect errors. - - Args: - actual_element: Actual element structure from document - expected_element: Expected element requirements - element_name: Name of the structural element (for error messages) - error_collector: Collector for validation errors - """ - if actual_element is None: - error_collector.add_error( - ValidationErrorType.MISSING_REQUIRED_SECTION, - f"Missing required structural element: {element_name}", - f"content.{element_name}", - expected=f"Document should contain {element_name}", - actual="Element not found", - suggestion=f"Add {element_name} to your document" - ) - return - - self._validate_array_constraints_with_errors( - actual_element, - expected_element, - f"content.{element_name}", - element_name, - error_collector - ) - - def _validate_array_constraints_with_errors( - self, - actual: Dict[str, Any], - expected: Dict[str, Any], - path: str, - element_description: str, - error_collector: ValidationErrorCollector - ) -> None: - """ - Validate array constraints and collect specific errors. - - Args: - actual: Actual element specification - expected: Expected element specification - path: JSON path for error location - element_description: Human-readable element description - error_collector: Collector for validation errors - """ - # Get actual count from the schema specification - actual_min = actual.get('minItems', 0) - actual_max = actual.get('maxItems', actual_min) - actual_count = actual_max # In our generated schemas, min=max=actual count - - # Check against expected constraints - expected_min = expected.get('minItems', 0) - expected_max = expected.get('maxItems', float('inf')) - - # Check minimum constraint - if actual_count < expected_min: - error_collector.add_error( - ValidationErrorType.INSUFFICIENT_CONTENT, - f"Insufficient {element_description}: found {actual_count}, required at least {expected_min}", - path, - expected=f"At least {expected_min} {element_description}", - actual=f"{actual_count} {element_description}", - suggestion=f"Add {expected_min - actual_count} more {element_description}" - ) - - # Check maximum constraint - if expected_max != float('inf') and actual_count > expected_max: - error_collector.add_error( - ValidationErrorType.EXCESS_CONTENT, - f"Too many {element_description}: found {actual_count}, maximum allowed {expected_max}", - path, - expected=f"At most {expected_max} {element_description}", - actual=f"{actual_count} {element_description}", - suggestion=f"Remove {actual_count - expected_max} {element_description}" - ) - - def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool: - """ - Check if the schema has heading text constraints (enum values on heading content). - - Args: - schema: JSON schema to check - - Returns: - True if schema has heading text constraints - """ - headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {}) - - for level_props in headings_props.values(): - items = level_props.get('items', {}) - content_prop = items.get('properties', {}).get('content', {}) - if 'enum' in content_prop: - return True - - return False - - def _validate_with_heading_text_constraints( - self, - file_path: Path, - expected_schema: Dict[str, Any], - document_schema: Dict[str, Any] - ) -> bool: - """ - Validate document with heading text constraints by comparing actual content against enum values. - - Args: - file_path: Path to the markdown file - expected_schema: Schema with heading text constraints - document_schema: Generated schema from the actual document - - Returns: - True if document meets all constraints including heading text - """ - # First check standard structure compliance - if not self._compare_structures(document_schema, expected_schema): - return False - - # Then check heading text constraints - expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {}) - - # Generate document analysis with actual heading content - content = file_path.read_text(encoding='utf-8') - ast_tokens = parse_markdown_to_ast(content) - structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None) - - for level_key, expected_level_spec in expected_headings.items(): - content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {}) - - if 'enum' in content_constraints: - allowed_texts = content_constraints['enum'] - actual_headings = structure_analysis['headings'].get(level_key, []) - - for heading in actual_headings: - actual_text = heading['content'] - if actual_text not in allowed_texts: - return False - - return True - - def _validate_heading_text_constraints_with_errors( - self, - file_path: Path, - expected_schema: Dict[str, Any], - error_collector: ValidationErrorCollector - ) -> None: - """ - Validate heading text constraints and collect detailed errors. - - Args: - file_path: Path to the markdown file - expected_schema: Schema with heading text constraints - error_collector: Collector for validation errors - """ - expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {}) - - # Generate document analysis with actual heading content - content = file_path.read_text(encoding='utf-8') - ast_tokens = parse_markdown_to_ast(content) - structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None) - - for level_key, expected_level_spec in expected_headings.items(): - content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {}) - - if 'enum' in content_constraints: - allowed_texts = content_constraints['enum'] - actual_headings = structure_analysis['headings'].get(level_key, []) - - for i, heading in enumerate(actual_headings): - actual_text = heading['content'] - if actual_text not in allowed_texts: - # Add detailed error about heading text mismatch - error_collector.add_error( - ValidationErrorType.HEADING_COUNT_MISMATCH, - f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'", - f"headings.{level_key}[{i}].content", - expected=f"One of: {allowed_texts}", - actual=actual_text, - suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}" - ) +__all__ = ['SchemaValidator'] diff --git a/markitect/serializer.py b/markitect/serializer.py index 0e2a2c3d..eb7ddf52 100644 --- a/markitect/serializer.py +++ b/markitect/serializer.py @@ -1,359 +1,11 @@ """ -AST to Markdown Serialization - Issue #2 Completion +AST to Markdown Serialization - Backward Compatibility Module. -This module provides functionality to serialize markdown-it AST tokens back into -markdown format, enabling roundtrip validation and document manipulation. - -Key Features: -- Convert AST tokens back to markdown text -- Preserve front matter during serialization -- Support for content manipulation operations -- Roundtrip integrity validation +This module re-exports from markitect.core.serializer for backward compatibility. +New code should import from markitect.core.serializer directly. """ -from typing import List, Dict, Any, Optional -import yaml +# Re-export from core for backward compatibility +from markitect.core.serializer import ASTSerializer - -class ASTSerializer: - """ - Serializes markdown-it AST tokens back to markdown format. - - Provides roundtrip capability: markdown → AST → markdown - Supports front matter preservation and content manipulation. - """ - - def __init__(self): - """Initialize the AST serializer.""" - pass - - def serialize_to_markdown(self, ast: List[Dict[str, Any]], front_matter: Optional[Dict[str, Any]] = None) -> str: - """ - Convert AST tokens back to markdown format. - - Args: - ast: List of markdown-it AST tokens - front_matter: Optional YAML front matter dictionary - - Returns: - Markdown text with optional front matter - - Example: - serializer = ASTSerializer() - markdown = serializer.serialize_to_markdown(ast, front_matter) - """ - markdown_parts = [] - - # Add front matter if present - if front_matter and isinstance(front_matter, dict) and front_matter: - yaml_content = yaml.dump(front_matter, default_flow_style=False).strip() - markdown_parts.append(f"---\n{yaml_content}\n---\n\n") - - # Process AST tokens - markdown_content = self._process_tokens(ast) - markdown_parts.append(markdown_content) - - return ''.join(markdown_parts) - - def _process_tokens(self, tokens: List[Dict[str, Any]]) -> str: - """ - Process a list of AST tokens into markdown text. - - Args: - tokens: List of markdown-it tokens - - Returns: - Markdown text representation - """ - markdown_lines = [] - current_line = "" - list_level = 0 - - for token in tokens: - token_type = token.get('type', '') - content = token.get('content', '') - markup = token.get('markup', '') - tag = token.get('tag', '') - nesting = token.get('nesting', 0) - level = token.get('level', 0) - - # Handle different token types - if token_type == 'heading_open': - heading_level = int(tag[1]) if tag.startswith('h') else 1 - current_line = '#' * heading_level + ' ' - elif token_type == 'heading_close': - if current_line: - markdown_lines.append(current_line.rstrip()) - current_line = "" - markdown_lines.append("") # Empty line after heading - - elif token_type == 'paragraph_open': - pass # Start of paragraph - elif token_type == 'paragraph_close': - if current_line: - markdown_lines.append(current_line.rstrip()) - current_line = "" - markdown_lines.append("") # Empty line after paragraph - - elif token_type == 'inline': - # Process inline content and children - if content: - current_line += content - elif 'children' in token: - current_line += self._process_inline_children(token['children']) - - elif token_type == 'list_item_open': - # Handle list items - indent = ' ' * (level // 2) - if markup in ('-', '*'): - current_line = indent + '- ' - elif markup.isdigit(): - current_line = indent + '1. ' - elif token_type == 'list_item_close': - if current_line: - markdown_lines.append(current_line.rstrip()) - current_line = "" - - elif token_type in ('bullet_list_open', 'ordered_list_open'): - list_level += 1 - elif token_type in ('bullet_list_close', 'ordered_list_close'): - list_level -= 1 - if list_level == 0: - markdown_lines.append("") # Empty line after list - - elif token_type == 'blockquote_open': - pass - elif token_type == 'blockquote_close': - markdown_lines.append("") - - elif token_type == 'code_block': - markdown_lines.append(f"```{token.get('info', '')}") - markdown_lines.append(content.rstrip()) - markdown_lines.append("```") - markdown_lines.append("") - - elif token_type == 'fence': - if nesting == 1: # Opening fence - markdown_lines.append(f"```{token.get('info', '')}") - else: # Closing fence - markdown_lines.append("```") - markdown_lines.append("") - - elif token_type == 'hr': - markdown_lines.append("---") - markdown_lines.append("") - - elif token_type == 'text': - current_line += content - - # Add any remaining content - if current_line: - markdown_lines.append(current_line.rstrip()) - - # Clean up extra empty lines at the end - while markdown_lines and markdown_lines[-1] == "": - markdown_lines.pop() - - return '\n'.join(markdown_lines) - - def _process_inline_children(self, children: List[Dict[str, Any]]) -> str: - """ - Process inline children tokens (emphasis, strong, links, etc.). - - Args: - children: List of inline token children - - Returns: - Processed inline markdown text - """ - result = "" - - for child in children: - token_type = child.get('type', '') - content = child.get('content', '') - markup = child.get('markup', '') - - if token_type == 'text': - result += content - elif token_type == 'code_inline': - result += f"`{content}`" - elif token_type == 'em_open': - result += markup or '*' - elif token_type == 'em_close': - result += markup or '*' - elif token_type == 'strong_open': - result += markup or '**' - elif token_type == 'strong_close': - result += markup or '**' - elif token_type == 'link_open': - # Extract href from attrs - href = "" - if 'attrs' in child and child['attrs']: - for attr in child['attrs']: - if attr[0] == 'href': - href = attr[1] - break - result += "[" - elif token_type == 'link_close': - # This is tricky - we need to get the href from the opening token - # For now, we'll use a placeholder approach - result += "](#)" - elif token_type == 'softbreak': - result += '\n' - elif token_type == 'hardbreak': - result += ' \n' - - return result - - def modify_ast_content(self, ast: List[Dict[str, Any]], modifications: Dict[str, Any]) -> List[Dict[str, Any]]: - """ - Modify AST content based on provided modifications. - - Args: - ast: Original AST tokens - modifications: Dictionary of modifications to apply - - Returns: - Modified AST tokens - - Supported modifications: - - add_section: Add a new section with title and content - - update_front_matter: Update front matter values - """ - modified_ast = ast.copy() - - # Handle adding sections - if 'add_section' in modifications: - section_data = modifications['add_section'] - title = section_data.get('title', 'New Section') - content = section_data.get('content', '') - level = section_data.get('level', 2) - - # Create new section tokens - new_tokens = [ - { - "type": "heading_open", - "tag": f"h{level}", - "attrs": {}, - "map": None, - "nesting": 1, - "level": 0, - "content": "", - "markup": "#" * level, - "info": "", - "meta": {}, - "block": True, - "hidden": False - }, - { - "type": "inline", - "tag": "", - "attrs": {}, - "map": None, - "nesting": 0, - "level": 1, - "children": [ - { - "type": "text", - "tag": "", - "attrs": {}, - "map": None, - "nesting": 0, - "level": 0, - "content": title, - "markup": "", - "info": "", - "meta": {}, - "block": False, - "hidden": False - } - ], - "content": title, - "markup": "", - "info": "", - "meta": {}, - "block": True, - "hidden": False - }, - { - "type": "heading_close", - "tag": f"h{level}", - "attrs": {}, - "map": None, - "nesting": -1, - "level": 0, - "content": "", - "markup": "#" * level, - "info": "", - "meta": {}, - "block": True, - "hidden": False - } - ] - - if content: - new_tokens.extend([ - { - "type": "paragraph_open", - "tag": "p", - "attrs": {}, - "map": None, - "nesting": 1, - "level": 0, - "content": "", - "markup": "", - "info": "", - "meta": {}, - "block": True, - "hidden": False - }, - { - "type": "inline", - "tag": "", - "attrs": {}, - "map": None, - "nesting": 0, - "level": 1, - "children": [ - { - "type": "text", - "tag": "", - "attrs": {}, - "map": None, - "nesting": 0, - "level": 0, - "content": content, - "markup": "", - "info": "", - "meta": {}, - "block": False, - "hidden": False - } - ], - "content": content, - "markup": "", - "info": "", - "meta": {}, - "block": True, - "hidden": False - }, - { - "type": "paragraph_close", - "tag": "p", - "attrs": {}, - "map": None, - "nesting": -1, - "level": 0, - "content": "", - "markup": "", - "info": "", - "meta": {}, - "block": True, - "hidden": False - } - ]) - - # Add to end of AST - modified_ast.extend(new_tokens) - - return modified_ast +__all__ = ['ASTSerializer'] diff --git a/markitect/spaces/__init__.py b/markitect/spaces/__init__.py new file mode 100644 index 00000000..5f82ecff --- /dev/null +++ b/markitect/spaces/__init__.py @@ -0,0 +1,76 @@ +""" +Information Spaces package for MarkiTect. + +This package provides the Information Space abstraction, enabling: +- First-class space entities with identity, metadata, and lifecycle +- Event-driven change tracking and notifications +- Persistent transclusion context with cross-space references +- HTML rendering with caching and theme support +- Bidirectional directory synchronization +- Composable space hierarchies + +Package Structure: + - models: Core domain models (InformationSpace, SpaceDocument, SpaceConfig) + - events: Event system (SpaceEvent, EventBus, handlers) + - repositories: Data access layer (ISpaceRepository, SqliteSpaceRepository) + - transclusion: Persistent transclusion context and reference tracking + - rendering: Space rendering (HTML, themes) + - sync: Directory synchronization (export, import, bidirectional) + - services: Business logic (SpaceService) + - history: Optional git-based version control + +Usage: + from markitect.spaces import SpaceService, InformationSpace + + service = SpaceService() + space = await service.create_space("my-docs") + await service.add_document(space, "/intro.md", content="# Intro") + await service.render(space, output_dir="./html/") +""" + +# Phase 1: Foundation +from .models import ( + InformationSpace, + SpaceDocument, + SpaceConfig, + SpaceMetadata, + SpaceVariable, + TransclusionReference, + SpaceStatus, +) +from .services import SpaceService +from .repositories import ( + ISpaceRepository, + IDocumentAssociationRepository, + IVariableRepository, + IReferenceRepository, + SqliteSpaceRepository, + SqliteDocumentRepository, + SqliteVariableRepository, + SqliteReferenceRepository, + initialize_space_tables, +) + +__all__ = [ + # Models + "InformationSpace", + "SpaceDocument", + "SpaceConfig", + "SpaceMetadata", + "SpaceVariable", + "TransclusionReference", + "SpaceStatus", + # Services + "SpaceService", + # Repository Interfaces + "ISpaceRepository", + "IDocumentAssociationRepository", + "IVariableRepository", + "IReferenceRepository", + # SQLite Implementations + "SqliteSpaceRepository", + "SqliteDocumentRepository", + "SqliteVariableRepository", + "SqliteReferenceRepository", + "initialize_space_tables", +] diff --git a/markitect/spaces/events/__init__.py b/markitect/spaces/events/__init__.py new file mode 100644 index 00000000..9d092aed --- /dev/null +++ b/markitect/spaces/events/__init__.py @@ -0,0 +1,16 @@ +""" +Event system for Information Spaces. + +This package provides event-driven architecture for space operations: +- SpaceEvent: Event dataclass with type, payload, timestamp +- EventBus: In-process publish/subscribe for space events +- Event handlers and registration + +Events emitted: +- SPACE_CREATED, SPACE_UPDATED, SPACE_DELETED +- DOCUMENT_ADDED, DOCUMENT_UPDATED, DOCUMENT_REMOVED +- RENDER_COMPLETED, SYNC_COMPLETED +""" + +# Events will be implemented in Phase 2 +__all__ = [] diff --git a/markitect/spaces/history/__init__.py b/markitect/spaces/history/__init__.py new file mode 100644 index 00000000..d8948eea --- /dev/null +++ b/markitect/spaces/history/__init__.py @@ -0,0 +1,13 @@ +""" +Git history tracking for Information Spaces (Optional Phase 8). + +This package provides version control integration: +- IHistoryBackend: Abstract history backend interface +- GitHistoryBackend: Git implementation +- Event-driven commit triggers +- History query API (log, diff, branches) +- Versioned read/render operations +""" + +# History tracking will be implemented in Phase 8 +__all__ = [] diff --git a/markitect/spaces/models.py b/markitect/spaces/models.py new file mode 100644 index 00000000..a644b17b --- /dev/null +++ b/markitect/spaces/models.py @@ -0,0 +1,329 @@ +""" +Core domain models for Information Spaces. + +This module provides the foundational data models for the Information Space +abstraction, including the space entity, document associations, and configuration. +""" + +import uuid +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, Any, List, Optional +from enum import Enum + + +class SpaceStatus(Enum): + """Lifecycle status of an Information Space.""" + DRAFT = "draft" + ACTIVE = "active" + ARCHIVED = "archived" + DELETED = "deleted" + + +@dataclass +class SpaceMetadata: + """ + Extensible metadata for an Information Space. + + Attributes: + tags: List of tags for categorization + author: Author identifier + custom: Dictionary for custom metadata fields + """ + tags: List[str] = field(default_factory=list) + author: Optional[str] = None + custom: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert metadata to dictionary for serialization.""" + return { + "tags": self.tags, + "author": self.author, + "custom": self.custom, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SpaceMetadata": + """Create metadata from dictionary.""" + return cls( + tags=data.get("tags", []), + author=data.get("author"), + custom=data.get("custom", {}), + ) + + +@dataclass +class SpaceConfig: + """ + Configuration settings for an Information Space. + + Attributes: + default_variant: Default directory variant for export (flat/hierarchical/semantic) + enable_caching: Whether to enable render caching + theme: Theme name for HTML rendering + history_enabled: Whether git history tracking is enabled (Phase 8) + history_backend: History backend type (default: "git") + history_options: Additional history backend options + variable_scope: Default variable scope resolution strategy + """ + default_variant: str = "hierarchical" + enable_caching: bool = True + theme: Optional[str] = None + history_enabled: bool = False + history_backend: str = "git" + history_options: Dict[str, Any] = field(default_factory=dict) + variable_scope: str = "space" # space, document, request + + def to_dict(self) -> Dict[str, Any]: + """Convert config to dictionary for serialization.""" + return { + "default_variant": self.default_variant, + "enable_caching": self.enable_caching, + "theme": self.theme, + "history_enabled": self.history_enabled, + "history_backend": self.history_backend, + "history_options": self.history_options, + "variable_scope": self.variable_scope, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SpaceConfig": + """Create config from dictionary.""" + return cls( + default_variant=data.get("default_variant", "hierarchical"), + enable_caching=data.get("enable_caching", True), + theme=data.get("theme"), + history_enabled=data.get("history_enabled", False), + history_backend=data.get("history_backend", "git"), + history_options=data.get("history_options", {}), + variable_scope=data.get("variable_scope", "space"), + ) + + +@dataclass +class SpaceDocument: + """ + Represents a document's membership in an Information Space. + + Attributes: + id: Unique document membership identifier + space_id: ID of the containing space + document_id: Reference to the actual document + space_path: Path within the space (e.g., "/intro.md") + order_index: Ordering within the space + metadata: Document-specific metadata + content_hash: Hash of document content for change detection + added_at: Timestamp when document was added + """ + id: str = field(default_factory=lambda: str(uuid.uuid4())) + space_id: str = "" + document_id: str = "" + space_path: str = "" + order_index: int = 0 + metadata: Dict[str, Any] = field(default_factory=dict) + content_hash: Optional[str] = None + added_at: datetime = field(default_factory=datetime.now) + + def to_dict(self) -> Dict[str, Any]: + """Convert document association to dictionary.""" + return { + "id": self.id, + "space_id": self.space_id, + "document_id": self.document_id, + "space_path": self.space_path, + "order_index": self.order_index, + "metadata": self.metadata, + "content_hash": self.content_hash, + "added_at": self.added_at.isoformat(), + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SpaceDocument": + """Create document association from dictionary.""" + added_at = data.get("added_at") + if isinstance(added_at, str): + added_at = datetime.fromisoformat(added_at) + elif added_at is None: + added_at = datetime.now() + + return cls( + id=data.get("id", str(uuid.uuid4())), + space_id=data.get("space_id", ""), + document_id=data.get("document_id", ""), + space_path=data.get("space_path", ""), + order_index=data.get("order_index", 0), + metadata=data.get("metadata", {}), + content_hash=data.get("content_hash"), + added_at=added_at, + ) + + +@dataclass +class InformationSpace: + """ + First-class Information Space abstraction. + + An Information Space is a container for documents with transclusion + relationships, persistent context, and lifecycle management. + + Attributes: + id: Unique space identifier + name: Human-readable unique name + description: Optional description + metadata: Extensible metadata + config: Space configuration + parent_space_id: Optional parent space for inheritance + status: Current lifecycle status + created_at: Creation timestamp + updated_at: Last update timestamp + + Example: + space = InformationSpace( + name="api-docs", + description="API Documentation", + config=SpaceConfig(theme="technical") + ) + """ + id: str = field(default_factory=lambda: str(uuid.uuid4())) + name: str = "" + description: Optional[str] = None + metadata: SpaceMetadata = field(default_factory=SpaceMetadata) + config: SpaceConfig = field(default_factory=SpaceConfig) + parent_space_id: Optional[str] = None + status: SpaceStatus = SpaceStatus.DRAFT + created_at: datetime = field(default_factory=datetime.now) + updated_at: datetime = field(default_factory=datetime.now) + + def __post_init__(self): + """Validate space after initialization.""" + if not self.name: + raise ValueError("Space name is required") + + def to_dict(self) -> Dict[str, Any]: + """Convert space to dictionary for serialization.""" + return { + "id": self.id, + "name": self.name, + "description": self.description, + "metadata": self.metadata.to_dict() if isinstance(self.metadata, SpaceMetadata) else self.metadata, + "config": self.config.to_dict() if isinstance(self.config, SpaceConfig) else self.config, + "parent_space_id": self.parent_space_id, + "status": self.status.value if isinstance(self.status, SpaceStatus) else self.status, + "created_at": self.created_at.isoformat(), + "updated_at": self.updated_at.isoformat(), + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "InformationSpace": + """Create space from dictionary.""" + created_at = data.get("created_at") + if isinstance(created_at, str): + created_at = datetime.fromisoformat(created_at) + elif created_at is None: + created_at = datetime.now() + + updated_at = data.get("updated_at") + if isinstance(updated_at, str): + updated_at = datetime.fromisoformat(updated_at) + elif updated_at is None: + updated_at = datetime.now() + + status = data.get("status", "draft") + if isinstance(status, str): + status = SpaceStatus(status) + + metadata = data.get("metadata", {}) + if isinstance(metadata, dict): + metadata = SpaceMetadata.from_dict(metadata) + + config = data.get("config", {}) + if isinstance(config, dict): + config = SpaceConfig.from_dict(config) + + return cls( + id=data.get("id", str(uuid.uuid4())), + name=data["name"], + description=data.get("description"), + metadata=metadata, + config=config, + parent_space_id=data.get("parent_space_id"), + status=status, + created_at=created_at, + updated_at=updated_at, + ) + + def activate(self) -> None: + """Activate the space.""" + self.status = SpaceStatus.ACTIVE + self.updated_at = datetime.now() + + def archive(self) -> None: + """Archive the space.""" + self.status = SpaceStatus.ARCHIVED + self.updated_at = datetime.now() + + def touch(self) -> None: + """Update the last modified timestamp.""" + self.updated_at = datetime.now() + + +@dataclass +class SpaceVariable: + """ + Variable stored at space level for transclusion context. + + Attributes: + space_id: ID of the containing space + name: Variable name + value: Variable value (JSON-serializable) + scope: Variable scope (space, document, request) + """ + space_id: str + name: str + value: Any + scope: str = "space" + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return { + "space_id": self.space_id, + "name": self.name, + "value": self.value, + "scope": self.scope, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SpaceVariable": + """Create from dictionary.""" + return cls( + space_id=data["space_id"], + name=data["name"], + value=data["value"], + scope=data.get("scope", "space"), + ) + + +@dataclass +class TransclusionReference: + """ + Tracks a transclusion reference between documents for cache invalidation. + + Attributes: + source_doc_id: ID of the document containing the transclusion + target_doc_id: ID of the transcluded document + space_id: ID of the space containing the reference + created_at: When the reference was created + """ + source_doc_id: str + target_doc_id: str + space_id: str + created_at: datetime = field(default_factory=datetime.now) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return { + "source_doc_id": self.source_doc_id, + "target_doc_id": self.target_doc_id, + "space_id": self.space_id, + "created_at": self.created_at.isoformat(), + } diff --git a/markitect/spaces/rendering/__init__.py b/markitect/spaces/rendering/__init__.py new file mode 100644 index 00000000..01fb53f6 --- /dev/null +++ b/markitect/spaces/rendering/__init__.py @@ -0,0 +1,12 @@ +""" +Rendering system for Information Spaces. + +This package provides space rendering capabilities: +- SpaceRenderer: Abstract renderer interface +- MarkdownToHTMLRenderer: HTML output renderer +- Theme support and customization +- Render caching with invalidation +""" + +# Rendering will be implemented in Phase 4 +__all__ = [] diff --git a/markitect/spaces/repositories/__init__.py b/markitect/spaces/repositories/__init__.py new file mode 100644 index 00000000..c1a0e43f --- /dev/null +++ b/markitect/spaces/repositories/__init__.py @@ -0,0 +1,38 @@ +""" +Repository layer for Information Spaces. + +This package provides data access abstractions: +- ISpaceRepository: Abstract interface for space CRUD operations +- SqliteSpaceRepository: SQLite implementation +- IDocumentAssociationRepository: Document-space association storage +- IVariableRepository: Space variable storage +- IReferenceRepository: Transclusion reference tracking +""" + +from .interfaces import ( + ISpaceRepository, + IDocumentAssociationRepository, + IVariableRepository, + IReferenceRepository, +) +from .sqlite import ( + SqliteSpaceRepository, + SqliteDocumentRepository, + SqliteVariableRepository, + SqliteReferenceRepository, + initialize_space_tables, +) + +__all__ = [ + # Interfaces + "ISpaceRepository", + "IDocumentAssociationRepository", + "IVariableRepository", + "IReferenceRepository", + # SQLite implementations + "SqliteSpaceRepository", + "SqliteDocumentRepository", + "SqliteVariableRepository", + "SqliteReferenceRepository", + "initialize_space_tables", +] diff --git a/markitect/spaces/repositories/interfaces.py b/markitect/spaces/repositories/interfaces.py new file mode 100644 index 00000000..f305c30e --- /dev/null +++ b/markitect/spaces/repositories/interfaces.py @@ -0,0 +1,409 @@ +""" +Repository interfaces for Information Spaces. + +This module defines abstract base classes for space data access, +following the repository pattern for clean separation of concerns. +""" + +from abc import ABC, abstractmethod +from typing import List, Optional, Dict, Any +from ..models import ( + InformationSpace, + SpaceDocument, + SpaceVariable, + TransclusionReference, +) + + +class ISpaceRepository(ABC): + """ + Abstract repository interface for InformationSpace persistence. + + Implementations should handle CRUD operations for spaces, + including proper transaction management and error handling. + """ + + @abstractmethod + def create(self, space: InformationSpace) -> InformationSpace: + """ + Create a new space in the repository. + + Args: + space: The space to create + + Returns: + The created space with any generated fields populated + + Raises: + ValueError: If space with same name already exists + """ + pass + + @abstractmethod + def get_by_id(self, space_id: str) -> Optional[InformationSpace]: + """ + Retrieve a space by its ID. + + Args: + space_id: The unique space identifier + + Returns: + The space if found, None otherwise + """ + pass + + @abstractmethod + def get_by_name(self, name: str) -> Optional[InformationSpace]: + """ + Retrieve a space by its unique name. + + Args: + name: The space name + + Returns: + The space if found, None otherwise + """ + pass + + @abstractmethod + def list_all(self, include_archived: bool = False) -> List[InformationSpace]: + """ + List all spaces in the repository. + + Args: + include_archived: Whether to include archived spaces + + Returns: + List of all spaces + """ + pass + + @abstractmethod + def update(self, space: InformationSpace) -> InformationSpace: + """ + Update an existing space. + + Args: + space: The space with updated values + + Returns: + The updated space + + Raises: + ValueError: If space does not exist + """ + pass + + @abstractmethod + def delete(self, space_id: str) -> bool: + """ + Delete a space by ID. + + Args: + space_id: The space ID to delete + + Returns: + True if deleted, False if not found + """ + pass + + @abstractmethod + def exists(self, space_id: str) -> bool: + """ + Check if a space exists. + + Args: + space_id: The space ID to check + + Returns: + True if exists, False otherwise + """ + pass + + @abstractmethod + def get_children(self, parent_space_id: str) -> List[InformationSpace]: + """ + Get all child spaces of a parent space. + + Args: + parent_space_id: The parent space ID + + Returns: + List of child spaces + """ + pass + + +class IDocumentAssociationRepository(ABC): + """ + Abstract repository interface for SpaceDocument associations. + + Manages the relationship between documents and spaces. + """ + + @abstractmethod + def add_document(self, document: SpaceDocument) -> SpaceDocument: + """ + Add a document to a space. + + Args: + document: The document association to create + + Returns: + The created document association + + Raises: + ValueError: If document path already exists in space + """ + pass + + @abstractmethod + def get_document(self, document_id: str) -> Optional[SpaceDocument]: + """ + Get a document association by ID. + + Args: + document_id: The document association ID + + Returns: + The document if found, None otherwise + """ + pass + + @abstractmethod + def get_by_space_path(self, space_id: str, space_path: str) -> Optional[SpaceDocument]: + """ + Get a document by its path within a space. + + Args: + space_id: The space ID + space_path: The path within the space (e.g., "/intro.md") + + Returns: + The document if found, None otherwise + """ + pass + + @abstractmethod + def list_by_space(self, space_id: str) -> List[SpaceDocument]: + """ + List all documents in a space. + + Args: + space_id: The space ID + + Returns: + List of documents in the space, ordered by order_index + """ + pass + + @abstractmethod + def update_document(self, document: SpaceDocument) -> SpaceDocument: + """ + Update a document association. + + Args: + document: The document with updated values + + Returns: + The updated document + + Raises: + ValueError: If document does not exist + """ + pass + + @abstractmethod + def remove_document(self, document_id: str) -> bool: + """ + Remove a document from a space. + + Args: + document_id: The document association ID + + Returns: + True if removed, False if not found + """ + pass + + @abstractmethod + def move_document(self, document_id: str, new_space_path: str) -> SpaceDocument: + """ + Move a document to a new path within the space. + + Args: + document_id: The document association ID + new_space_path: The new path within the space + + Returns: + The updated document + + Raises: + ValueError: If new path already exists + """ + pass + + @abstractmethod + def reorder_documents(self, space_id: str, document_ids: List[str]) -> None: + """ + Reorder documents within a space. + + Args: + space_id: The space ID + document_ids: Ordered list of document IDs + """ + pass + + @abstractmethod + def update_content_hash(self, document_id: str, content_hash: str) -> None: + """ + Update the content hash for change detection. + + Args: + document_id: The document association ID + content_hash: New content hash + """ + pass + + +class IVariableRepository(ABC): + """ + Abstract repository interface for SpaceVariable storage. + + Manages space-level variables for transclusion context. + """ + + @abstractmethod + def set_variable(self, variable: SpaceVariable) -> SpaceVariable: + """ + Set a variable value. + + Args: + variable: The variable to set + + Returns: + The saved variable + """ + pass + + @abstractmethod + def get_variable(self, space_id: str, name: str) -> Optional[SpaceVariable]: + """ + Get a variable by name. + + Args: + space_id: The space ID + name: Variable name + + Returns: + The variable if found, None otherwise + """ + pass + + @abstractmethod + def list_variables(self, space_id: str, scope: Optional[str] = None) -> List[SpaceVariable]: + """ + List all variables in a space. + + Args: + space_id: The space ID + scope: Optional scope filter + + Returns: + List of variables + """ + pass + + @abstractmethod + def delete_variable(self, space_id: str, name: str) -> bool: + """ + Delete a variable. + + Args: + space_id: The space ID + name: Variable name + + Returns: + True if deleted, False if not found + """ + pass + + +class IReferenceRepository(ABC): + """ + Abstract repository interface for TransclusionReference tracking. + + Manages the dependency graph for cache invalidation. + """ + + @abstractmethod + def add_reference(self, reference: TransclusionReference) -> TransclusionReference: + """ + Add a transclusion reference. + + Args: + reference: The reference to add + + Returns: + The saved reference + """ + pass + + @abstractmethod + def get_references_from(self, source_doc_id: str, space_id: str) -> List[TransclusionReference]: + """ + Get all references from a source document. + + Args: + source_doc_id: The source document ID + space_id: The space ID + + Returns: + List of references from this document + """ + pass + + @abstractmethod + def get_references_to(self, target_doc_id: str, space_id: str) -> List[TransclusionReference]: + """ + Get all references to a target document. + + Args: + target_doc_id: The target document ID + space_id: The space ID + + Returns: + List of references to this document + """ + pass + + @abstractmethod + def clear_references_from(self, source_doc_id: str, space_id: str) -> int: + """ + Clear all references from a source document. + + Args: + source_doc_id: The source document ID + space_id: The space ID + + Returns: + Number of references deleted + """ + pass + + @abstractmethod + def get_dependents(self, document_id: str, space_id: str) -> List[str]: + """ + Get all documents that depend on a given document. + + Used for cache invalidation - returns documents that need + to be re-rendered when the target document changes. + + Args: + document_id: The document ID + space_id: The space ID + + Returns: + List of dependent document IDs + """ + pass diff --git a/markitect/spaces/repositories/sqlite.py b/markitect/spaces/repositories/sqlite.py new file mode 100644 index 00000000..3f53557d --- /dev/null +++ b/markitect/spaces/repositories/sqlite.py @@ -0,0 +1,713 @@ +""" +SQLite implementation of space repositories. + +This module provides SQLite-backed implementations of the repository +interfaces for persistent storage of Information Spaces. +""" + +import sqlite3 +import json +from pathlib import Path +from typing import List, Optional +from datetime import datetime + +from .interfaces import ( + ISpaceRepository, + IDocumentAssociationRepository, + IVariableRepository, + IReferenceRepository, +) +from ..models import ( + InformationSpace, + SpaceDocument, + SpaceVariable, + TransclusionReference, + SpaceStatus, + SpaceMetadata, + SpaceConfig, +) + + +# SQL Schema for space tables +SPACE_TABLES_SQL = """ +-- Information Spaces table +CREATE TABLE IF NOT EXISTS spaces ( + id TEXT PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + description TEXT, + metadata JSON, + config JSON, + parent_space_id TEXT REFERENCES spaces(id), + status TEXT DEFAULT 'draft', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Space documents association table +CREATE TABLE IF NOT EXISTS space_documents ( + id TEXT PRIMARY KEY, + space_id TEXT NOT NULL REFERENCES spaces(id) ON DELETE CASCADE, + document_id TEXT NOT NULL, + space_path TEXT NOT NULL, + order_index INTEGER DEFAULT 0, + metadata JSON, + content_hash TEXT, + added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(space_id, space_path) +); + +-- Space variables for transclusion context +CREATE TABLE IF NOT EXISTS space_variables ( + space_id TEXT NOT NULL REFERENCES spaces(id) ON DELETE CASCADE, + name TEXT NOT NULL, + value JSON, + scope TEXT DEFAULT 'space', + PRIMARY KEY(space_id, name) +); + +-- Transclusion reference tracking for cache invalidation +CREATE TABLE IF NOT EXISTS transclusion_references ( + source_doc_id TEXT NOT NULL, + target_doc_id TEXT NOT NULL, + space_id TEXT NOT NULL REFERENCES spaces(id) ON DELETE CASCADE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(source_doc_id, target_doc_id, space_id) +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_spaces_name ON spaces(name); +CREATE INDEX IF NOT EXISTS idx_spaces_parent ON spaces(parent_space_id); +CREATE INDEX IF NOT EXISTS idx_spaces_status ON spaces(status); +CREATE INDEX IF NOT EXISTS idx_space_documents_space ON space_documents(space_id); +CREATE INDEX IF NOT EXISTS idx_space_documents_path ON space_documents(space_id, space_path); +CREATE INDEX IF NOT EXISTS idx_transclusion_refs_source ON transclusion_references(source_doc_id, space_id); +CREATE INDEX IF NOT EXISTS idx_transclusion_refs_target ON transclusion_references(target_doc_id, space_id); +""" + + +def initialize_space_tables(db_path: str) -> None: + """ + Initialize the space-related database tables. + + Args: + db_path: Path to the SQLite database file + """ + # Ensure directory exists + db_dir = Path(db_path).parent + if db_dir and not db_dir.exists(): + db_dir.mkdir(parents=True, exist_ok=True) + + conn = sqlite3.connect(db_path) + try: + cursor = conn.cursor() + cursor.executescript(SPACE_TABLES_SQL) + conn.commit() + finally: + conn.close() + + +class SqliteSpaceRepository(ISpaceRepository): + """ + SQLite implementation of the space repository. + + Provides persistent storage for InformationSpace entities + using SQLite as the backend. + """ + + def __init__(self, db_path: str): + """ + Initialize the repository. + + Args: + db_path: Path to the SQLite database file + """ + self.db_path = db_path + initialize_space_tables(db_path) + + def _get_connection(self) -> sqlite3.Connection: + """Get a database connection with foreign keys enabled.""" + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + return conn + + def _row_to_space(self, row: sqlite3.Row) -> InformationSpace: + """Convert a database row to an InformationSpace.""" + metadata_dict = json.loads(row["metadata"]) if row["metadata"] else {} + config_dict = json.loads(row["config"]) if row["config"] else {} + + return InformationSpace( + id=row["id"], + name=row["name"], + description=row["description"], + metadata=SpaceMetadata.from_dict(metadata_dict), + config=SpaceConfig.from_dict(config_dict), + parent_space_id=row["parent_space_id"], + status=SpaceStatus(row["status"]), + created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.now(), + updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else datetime.now(), + ) + + def create(self, space: InformationSpace) -> InformationSpace: + """Create a new space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + + # Check if name already exists + cursor.execute("SELECT id FROM spaces WHERE name = ?", (space.name,)) + if cursor.fetchone(): + raise ValueError(f"Space with name '{space.name}' already exists") + + cursor.execute( + """ + INSERT INTO spaces (id, name, description, metadata, config, parent_space_id, status, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + space.id, + space.name, + space.description, + json.dumps(space.metadata.to_dict() if isinstance(space.metadata, SpaceMetadata) else space.metadata), + json.dumps(space.config.to_dict() if isinstance(space.config, SpaceConfig) else space.config), + space.parent_space_id, + space.status.value if isinstance(space.status, SpaceStatus) else space.status, + space.created_at.isoformat(), + space.updated_at.isoformat(), + ), + ) + conn.commit() + return space + finally: + conn.close() + + def get_by_id(self, space_id: str) -> Optional[InformationSpace]: + """Get a space by ID.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute("SELECT * FROM spaces WHERE id = ?", (space_id,)) + row = cursor.fetchone() + return self._row_to_space(row) if row else None + finally: + conn.close() + + def get_by_name(self, name: str) -> Optional[InformationSpace]: + """Get a space by name.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute("SELECT * FROM spaces WHERE name = ?", (name,)) + row = cursor.fetchone() + return self._row_to_space(row) if row else None + finally: + conn.close() + + def list_all(self, include_archived: bool = False) -> List[InformationSpace]: + """List all spaces.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + if include_archived: + cursor.execute("SELECT * FROM spaces WHERE status != 'deleted' ORDER BY name") + else: + cursor.execute("SELECT * FROM spaces WHERE status NOT IN ('archived', 'deleted') ORDER BY name") + return [self._row_to_space(row) for row in cursor.fetchall()] + finally: + conn.close() + + def update(self, space: InformationSpace) -> InformationSpace: + """Update a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + + # Check if space exists + cursor.execute("SELECT id FROM spaces WHERE id = ?", (space.id,)) + if not cursor.fetchone(): + raise ValueError(f"Space with id '{space.id}' does not exist") + + space.touch() # Update timestamp + + cursor.execute( + """ + UPDATE spaces SET + name = ?, + description = ?, + metadata = ?, + config = ?, + parent_space_id = ?, + status = ?, + updated_at = ? + WHERE id = ? + """, + ( + space.name, + space.description, + json.dumps(space.metadata.to_dict() if isinstance(space.metadata, SpaceMetadata) else space.metadata), + json.dumps(space.config.to_dict() if isinstance(space.config, SpaceConfig) else space.config), + space.parent_space_id, + space.status.value if isinstance(space.status, SpaceStatus) else space.status, + space.updated_at.isoformat(), + space.id, + ), + ) + conn.commit() + return space + finally: + conn.close() + + def delete(self, space_id: str) -> bool: + """Delete a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute("DELETE FROM spaces WHERE id = ?", (space_id,)) + conn.commit() + return cursor.rowcount > 0 + finally: + conn.close() + + def exists(self, space_id: str) -> bool: + """Check if a space exists.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute("SELECT 1 FROM spaces WHERE id = ?", (space_id,)) + return cursor.fetchone() is not None + finally: + conn.close() + + def get_children(self, parent_space_id: str) -> List[InformationSpace]: + """Get child spaces.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT * FROM spaces WHERE parent_space_id = ? ORDER BY name", + (parent_space_id,), + ) + return [self._row_to_space(row) for row in cursor.fetchall()] + finally: + conn.close() + + +class SqliteDocumentRepository(IDocumentAssociationRepository): + """ + SQLite implementation of the document association repository. + """ + + def __init__(self, db_path: str): + """Initialize the repository.""" + self.db_path = db_path + initialize_space_tables(db_path) + + def _get_connection(self) -> sqlite3.Connection: + """Get a database connection with foreign keys enabled.""" + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + return conn + + def _row_to_document(self, row: sqlite3.Row) -> SpaceDocument: + """Convert a database row to a SpaceDocument.""" + metadata_dict = json.loads(row["metadata"]) if row["metadata"] else {} + + return SpaceDocument( + id=row["id"], + space_id=row["space_id"], + document_id=row["document_id"], + space_path=row["space_path"], + order_index=row["order_index"], + metadata=metadata_dict, + content_hash=row["content_hash"], + added_at=datetime.fromisoformat(row["added_at"]) if row["added_at"] else datetime.now(), + ) + + def add_document(self, document: SpaceDocument) -> SpaceDocument: + """Add a document to a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + + # Check if path already exists in space + cursor.execute( + "SELECT id FROM space_documents WHERE space_id = ? AND space_path = ?", + (document.space_id, document.space_path), + ) + if cursor.fetchone(): + raise ValueError(f"Document path '{document.space_path}' already exists in space") + + cursor.execute( + """ + INSERT INTO space_documents (id, space_id, document_id, space_path, order_index, metadata, content_hash, added_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + document.id, + document.space_id, + document.document_id, + document.space_path, + document.order_index, + json.dumps(document.metadata), + document.content_hash, + document.added_at.isoformat(), + ), + ) + conn.commit() + return document + finally: + conn.close() + + def get_document(self, document_id: str) -> Optional[SpaceDocument]: + """Get a document by ID.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute("SELECT * FROM space_documents WHERE id = ?", (document_id,)) + row = cursor.fetchone() + return self._row_to_document(row) if row else None + finally: + conn.close() + + def get_by_space_path(self, space_id: str, space_path: str) -> Optional[SpaceDocument]: + """Get a document by its path within a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT * FROM space_documents WHERE space_id = ? AND space_path = ?", + (space_id, space_path), + ) + row = cursor.fetchone() + return self._row_to_document(row) if row else None + finally: + conn.close() + + def list_by_space(self, space_id: str) -> List[SpaceDocument]: + """List all documents in a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT * FROM space_documents WHERE space_id = ? ORDER BY order_index, space_path", + (space_id,), + ) + return [self._row_to_document(row) for row in cursor.fetchall()] + finally: + conn.close() + + def update_document(self, document: SpaceDocument) -> SpaceDocument: + """Update a document.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + + cursor.execute("SELECT id FROM space_documents WHERE id = ?", (document.id,)) + if not cursor.fetchone(): + raise ValueError(f"Document with id '{document.id}' does not exist") + + cursor.execute( + """ + UPDATE space_documents SET + document_id = ?, + space_path = ?, + order_index = ?, + metadata = ?, + content_hash = ? + WHERE id = ? + """, + ( + document.document_id, + document.space_path, + document.order_index, + json.dumps(document.metadata), + document.content_hash, + document.id, + ), + ) + conn.commit() + return document + finally: + conn.close() + + def remove_document(self, document_id: str) -> bool: + """Remove a document from a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute("DELETE FROM space_documents WHERE id = ?", (document_id,)) + conn.commit() + return cursor.rowcount > 0 + finally: + conn.close() + + def move_document(self, document_id: str, new_space_path: str) -> SpaceDocument: + """Move a document to a new path.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + + # Get current document + cursor.execute("SELECT * FROM space_documents WHERE id = ?", (document_id,)) + row = cursor.fetchone() + if not row: + raise ValueError(f"Document with id '{document_id}' does not exist") + + # Check if new path already exists + cursor.execute( + "SELECT id FROM space_documents WHERE space_id = ? AND space_path = ? AND id != ?", + (row["space_id"], new_space_path, document_id), + ) + if cursor.fetchone(): + raise ValueError(f"Document path '{new_space_path}' already exists") + + cursor.execute( + "UPDATE space_documents SET space_path = ? WHERE id = ?", + (new_space_path, document_id), + ) + conn.commit() + + document = self._row_to_document(row) + document.space_path = new_space_path + return document + finally: + conn.close() + + def reorder_documents(self, space_id: str, document_ids: List[str]) -> None: + """Reorder documents within a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + for index, doc_id in enumerate(document_ids): + cursor.execute( + "UPDATE space_documents SET order_index = ? WHERE id = ? AND space_id = ?", + (index, doc_id, space_id), + ) + conn.commit() + finally: + conn.close() + + def update_content_hash(self, document_id: str, content_hash: str) -> None: + """Update the content hash.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "UPDATE space_documents SET content_hash = ? WHERE id = ?", + (content_hash, document_id), + ) + conn.commit() + finally: + conn.close() + + +class SqliteVariableRepository(IVariableRepository): + """ + SQLite implementation of the variable repository. + """ + + def __init__(self, db_path: str): + """Initialize the repository.""" + self.db_path = db_path + initialize_space_tables(db_path) + + def _get_connection(self) -> sqlite3.Connection: + """Get a database connection with foreign keys enabled.""" + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + return conn + + def set_variable(self, variable: SpaceVariable) -> SpaceVariable: + """Set a variable value.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR REPLACE INTO space_variables (space_id, name, value, scope) + VALUES (?, ?, ?, ?) + """, + ( + variable.space_id, + variable.name, + json.dumps(variable.value), + variable.scope, + ), + ) + conn.commit() + return variable + finally: + conn.close() + + def get_variable(self, space_id: str, name: str) -> Optional[SpaceVariable]: + """Get a variable by name.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT * FROM space_variables WHERE space_id = ? AND name = ?", + (space_id, name), + ) + row = cursor.fetchone() + if not row: + return None + return SpaceVariable( + space_id=row["space_id"], + name=row["name"], + value=json.loads(row["value"]) if row["value"] else None, + scope=row["scope"], + ) + finally: + conn.close() + + def list_variables(self, space_id: str, scope: Optional[str] = None) -> List[SpaceVariable]: + """List variables in a space.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + if scope: + cursor.execute( + "SELECT * FROM space_variables WHERE space_id = ? AND scope = ?", + (space_id, scope), + ) + else: + cursor.execute( + "SELECT * FROM space_variables WHERE space_id = ?", + (space_id,), + ) + return [ + SpaceVariable( + space_id=row["space_id"], + name=row["name"], + value=json.loads(row["value"]) if row["value"] else None, + scope=row["scope"], + ) + for row in cursor.fetchall() + ] + finally: + conn.close() + + def delete_variable(self, space_id: str, name: str) -> bool: + """Delete a variable.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "DELETE FROM space_variables WHERE space_id = ? AND name = ?", + (space_id, name), + ) + conn.commit() + return cursor.rowcount > 0 + finally: + conn.close() + + +class SqliteReferenceRepository(IReferenceRepository): + """ + SQLite implementation of the reference repository. + """ + + def __init__(self, db_path: str): + """Initialize the repository.""" + self.db_path = db_path + initialize_space_tables(db_path) + + def _get_connection(self) -> sqlite3.Connection: + """Get a database connection with foreign keys enabled.""" + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + return conn + + def add_reference(self, reference: TransclusionReference) -> TransclusionReference: + """Add a transclusion reference.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR REPLACE INTO transclusion_references (source_doc_id, target_doc_id, space_id, created_at) + VALUES (?, ?, ?, ?) + """, + ( + reference.source_doc_id, + reference.target_doc_id, + reference.space_id, + reference.created_at.isoformat(), + ), + ) + conn.commit() + return reference + finally: + conn.close() + + def get_references_from(self, source_doc_id: str, space_id: str) -> List[TransclusionReference]: + """Get references from a source document.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT * FROM transclusion_references WHERE source_doc_id = ? AND space_id = ?", + (source_doc_id, space_id), + ) + return [ + TransclusionReference( + source_doc_id=row["source_doc_id"], + target_doc_id=row["target_doc_id"], + space_id=row["space_id"], + created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.now(), + ) + for row in cursor.fetchall() + ] + finally: + conn.close() + + def get_references_to(self, target_doc_id: str, space_id: str) -> List[TransclusionReference]: + """Get references to a target document.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT * FROM transclusion_references WHERE target_doc_id = ? AND space_id = ?", + (target_doc_id, space_id), + ) + return [ + TransclusionReference( + source_doc_id=row["source_doc_id"], + target_doc_id=row["target_doc_id"], + space_id=row["space_id"], + created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else datetime.now(), + ) + for row in cursor.fetchall() + ] + finally: + conn.close() + + def clear_references_from(self, source_doc_id: str, space_id: str) -> int: + """Clear references from a source document.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "DELETE FROM transclusion_references WHERE source_doc_id = ? AND space_id = ?", + (source_doc_id, space_id), + ) + conn.commit() + return cursor.rowcount + finally: + conn.close() + + def get_dependents(self, document_id: str, space_id: str) -> List[str]: + """Get documents that depend on this document.""" + conn = self._get_connection() + try: + cursor = conn.cursor() + cursor.execute( + "SELECT DISTINCT source_doc_id FROM transclusion_references WHERE target_doc_id = ? AND space_id = ?", + (document_id, space_id), + ) + return [row["source_doc_id"] for row in cursor.fetchall()] + finally: + conn.close() diff --git a/markitect/spaces/services/__init__.py b/markitect/spaces/services/__init__.py new file mode 100644 index 00000000..30d4ef11 --- /dev/null +++ b/markitect/spaces/services/__init__.py @@ -0,0 +1,14 @@ +""" +Service layer for Information Spaces. + +This package provides the main orchestration service: +- SpaceService: Main API for space operations +- RenderService: Rendering orchestration (Phase 4) +- SyncService: Synchronization coordination (Phase 5) +""" + +from .space_service import SpaceService + +__all__ = [ + "SpaceService", +] diff --git a/markitect/spaces/services/space_service.py b/markitect/spaces/services/space_service.py new file mode 100644 index 00000000..995a7c3b --- /dev/null +++ b/markitect/spaces/services/space_service.py @@ -0,0 +1,659 @@ +""" +SpaceService - Main orchestration service for Information Spaces. + +This module provides the primary API for space operations, coordinating +between repositories, event handling, and transclusion context management. +""" + +from typing import List, Optional, Dict, Any +from pathlib import Path + +from ..models import ( + InformationSpace, + SpaceDocument, + SpaceVariable, + TransclusionReference, + SpaceStatus, + SpaceConfig, + SpaceMetadata, +) +from ..repositories.interfaces import ( + ISpaceRepository, + IDocumentAssociationRepository, + IVariableRepository, + IReferenceRepository, +) + + +class SpaceService: + """ + Main orchestration service for Information Space operations. + + Provides a high-level API for managing spaces, documents, variables, + and transclusion references. This service coordinates between the + repository layer and future event/rendering systems. + + Usage: + service = SpaceService( + space_repo=SqliteSpaceRepository(db_path), + document_repo=SqliteDocumentRepository(db_path), + variable_repo=SqliteVariableRepository(db_path), + reference_repo=SqliteReferenceRepository(db_path), + ) + + # Create a space + space = service.create_space(name="my-docs", description="My documentation") + + # Add documents + service.add_document(space.id, "/intro.md", document_id="doc-1") + """ + + def __init__( + self, + space_repo: ISpaceRepository, + document_repo: IDocumentAssociationRepository, + variable_repo: IVariableRepository, + reference_repo: IReferenceRepository, + ): + """ + Initialize the SpaceService. + + Args: + space_repo: Repository for space CRUD operations + document_repo: Repository for document associations + variable_repo: Repository for space variables + reference_repo: Repository for transclusion references + """ + self._space_repo = space_repo + self._document_repo = document_repo + self._variable_repo = variable_repo + self._reference_repo = reference_repo + + # ========================================================================= + # Space CRUD Operations + # ========================================================================= + + def create_space( + self, + name: str, + description: Optional[str] = None, + config: Optional[SpaceConfig] = None, + metadata: Optional[SpaceMetadata] = None, + parent_space_id: Optional[str] = None, + ) -> InformationSpace: + """ + Create a new information space. + + Args: + name: Unique name for the space + description: Optional description + config: Optional configuration (defaults provided if None) + metadata: Optional metadata (defaults provided if None) + parent_space_id: Optional parent space for hierarchy + + Returns: + The created InformationSpace + + Raises: + ValueError: If name is empty or already exists + """ + if not name or not name.strip(): + raise ValueError("Space name cannot be empty") + + # Validate parent exists if specified + if parent_space_id: + parent = self._space_repo.get_by_id(parent_space_id) + if not parent: + raise ValueError(f"Parent space '{parent_space_id}' not found") + + space = InformationSpace( + name=name.strip(), + description=description, + config=config or SpaceConfig(), + metadata=metadata or SpaceMetadata(), + parent_space_id=parent_space_id, + ) + + return self._space_repo.create(space) + + def get_space(self, space_id: str) -> Optional[InformationSpace]: + """ + Get a space by its ID. + + Args: + space_id: The space ID + + Returns: + The space if found, None otherwise + """ + return self._space_repo.get_by_id(space_id) + + def get_space_by_name(self, name: str) -> Optional[InformationSpace]: + """ + Get a space by its name. + + Args: + name: The space name + + Returns: + The space if found, None otherwise + """ + return self._space_repo.get_by_name(name) + + def list_spaces(self, include_archived: bool = False) -> List[InformationSpace]: + """ + List all spaces. + + Args: + include_archived: Whether to include archived spaces + + Returns: + List of spaces + """ + return self._space_repo.list_all(include_archived=include_archived) + + def update_space( + self, + space_id: str, + name: Optional[str] = None, + description: Optional[str] = None, + config: Optional[SpaceConfig] = None, + metadata: Optional[SpaceMetadata] = None, + ) -> InformationSpace: + """ + Update a space's properties. + + Args: + space_id: The space ID to update + name: New name (optional) + description: New description (optional) + config: New config (optional) + metadata: New metadata (optional) + + Returns: + The updated space + + Raises: + ValueError: If space not found or name already taken + """ + space = self._space_repo.get_by_id(space_id) + if not space: + raise ValueError(f"Space '{space_id}' not found") + + if name is not None: + if not name.strip(): + raise ValueError("Space name cannot be empty") + # Check if name is taken by another space + existing = self._space_repo.get_by_name(name.strip()) + if existing and existing.id != space_id: + raise ValueError(f"Space name '{name}' already exists") + space.name = name.strip() + + if description is not None: + space.description = description + + if config is not None: + space.config = config + + if metadata is not None: + space.metadata = metadata + + return self._space_repo.update(space) + + def delete_space(self, space_id: str, cascade: bool = True) -> bool: + """ + Delete a space. + + Args: + space_id: The space ID to delete + cascade: If True, delete all child spaces too + + Returns: + True if deleted, False if not found + + Raises: + ValueError: If space has children and cascade is False + """ + space = self._space_repo.get_by_id(space_id) + if not space: + return False + + children = self._space_repo.get_children(space_id) + if children and not cascade: + raise ValueError( + f"Space '{space_id}' has {len(children)} child spaces. " + "Set cascade=True to delete them." + ) + + # Delete children first (if cascade) + if cascade: + for child in children: + self.delete_space(child.id, cascade=True) + + return self._space_repo.delete(space_id) + + def activate_space(self, space_id: str) -> InformationSpace: + """ + Activate a space (change status from draft to active). + + Args: + space_id: The space ID + + Returns: + The updated space + + Raises: + ValueError: If space not found + """ + space = self._space_repo.get_by_id(space_id) + if not space: + raise ValueError(f"Space '{space_id}' not found") + + space.activate() + return self._space_repo.update(space) + + def archive_space(self, space_id: str) -> InformationSpace: + """ + Archive a space. + + Args: + space_id: The space ID + + Returns: + The updated space + + Raises: + ValueError: If space not found + """ + space = self._space_repo.get_by_id(space_id) + if not space: + raise ValueError(f"Space '{space_id}' not found") + + space.archive() + return self._space_repo.update(space) + + def get_child_spaces(self, parent_space_id: str) -> List[InformationSpace]: + """ + Get all child spaces of a parent. + + Args: + parent_space_id: The parent space ID + + Returns: + List of child spaces + """ + return self._space_repo.get_children(parent_space_id) + + # ========================================================================= + # Document Operations + # ========================================================================= + + def add_document( + self, + space_id: str, + space_path: str, + document_id: Optional[str] = None, + order_index: int = 0, + metadata: Optional[Dict[str, Any]] = None, + content_hash: Optional[str] = None, + ) -> SpaceDocument: + """ + Add a document to a space. + + Args: + space_id: The space ID + space_path: Path within the space (e.g., "/intro.md") + document_id: External document ID (optional) + order_index: Position in space ordering + metadata: Document metadata + content_hash: Content hash for change detection + + Returns: + The created document association + + Raises: + ValueError: If space not found or path already exists + """ + if not self._space_repo.exists(space_id): + raise ValueError(f"Space '{space_id}' not found") + + # Normalize path + if not space_path.startswith("/"): + space_path = "/" + space_path + + document = SpaceDocument( + space_id=space_id, + document_id=document_id or "", + space_path=space_path, + order_index=order_index, + metadata=metadata or {}, + content_hash=content_hash, + ) + + return self._document_repo.add_document(document) + + def get_document(self, document_id: str) -> Optional[SpaceDocument]: + """ + Get a document by its association ID. + + Args: + document_id: The document association ID + + Returns: + The document if found, None otherwise + """ + return self._document_repo.get_document(document_id) + + def get_document_by_path( + self, space_id: str, space_path: str + ) -> Optional[SpaceDocument]: + """ + Get a document by its path within a space. + + Args: + space_id: The space ID + space_path: The path within the space + + Returns: + The document if found, None otherwise + """ + # Normalize path + if not space_path.startswith("/"): + space_path = "/" + space_path + return self._document_repo.get_by_space_path(space_id, space_path) + + def list_documents(self, space_id: str) -> List[SpaceDocument]: + """ + List all documents in a space. + + Args: + space_id: The space ID + + Returns: + List of documents ordered by order_index + """ + return self._document_repo.list_by_space(space_id) + + def remove_document(self, document_id: str) -> bool: + """ + Remove a document from a space. + + Args: + document_id: The document association ID + + Returns: + True if removed, False if not found + """ + # Clear any references from this document first + document = self._document_repo.get_document(document_id) + if document: + self._reference_repo.clear_references_from(document_id, document.space_id) + + return self._document_repo.remove_document(document_id) + + def move_document(self, document_id: str, new_path: str) -> SpaceDocument: + """ + Move a document to a new path within its space. + + Args: + document_id: The document association ID + new_path: The new path + + Returns: + The updated document + + Raises: + ValueError: If document not found or new path exists + """ + if not new_path.startswith("/"): + new_path = "/" + new_path + return self._document_repo.move_document(document_id, new_path) + + def reorder_documents(self, space_id: str, document_ids: List[str]) -> None: + """ + Reorder documents within a space. + + Args: + space_id: The space ID + document_ids: Ordered list of document IDs + """ + self._document_repo.reorder_documents(space_id, document_ids) + + def update_document_hash(self, document_id: str, content_hash: str) -> None: + """ + Update the content hash for a document. + + Args: + document_id: The document association ID + content_hash: The new content hash + """ + self._document_repo.update_content_hash(document_id, content_hash) + + # ========================================================================= + # Variable Operations + # ========================================================================= + + def set_variable( + self, + space_id: str, + name: str, + value: Any, + scope: str = "space", + ) -> SpaceVariable: + """ + Set a variable in a space. + + Args: + space_id: The space ID + name: Variable name + value: Variable value (any JSON-serializable value) + scope: Variable scope ("space" or "document") + + Returns: + The saved variable + + Raises: + ValueError: If space not found + """ + if not self._space_repo.exists(space_id): + raise ValueError(f"Space '{space_id}' not found") + + variable = SpaceVariable( + space_id=space_id, + name=name, + value=value, + scope=scope, + ) + + return self._variable_repo.set_variable(variable) + + def get_variable(self, space_id: str, name: str) -> Optional[SpaceVariable]: + """ + Get a variable by name. + + Args: + space_id: The space ID + name: Variable name + + Returns: + The variable if found, None otherwise + """ + return self._variable_repo.get_variable(space_id, name) + + def list_variables( + self, space_id: str, scope: Optional[str] = None + ) -> List[SpaceVariable]: + """ + List all variables in a space. + + Args: + space_id: The space ID + scope: Optional scope filter + + Returns: + List of variables + """ + return self._variable_repo.list_variables(space_id, scope) + + def delete_variable(self, space_id: str, name: str) -> bool: + """ + Delete a variable. + + Args: + space_id: The space ID + name: Variable name + + Returns: + True if deleted, False if not found + """ + return self._variable_repo.delete_variable(space_id, name) + + def get_variables_dict(self, space_id: str) -> Dict[str, Any]: + """ + Get all variables as a dictionary. + + Useful for transclusion context. + + Args: + space_id: The space ID + + Returns: + Dictionary of variable names to values + """ + variables = self._variable_repo.list_variables(space_id) + return {var.name: var.value for var in variables} + + # ========================================================================= + # Reference Operations + # ========================================================================= + + def add_reference( + self, + source_doc_id: str, + target_doc_id: str, + space_id: str, + ) -> TransclusionReference: + """ + Add a transclusion reference. + + Args: + source_doc_id: The source document ID + target_doc_id: The target document ID + space_id: The space ID + + Returns: + The created reference + """ + reference = TransclusionReference( + source_doc_id=source_doc_id, + target_doc_id=target_doc_id, + space_id=space_id, + ) + return self._reference_repo.add_reference(reference) + + def get_references_from( + self, source_doc_id: str, space_id: str + ) -> List[TransclusionReference]: + """ + Get all references from a source document. + + Args: + source_doc_id: The source document ID + space_id: The space ID + + Returns: + List of references + """ + return self._reference_repo.get_references_from(source_doc_id, space_id) + + def get_references_to( + self, target_doc_id: str, space_id: str + ) -> List[TransclusionReference]: + """ + Get all references to a target document. + + Args: + target_doc_id: The target document ID + space_id: The space ID + + Returns: + List of references + """ + return self._reference_repo.get_references_to(target_doc_id, space_id) + + def clear_references_from(self, source_doc_id: str, space_id: str) -> int: + """ + Clear all references from a source document. + + Args: + source_doc_id: The source document ID + space_id: The space ID + + Returns: + Number of references cleared + """ + return self._reference_repo.clear_references_from(source_doc_id, space_id) + + def get_dependents(self, document_id: str, space_id: str) -> List[str]: + """ + Get all documents that depend on a given document. + + Used for cache invalidation - returns documents that need + to be re-rendered when the target document changes. + + Args: + document_id: The document ID + space_id: The space ID + + Returns: + List of dependent document IDs + """ + return self._reference_repo.get_dependents(document_id, space_id) + + # ========================================================================= + # Convenience Methods + # ========================================================================= + + def space_exists(self, space_id: str) -> bool: + """ + Check if a space exists. + + Args: + space_id: The space ID + + Returns: + True if exists, False otherwise + """ + return self._space_repo.exists(space_id) + + def get_space_stats(self, space_id: str) -> Dict[str, Any]: + """ + Get statistics about a space. + + Args: + space_id: The space ID + + Returns: + Dictionary with statistics + + Raises: + ValueError: If space not found + """ + space = self._space_repo.get_by_id(space_id) + if not space: + raise ValueError(f"Space '{space_id}' not found") + + documents = self._document_repo.list_by_space(space_id) + variables = self._variable_repo.list_variables(space_id) + children = self._space_repo.get_children(space_id) + + return { + "space_id": space_id, + "name": space.name, + "status": space.status.value, + "document_count": len(documents), + "variable_count": len(variables), + "child_space_count": len(children), + "created_at": space.created_at.isoformat(), + "updated_at": space.updated_at.isoformat(), + } diff --git a/markitect/spaces/sync/__init__.py b/markitect/spaces/sync/__init__.py new file mode 100644 index 00000000..6b107be1 --- /dev/null +++ b/markitect/spaces/sync/__init__.py @@ -0,0 +1,13 @@ +""" +Directory synchronization for Information Spaces. + +This package provides filesystem integration: +- SpaceToDirectory exporter using VariantFactory +- DirectoryToSpace importer +- Bidirectional sync coordinator +- Filesystem watcher for external changes +- Conflict detection and resolution +""" + +# Directory sync will be implemented in Phase 5 +__all__ = [] diff --git a/markitect/spaces/transclusion/__init__.py b/markitect/spaces/transclusion/__init__.py new file mode 100644 index 00000000..62b2a369 --- /dev/null +++ b/markitect/spaces/transclusion/__init__.py @@ -0,0 +1,12 @@ +""" +Persistent transclusion context for Information Spaces. + +This package extends the existing TransclusionContext with: +- Database persistence for context state +- Cross-space reference resolution +- Reference graph for dependency tracking +- Variable scope layers (space, document, request) +""" + +# Transclusion extensions will be implemented in Phase 3 +__all__ = [] diff --git a/markitect/storage/__init__.py b/markitect/storage/__init__.py new file mode 100644 index 00000000..1e257e57 --- /dev/null +++ b/markitect/storage/__init__.py @@ -0,0 +1,10 @@ +""" +Storage modules for MarkiTect. + +This package contains storage-related functionality: +- Database: SQLite database management for documents and schemas +""" + +from .database import DatabaseManager + +__all__ = ['DatabaseManager'] diff --git a/markitect/storage/database.py b/markitect/storage/database.py new file mode 100644 index 00000000..2eec0f00 --- /dev/null +++ b/markitect/storage/database.py @@ -0,0 +1,444 @@ +""" +Database management functionality for MarkiTect. + +This module provides SQLite database initialization, markdown file storage +with front matter support, and JSON schema storage (Issue #3). +""" + +import sqlite3 +import json +import os +from datetime import datetime +from pathlib import Path +from typing import Optional, Dict, Any + +from markitect.frontmatter import FrontMatterParser + + +class DatabaseManager: + """Manager for SQLite database operations.""" + + def __init__(self, db_path: str): + """ + Initialize database manager. + + Args: + db_path: Path to SQLite database file + """ + self.db_path = db_path + self.front_matter_parser = FrontMatterParser() + + def initialize_database(self) -> None: + """ + Initialize SQLite database with required tables. + + Creates the markdown_files table with the following schema: + - id: INTEGER PRIMARY KEY + - filename: TEXT NOT NULL + - front_matter: TEXT (JSON) + - content: TEXT + - created_at: TIMESTAMP DEFAULT CURRENT_TIMESTAMP + + Also initializes finance schema if finance module is available. + """ + # Ensure directory exists + db_dir = os.path.dirname(self.db_path) + if db_dir and not os.path.exists(db_dir): + os.makedirs(db_dir) + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + # Create markdown_files table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS markdown_files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filename TEXT NOT NULL, + front_matter TEXT, + content TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + ''') + + # Create schemas table for Issue #3 + cursor.execute(''' + CREATE TABLE IF NOT EXISTS schemas ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filename TEXT NOT NULL UNIQUE, + title TEXT, + description TEXT, + schema_content TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + ''') + + conn.commit() + conn.close() + + # Initialize finance schema if available + self.initialize_finance_schema() + + def initialize_finance_schema(self) -> None: + """ + Initialize finance schema for cost tracking (Issue #88). + + This method is called automatically during database initialization + to set up cost tracking tables if the finance module is available. + """ + try: + from .finance.models import FinanceModels + finance_models = FinanceModels(self.db_path) + finance_models.initialize_finance_schema() + except ImportError: + # Finance module not available, skip initialization + pass + except Exception as e: + # Silently ignore finance schema initialization errors for CLI compatibility + pass + + def store_markdown_file(self, filename: str, content: str) -> Optional[int]: + """ + Store a markdown file in the database. + + Args: + filename: Name of the markdown file + content: Raw markdown content with optional front matter + + Returns: + ID of the inserted record, or None if insertion failed + """ + # Parse front matter and content + front_matter, markdown_content = self.front_matter_parser.parse(content) + + # Convert front matter to JSON string + front_matter_json = json.dumps(front_matter) if front_matter else '{}' + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO markdown_files (filename, front_matter, content, created_at) + VALUES (?, ?, ?, ?) + ''', (filename, front_matter_json, markdown_content, datetime.now().isoformat())) + + record_id = cursor.lastrowid + conn.commit() + return record_id + + except sqlite3.Error: + conn.rollback() + return None + + finally: + conn.close() + + def get_markdown_file(self, filename: str) -> Optional[Dict[str, Any]]: + """ + Retrieve a markdown file from the database. + + Args: + filename: Name of the markdown file to retrieve + + Returns: + Dictionary containing file data, or None if not found + """ + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute(''' + SELECT id, filename, front_matter, content, created_at + FROM markdown_files + WHERE filename = ? + ''', (filename,)) + + row = cursor.fetchone() + conn.close() + + if row: + return { + 'id': row[0], + 'filename': row[1], + 'front_matter': json.loads(row[2]) if row[2] else {}, + 'content': row[3], + 'created_at': row[4] + } + + return None + + def list_markdown_files(self) -> list: + """ + List all markdown files in the database. + + Returns: + List of dictionaries containing file metadata + """ + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute(''' + SELECT id, filename, front_matter, created_at + FROM markdown_files + ORDER BY created_at DESC + ''') + + rows = cursor.fetchall() + conn.close() + + files = [] + for row in rows: + files.append({ + 'id': row[0], + 'filename': row[1], + 'front_matter': json.loads(row[2]) if row[2] else {}, + 'created_at': row[3] + }) + + return files + + def execute_query(self, sql: str) -> list: + """ + Execute a read-only SQL query against the database. + + Args: + sql: SQL query string (SELECT operations only) + + Returns: + List of dictionaries representing query results + + Raises: + ValueError: If query contains non-SELECT operations + sqlite3.Error: If query execution fails + """ + # Security check: only allow SELECT queries + sql_upper = sql.strip().upper() + if not sql_upper.startswith('SELECT'): + allowed_starts = ['SELECT', 'WITH'] # Allow WITH for CTEs + if not any(sql_upper.startswith(start) for start in allowed_starts): + raise ValueError("Only SELECT and WITH queries are allowed for safety") + + # Additional safety checks for dangerous keywords (as whole words) + dangerous_keywords = [ + 'DROP', 'DELETE', 'UPDATE', 'INSERT', 'CREATE', 'ALTER', + 'TRUNCATE', 'REPLACE', 'PRAGMA' + ] + import re + for keyword in dangerous_keywords: + # Use word boundaries to match only complete words + pattern = r'\b' + keyword + r'\b' + if re.search(pattern, sql_upper): + raise ValueError(f"Query contains dangerous keyword: {keyword}") + + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row # Enable column access by name + cursor = conn.cursor() + + try: + cursor.execute(sql) + rows = cursor.fetchall() + + # Convert rows to dictionaries + results = [] + for row in rows: + results.append(dict(row)) + + conn.close() + return results + + except sqlite3.Error as e: + conn.close() + raise e + + def get_schema(self) -> dict: + """ + Get database schema information. + + Returns: + Dictionary containing table schemas with column information + """ + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + schema = {} + + try: + # Get all table names + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = cursor.fetchall() + + for table_row in tables: + table_name = table_row[0] + + # Get column information for each table + cursor.execute(f"PRAGMA table_info({table_name})") + columns = cursor.fetchall() + + column_info = [] + for col in columns: + column_info.append({ + 'name': col[1], + 'type': col[2], + 'nullable': not bool(col[3]), # notnull flag + 'default_value': col[4], + 'primary_key': bool(col[5]) + }) + + schema[table_name] = { + 'columns': column_info + } + + conn.close() + return schema + + except sqlite3.Error as e: + conn.close() + raise e + + # Schema management methods for Issue #3 + def store_schema_file(self, filename: str, schema_content: str) -> Optional[int]: + """ + Store a JSON schema file in the database. + + Args: + filename: Name of the schema file + schema_content: JSON schema content as string + + Returns: + ID of the inserted/updated record, or None if operation failed + """ + try: + # Parse and validate JSON schema + schema_data = json.loads(schema_content) + title = schema_data.get('title', filename) + description = schema_data.get('description', '') + except json.JSONDecodeError: + return None + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + # Check if schema already exists + cursor.execute('SELECT id FROM schemas WHERE filename = ?', (filename,)) + existing = cursor.fetchone() + + if existing: + # Update existing schema + cursor.execute(''' + UPDATE schemas + SET title = ?, description = ?, schema_content = ?, updated_at = ? + WHERE filename = ? + ''', (title, description, schema_content, datetime.now().isoformat(), filename)) + record_id = existing[0] + else: + # Insert new schema + cursor.execute(''' + INSERT INTO schemas (filename, title, description, schema_content, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?) + ''', (filename, title, description, schema_content, + datetime.now().isoformat(), datetime.now().isoformat())) + record_id = cursor.lastrowid + + conn.commit() + return record_id + + except sqlite3.Error: + conn.rollback() + return None + + finally: + conn.close() + + def get_schema_file(self, filename: str) -> Optional[Dict[str, Any]]: + """ + Retrieve a schema file from the database. + + Args: + filename: Name of the schema file to retrieve + + Returns: + Dictionary containing schema data, or None if not found + """ + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute(''' + SELECT id, filename, title, description, schema_content, created_at, updated_at + FROM schemas + WHERE filename = ? + ''', (filename,)) + + row = cursor.fetchone() + conn.close() + + if row: + return { + 'id': row[0], + 'filename': row[1], + 'title': row[2], + 'description': row[3], + 'schema_content': row[4], + 'created_at': row[5], + 'updated_at': row[6] + } + + return None + + def list_schema_files(self) -> list: + """ + List all schema files in the database. + + Returns: + List of dictionaries containing schema metadata + """ + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute(''' + SELECT id, filename, title, description, created_at, updated_at + FROM schemas + ORDER BY updated_at DESC + ''') + + rows = cursor.fetchall() + conn.close() + + schemas = [] + for row in rows: + schemas.append({ + 'id': row[0], + 'filename': row[1], + 'title': row[2], + 'description': row[3], + 'created_at': row[4], + 'updated_at': row[5] + }) + + return schemas + + def delete_schema_file(self, filename: str) -> bool: + """ + Delete a schema file from the database. + + Args: + filename: Name of the schema file to delete + + Returns: + True if deletion was successful, False otherwise + """ + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute('DELETE FROM schemas WHERE filename = ?', (filename,)) + success = cursor.rowcount > 0 + conn.commit() + return success + + except sqlite3.Error: + conn.rollback() + return False + + finally: + conn.close() diff --git a/markitect/workspace.py b/markitect/workspace.py index b16bbe36..5e6dc6c5 100644 --- a/markitect/workspace.py +++ b/markitect/workspace.py @@ -1,477 +1,37 @@ """ -Workspace management functionality for Issue #144. +Workspace management - Backward Compatibility Module. -This module provides workspace templates, multi-project support, and -collaborative workspace features. +This module re-exports from markitect.core.workspace for backward compatibility. +New code should import from markitect.core.workspace directly. """ -import json -import yaml -import shutil -import zipfile -import tempfile -from pathlib import Path -from typing import Dict, Any, List, Optional -from dataclasses import dataclass, field -from datetime import datetime - -from markitect.assets import AssetManager - - -@dataclass -class TemplateMetadata: - """Metadata for workspace templates.""" - name: str - description: str - version: str - created_at: datetime - asset_count: int - author: str = "Unknown" - tags: List[str] = field(default_factory=list) - - -@dataclass -class TemplateResult: - """Result of template creation.""" - success: bool - template_path: Path - template_name: str - error: Optional[Exception] = None - - -@dataclass -class WorkspaceCreationResult: - """Result of workspace creation from template.""" - success: bool - workspace_path: Path - project_name: str - error: Optional[Exception] = None - - -@dataclass -class ProjectResult: - """Result of project operations.""" - success: bool - project_path: Path - project_name: str - error: Optional[Exception] = None - - -@dataclass -class SyncResult: - """Result of workspace synchronization.""" - synchronized_count: int - skipped_count: int - error_count: int - errors: List[Exception] = field(default_factory=list) - - -@dataclass -class BackupResult: - """Result of workspace backup.""" - success: bool - backup_path: Path - backup_size: int - error: Optional[Exception] = None - - -@dataclass -class RestoreResult: - """Result of workspace restore.""" - success: bool - restored_path: Path - files_restored: int - error: Optional[Exception] = None - - -@dataclass -class WorkspaceState: - """Snapshot of workspace state.""" - timestamp: datetime - file_checksums: Dict[str, str] - directory_structure: List[str] - asset_hashes: List[str] - - -@dataclass -class ConflictInfo: - """Information about a workspace conflict.""" - file_path: Path - conflict_type: str - local_timestamp: datetime - remote_timestamp: datetime - - -@dataclass -class MergeResult: - """Result of conflict resolution.""" - resolved_conflicts: int - unresolved_conflicts: int - merge_strategy: str - - -class WorkspaceTemplate: - """Workspace template management.""" - - def __init__(self, template_path: Path): - """Initialize workspace template.""" - self.template_path = template_path - self.metadata_file = template_path / "template.json" - - def get_metadata(self) -> TemplateMetadata: - """Get template metadata.""" - if self.metadata_file.exists(): - metadata_dict = json.loads(self.metadata_file.read_text()) - return TemplateMetadata(**metadata_dict) - else: - return TemplateMetadata( - name="Unknown", - description="No description", - version="1.0.0", - created_at=datetime.now(), - asset_count=0 - ) - - -class WorkspaceManager: - """Workspace management system.""" - - def __init__(self, templates_dir: Optional[Path] = None): - """Initialize workspace manager.""" - self.templates_dir = templates_dir or Path.home() / ".markitect" / "templates" - self.templates_dir.mkdir(parents=True, exist_ok=True) - - def create_template(self, name: str, source_path: Path, description: str = "", - include_assets: bool = True, configuration: Optional[Dict] = None) -> TemplateResult: - """Create a workspace template from existing workspace.""" - try: - template_path = self.templates_dir / name - template_path.mkdir(exist_ok=True) - - # Copy workspace structure - self._copy_workspace_structure(source_path, template_path, include_assets) - - # Count assets - asset_count = 0 - if include_assets and (source_path / "assets").exists(): - asset_count = len(list((source_path / "assets").rglob("*"))) - - # Create template metadata - metadata = { - "name": name, - "description": description, - "version": "1.0.0", - "created_at": datetime.now().isoformat(), - "asset_count": asset_count, - "author": "Unknown", - "tags": [] - } - - metadata_file = template_path / "template.json" - metadata_file.write_text(json.dumps(metadata, indent=2)) - - # Save configuration if provided - if configuration: - config_file = template_path / "markitect.yaml" - config_file.write_text(yaml.dump(configuration, indent=2)) - - return TemplateResult( - success=True, - template_path=template_path, - template_name=name - ) - - except Exception as e: - return TemplateResult( - success=False, - template_path=Path(), - template_name=name, - error=e - ) - - def get_template_metadata(self, template_name: str) -> TemplateMetadata: - """Get metadata for a specific template.""" - template_path = self.templates_dir / template_name - template = WorkspaceTemplate(template_path) - return template.get_metadata() - - def create_workspace_from_template(self, template_name: str, target_path: Path, - project_name: str) -> WorkspaceCreationResult: - """Create a new workspace from a template.""" - try: - template_path = self.templates_dir / template_name - - if not template_path.exists(): - raise FileNotFoundError(f"Template '{template_name}' not found") - - # Create target directory - target_path.mkdir(parents=True, exist_ok=True) - - # Copy template contents - self._copy_workspace_structure(template_path, target_path, include_assets=True) - - # Update project-specific files - self._customize_workspace(target_path, project_name) - - return WorkspaceCreationResult( - success=True, - workspace_path=target_path, - project_name=project_name - ) - - except Exception as e: - return WorkspaceCreationResult( - success=False, - workspace_path=target_path, - project_name=project_name, - error=e - ) - - def initialize_multi_project_workspace(self, workspace_root: Path): - """Initialize a multi-project workspace.""" - workspace_root.mkdir(parents=True, exist_ok=True) - - # Create shared directories - (workspace_root / "shared_assets").mkdir(exist_ok=True) - (workspace_root / "templates").mkdir(exist_ok=True) - (workspace_root / "config").mkdir(exist_ok=True) - - # Create workspace configuration - config = { - "workspace_type": "multi_project", - "shared_assets_enabled": True, - "project_isolation": True, - "created_at": datetime.now().isoformat() - } - - config_file = workspace_root / "workspace.yaml" - config_file.write_text(yaml.dump(config, indent=2)) - - def add_project(self, workspace_root: Path, project_name: str, - template: Optional[str] = None) -> ProjectResult: - """Add a project to multi-project workspace.""" - try: - project_path = workspace_root / project_name - project_path.mkdir(exist_ok=True) - - if template: - # Use template if specified - result = self.create_workspace_from_template(template, project_path, project_name) - if not result.success: - raise result.error or Exception("Template creation failed") - else: - # Create basic project structure - (project_path / "docs").mkdir(exist_ok=True) - (project_path / "assets").mkdir(exist_ok=True) - - return ProjectResult( - success=True, - project_path=project_path, - project_name=project_name - ) - - except Exception as e: - return ProjectResult( - success=False, - project_path=workspace_root / project_name, - project_name=project_name, - error=e - ) - - def get_shared_asset_library(self, workspace_root: Path) -> Optional[AssetManager]: - """Get shared asset library for multi-project workspace.""" - shared_assets_path = workspace_root / "shared_assets" - if shared_assets_path.exists(): - return AssetManager(storage_path=shared_assets_path) - return None - - def initialize_workspace(self, workspace_path: Path): - """Initialize a single workspace.""" - workspace_path.mkdir(parents=True, exist_ok=True) - (workspace_path / "assets").mkdir(exist_ok=True) - (workspace_path / "docs").mkdir(exist_ok=True) - - def synchronize_assets(self, source_workspace: Path, target_workspace: Path, - sync_mode: str = "incremental") -> SyncResult: - """Synchronize assets between workspaces.""" - result = SyncResult( - synchronized_count=0, - skipped_count=0, - error_count=0 - ) - - try: - source_assets = source_workspace / "assets" - target_assets = target_workspace / "assets" - - if not source_assets.exists(): - return result - - target_assets.mkdir(exist_ok=True) - - # Simple synchronization (copy new files) - for asset_file in source_assets.rglob("*"): - if asset_file.is_file(): - relative_path = asset_file.relative_to(source_assets) - target_file = target_assets / relative_path - - if not target_file.exists() or sync_mode == "overwrite": - target_file.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(asset_file, target_file) - result.synchronized_count += 1 - else: - result.skipped_count += 1 - - except Exception as e: - result.error_count += 1 - result.errors.append(e) - - return result - - def create_backup(self, workspace_path: Path, backup_path: Path, - include_assets: bool = True, compression_level: int = 6) -> BackupResult: - """Create a backup of workspace.""" - try: - with zipfile.ZipFile(backup_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=compression_level) as backup_zip: - for file_path in workspace_path.rglob("*"): - if file_path.is_file(): - # Skip assets if not included - if not include_assets and "assets" in file_path.parts: - continue - - arc_name = file_path.relative_to(workspace_path) - backup_zip.write(file_path, arc_name) - - backup_size = backup_path.stat().st_size - - return BackupResult( - success=True, - backup_path=backup_path, - backup_size=backup_size - ) - - except Exception as e: - return BackupResult( - success=False, - backup_path=backup_path, - backup_size=0, - error=e - ) - - def restore_from_backup(self, backup_path: Path, target_path: Path) -> RestoreResult: - """Restore workspace from backup.""" - try: - target_path.mkdir(parents=True, exist_ok=True) - - files_restored = 0 - with zipfile.ZipFile(backup_path, 'r') as backup_zip: - backup_zip.extractall(target_path) - files_restored = len(backup_zip.namelist()) - - return RestoreResult( - success=True, - restored_path=target_path, - files_restored=files_restored - ) - - except Exception as e: - return RestoreResult( - success=False, - restored_path=target_path, - files_restored=0, - error=e - ) - - def capture_workspace_state(self, workspace_path: Path) -> WorkspaceState: - """Capture current state of workspace.""" - import hashlib - - file_checksums = {} - directory_structure = [] - asset_hashes = [] - - for item_path in workspace_path.rglob("*"): - relative_path = str(item_path.relative_to(workspace_path)) - - if item_path.is_file(): - # Calculate file checksum - content = item_path.read_bytes() - checksum = hashlib.md5(content).hexdigest() - file_checksums[relative_path] = checksum - - # Track asset hashes - if "assets" in item_path.parts: - asset_hashes.append(checksum) - - directory_structure.append(relative_path) - - return WorkspaceState( - timestamp=datetime.now(), - file_checksums=file_checksums, - directory_structure=directory_structure, - asset_hashes=asset_hashes - ) - - def detect_conflicts(self, state1: WorkspaceState, state2: WorkspaceState) -> List[ConflictInfo]: - """Detect conflicts between workspace states.""" - conflicts = [] - - # Find files that exist in both states but have different checksums - for file_path, checksum1 in state1.file_checksums.items(): - if file_path in state2.file_checksums: - checksum2 = state2.file_checksums[file_path] - if checksum1 != checksum2: - conflict = ConflictInfo( - file_path=Path(file_path), - conflict_type="content_conflict", - local_timestamp=state1.timestamp, - remote_timestamp=state2.timestamp - ) - conflicts.append(conflict) - - return conflicts - - def resolve_conflicts(self, conflicts: List[ConflictInfo], - resolution_strategy: str = "manual") -> MergeResult: - """Resolve workspace conflicts.""" - # Mock conflict resolution - result = MergeResult( - resolved_conflicts=len(conflicts), - unresolved_conflicts=0, - merge_strategy=resolution_strategy - ) - - return result - - def _copy_workspace_structure(self, source: Path, target: Path, include_assets: bool): - """Copy workspace structure from source to target.""" - for item in source.rglob("*"): - if item.is_file(): - relative_path = item.relative_to(source) - - # Skip assets if not included - if not include_assets and "assets" in relative_path.parts: - continue - - # Skip template metadata - if item.name == "template.json": - continue - - target_path = target / relative_path - target_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(item, target_path) - - def _customize_workspace(self, workspace_path: Path, project_name: str): - """Customize workspace for specific project.""" - # Update any configuration files with project name - config_files = list(workspace_path.glob("*.yaml")) + list(workspace_path.glob("*.yml")) - - for config_file in config_files: - try: - content = config_file.read_text() - # Replace placeholder project names - content = content.replace("{{PROJECT_NAME}}", project_name) - content = content.replace("New Project", project_name) - config_file.write_text(content) - except Exception: - pass # Ignore errors in customization \ No newline at end of file +# Re-export from core for backward compatibility +from markitect.core.workspace import ( + WorkspaceManager, + WorkspaceTemplate, + TemplateMetadata, + TemplateResult, + WorkspaceCreationResult, + ProjectResult, + SyncResult, + BackupResult, + RestoreResult, + WorkspaceState, + ConflictInfo, + MergeResult, +) + +__all__ = [ + 'WorkspaceManager', + 'WorkspaceTemplate', + 'TemplateMetadata', + 'TemplateResult', + 'WorkspaceCreationResult', + 'ProjectResult', + 'SyncResult', + 'BackupResult', + 'RestoreResult', + 'WorkspaceState', + 'ConflictInfo', + 'MergeResult', +] diff --git a/tests/integration/spaces/__init__.py b/tests/integration/spaces/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/spaces/test_space_service_integration.py b/tests/integration/spaces/test_space_service_integration.py new file mode 100644 index 00000000..e7303c78 --- /dev/null +++ b/tests/integration/spaces/test_space_service_integration.py @@ -0,0 +1,570 @@ +""" +Integration tests for SpaceService. + +Tests the full workflow of space operations including: +- Space creation and lifecycle management +- Document operations within spaces +- Variable management +- Reference tracking for cache invalidation +""" + +import pytest +import tempfile +import os + +from markitect.spaces import ( + SpaceService, + InformationSpace, + SpaceDocument, + SpaceConfig, + SpaceMetadata, + SpaceStatus, + SqliteSpaceRepository, + SqliteDocumentRepository, + SqliteVariableRepository, + SqliteReferenceRepository, +) + + +@pytest.fixture +def temp_db(): + """Create a temporary database file for testing.""" + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + yield path + if os.path.exists(path): + os.unlink(path) + + +@pytest.fixture +def space_service(temp_db): + """Create a fully wired SpaceService for testing.""" + return SpaceService( + space_repo=SqliteSpaceRepository(temp_db), + document_repo=SqliteDocumentRepository(temp_db), + variable_repo=SqliteVariableRepository(temp_db), + reference_repo=SqliteReferenceRepository(temp_db), + ) + + +class TestSpaceLifecycle: + """Tests for space lifecycle operations.""" + + def test_create_and_retrieve_space(self, space_service): + """Test creating and retrieving a space.""" + space = space_service.create_space( + name="my-docs", + description="My documentation", + ) + + assert space.name == "my-docs" + assert space.description == "My documentation" + assert space.status == SpaceStatus.DRAFT + + # Retrieve by ID + retrieved = space_service.get_space(space.id) + assert retrieved is not None + assert retrieved.name == "my-docs" + + # Retrieve by name + by_name = space_service.get_space_by_name("my-docs") + assert by_name is not None + assert by_name.id == space.id + + def test_create_space_with_config_and_metadata(self, space_service): + """Test creating a space with custom config and metadata.""" + config = SpaceConfig( + theme="dark", + history_enabled=True, + enable_caching=False, + ) + metadata = SpaceMetadata( + tags=["api", "v1"], + author="tester", + custom={"version": "1.0"}, + ) + + space = space_service.create_space( + name="configured-space", + config=config, + metadata=metadata, + ) + + assert space.config.theme == "dark" + assert space.config.history_enabled is True + assert space.metadata.tags == ["api", "v1"] + assert space.metadata.author == "tester" + + def test_update_space(self, space_service): + """Test updating a space.""" + space = space_service.create_space(name="original") + + updated = space_service.update_space( + space.id, + name="updated", + description="New description", + ) + + assert updated.name == "updated" + assert updated.description == "New description" + + # Verify persisted + retrieved = space_service.get_space(space.id) + assert retrieved.name == "updated" + + def test_space_lifecycle_transitions(self, space_service): + """Test space status transitions.""" + space = space_service.create_space(name="lifecycle-test") + assert space.status == SpaceStatus.DRAFT + + # Activate + activated = space_service.activate_space(space.id) + assert activated.status == SpaceStatus.ACTIVE + + # Archive + archived = space_service.archive_space(space.id) + assert archived.status == SpaceStatus.ARCHIVED + + def test_delete_space(self, space_service): + """Test deleting a space.""" + space = space_service.create_space(name="to-delete") + + result = space_service.delete_space(space.id) + assert result is True + + # Verify deleted + retrieved = space_service.get_space(space.id) + assert retrieved is None + + def test_list_spaces_excludes_archived(self, space_service): + """Test that list_spaces excludes archived by default.""" + space1 = space_service.create_space(name="active") + space2 = space_service.create_space(name="archived") + space_service.archive_space(space2.id) + + spaces = space_service.list_spaces() + assert len(spaces) == 1 + assert spaces[0].name == "active" + + # Include archived + all_spaces = space_service.list_spaces(include_archived=True) + assert len(all_spaces) == 2 + + +class TestSpaceHierarchy: + """Tests for space hierarchy operations.""" + + def test_create_child_space(self, space_service): + """Test creating a child space.""" + parent = space_service.create_space(name="parent") + child = space_service.create_space( + name="child", + parent_space_id=parent.id, + ) + + assert child.parent_space_id == parent.id + + children = space_service.get_child_spaces(parent.id) + assert len(children) == 1 + assert children[0].id == child.id + + def test_create_nested_hierarchy(self, space_service): + """Test creating a nested space hierarchy.""" + root = space_service.create_space(name="root") + level1 = space_service.create_space(name="level1", parent_space_id=root.id) + level2 = space_service.create_space(name="level2", parent_space_id=level1.id) + + # Verify hierarchy + root_children = space_service.get_child_spaces(root.id) + assert len(root_children) == 1 + assert root_children[0].id == level1.id + + level1_children = space_service.get_child_spaces(level1.id) + assert len(level1_children) == 1 + assert level1_children[0].id == level2.id + + def test_delete_space_with_children_cascade(self, space_service): + """Test deleting a space cascades to children.""" + parent = space_service.create_space(name="parent") + child = space_service.create_space(name="child", parent_space_id=parent.id) + + space_service.delete_space(parent.id, cascade=True) + + assert space_service.get_space(parent.id) is None + assert space_service.get_space(child.id) is None + + def test_delete_space_with_children_no_cascade_raises(self, space_service): + """Test deleting a space with children raises if cascade=False.""" + parent = space_service.create_space(name="parent") + space_service.create_space(name="child", parent_space_id=parent.id) + + with pytest.raises(ValueError, match="has 1 child"): + space_service.delete_space(parent.id, cascade=False) + + +class TestDocumentOperations: + """Tests for document operations within spaces.""" + + def test_add_and_list_documents(self, space_service): + """Test adding and listing documents.""" + space = space_service.create_space(name="doc-space") + + doc1 = space_service.add_document( + space.id, + space_path="/intro.md", + document_id="doc-1", + ) + doc2 = space_service.add_document( + space.id, + space_path="/api/endpoints.md", + document_id="doc-2", + ) + + docs = space_service.list_documents(space.id) + assert len(docs) == 2 + + def test_get_document_by_path(self, space_service): + """Test getting a document by its path.""" + space = space_service.create_space(name="doc-space") + space_service.add_document(space.id, "/intro.md", document_id="doc-1") + + doc = space_service.get_document_by_path(space.id, "/intro.md") + assert doc is not None + assert doc.document_id == "doc-1" + + # Also works without leading slash + doc2 = space_service.get_document_by_path(space.id, "intro.md") + assert doc2 is not None + + def test_move_document(self, space_service): + """Test moving a document to a new path.""" + space = space_service.create_space(name="doc-space") + doc = space_service.add_document(space.id, "/old-path.md") + + moved = space_service.move_document(doc.id, "/new-path.md") + assert moved.space_path == "/new-path.md" + + # Old path should not exist + old_doc = space_service.get_document_by_path(space.id, "/old-path.md") + assert old_doc is None + + # New path should work + new_doc = space_service.get_document_by_path(space.id, "/new-path.md") + assert new_doc is not None + + def test_remove_document(self, space_service): + """Test removing a document.""" + space = space_service.create_space(name="doc-space") + doc = space_service.add_document(space.id, "/to-remove.md") + + result = space_service.remove_document(doc.id) + assert result is True + + # Verify removed + retrieved = space_service.get_document(doc.id) + assert retrieved is None + + def test_reorder_documents(self, space_service): + """Test reordering documents.""" + space = space_service.create_space(name="doc-space") + doc1 = space_service.add_document(space.id, "/a.md", order_index=0) + doc2 = space_service.add_document(space.id, "/b.md", order_index=1) + doc3 = space_service.add_document(space.id, "/c.md", order_index=2) + + # Reorder: c, a, b + space_service.reorder_documents(space.id, [doc3.id, doc1.id, doc2.id]) + + docs = space_service.list_documents(space.id) + assert docs[0].id == doc3.id + assert docs[1].id == doc1.id + assert docs[2].id == doc2.id + + def test_document_with_metadata(self, space_service): + """Test document with custom metadata.""" + space = space_service.create_space(name="doc-space") + doc = space_service.add_document( + space.id, + "/api.md", + metadata={"title": "API Reference", "order": 5}, + ) + + retrieved = space_service.get_document(doc.id) + assert retrieved.metadata["title"] == "API Reference" + assert retrieved.metadata["order"] == 5 + + def test_update_document_hash(self, space_service): + """Test updating document content hash.""" + space = space_service.create_space(name="doc-space") + doc = space_service.add_document(space.id, "/content.md") + + space_service.update_document_hash(doc.id, "hash123abc") + + retrieved = space_service.get_document(doc.id) + assert retrieved.content_hash == "hash123abc" + + +class TestVariableOperations: + """Tests for variable operations within spaces.""" + + def test_set_and_get_variable(self, space_service): + """Test setting and getting a variable.""" + space = space_service.create_space(name="var-space") + + var = space_service.set_variable(space.id, "version", "1.0.0") + assert var.value == "1.0.0" + + retrieved = space_service.get_variable(space.id, "version") + assert retrieved is not None + assert retrieved.value == "1.0.0" + + def test_list_variables(self, space_service): + """Test listing variables.""" + space = space_service.create_space(name="var-space") + space_service.set_variable(space.id, "var1", "value1") + space_service.set_variable(space.id, "var2", "value2") + + variables = space_service.list_variables(space.id) + assert len(variables) == 2 + + def test_list_variables_by_scope(self, space_service): + """Test listing variables filtered by scope.""" + space = space_service.create_space(name="var-space") + space_service.set_variable(space.id, "global", "g", scope="space") + space_service.set_variable(space.id, "local", "l", scope="document") + + space_vars = space_service.list_variables(space.id, scope="space") + assert len(space_vars) == 1 + assert space_vars[0].name == "global" + + def test_delete_variable(self, space_service): + """Test deleting a variable.""" + space = space_service.create_space(name="var-space") + space_service.set_variable(space.id, "temp", "value") + + result = space_service.delete_variable(space.id, "temp") + assert result is True + + retrieved = space_service.get_variable(space.id, "temp") + assert retrieved is None + + def test_get_variables_dict(self, space_service): + """Test getting variables as a dictionary.""" + space = space_service.create_space(name="var-space") + space_service.set_variable(space.id, "api_url", "https://api.example.com") + space_service.set_variable(space.id, "version", "2.0") + + variables_dict = space_service.get_variables_dict(space.id) + assert variables_dict == { + "api_url": "https://api.example.com", + "version": "2.0", + } + + def test_variable_with_complex_value(self, space_service): + """Test variable with complex JSON value.""" + space = space_service.create_space(name="var-space") + complex_value = { + "endpoints": ["/api/v1", "/api/v2"], + "config": {"timeout": 30}, + } + space_service.set_variable(space.id, "api_config", complex_value) + + retrieved = space_service.get_variable(space.id, "api_config") + assert retrieved.value == complex_value + + +class TestReferenceTracking: + """Tests for transclusion reference tracking.""" + + def test_add_and_get_references(self, space_service): + """Test adding and getting references.""" + space = space_service.create_space(name="ref-space") + + space_service.add_reference("doc-1", "shared-component", space.id) + space_service.add_reference("doc-2", "shared-component", space.id) + + refs = space_service.get_references_to("shared-component", space.id) + assert len(refs) == 2 + + def test_get_references_from(self, space_service): + """Test getting references from a source document.""" + space = space_service.create_space(name="ref-space") + + space_service.add_reference("doc-1", "component-a", space.id) + space_service.add_reference("doc-1", "component-b", space.id) + + refs = space_service.get_references_from("doc-1", space.id) + assert len(refs) == 2 + targets = [r.target_doc_id for r in refs] + assert "component-a" in targets + assert "component-b" in targets + + def test_get_dependents(self, space_service): + """Test getting dependent documents.""" + space = space_service.create_space(name="ref-space") + + space_service.add_reference("doc-1", "shared", space.id) + space_service.add_reference("doc-2", "shared", space.id) + space_service.add_reference("doc-3", "shared", space.id) + + dependents = space_service.get_dependents("shared", space.id) + assert len(dependents) == 3 + assert set(dependents) == {"doc-1", "doc-2", "doc-3"} + + def test_clear_references_from(self, space_service): + """Test clearing references from a source document.""" + space = space_service.create_space(name="ref-space") + + space_service.add_reference("doc-1", "a", space.id) + space_service.add_reference("doc-1", "b", space.id) + space_service.add_reference("doc-2", "a", space.id) + + count = space_service.clear_references_from("doc-1", space.id) + assert count == 2 + + # doc-1 refs should be gone + refs1 = space_service.get_references_from("doc-1", space.id) + assert len(refs1) == 0 + + # doc-2 refs should still exist + refs2 = space_service.get_references_from("doc-2", space.id) + assert len(refs2) == 1 + + def test_remove_document_clears_references(self, space_service): + """Test that removing a document clears its references.""" + space = space_service.create_space(name="ref-space") + doc = space_service.add_document(space.id, "/source.md") + + # Add reference from this document + space_service.add_reference(doc.id, "target", space.id) + + # Verify reference exists + refs = space_service.get_references_from(doc.id, space.id) + assert len(refs) == 1 + + # Remove document + space_service.remove_document(doc.id) + + # References should be cleared + refs = space_service.get_references_from(doc.id, space.id) + assert len(refs) == 0 + + +class TestFullWorkflow: + """End-to-end workflow tests.""" + + def test_documentation_space_workflow(self, space_service): + """Test a complete documentation space workflow.""" + # Create a documentation space + space = space_service.create_space( + name="api-docs", + description="API Documentation", + config=SpaceConfig(theme="minimal"), + metadata=SpaceMetadata(tags=["api", "v2"]), + ) + + # Add documents + intro = space_service.add_document( + space.id, + "/intro.md", + order_index=0, + metadata={"title": "Introduction"}, + ) + endpoints = space_service.add_document( + space.id, + "/api/endpoints.md", + order_index=1, + metadata={"title": "API Endpoints"}, + ) + auth = space_service.add_document( + space.id, + "/api/auth.md", + order_index=2, + metadata={"title": "Authentication"}, + ) + + # Add variables for transclusion + space_service.set_variable(space.id, "api_base_url", "https://api.example.com") + space_service.set_variable(space.id, "version", "2.0") + + # Track references (e.g., endpoints includes auth) + space_service.add_reference(endpoints.id, auth.id, space.id) + + # Activate the space + space_service.activate_space(space.id) + + # Get stats + stats = space_service.get_space_stats(space.id) + assert stats["document_count"] == 3 + assert stats["variable_count"] == 2 + assert stats["status"] == "active" + + # Verify the space + retrieved = space_service.get_space(space.id) + assert retrieved.status == SpaceStatus.ACTIVE + + # List documents in order + docs = space_service.list_documents(space.id) + assert len(docs) == 3 + assert docs[0].space_path == "/intro.md" + + # Get transclusion context + context = space_service.get_variables_dict(space.id) + assert context["api_base_url"] == "https://api.example.com" + + # Check dependencies for cache invalidation + dependents = space_service.get_dependents(auth.id, space.id) + assert endpoints.id in dependents + + def test_space_stats(self, space_service): + """Test getting space statistics.""" + space = space_service.create_space(name="stats-test") + space_service.add_document(space.id, "/doc1.md") + space_service.add_document(space.id, "/doc2.md") + space_service.set_variable(space.id, "var1", "value1") + space_service.create_space(name="child", parent_space_id=space.id) + + stats = space_service.get_space_stats(space.id) + + assert stats["name"] == "stats-test" + assert stats["document_count"] == 2 + assert stats["variable_count"] == 1 + assert stats["child_space_count"] == 1 + + +class TestErrorHandling: + """Tests for error handling scenarios.""" + + def test_create_space_empty_name_raises(self, space_service): + """Test that empty name raises ValueError.""" + with pytest.raises(ValueError, match="cannot be empty"): + space_service.create_space(name="") + + with pytest.raises(ValueError, match="cannot be empty"): + space_service.create_space(name=" ") + + def test_create_space_duplicate_name_raises(self, space_service): + """Test that duplicate name raises ValueError.""" + space_service.create_space(name="taken") + + with pytest.raises(ValueError, match="already exists"): + space_service.create_space(name="taken") + + def test_update_nonexistent_space_raises(self, space_service): + """Test that updating non-existent space raises ValueError.""" + with pytest.raises(ValueError, match="not found"): + space_service.update_space("non-existent", name="new-name") + + def test_add_document_to_nonexistent_space_raises(self, space_service): + """Test that adding document to non-existent space raises.""" + with pytest.raises(ValueError, match="not found"): + space_service.add_document("non-existent", "/doc.md") + + def test_set_variable_in_nonexistent_space_raises(self, space_service): + """Test that setting variable in non-existent space raises.""" + with pytest.raises(ValueError, match="not found"): + space_service.set_variable("non-existent", "var", "value") + + def test_create_child_with_nonexistent_parent_raises(self, space_service): + """Test that creating child with non-existent parent raises.""" + with pytest.raises(ValueError, match="Parent space.*not found"): + space_service.create_space(name="orphan", parent_space_id="non-existent") diff --git a/tests/unit/spaces/__init__.py b/tests/unit/spaces/__init__.py new file mode 100644 index 00000000..b9174662 --- /dev/null +++ b/tests/unit/spaces/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the spaces package.""" diff --git a/tests/unit/spaces/test_models.py b/tests/unit/spaces/test_models.py new file mode 100644 index 00000000..59c7711b --- /dev/null +++ b/tests/unit/spaces/test_models.py @@ -0,0 +1,299 @@ +""" +Unit tests for Information Space models. + +Tests the core domain models: InformationSpace, SpaceDocument, SpaceConfig, SpaceMetadata. +""" + +import pytest +from datetime import datetime +from markitect.spaces.models import ( + InformationSpace, + SpaceDocument, + SpaceConfig, + SpaceMetadata, + SpaceVariable, + TransclusionReference, + SpaceStatus, +) + + +class TestSpaceMetadata: + """Tests for SpaceMetadata dataclass.""" + + def test_default_metadata(self): + """Test default metadata values.""" + metadata = SpaceMetadata() + assert metadata.tags == [] + assert metadata.author is None + assert metadata.custom == {} + + def test_metadata_with_values(self): + """Test metadata with custom values.""" + metadata = SpaceMetadata( + tags=["api", "docs"], + author="test-user", + custom={"version": "1.0"} + ) + assert metadata.tags == ["api", "docs"] + assert metadata.author == "test-user" + assert metadata.custom["version"] == "1.0" + + def test_metadata_to_dict(self): + """Test metadata serialization.""" + metadata = SpaceMetadata(tags=["test"], author="user") + data = metadata.to_dict() + assert data["tags"] == ["test"] + assert data["author"] == "user" + + def test_metadata_from_dict(self): + """Test metadata deserialization.""" + data = {"tags": ["api"], "author": "admin", "custom": {"key": "value"}} + metadata = SpaceMetadata.from_dict(data) + assert metadata.tags == ["api"] + assert metadata.author == "admin" + assert metadata.custom["key"] == "value" + + +class TestSpaceConfig: + """Tests for SpaceConfig dataclass.""" + + def test_default_config(self): + """Test default configuration values.""" + config = SpaceConfig() + assert config.default_variant == "hierarchical" + assert config.enable_caching is True + assert config.theme is None + assert config.history_enabled is False + assert config.history_backend == "git" + + def test_config_with_history_enabled(self): + """Test config with git history enabled.""" + config = SpaceConfig(history_enabled=True, history_backend="git") + assert config.history_enabled is True + assert config.history_backend == "git" + + def test_config_to_dict(self): + """Test config serialization.""" + config = SpaceConfig(theme="dark", enable_caching=False) + data = config.to_dict() + assert data["theme"] == "dark" + assert data["enable_caching"] is False + + def test_config_from_dict(self): + """Test config deserialization.""" + data = {"default_variant": "flat", "history_enabled": True} + config = SpaceConfig.from_dict(data) + assert config.default_variant == "flat" + assert config.history_enabled is True + + +class TestSpaceDocument: + """Tests for SpaceDocument dataclass.""" + + def test_default_document(self): + """Test default document values.""" + doc = SpaceDocument() + assert doc.id is not None + assert doc.space_path == "" + assert doc.order_index == 0 + assert doc.metadata == {} + + def test_document_with_values(self): + """Test document with custom values.""" + doc = SpaceDocument( + space_id="space-1", + document_id="doc-1", + space_path="/intro.md", + order_index=1, + content_hash="abc123" + ) + assert doc.space_id == "space-1" + assert doc.space_path == "/intro.md" + assert doc.content_hash == "abc123" + + def test_document_to_dict(self): + """Test document serialization.""" + doc = SpaceDocument(space_path="/test.md") + data = doc.to_dict() + assert data["space_path"] == "/test.md" + assert "id" in data + assert "added_at" in data + + def test_document_from_dict(self): + """Test document deserialization.""" + data = { + "id": "doc-123", + "space_id": "space-1", + "space_path": "/api.md", + "order_index": 5 + } + doc = SpaceDocument.from_dict(data) + assert doc.id == "doc-123" + assert doc.space_path == "/api.md" + assert doc.order_index == 5 + + +class TestInformationSpace: + """Tests for InformationSpace dataclass.""" + + def test_space_requires_name(self): + """Test that space name is required.""" + with pytest.raises(ValueError, match="Space name is required"): + InformationSpace(name="") + + def test_space_default_values(self): + """Test default space values.""" + space = InformationSpace(name="test-space") + assert space.name == "test-space" + assert space.id is not None + assert space.status == SpaceStatus.DRAFT + assert space.description is None + assert space.parent_space_id is None + + def test_space_with_config(self): + """Test space with custom config.""" + config = SpaceConfig(theme="minimal", history_enabled=True) + space = InformationSpace( + name="docs", + description="Documentation space", + config=config + ) + assert space.config.theme == "minimal" + assert space.config.history_enabled is True + + def test_space_activation(self): + """Test space lifecycle transitions.""" + space = InformationSpace(name="test") + assert space.status == SpaceStatus.DRAFT + + space.activate() + assert space.status == SpaceStatus.ACTIVE + + space.archive() + assert space.status == SpaceStatus.ARCHIVED + + def test_space_touch_updates_timestamp(self): + """Test that touch() updates the timestamp.""" + space = InformationSpace(name="test") + original_updated = space.updated_at + + import time + time.sleep(0.01) # Small delay to ensure timestamp changes + + space.touch() + assert space.updated_at >= original_updated + + def test_space_to_dict(self): + """Test space serialization.""" + space = InformationSpace( + name="api-docs", + description="API Documentation" + ) + data = space.to_dict() + + assert data["name"] == "api-docs" + assert data["description"] == "API Documentation" + assert data["status"] == "draft" + assert "id" in data + assert "created_at" in data + + def test_space_from_dict(self): + """Test space deserialization.""" + data = { + "id": "space-123", + "name": "my-space", + "description": "Test space", + "status": "active", + "config": {"history_enabled": True}, + "metadata": {"tags": ["test"]} + } + space = InformationSpace.from_dict(data) + + assert space.id == "space-123" + assert space.name == "my-space" + assert space.status == SpaceStatus.ACTIVE + assert space.config.history_enabled is True + assert space.metadata.tags == ["test"] + + def test_space_roundtrip_serialization(self): + """Test that to_dict and from_dict are inverse operations.""" + original = InformationSpace( + name="roundtrip-test", + description="Testing serialization", + config=SpaceConfig(theme="dark", history_enabled=True), + metadata=SpaceMetadata(tags=["test", "roundtrip"]) + ) + original.activate() + + data = original.to_dict() + restored = InformationSpace.from_dict(data) + + assert restored.name == original.name + assert restored.description == original.description + assert restored.status == original.status + assert restored.config.theme == original.config.theme + assert restored.metadata.tags == original.metadata.tags + + +class TestSpaceVariable: + """Tests for SpaceVariable dataclass.""" + + def test_variable_creation(self): + """Test variable creation.""" + var = SpaceVariable( + space_id="space-1", + name="version", + value="1.0.0" + ) + assert var.name == "version" + assert var.value == "1.0.0" + assert var.scope == "space" + + def test_variable_to_dict(self): + """Test variable serialization.""" + var = SpaceVariable( + space_id="space-1", + name="config", + value={"key": "value"}, + scope="document" + ) + data = var.to_dict() + assert data["name"] == "config" + assert data["scope"] == "document" + + +class TestTransclusionReference: + """Tests for TransclusionReference dataclass.""" + + def test_reference_creation(self): + """Test transclusion reference creation.""" + ref = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id="space-1" + ) + assert ref.source_doc_id == "doc-1" + assert ref.target_doc_id == "doc-2" + assert ref.created_at is not None + + def test_reference_to_dict(self): + """Test reference serialization.""" + ref = TransclusionReference( + source_doc_id="a", + target_doc_id="b", + space_id="s" + ) + data = ref.to_dict() + assert "created_at" in data + assert data["source_doc_id"] == "a" + + +class TestSpaceStatus: + """Tests for SpaceStatus enum.""" + + def test_status_values(self): + """Test status enum values.""" + assert SpaceStatus.DRAFT.value == "draft" + assert SpaceStatus.ACTIVE.value == "active" + assert SpaceStatus.ARCHIVED.value == "archived" + assert SpaceStatus.DELETED.value == "deleted" diff --git a/tests/unit/spaces/test_repositories.py b/tests/unit/spaces/test_repositories.py new file mode 100644 index 00000000..87a9c753 --- /dev/null +++ b/tests/unit/spaces/test_repositories.py @@ -0,0 +1,901 @@ +""" +Unit tests for space repositories. + +Tests the SQLite implementations of: +- ISpaceRepository (SqliteSpaceRepository) +- IDocumentAssociationRepository (SqliteDocumentRepository) +- IVariableRepository (SqliteVariableRepository) +- IReferenceRepository (SqliteReferenceRepository) +""" + +import pytest +import tempfile +import os +from datetime import datetime + +from markitect.spaces.models import ( + InformationSpace, + SpaceDocument, + SpaceVariable, + TransclusionReference, + SpaceStatus, + SpaceConfig, + SpaceMetadata, +) +from markitect.spaces.repositories.sqlite import ( + SqliteSpaceRepository, + SqliteDocumentRepository, + SqliteVariableRepository, + SqliteReferenceRepository, + initialize_space_tables, +) + + +@pytest.fixture +def temp_db(): + """Create a temporary database file for testing.""" + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + yield path + if os.path.exists(path): + os.unlink(path) + + +@pytest.fixture +def space_repo(temp_db): + """Create a SqliteSpaceRepository for testing.""" + return SqliteSpaceRepository(temp_db) + + +@pytest.fixture +def doc_repo(temp_db): + """Create a SqliteDocumentRepository for testing.""" + return SqliteDocumentRepository(temp_db) + + +@pytest.fixture +def var_repo(temp_db): + """Create a SqliteVariableRepository for testing.""" + return SqliteVariableRepository(temp_db) + + +@pytest.fixture +def ref_repo(temp_db): + """Create a SqliteReferenceRepository for testing.""" + return SqliteReferenceRepository(temp_db) + + +class TestInitializeSpaceTables: + """Tests for initialize_space_tables function.""" + + def test_creates_tables(self, temp_db): + """Test that initialize_space_tables creates all required tables.""" + import sqlite3 + + initialize_space_tables(temp_db) + + conn = sqlite3.connect(temp_db) + cursor = conn.cursor() + + # Check that all tables exist + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = {row[0] for row in cursor.fetchall()} + + assert "spaces" in tables + assert "space_documents" in tables + assert "space_variables" in tables + assert "transclusion_references" in tables + + conn.close() + + def test_idempotent(self, temp_db): + """Test that initialize_space_tables can be called multiple times.""" + initialize_space_tables(temp_db) + initialize_space_tables(temp_db) # Should not raise + + def test_creates_parent_directory(self): + """Test that initialize_space_tables creates parent directories.""" + with tempfile.TemporaryDirectory() as tmpdir: + db_path = os.path.join(tmpdir, "subdir", "nested", "test.db") + initialize_space_tables(db_path) + assert os.path.exists(db_path) + + +class TestSqliteSpaceRepository: + """Tests for SqliteSpaceRepository.""" + + def test_create_space(self, space_repo): + """Test creating a new space.""" + space = InformationSpace(name="test-space", description="A test space") + created = space_repo.create(space) + + assert created.id == space.id + assert created.name == "test-space" + assert created.description == "A test space" + + def test_create_space_duplicate_name_raises(self, space_repo): + """Test that creating a space with duplicate name raises ValueError.""" + space1 = InformationSpace(name="duplicate") + space_repo.create(space1) + + space2 = InformationSpace(name="duplicate") + with pytest.raises(ValueError, match="already exists"): + space_repo.create(space2) + + def test_get_by_id(self, space_repo): + """Test retrieving a space by ID.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + retrieved = space_repo.get_by_id(space.id) + assert retrieved is not None + assert retrieved.id == space.id + assert retrieved.name == "test-space" + + def test_get_by_id_not_found(self, space_repo): + """Test that get_by_id returns None for non-existent space.""" + result = space_repo.get_by_id("non-existent-id") + assert result is None + + def test_get_by_name(self, space_repo): + """Test retrieving a space by name.""" + space = InformationSpace(name="named-space") + space_repo.create(space) + + retrieved = space_repo.get_by_name("named-space") + assert retrieved is not None + assert retrieved.name == "named-space" + + def test_get_by_name_not_found(self, space_repo): + """Test that get_by_name returns None for non-existent space.""" + result = space_repo.get_by_name("non-existent") + assert result is None + + def test_list_all_empty(self, space_repo): + """Test listing spaces when none exist.""" + spaces = space_repo.list_all() + assert spaces == [] + + def test_list_all(self, space_repo): + """Test listing all spaces.""" + space1 = InformationSpace(name="alpha") + space2 = InformationSpace(name="beta") + space_repo.create(space1) + space_repo.create(space2) + + spaces = space_repo.list_all() + assert len(spaces) == 2 + names = [s.name for s in spaces] + assert "alpha" in names + assert "beta" in names + + def test_list_all_excludes_archived_by_default(self, space_repo): + """Test that list_all excludes archived spaces by default.""" + space1 = InformationSpace(name="active-space") + space2 = InformationSpace(name="archived-space") + space2.archive() + + space_repo.create(space1) + space_repo.create(space2) + + spaces = space_repo.list_all() + assert len(spaces) == 1 + assert spaces[0].name == "active-space" + + def test_list_all_includes_archived_when_requested(self, space_repo): + """Test that list_all includes archived spaces when requested.""" + space1 = InformationSpace(name="active-space") + space2 = InformationSpace(name="archived-space") + space2.archive() + + space_repo.create(space1) + space_repo.create(space2) + + spaces = space_repo.list_all(include_archived=True) + assert len(spaces) == 2 + + def test_update_space(self, space_repo): + """Test updating a space.""" + space = InformationSpace(name="original") + space_repo.create(space) + + space.description = "Updated description" + updated = space_repo.update(space) + + assert updated.description == "Updated description" + + # Verify persisted + retrieved = space_repo.get_by_id(space.id) + assert retrieved.description == "Updated description" + + def test_update_nonexistent_raises(self, space_repo): + """Test that updating a non-existent space raises ValueError.""" + space = InformationSpace(name="non-existent") + with pytest.raises(ValueError, match="does not exist"): + space_repo.update(space) + + def test_delete_space(self, space_repo): + """Test deleting a space.""" + space = InformationSpace(name="to-delete") + space_repo.create(space) + + result = space_repo.delete(space.id) + assert result is True + + # Verify deleted + retrieved = space_repo.get_by_id(space.id) + assert retrieved is None + + def test_delete_nonexistent(self, space_repo): + """Test that deleting a non-existent space returns False.""" + result = space_repo.delete("non-existent-id") + assert result is False + + def test_exists(self, space_repo): + """Test checking if a space exists.""" + space = InformationSpace(name="existing") + space_repo.create(space) + + assert space_repo.exists(space.id) is True + assert space_repo.exists("non-existent") is False + + def test_get_children(self, space_repo): + """Test getting child spaces.""" + parent = InformationSpace(name="parent") + space_repo.create(parent) + + child1 = InformationSpace(name="child1", parent_space_id=parent.id) + child2 = InformationSpace(name="child2", parent_space_id=parent.id) + space_repo.create(child1) + space_repo.create(child2) + + children = space_repo.get_children(parent.id) + assert len(children) == 2 + names = [c.name for c in children] + assert "child1" in names + assert "child2" in names + + def test_get_children_empty(self, space_repo): + """Test getting children when none exist.""" + parent = InformationSpace(name="lonely-parent") + space_repo.create(parent) + + children = space_repo.get_children(parent.id) + assert children == [] + + def test_space_with_config_and_metadata(self, space_repo): + """Test creating and retrieving a space with config and metadata.""" + config = SpaceConfig(theme="dark", history_enabled=True) + metadata = SpaceMetadata(tags=["api", "docs"], author="tester") + space = InformationSpace( + name="configured-space", + config=config, + metadata=metadata, + ) + space_repo.create(space) + + retrieved = space_repo.get_by_id(space.id) + assert retrieved.config.theme == "dark" + assert retrieved.config.history_enabled is True + assert retrieved.metadata.tags == ["api", "docs"] + assert retrieved.metadata.author == "tester" + + def test_space_status_persistence(self, space_repo): + """Test that space status is persisted correctly.""" + space = InformationSpace(name="lifecycle-test") + space.activate() + space_repo.create(space) + + retrieved = space_repo.get_by_id(space.id) + assert retrieved.status == SpaceStatus.ACTIVE + + +class TestSqliteDocumentRepository: + """Tests for SqliteDocumentRepository.""" + + def test_add_document(self, doc_repo, space_repo): + """Test adding a document to a space.""" + # First create a space + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument( + space_id=space.id, + document_id="doc-123", + space_path="/intro.md", + ) + added = doc_repo.add_document(doc) + + assert added.id == doc.id + assert added.space_path == "/intro.md" + + def test_add_document_duplicate_path_raises(self, doc_repo, space_repo): + """Test that adding a document with duplicate path raises ValueError.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc1 = SpaceDocument(space_id=space.id, space_path="/same.md") + doc_repo.add_document(doc1) + + doc2 = SpaceDocument(space_id=space.id, space_path="/same.md") + with pytest.raises(ValueError, match="already exists"): + doc_repo.add_document(doc2) + + def test_get_document(self, doc_repo, space_repo): + """Test getting a document by ID.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/test.md") + doc_repo.add_document(doc) + + retrieved = doc_repo.get_document(doc.id) + assert retrieved is not None + assert retrieved.space_path == "/test.md" + + def test_get_document_not_found(self, doc_repo): + """Test that get_document returns None for non-existent document.""" + result = doc_repo.get_document("non-existent") + assert result is None + + def test_get_by_space_path(self, doc_repo, space_repo): + """Test getting a document by space path.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/api/docs.md") + doc_repo.add_document(doc) + + retrieved = doc_repo.get_by_space_path(space.id, "/api/docs.md") + assert retrieved is not None + assert retrieved.id == doc.id + + def test_get_by_space_path_not_found(self, doc_repo, space_repo): + """Test that get_by_space_path returns None for non-existent path.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + result = doc_repo.get_by_space_path(space.id, "/non-existent.md") + assert result is None + + def test_list_by_space(self, doc_repo, space_repo): + """Test listing documents in a space.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc1 = SpaceDocument(space_id=space.id, space_path="/first.md", order_index=0) + doc2 = SpaceDocument(space_id=space.id, space_path="/second.md", order_index=1) + doc_repo.add_document(doc1) + doc_repo.add_document(doc2) + + docs = doc_repo.list_by_space(space.id) + assert len(docs) == 2 + assert docs[0].space_path == "/first.md" + assert docs[1].space_path == "/second.md" + + def test_list_by_space_empty(self, doc_repo, space_repo): + """Test listing documents when none exist.""" + space = InformationSpace(name="empty-space") + space_repo.create(space) + + docs = doc_repo.list_by_space(space.id) + assert docs == [] + + def test_update_document(self, doc_repo, space_repo): + """Test updating a document.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/old.md") + doc_repo.add_document(doc) + + doc.content_hash = "newhash123" + updated = doc_repo.update_document(doc) + assert updated.content_hash == "newhash123" + + # Verify persisted + retrieved = doc_repo.get_document(doc.id) + assert retrieved.content_hash == "newhash123" + + def test_update_nonexistent_raises(self, doc_repo): + """Test that updating a non-existent document raises ValueError.""" + doc = SpaceDocument(space_path="/non-existent.md") + with pytest.raises(ValueError, match="does not exist"): + doc_repo.update_document(doc) + + def test_remove_document(self, doc_repo, space_repo): + """Test removing a document.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/to-remove.md") + doc_repo.add_document(doc) + + result = doc_repo.remove_document(doc.id) + assert result is True + + # Verify removed + retrieved = doc_repo.get_document(doc.id) + assert retrieved is None + + def test_remove_nonexistent(self, doc_repo): + """Test that removing a non-existent document returns False.""" + result = doc_repo.remove_document("non-existent") + assert result is False + + def test_move_document(self, doc_repo, space_repo): + """Test moving a document to a new path.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/old-path.md") + doc_repo.add_document(doc) + + moved = doc_repo.move_document(doc.id, "/new-path.md") + assert moved.space_path == "/new-path.md" + + # Verify old path no longer works + old_result = doc_repo.get_by_space_path(space.id, "/old-path.md") + assert old_result is None + + # Verify new path works + new_result = doc_repo.get_by_space_path(space.id, "/new-path.md") + assert new_result is not None + + def test_move_document_to_existing_path_raises(self, doc_repo, space_repo): + """Test that moving to an existing path raises ValueError.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc1 = SpaceDocument(space_id=space.id, space_path="/first.md") + doc2 = SpaceDocument(space_id=space.id, space_path="/second.md") + doc_repo.add_document(doc1) + doc_repo.add_document(doc2) + + with pytest.raises(ValueError, match="already exists"): + doc_repo.move_document(doc1.id, "/second.md") + + def test_move_nonexistent_raises(self, doc_repo): + """Test that moving a non-existent document raises ValueError.""" + with pytest.raises(ValueError, match="does not exist"): + doc_repo.move_document("non-existent", "/new-path.md") + + def test_reorder_documents(self, doc_repo, space_repo): + """Test reordering documents within a space.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc1 = SpaceDocument(space_id=space.id, space_path="/a.md", order_index=0) + doc2 = SpaceDocument(space_id=space.id, space_path="/b.md", order_index=1) + doc3 = SpaceDocument(space_id=space.id, space_path="/c.md", order_index=2) + doc_repo.add_document(doc1) + doc_repo.add_document(doc2) + doc_repo.add_document(doc3) + + # Reorder: c, a, b + doc_repo.reorder_documents(space.id, [doc3.id, doc1.id, doc2.id]) + + docs = doc_repo.list_by_space(space.id) + assert docs[0].id == doc3.id + assert docs[1].id == doc1.id + assert docs[2].id == doc2.id + + def test_update_content_hash(self, doc_repo, space_repo): + """Test updating content hash.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/test.md") + doc_repo.add_document(doc) + + doc_repo.update_content_hash(doc.id, "newhash456") + + retrieved = doc_repo.get_document(doc.id) + assert retrieved.content_hash == "newhash456" + + def test_document_with_metadata(self, doc_repo, space_repo): + """Test document with custom metadata.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument( + space_id=space.id, + space_path="/with-meta.md", + metadata={"title": "Test Document", "version": "1.0"}, + ) + doc_repo.add_document(doc) + + retrieved = doc_repo.get_document(doc.id) + assert retrieved.metadata["title"] == "Test Document" + assert retrieved.metadata["version"] == "1.0" + + +class TestSqliteVariableRepository: + """Tests for SqliteVariableRepository.""" + + def test_set_variable(self, var_repo, space_repo): + """Test setting a variable.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + var = SpaceVariable( + space_id=space.id, + name="version", + value="1.0.0", + ) + result = var_repo.set_variable(var) + assert result.name == "version" + assert result.value == "1.0.0" + + def test_set_variable_overwrites(self, var_repo, space_repo): + """Test that setting a variable with same name overwrites.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + var1 = SpaceVariable(space_id=space.id, name="config", value="old") + var_repo.set_variable(var1) + + var2 = SpaceVariable(space_id=space.id, name="config", value="new") + var_repo.set_variable(var2) + + retrieved = var_repo.get_variable(space.id, "config") + assert retrieved.value == "new" + + def test_get_variable(self, var_repo, space_repo): + """Test getting a variable.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + var = SpaceVariable(space_id=space.id, name="api_key", value="secret123") + var_repo.set_variable(var) + + retrieved = var_repo.get_variable(space.id, "api_key") + assert retrieved is not None + assert retrieved.value == "secret123" + + def test_get_variable_not_found(self, var_repo, space_repo): + """Test that get_variable returns None for non-existent variable.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + result = var_repo.get_variable(space.id, "non-existent") + assert result is None + + def test_list_variables(self, var_repo, space_repo): + """Test listing variables in a space.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + var1 = SpaceVariable(space_id=space.id, name="var1", value="a") + var2 = SpaceVariable(space_id=space.id, name="var2", value="b") + var_repo.set_variable(var1) + var_repo.set_variable(var2) + + variables = var_repo.list_variables(space.id) + assert len(variables) == 2 + names = [v.name for v in variables] + assert "var1" in names + assert "var2" in names + + def test_list_variables_empty(self, var_repo, space_repo): + """Test listing variables when none exist.""" + space = InformationSpace(name="empty-space") + space_repo.create(space) + + variables = var_repo.list_variables(space.id) + assert variables == [] + + def test_list_variables_with_scope_filter(self, var_repo, space_repo): + """Test listing variables filtered by scope.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + var1 = SpaceVariable(space_id=space.id, name="global", value="x", scope="space") + var2 = SpaceVariable(space_id=space.id, name="local", value="y", scope="document") + var_repo.set_variable(var1) + var_repo.set_variable(var2) + + space_vars = var_repo.list_variables(space.id, scope="space") + assert len(space_vars) == 1 + assert space_vars[0].name == "global" + + doc_vars = var_repo.list_variables(space.id, scope="document") + assert len(doc_vars) == 1 + assert doc_vars[0].name == "local" + + def test_delete_variable(self, var_repo, space_repo): + """Test deleting a variable.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + var = SpaceVariable(space_id=space.id, name="to-delete", value="bye") + var_repo.set_variable(var) + + result = var_repo.delete_variable(space.id, "to-delete") + assert result is True + + # Verify deleted + retrieved = var_repo.get_variable(space.id, "to-delete") + assert retrieved is None + + def test_delete_nonexistent(self, var_repo, space_repo): + """Test that deleting a non-existent variable returns False.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + result = var_repo.delete_variable(space.id, "non-existent") + assert result is False + + def test_variable_with_complex_value(self, var_repo, space_repo): + """Test variable with complex JSON value.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + complex_value = { + "endpoints": [ + {"url": "/api/v1", "methods": ["GET", "POST"]}, + {"url": "/api/v2", "methods": ["GET"]}, + ], + "config": {"timeout": 30, "retries": 3}, + } + var = SpaceVariable(space_id=space.id, name="api_config", value=complex_value) + var_repo.set_variable(var) + + retrieved = var_repo.get_variable(space.id, "api_config") + assert retrieved.value == complex_value + assert retrieved.value["endpoints"][0]["url"] == "/api/v1" + + +class TestSqliteReferenceRepository: + """Tests for SqliteReferenceRepository.""" + + def test_add_reference(self, ref_repo, space_repo): + """Test adding a transclusion reference.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + ref = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space.id, + ) + result = ref_repo.add_reference(ref) + assert result.source_doc_id == "doc-1" + assert result.target_doc_id == "doc-2" + + def test_add_reference_overwrites(self, ref_repo, space_repo): + """Test that adding same reference overwrites (no duplicates).""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + ref1 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space.id, + ) + ref_repo.add_reference(ref1) + + # Add same reference again (should not raise) + ref2 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space.id, + ) + ref_repo.add_reference(ref2) + + # Should still only have one reference + refs = ref_repo.get_references_from("doc-1", space.id) + assert len(refs) == 1 + + def test_get_references_from(self, ref_repo, space_repo): + """Test getting references from a source document.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + ref1 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space.id, + ) + ref2 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-3", + space_id=space.id, + ) + ref_repo.add_reference(ref1) + ref_repo.add_reference(ref2) + + refs = ref_repo.get_references_from("doc-1", space.id) + assert len(refs) == 2 + targets = [r.target_doc_id for r in refs] + assert "doc-2" in targets + assert "doc-3" in targets + + def test_get_references_from_empty(self, ref_repo, space_repo): + """Test getting references when none exist.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + refs = ref_repo.get_references_from("non-existent", space.id) + assert refs == [] + + def test_get_references_to(self, ref_repo, space_repo): + """Test getting references to a target document.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + ref1 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="shared-doc", + space_id=space.id, + ) + ref2 = TransclusionReference( + source_doc_id="doc-2", + target_doc_id="shared-doc", + space_id=space.id, + ) + ref_repo.add_reference(ref1) + ref_repo.add_reference(ref2) + + refs = ref_repo.get_references_to("shared-doc", space.id) + assert len(refs) == 2 + sources = [r.source_doc_id for r in refs] + assert "doc-1" in sources + assert "doc-2" in sources + + def test_clear_references_from(self, ref_repo, space_repo): + """Test clearing references from a source document.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + ref1 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space.id, + ) + ref2 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-3", + space_id=space.id, + ) + ref_repo.add_reference(ref1) + ref_repo.add_reference(ref2) + + count = ref_repo.clear_references_from("doc-1", space.id) + assert count == 2 + + # Verify cleared + refs = ref_repo.get_references_from("doc-1", space.id) + assert refs == [] + + def test_clear_references_from_empty(self, ref_repo, space_repo): + """Test clearing references when none exist.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + count = ref_repo.clear_references_from("non-existent", space.id) + assert count == 0 + + def test_get_dependents(self, ref_repo, space_repo): + """Test getting dependent documents.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + # doc-1 and doc-2 both reference shared-component + ref1 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="shared-component", + space_id=space.id, + ) + ref2 = TransclusionReference( + source_doc_id="doc-2", + target_doc_id="shared-component", + space_id=space.id, + ) + ref_repo.add_reference(ref1) + ref_repo.add_reference(ref2) + + dependents = ref_repo.get_dependents("shared-component", space.id) + assert len(dependents) == 2 + assert "doc-1" in dependents + assert "doc-2" in dependents + + def test_get_dependents_empty(self, ref_repo, space_repo): + """Test getting dependents when none exist.""" + space = InformationSpace(name="test-space") + space_repo.create(space) + + dependents = ref_repo.get_dependents("orphan-doc", space.id) + assert dependents == [] + + def test_references_isolated_by_space(self, ref_repo, space_repo): + """Test that references are isolated by space.""" + space1 = InformationSpace(name="space-1") + space2 = InformationSpace(name="space-2") + space_repo.create(space1) + space_repo.create(space2) + + # Same source/target IDs in different spaces + ref1 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space1.id, + ) + ref2 = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space2.id, + ) + ref_repo.add_reference(ref1) + ref_repo.add_reference(ref2) + + # Each space should have its own reference + refs1 = ref_repo.get_references_from("doc-1", space1.id) + refs2 = ref_repo.get_references_from("doc-1", space2.id) + + assert len(refs1) == 1 + assert len(refs2) == 1 + assert refs1[0].space_id == space1.id + assert refs2[0].space_id == space2.id + + +class TestCascadeDelete: + """Test cascade delete behavior.""" + + def test_deleting_space_cascades_to_documents(self, temp_db): + """Test that deleting a space also deletes its documents.""" + space_repo = SqliteSpaceRepository(temp_db) + doc_repo = SqliteDocumentRepository(temp_db) + + space = InformationSpace(name="test-space") + space_repo.create(space) + + doc = SpaceDocument(space_id=space.id, space_path="/test.md") + doc_repo.add_document(doc) + + # Delete space + space_repo.delete(space.id) + + # Document should also be gone + retrieved = doc_repo.get_document(doc.id) + assert retrieved is None + + def test_deleting_space_cascades_to_variables(self, temp_db): + """Test that deleting a space also deletes its variables.""" + space_repo = SqliteSpaceRepository(temp_db) + var_repo = SqliteVariableRepository(temp_db) + + space = InformationSpace(name="test-space") + space_repo.create(space) + + var = SpaceVariable(space_id=space.id, name="var", value="val") + var_repo.set_variable(var) + + # Delete space + space_repo.delete(space.id) + + # Variable should also be gone + retrieved = var_repo.get_variable(space.id, "var") + assert retrieved is None + + def test_deleting_space_cascades_to_references(self, temp_db): + """Test that deleting a space also deletes its references.""" + space_repo = SqliteSpaceRepository(temp_db) + ref_repo = SqliteReferenceRepository(temp_db) + + space = InformationSpace(name="test-space") + space_repo.create(space) + + ref = TransclusionReference( + source_doc_id="doc-1", + target_doc_id="doc-2", + space_id=space.id, + ) + ref_repo.add_reference(ref) + + # Delete space + space_repo.delete(space.id) + + # Reference should also be gone + refs = ref_repo.get_references_from("doc-1", space.id) + assert refs == []