Files
markitect-main/markitect/assets/packager.py
tegwick 81d3da5fe7
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat: comprehensive asset management system and testing improvements
Asset Management System (Issue #142):
- Add complete asset management framework with deduplication
- Implement AssetManager, AssetRegistry, and AssetDeduplicator classes
- Add AssetPackager for markdown document packaging
- Create comprehensive test suite for all asset management components
- Add asset constants and custom exceptions for robust error handling

Markdown Processing Enhancements:
- Update markdown_commands.py with improved functionality
- Enhanced parsing and content aggregation capabilities
- Improved filename encoding/decoding for special characters

Test Suite Improvements:
- Add comprehensive tests for Issue #138 markdown parsing
- Enhance Issue #139 content aggregation and end-to-end testing
- Complete test coverage for new asset management features

Examples and Documentation:
- Update BildungsKanonJon.md example with enhanced content
- Generate corresponding HTML output for documentation
- Add asset registry configuration

Development Tools:
- Add install script for simplified setup

This commit represents a major enhancement to MarkiTect's asset handling
capabilities with full test coverage and improved markdown processing.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 19:57:31 +02:00

412 lines
15 KiB
Python

"""
MarkdownPackager class for .mdpkg ZIP package creation and extraction.
This module implements the MarkdownPackager class that provides .mdpkg ZIP package
creation, package extraction with symlink restoration, manifest generation and
validation, and asset resolution during packaging.
"""
import json
import re
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Set, Optional, Any
from .exceptions import PackagingError
from .registry import AssetRegistry
from .deduplicator import AssetDeduplicator
from .constants import (
DEFAULT_MANIFEST_FILENAME, DEFAULT_EXCLUDE_PATTERNS,
MANIFEST_FORMAT_VERSION, PACKAGE_EXTENSION
)
class MarkdownPackager:
"""ZIP-based packager for markdown documents with embedded assets."""
def __init__(self, registry: AssetRegistry, deduplicator: AssetDeduplicator,
manifest_filename: str = DEFAULT_MANIFEST_FILENAME):
"""Initialize MarkdownPackager with dependencies.
Args:
registry: AssetRegistry instance for metadata management.
deduplicator: AssetDeduplicator for asset storage and linking.
manifest_filename: Name of manifest file in package.
"""
self.registry = registry
self.deduplicator = deduplicator
self.manifest_filename = manifest_filename
def create_package(self, source_dir: Path, package_path: Path,
description: Optional[str] = None,
exclude_patterns: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Create .mdpkg package from source directory.
Args:
source_dir: Directory containing files to package.
package_path: Path for the output package file.
description: Optional package description.
exclude_patterns: File patterns to exclude from packaging.
metadata: Optional metadata to include in manifest.
Returns:
Dictionary containing packaging results.
Raises:
PackagingError: If package creation fails.
"""
if not source_dir.exists() or not source_dir.is_dir():
raise PackagingError(f"Source directory does not exist: {source_dir}")
if exclude_patterns is None:
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy()
try:
# Collect files to package
files_to_package = self._collect_files(source_dir, exclude_patterns)
# Identify and process assets
assets_info = []
asset_references = set()
for file_path in files_to_package:
if self._is_text_file(file_path):
# Scan for asset references
content = file_path.read_text(encoding='utf-8', errors='ignore')
file_assets = self.resolve_asset_references(content, source_dir)
asset_references.update(file_assets)
# Process referenced assets through deduplicator
for asset_ref in asset_references:
asset_path = source_dir / asset_ref
if asset_path.exists():
try:
asset_info = self.deduplicator.store_asset(asset_path)
assets_info.append({
"path": asset_ref,
"content_hash": asset_info["content_hash"],
"mime_type": self.registry.detect_mime_type(asset_path),
"size": asset_path.stat().st_size
})
except Exception as e:
# Log warning but continue packaging
pass
# Create manifest
manifest = self.generate_manifest(
[str(f.relative_to(source_dir)) for f in files_to_package],
assets_info,
description=description,
metadata=metadata
)
# Create ZIP package
package_path.parent.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(package_path, 'w', zipfile.ZIP_DEFLATED) as zf:
# Add manifest
zf.writestr(self.manifest_filename, json.dumps(manifest, indent=2))
# Add all files
for file_path in files_to_package:
arcname = str(file_path.relative_to(source_dir))
zf.write(file_path, arcname)
return {
"package_path": str(package_path),
"files": [str(f.relative_to(source_dir)) for f in files_to_package],
"assets": assets_info,
"assets_processed": len(assets_info),
"manifest": manifest
}
except Exception as e:
if isinstance(e, PackagingError):
raise
raise PackagingError(f"Failed to create package: {e}", cause=e)
def extract_package(self, package_path: Path, extract_dir: Path,
restore_symlinks: bool = False,
missing_asset_handling: str = "warn") -> Dict[str, Any]:
"""Extract .mdpkg package to directory.
Args:
package_path: Path to the package file.
extract_dir: Directory to extract files to.
restore_symlinks: Whether to create symlinks to stored assets.
missing_asset_handling: How to handle missing assets ("warn", "error", "ignore").
Returns:
Dictionary containing extraction results.
Raises:
PackagingError: If extraction fails.
"""
if not package_path.exists():
raise PackagingError(f"Package file does not exist: {package_path}")
try:
# Extract ZIP file
with zipfile.ZipFile(package_path, 'r') as zf:
# Read and validate manifest
try:
manifest_data = zf.read(self.manifest_filename)
manifest = json.loads(manifest_data)
except KeyError:
raise PackagingError("Package missing manifest file")
if not self.validate_manifest(manifest):
raise PackagingError("Invalid manifest structure")
# Create extraction directory
extract_dir.mkdir(parents=True, exist_ok=True)
# Extract all files
zf.extractall(extract_dir)
# Remove manifest from extracted files
(extract_dir / self.manifest_filename).unlink(missing_ok=True)
# Handle asset restoration if requested
warnings = []
asset_links_created = 0
if restore_symlinks and "assets" in manifest:
for asset in manifest["assets"]:
asset_path = extract_dir / asset["path"]
content_hash = asset["content_hash"]
try:
# Get stored asset path
stored_path = self.deduplicator.get_asset_path(content_hash)
# Create link to stored asset
if asset_path.exists():
asset_path.unlink() # Remove extracted copy
self.deduplicator.create_asset_link(stored_path, asset_path)
asset_links_created += 1
except Exception as e:
warning_msg = f"Could not restore asset {asset['path']}: {e}"
warnings.append(warning_msg)
if missing_asset_handling == "error":
raise PackagingError(warning_msg)
return {
"extracted_files": len(manifest.get("files", [])),
"asset_links_created": asset_links_created,
"warnings": warnings,
"manifest": manifest
}
except zipfile.BadZipFile:
raise PackagingError(f"Invalid or corrupted package file: {package_path}")
except Exception as e:
if isinstance(e, PackagingError):
raise
raise PackagingError(f"Failed to extract package: {e}", cause=e)
def _collect_files(self, source_dir: Path, exclude_patterns: List[str]) -> List[Path]:
"""Collect files to package, applying exclude patterns.
Args:
source_dir: Source directory to scan.
exclude_patterns: Patterns to exclude.
Returns:
List of file paths to include in package.
"""
import fnmatch
files = []
for file_path in source_dir.rglob("*"):
if file_path.is_file():
relative_path = str(file_path.relative_to(source_dir))
# Check exclude patterns
excluded = False
for pattern in exclude_patterns:
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(file_path.name, pattern):
excluded = True
break
if not excluded:
files.append(file_path)
return files
def _is_text_file(self, file_path: Path) -> bool:
"""Check if file is likely a text file that might contain asset references.
Args:
file_path: Path to the file.
Returns:
True if file is likely text-based.
"""
text_extensions = {'.md', '.markdown', '.txt', '.html', '.htm', '.css', '.js', '.json', '.yaml', '.yml'}
return file_path.suffix.lower() in text_extensions
def resolve_asset_references(self, content: str, base_dir: Path) -> Set[str]:
"""Resolve asset references in text content.
Args:
content: Text content to scan for asset references.
base_dir: Base directory for resolving relative paths.
Returns:
Set of relative asset paths found in content.
"""
asset_paths = set()
# Markdown image references: ![alt](path) and ![](path)
md_image_pattern = r'!\[.*?\]\(([^)]+)\)'
for match in re.finditer(md_image_pattern, content):
path = match.group(1)
if not self._is_external_url(path):
asset_paths.add(self._normalize_path(path))
# Markdown link references: [text](path)
md_link_pattern = r'(?<!\!)\[.*?\]\(([^)]+)\)'
for match in re.finditer(md_link_pattern, content):
path = match.group(1)
if not self._is_external_url(path) and self._looks_like_file(path):
asset_paths.add(self._normalize_path(path))
# HTML img src attributes
html_img_pattern = r'<img[^>]+src=["\']([^"\']+)["\']'
for match in re.finditer(html_img_pattern, content, re.IGNORECASE):
path = match.group(1)
if not self._is_external_url(path):
asset_paths.add(self._normalize_path(path))
# HTML link href attributes (for stylesheets, scripts, etc.)
html_link_pattern = r'<(?:link|script)[^>]+(?:href|src)=["\']([^"\']+)["\']'
for match in re.finditer(html_link_pattern, content, re.IGNORECASE):
path = match.group(1)
if not self._is_external_url(path) and self._looks_like_file(path):
asset_paths.add(self._normalize_path(path))
# HTML anchor href attributes (for downloadable files)
html_anchor_pattern = r'<a[^>]+href=["\']([^"\']+)["\']'
for match in re.finditer(html_anchor_pattern, content, re.IGNORECASE):
path = match.group(1)
if not self._is_external_url(path) and self._looks_like_file(path):
asset_paths.add(self._normalize_path(path))
return asset_paths
def _is_external_url(self, path: str) -> bool:
"""Check if path is an external URL.
Args:
path: Path string to check.
Returns:
True if path looks like an external URL.
"""
return path.startswith(('http://', 'https://', 'ftp://', 'mailto:', '#'))
def _looks_like_file(self, path: str) -> bool:
"""Check if path looks like a file reference.
Args:
path: Path string to check.
Returns:
True if path looks like a file.
"""
# Skip anchors and query parameters
if '#' in path or '?' in path:
return False
# Must have an extension or be a known file pattern
return '.' in path or path.endswith(('/', 'README', 'LICENSE'))
def _normalize_path(self, path: str) -> str:
"""Normalize path by removing leading ./ and ensuring forward slashes.
Args:
path: Path string to normalize.
Returns:
Normalized path string.
"""
# Remove leading ./
if path.startswith('./'):
path = path[2:]
# Convert backslashes to forward slashes
path = path.replace('\\', '/')
return path
def generate_manifest(self, files: List[str], assets: List[Dict[str, Any]],
description: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Generate package manifest.
Args:
files: List of files in the package.
assets: List of asset information dictionaries.
description: Optional package description.
metadata: Optional additional metadata.
Returns:
Manifest dictionary.
"""
manifest = {
"package_info": {
"format_version": MANIFEST_FORMAT_VERSION,
"created_at": datetime.now().isoformat(),
"description": description,
"metadata": metadata or {}
},
"files": files,
"assets": assets
}
return manifest
def validate_manifest(self, manifest: Dict[str, Any]) -> bool:
"""Validate manifest structure.
Args:
manifest: Manifest dictionary to validate.
Returns:
True if manifest is valid, False otherwise.
"""
try:
# Check required top-level keys
required_keys = ["package_info", "files", "assets"]
if not all(key in manifest for key in required_keys):
return False
# Check package_info structure
package_info = manifest["package_info"]
if "format_version" not in package_info:
return False
# Check that files is a list
if not isinstance(manifest["files"], list):
return False
# Check that assets is a list
if not isinstance(manifest["assets"], list):
return False
# Validate each asset has required fields
for asset in manifest["assets"]:
required_asset_keys = ["path", "content_hash", "mime_type"]
if not all(key in asset for key in required_asset_keys):
return False
return True
except Exception:
return False