""" MarkdownPackager class for .mdpkg ZIP package creation and extraction. This module implements the MarkdownPackager class that provides .mdpkg ZIP package creation, package extraction with symlink restoration, manifest generation and validation, and asset resolution during packaging. """ import json import re import zipfile from datetime import datetime from pathlib import Path from typing import Dict, List, Set, Optional, Any from .exceptions import PackagingError from .registry import AssetRegistry from .deduplicator import AssetDeduplicator from .constants import ( DEFAULT_MANIFEST_FILENAME, DEFAULT_EXCLUDE_PATTERNS, MANIFEST_FORMAT_VERSION, PACKAGE_EXTENSION ) class MarkdownPackager: """ZIP-based packager for markdown documents with embedded assets.""" def __init__(self, registry: AssetRegistry, deduplicator: AssetDeduplicator, manifest_filename: str = DEFAULT_MANIFEST_FILENAME): """Initialize MarkdownPackager with dependencies. Args: registry: AssetRegistry instance for metadata management. deduplicator: AssetDeduplicator for asset storage and linking. manifest_filename: Name of manifest file in package. """ self.registry = registry self.deduplicator = deduplicator self.manifest_filename = manifest_filename def create_package(self, source_dir: Path, package_path: Path, description: Optional[str] = None, exclude_patterns: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Create .mdpkg package from source directory. Args: source_dir: Directory containing files to package. package_path: Path for the output package file. description: Optional package description. exclude_patterns: File patterns to exclude from packaging. metadata: Optional metadata to include in manifest. Returns: Dictionary containing packaging results. Raises: PackagingError: If package creation fails. """ if not source_dir.exists() or not source_dir.is_dir(): raise PackagingError(f"Source directory does not exist: {source_dir}") if exclude_patterns is None: exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy() try: # Collect files to package files_to_package = self._collect_files(source_dir, exclude_patterns) # Identify and process assets assets_info = [] asset_references = set() for file_path in files_to_package: if self._is_text_file(file_path): # Scan for asset references content = file_path.read_text(encoding='utf-8', errors='ignore') file_assets = self.resolve_asset_references(content, source_dir) asset_references.update(file_assets) # Process referenced assets through deduplicator for asset_ref in asset_references: asset_path = source_dir / asset_ref if asset_path.exists(): try: asset_info = self.deduplicator.store_asset(asset_path) assets_info.append({ "path": asset_ref, "content_hash": asset_info["content_hash"], "mime_type": self.registry.detect_mime_type(asset_path), "size": asset_path.stat().st_size }) except Exception as e: # Log warning but continue packaging pass # Create manifest manifest = self.generate_manifest( [str(f.relative_to(source_dir)) for f in files_to_package], assets_info, description=description, metadata=metadata ) # Create ZIP package package_path.parent.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(package_path, 'w', zipfile.ZIP_DEFLATED) as zf: # Add manifest zf.writestr(self.manifest_filename, json.dumps(manifest, indent=2)) # Add all files for file_path in files_to_package: arcname = str(file_path.relative_to(source_dir)) zf.write(file_path, arcname) return { "package_path": str(package_path), "files": [str(f.relative_to(source_dir)) for f in files_to_package], "assets": assets_info, "assets_processed": len(assets_info), "manifest": manifest } except Exception as e: if isinstance(e, PackagingError): raise raise PackagingError(f"Failed to create package: {e}", cause=e) def extract_package(self, package_path: Path, extract_dir: Path, restore_symlinks: bool = False, missing_asset_handling: str = "warn") -> Dict[str, Any]: """Extract .mdpkg package to directory. Args: package_path: Path to the package file. extract_dir: Directory to extract files to. restore_symlinks: Whether to create symlinks to stored assets. missing_asset_handling: How to handle missing assets ("warn", "error", "ignore"). Returns: Dictionary containing extraction results. Raises: PackagingError: If extraction fails. """ if not package_path.exists(): raise PackagingError(f"Package file does not exist: {package_path}") try: # Extract ZIP file with zipfile.ZipFile(package_path, 'r') as zf: # Read and validate manifest try: manifest_data = zf.read(self.manifest_filename) manifest = json.loads(manifest_data) except KeyError: raise PackagingError("Package missing manifest file") if not self.validate_manifest(manifest): raise PackagingError("Invalid manifest structure") # Create extraction directory extract_dir.mkdir(parents=True, exist_ok=True) # Extract all files zf.extractall(extract_dir) # Remove manifest from extracted files (extract_dir / self.manifest_filename).unlink(missing_ok=True) # Handle asset restoration if requested warnings = [] asset_links_created = 0 if restore_symlinks and "assets" in manifest: for asset in manifest["assets"]: asset_path = extract_dir / asset["path"] content_hash = asset["content_hash"] try: # Get stored asset path stored_path = self.deduplicator.get_asset_path(content_hash) # Create link to stored asset if asset_path.exists(): asset_path.unlink() # Remove extracted copy self.deduplicator.create_asset_link(stored_path, asset_path) asset_links_created += 1 except Exception as e: warning_msg = f"Could not restore asset {asset['path']}: {e}" warnings.append(warning_msg) if missing_asset_handling == "error": raise PackagingError(warning_msg) return { "extracted_files": len(manifest.get("files", [])), "asset_links_created": asset_links_created, "warnings": warnings, "manifest": manifest } except zipfile.BadZipFile: raise PackagingError(f"Invalid or corrupted package file: {package_path}") except Exception as e: if isinstance(e, PackagingError): raise raise PackagingError(f"Failed to extract package: {e}", cause=e) def _collect_files(self, source_dir: Path, exclude_patterns: List[str]) -> List[Path]: """Collect files to package, applying exclude patterns. Args: source_dir: Source directory to scan. exclude_patterns: Patterns to exclude. Returns: List of file paths to include in package. """ import fnmatch files = [] for file_path in source_dir.rglob("*"): if file_path.is_file(): relative_path = str(file_path.relative_to(source_dir)) # Check exclude patterns excluded = False for pattern in exclude_patterns: if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(file_path.name, pattern): excluded = True break if not excluded: files.append(file_path) return files def _is_text_file(self, file_path: Path) -> bool: """Check if file is likely a text file that might contain asset references. Args: file_path: Path to the file. Returns: True if file is likely text-based. """ text_extensions = {'.md', '.markdown', '.txt', '.html', '.htm', '.css', '.js', '.json', '.yaml', '.yml'} return file_path.suffix.lower() in text_extensions def resolve_asset_references(self, content: str, base_dir: Path) -> Set[str]: """Resolve asset references in text content. Args: content: Text content to scan for asset references. base_dir: Base directory for resolving relative paths. Returns: Set of relative asset paths found in content. """ asset_paths = set() # Markdown image references: ![alt](path) and ![](path) md_image_pattern = r'!\[.*?\]\(([^)]+)\)' for match in re.finditer(md_image_pattern, content): path = match.group(1) if not self._is_external_url(path): asset_paths.add(self._normalize_path(path)) # Markdown link references: [text](path) md_link_pattern = r'(?]+src=["\']([^"\']+)["\']' for match in re.finditer(html_img_pattern, content, re.IGNORECASE): path = match.group(1) if not self._is_external_url(path): asset_paths.add(self._normalize_path(path)) # HTML link href attributes (for stylesheets, scripts, etc.) html_link_pattern = r'<(?:link|script)[^>]+(?:href|src)=["\']([^"\']+)["\']' for match in re.finditer(html_link_pattern, content, re.IGNORECASE): path = match.group(1) if not self._is_external_url(path) and self._looks_like_file(path): asset_paths.add(self._normalize_path(path)) # HTML anchor href attributes (for downloadable files) html_anchor_pattern = r']+href=["\']([^"\']+)["\']' for match in re.finditer(html_anchor_pattern, content, re.IGNORECASE): path = match.group(1) if not self._is_external_url(path) and self._looks_like_file(path): asset_paths.add(self._normalize_path(path)) return asset_paths def _is_external_url(self, path: str) -> bool: """Check if path is an external URL. Args: path: Path string to check. Returns: True if path looks like an external URL. """ return path.startswith(('http://', 'https://', 'ftp://', 'mailto:', '#')) def _looks_like_file(self, path: str) -> bool: """Check if path looks like a file reference. Args: path: Path string to check. Returns: True if path looks like a file. """ # Skip anchors and query parameters if '#' in path or '?' in path: return False # Must have an extension or be a known file pattern return '.' in path or path.endswith(('/', 'README', 'LICENSE')) def _normalize_path(self, path: str) -> str: """Normalize path by removing leading ./ and ensuring forward slashes. Args: path: Path string to normalize. Returns: Normalized path string. """ # Remove leading ./ if path.startswith('./'): path = path[2:] # Convert backslashes to forward slashes path = path.replace('\\', '/') return path def generate_manifest(self, files: List[str], assets: List[Dict[str, Any]], description: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Generate package manifest. Args: files: List of files in the package. assets: List of asset information dictionaries. description: Optional package description. metadata: Optional additional metadata. Returns: Manifest dictionary. """ manifest = { "package_info": { "format_version": MANIFEST_FORMAT_VERSION, "created_at": datetime.now().isoformat(), "description": description, "metadata": metadata or {} }, "files": files, "assets": assets } return manifest def validate_manifest(self, manifest: Dict[str, Any]) -> bool: """Validate manifest structure. Args: manifest: Manifest dictionary to validate. Returns: True if manifest is valid, False otherwise. """ try: # Check required top-level keys required_keys = ["package_info", "files", "assets"] if not all(key in manifest for key in required_keys): return False # Check package_info structure package_info = manifest["package_info"] if "format_version" not in package_info: return False # Check that files is a list if not isinstance(manifest["files"], list): return False # Check that assets is a list if not isinstance(manifest["assets"], list): return False # Validate each asset has required fields for asset in manifest["assets"]: required_asset_keys = ["path", "content_hash", "mime_type"] if not all(key in asset for key in required_asset_keys): return False return True except Exception: return False