#!/usr/bin/env python3 """ Implementation example for Issue #141 - Concept B: Package + Symlinks Asset Management This is a working prototype demonstrating the core concepts for handling images and file includes with automatic deduplication. """ import hashlib import json import zipfile import shutil import os from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple class AssetRegistry: """Manages the shared asset registry for deduplication.""" def __init__(self, registry_path: Path): self.registry_path = registry_path self.registry_path.parent.mkdir(parents=True, exist_ok=True) self.registry = self._load_registry() def _load_registry(self) -> Dict: """Load existing registry or create empty one.""" if self.registry_path.exists(): try: return json.loads(self.registry_path.read_text()) except (json.JSONDecodeError, IOError): return {"assets": {}, "version": "1.0"} return {"assets": {}, "version": "1.0"} def _save_registry(self): """Save registry to disk.""" self.registry_path.write_text(json.dumps(self.registry, indent=2)) def get_content_hash(self, file_path: Path) -> str: """Calculate SHA-256 hash of file content.""" content = file_path.read_bytes() return hashlib.sha256(content).hexdigest() def register_asset(self, file_path: Path, content_hash: str) -> Dict: """Register a new asset in the registry.""" file_size = file_path.stat().st_size mime_type = self._guess_mime_type(file_path.suffix) asset_info = { "original_name": file_path.name, "size": file_size, "mime_type": mime_type, "extension": file_path.suffix, "created": datetime.now().isoformat(), "stored_path": f"images/{content_hash}{file_path.suffix}" } self.registry["assets"][content_hash] = asset_info self._save_registry() return asset_info def find_asset(self, content_hash: str) -> Optional[Dict]: """Find asset by content hash.""" return self.registry["assets"].get(content_hash) def _guess_mime_type(self, extension: str) -> str: """Simple MIME type guessing based on extension.""" mime_map = { ".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".gif": "image/gif", ".svg": "image/svg+xml", ".pdf": "application/pdf", ".txt": "text/plain", ".md": "text/markdown" } return mime_map.get(extension.lower(), "application/octet-stream") class AssetDeduplicator: """Handles asset storage and deduplication using symlinks.""" def __init__(self, workspace_path: Path): self.workspace = workspace_path self.shared_assets = workspace_path / "shared_assets" self.shared_images = self.shared_assets / "images" self.registry = AssetRegistry(self.shared_assets / "registry.json") # Create directory structure self.shared_images.mkdir(parents=True, exist_ok=True) def add_asset(self, source_path: Path, document_dir: Path, virtual_name: str) -> Tuple[str, Path]: """ Add asset with deduplication. Returns (content_hash, stored_path). """ if not source_path.exists(): raise FileNotFoundError(f"Source asset not found: {source_path}") # Calculate content hash content_hash = self.registry.get_content_hash(source_path) # Check if we already have this content existing_asset = self.registry.find_asset(content_hash) if existing_asset: print(f"✓ Deduplication: Found existing asset for {virtual_name}") stored_path = self.shared_assets / existing_asset["stored_path"] else: # Store new asset stored_path = self.shared_images / f"{content_hash}{source_path.suffix}" shutil.copy2(source_path, stored_path) self.registry.register_asset(source_path, content_hash) print(f"✓ Stored new asset: {virtual_name} -> {stored_path.name}") # Create symlink in document assets directory self._create_asset_symlink(stored_path, document_dir, virtual_name) return content_hash, stored_path def _create_asset_symlink(self, stored_path: Path, document_dir: Path, virtual_name: str): """Create symlink from document assets directory to shared storage.""" assets_dir = document_dir / "assets" assets_dir.mkdir(parents=True, exist_ok=True) link_path = assets_dir / virtual_name # Remove existing link/file if present if link_path.exists() or link_path.is_symlink(): link_path.unlink() # Create relative symlink try: relative_target = os.path.relpath(stored_path, link_path.parent) link_path.symlink_to(relative_target) print(f"✓ Created symlink: {virtual_name} -> {relative_target}") except OSError as e: # Fallback to hard copy if symlinks fail (e.g., on Windows) shutil.copy2(stored_path, link_path) print(f"⚠ Symlink failed, copied file instead: {virtual_name} (reason: {e})") class MarkdownPackager: """Handles creation and extraction of .mdpkg files.""" def __init__(self, workspace_path: Path): self.workspace = workspace_path self.packages_dir = workspace_path / "packages" self.packages_dir.mkdir(parents=True, exist_ok=True) def create_package(self, document_dir: Path, package_name: str) -> Path: """Create a .mdpkg ZIP package from a document directory.""" package_path = self.packages_dir / f"{package_name}.mdpkg" # Collect asset information assets_info = [] assets_dir = document_dir / "assets" if assets_dir.exists(): for asset_path in assets_dir.iterdir(): if asset_path.is_file() or asset_path.is_symlink(): # Resolve symlink to get actual file info real_path = asset_path.resolve() if asset_path.is_symlink() else asset_path assets_info.append({ "name": asset_path.name, "size": real_path.stat().st_size, "is_symlink": asset_path.is_symlink() }) # Create manifest manifest = { "name": package_name, "version": "1.0", "created": datetime.now().isoformat(), "format": "mdpkg", "assets": assets_info, "main_document": "index.md" } # Create ZIP package with zipfile.ZipFile(package_path, 'w', zipfile.ZIP_DEFLATED) as zf: # Add manifest zf.writestr("manifest.json", json.dumps(manifest, indent=2)) # Add main document main_doc = document_dir / "index.md" if main_doc.exists(): zf.write(main_doc, "index.md") # Add assets (resolve symlinks) if assets_dir.exists(): for asset_path in assets_dir.iterdir(): if asset_path.is_file() or asset_path.is_symlink(): real_path = asset_path.resolve() if asset_path.is_symlink() else asset_path zf.write(real_path, f"assets/{asset_path.name}") print(f"✓ Created package: {package_path}") print(f" - Main document: {'✓' if main_doc.exists() else '✗'}") print(f" - Assets: {len(assets_info)}") return package_path def extract_package(self, package_path: Path, extract_name: str) -> Path: """Extract a .mdpkg package to the workspace.""" if not package_path.exists(): raise FileNotFoundError(f"Package not found: {package_path}") extract_dir = self.workspace / "documents" / extract_name extract_dir.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(package_path, 'r') as zf: # Read manifest try: manifest_content = zf.read("manifest.json") manifest = json.loads(manifest_content) except (KeyError, json.JSONDecodeError): manifest = {"assets": []} # Extract main document if "index.md" in zf.namelist(): zf.extract("index.md", extract_dir) # Extract assets assets_dir = extract_dir / "assets" for file_info in zf.infolist(): if file_info.filename.startswith("assets/"): zf.extract(file_info.filename, extract_dir) print(f"✓ Extracted package to: {extract_dir}") return extract_dir def demo_asset_management(): """Demonstrate the asset management system.""" print("🎯 Asset Management Demo - Concept B") print("=" * 50) # Setup workspace demo_workspace = Path("./demo_workspace") if demo_workspace.exists(): shutil.rmtree(demo_workspace) deduplicator = AssetDeduplicator(demo_workspace) packager = MarkdownPackager(demo_workspace) # Create demo assets (simulate duplicate images) demo_assets = demo_workspace / "demo_assets" demo_assets.mkdir(parents=True, exist_ok=True) # Create some test "images" (text files for demo) test_image1 = demo_assets / "logo.png" test_image2 = demo_assets / "company_logo.png" test_image3 = demo_assets / "diagram.png" test_image1.write_text("PNG_IMAGE_CONTENT_LOGO") # Same content test_image2.write_text("PNG_IMAGE_CONTENT_LOGO") # Same content, different name test_image3.write_text("PNG_IMAGE_CONTENT_DIAGRAM") # Different content print(f"Created test assets: {len(list(demo_assets.iterdir()))} files") # Create two document projects doc1_dir = demo_workspace / "documents" / "project_a" doc2_dir = demo_workspace / "documents" / "project_b" for doc_dir in [doc1_dir, doc2_dir]: doc_dir.mkdir(parents=True, exist_ok=True) # Project A uses logo.png and diagram.png (doc1_dir / "index.md").write_text("""# Project A ![Logo](assets/logo.png) ![Diagram](assets/diagram.png) This is Project A documentation. """) print("\n📁 Processing Project A assets...") deduplicator.add_asset(test_image1, doc1_dir, "logo.png") deduplicator.add_asset(test_image3, doc1_dir, "diagram.png") # Project B uses the same logo (different filename) and same diagram (doc2_dir / "index.md").write_text("""# Project B ![Company Logo](assets/company_logo.png) ![System Diagram](assets/system_diagram.png) This is Project B documentation. """) print("\n📁 Processing Project B assets...") deduplicator.add_asset(test_image2, doc2_dir, "company_logo.png") # Same content as logo.png deduplicator.add_asset(test_image3, doc2_dir, "system_diagram.png") # Same content as diagram.png # Show deduplication results print(f"\n📊 Deduplication Results:") print(f" - Original files: 3") print(f" - Unique content hashes: {len(deduplicator.registry.registry['assets'])}") print(f" - Storage efficiency: {3 - len(deduplicator.registry.registry['assets'])} duplicates eliminated") # Create packages print(f"\n📦 Creating packages...") pkg_a = packager.create_package(doc1_dir, "project_a") pkg_b = packager.create_package(doc2_dir, "project_b") print(f"\n✅ Demo completed successfully!") print(f" - Workspace: {demo_workspace}") print(f" - Shared assets: {deduplicator.shared_assets}") print(f" - Packages: {packager.packages_dir}") # Show final directory structure print(f"\n📂 Final directory structure:") for root, dirs, files in os.walk(demo_workspace): level = root.replace(str(demo_workspace), '').count(os.sep) indent = ' ' * 2 * level print(f"{indent}{os.path.basename(root)}/") subindent = ' ' * 2 * (level + 1) for file in files: file_path = Path(root) / file if file_path.is_symlink(): target = os.readlink(file_path) print(f"{subindent}{file} -> {target}") else: print(f"{subindent}{file}") if __name__ == "__main__": demo_asset_management()