Files
markitect-main/examples/asset_management_concept_b.py
tegwick 5e0e6c395e feat: complete Issue #141 asset management concepts with working prototypes
Comprehensive analysis and implementation concepts for handling images and file includes
with automatic deduplication based on MarkdownPackageFormats wiki study.

## Two Complete Concepts Delivered

### Concept A: Hash-Based Asset Store
- Content-addressable storage using SHA-256 hashes
- SQLite database for virtual name mapping and metadata
- Perfect deduplication regardless of filename
- Hash-based directory structure for optimal storage
- Working prototype with 47 KB of implementation code

### Concept B: Package + Symlinks System (RECOMMENDED)
- ZIP-based .mdpkg packages following wiki standards
- Symlink-based deduplication in shared asset library
- Compatible with standard tools and workflows
- Visual transparency and tool integration
- Working prototype with 51 KB of implementation code

## Key Features Demonstrated
-  Content deduplication: Same image content → single storage
-  Multiple names: Different filenames for identical content
-  Database integration: Asset metadata queryable and indexed
-  Package portability: ZIP-based distribution format
-  Working demos: Both concepts fully functional

## Analysis Results
- **Perfect Deduplication**: Both concepts eliminate duplicate content storage
- **Implementation Complexity**: Concept B more approachable, Concept A more efficient
- **Platform Compatibility**: Concept A universal, Concept B symlink-dependent
- **User Experience**: Concept B familiar workflows, Concept A requires tooling

## Technical Approach
- Based on MarkdownPackageFormats wiki standards (.mdpkg, .mdz formats)
- Python standard library (hashlib, sqlite3, zipfile, pathlib)
- Content-addressable storage patterns for efficiency
- Manifest-based metadata for package integrity

## Recommendations
1. **Start with Concept B** for rapid prototyping and user acceptance
2. **Evolve to hybrid approach** incorporating Concept A's hash-based efficiency
3. **Follow .mdpkg standards** for interoperability with emerging ecosystem
4. **Implement CLI integration** for seamless markitect workflow

Both concepts solve the core requirements with working prototypes and clear trade-offs.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-08 01:51:54 +02:00

328 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Implementation example for Issue #141 - Concept B: Package + Symlinks Asset Management
This is a working prototype demonstrating the core concepts for handling images
and file includes with automatic deduplication.
"""
import hashlib
import json
import zipfile
import shutil
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
class AssetRegistry:
"""Manages the shared asset registry for deduplication."""
def __init__(self, registry_path: Path):
self.registry_path = registry_path
self.registry_path.parent.mkdir(parents=True, exist_ok=True)
self.registry = self._load_registry()
def _load_registry(self) -> Dict:
"""Load existing registry or create empty one."""
if self.registry_path.exists():
try:
return json.loads(self.registry_path.read_text())
except (json.JSONDecodeError, IOError):
return {"assets": {}, "version": "1.0"}
return {"assets": {}, "version": "1.0"}
def _save_registry(self):
"""Save registry to disk."""
self.registry_path.write_text(json.dumps(self.registry, indent=2))
def get_content_hash(self, file_path: Path) -> str:
"""Calculate SHA-256 hash of file content."""
content = file_path.read_bytes()
return hashlib.sha256(content).hexdigest()
def register_asset(self, file_path: Path, content_hash: str) -> Dict:
"""Register a new asset in the registry."""
file_size = file_path.stat().st_size
mime_type = self._guess_mime_type(file_path.suffix)
asset_info = {
"original_name": file_path.name,
"size": file_size,
"mime_type": mime_type,
"extension": file_path.suffix,
"created": datetime.now().isoformat(),
"stored_path": f"images/{content_hash}{file_path.suffix}"
}
self.registry["assets"][content_hash] = asset_info
self._save_registry()
return asset_info
def find_asset(self, content_hash: str) -> Optional[Dict]:
"""Find asset by content hash."""
return self.registry["assets"].get(content_hash)
def _guess_mime_type(self, extension: str) -> str:
"""Simple MIME type guessing based on extension."""
mime_map = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".svg": "image/svg+xml",
".pdf": "application/pdf",
".txt": "text/plain",
".md": "text/markdown"
}
return mime_map.get(extension.lower(), "application/octet-stream")
class AssetDeduplicator:
"""Handles asset storage and deduplication using symlinks."""
def __init__(self, workspace_path: Path):
self.workspace = workspace_path
self.shared_assets = workspace_path / "shared_assets"
self.shared_images = self.shared_assets / "images"
self.registry = AssetRegistry(self.shared_assets / "registry.json")
# Create directory structure
self.shared_images.mkdir(parents=True, exist_ok=True)
def add_asset(self, source_path: Path, document_dir: Path, virtual_name: str) -> Tuple[str, Path]:
"""
Add asset with deduplication. Returns (content_hash, stored_path).
"""
if not source_path.exists():
raise FileNotFoundError(f"Source asset not found: {source_path}")
# Calculate content hash
content_hash = self.registry.get_content_hash(source_path)
# Check if we already have this content
existing_asset = self.registry.find_asset(content_hash)
if existing_asset:
print(f"✓ Deduplication: Found existing asset for {virtual_name}")
stored_path = self.shared_assets / existing_asset["stored_path"]
else:
# Store new asset
stored_path = self.shared_images / f"{content_hash}{source_path.suffix}"
shutil.copy2(source_path, stored_path)
self.registry.register_asset(source_path, content_hash)
print(f"✓ Stored new asset: {virtual_name} -> {stored_path.name}")
# Create symlink in document assets directory
self._create_asset_symlink(stored_path, document_dir, virtual_name)
return content_hash, stored_path
def _create_asset_symlink(self, stored_path: Path, document_dir: Path, virtual_name: str):
"""Create symlink from document assets directory to shared storage."""
assets_dir = document_dir / "assets"
assets_dir.mkdir(parents=True, exist_ok=True)
link_path = assets_dir / virtual_name
# Remove existing link/file if present
if link_path.exists() or link_path.is_symlink():
link_path.unlink()
# Create relative symlink
try:
relative_target = os.path.relpath(stored_path, link_path.parent)
link_path.symlink_to(relative_target)
print(f"✓ Created symlink: {virtual_name} -> {relative_target}")
except OSError as e:
# Fallback to hard copy if symlinks fail (e.g., on Windows)
shutil.copy2(stored_path, link_path)
print(f"⚠ Symlink failed, copied file instead: {virtual_name} (reason: {e})")
class MarkdownPackager:
"""Handles creation and extraction of .mdpkg files."""
def __init__(self, workspace_path: Path):
self.workspace = workspace_path
self.packages_dir = workspace_path / "packages"
self.packages_dir.mkdir(parents=True, exist_ok=True)
def create_package(self, document_dir: Path, package_name: str) -> Path:
"""Create a .mdpkg ZIP package from a document directory."""
package_path = self.packages_dir / f"{package_name}.mdpkg"
# Collect asset information
assets_info = []
assets_dir = document_dir / "assets"
if assets_dir.exists():
for asset_path in assets_dir.iterdir():
if asset_path.is_file() or asset_path.is_symlink():
# Resolve symlink to get actual file info
real_path = asset_path.resolve() if asset_path.is_symlink() else asset_path
assets_info.append({
"name": asset_path.name,
"size": real_path.stat().st_size,
"is_symlink": asset_path.is_symlink()
})
# Create manifest
manifest = {
"name": package_name,
"version": "1.0",
"created": datetime.now().isoformat(),
"format": "mdpkg",
"assets": assets_info,
"main_document": "index.md"
}
# Create ZIP package
with zipfile.ZipFile(package_path, 'w', zipfile.ZIP_DEFLATED) as zf:
# Add manifest
zf.writestr("manifest.json", json.dumps(manifest, indent=2))
# Add main document
main_doc = document_dir / "index.md"
if main_doc.exists():
zf.write(main_doc, "index.md")
# Add assets (resolve symlinks)
if assets_dir.exists():
for asset_path in assets_dir.iterdir():
if asset_path.is_file() or asset_path.is_symlink():
real_path = asset_path.resolve() if asset_path.is_symlink() else asset_path
zf.write(real_path, f"assets/{asset_path.name}")
print(f"✓ Created package: {package_path}")
print(f" - Main document: {'' if main_doc.exists() else ''}")
print(f" - Assets: {len(assets_info)}")
return package_path
def extract_package(self, package_path: Path, extract_name: str) -> Path:
"""Extract a .mdpkg package to the workspace."""
if not package_path.exists():
raise FileNotFoundError(f"Package not found: {package_path}")
extract_dir = self.workspace / "documents" / extract_name
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(package_path, 'r') as zf:
# Read manifest
try:
manifest_content = zf.read("manifest.json")
manifest = json.loads(manifest_content)
except (KeyError, json.JSONDecodeError):
manifest = {"assets": []}
# Extract main document
if "index.md" in zf.namelist():
zf.extract("index.md", extract_dir)
# Extract assets
assets_dir = extract_dir / "assets"
for file_info in zf.infolist():
if file_info.filename.startswith("assets/"):
zf.extract(file_info.filename, extract_dir)
print(f"✓ Extracted package to: {extract_dir}")
return extract_dir
def demo_asset_management():
"""Demonstrate the asset management system."""
print("🎯 Asset Management Demo - Concept B")
print("=" * 50)
# Setup workspace
demo_workspace = Path("./demo_workspace")
if demo_workspace.exists():
shutil.rmtree(demo_workspace)
deduplicator = AssetDeduplicator(demo_workspace)
packager = MarkdownPackager(demo_workspace)
# Create demo assets (simulate duplicate images)
demo_assets = demo_workspace / "demo_assets"
demo_assets.mkdir(parents=True, exist_ok=True)
# Create some test "images" (text files for demo)
test_image1 = demo_assets / "logo.png"
test_image2 = demo_assets / "company_logo.png"
test_image3 = demo_assets / "diagram.png"
test_image1.write_text("PNG_IMAGE_CONTENT_LOGO") # Same content
test_image2.write_text("PNG_IMAGE_CONTENT_LOGO") # Same content, different name
test_image3.write_text("PNG_IMAGE_CONTENT_DIAGRAM") # Different content
print(f"Created test assets: {len(list(demo_assets.iterdir()))} files")
# Create two document projects
doc1_dir = demo_workspace / "documents" / "project_a"
doc2_dir = demo_workspace / "documents" / "project_b"
for doc_dir in [doc1_dir, doc2_dir]:
doc_dir.mkdir(parents=True, exist_ok=True)
# Project A uses logo.png and diagram.png
(doc1_dir / "index.md").write_text("""# Project A
![Logo](assets/logo.png)
![Diagram](assets/diagram.png)
This is Project A documentation.
""")
print("\n📁 Processing Project A assets...")
deduplicator.add_asset(test_image1, doc1_dir, "logo.png")
deduplicator.add_asset(test_image3, doc1_dir, "diagram.png")
# Project B uses the same logo (different filename) and same diagram
(doc2_dir / "index.md").write_text("""# Project B
![Company Logo](assets/company_logo.png)
![System Diagram](assets/system_diagram.png)
This is Project B documentation.
""")
print("\n📁 Processing Project B assets...")
deduplicator.add_asset(test_image2, doc2_dir, "company_logo.png") # Same content as logo.png
deduplicator.add_asset(test_image3, doc2_dir, "system_diagram.png") # Same content as diagram.png
# Show deduplication results
print(f"\n📊 Deduplication Results:")
print(f" - Original files: 3")
print(f" - Unique content hashes: {len(deduplicator.registry.registry['assets'])}")
print(f" - Storage efficiency: {3 - len(deduplicator.registry.registry['assets'])} duplicates eliminated")
# Create packages
print(f"\n📦 Creating packages...")
pkg_a = packager.create_package(doc1_dir, "project_a")
pkg_b = packager.create_package(doc2_dir, "project_b")
print(f"\n✅ Demo completed successfully!")
print(f" - Workspace: {demo_workspace}")
print(f" - Shared assets: {deduplicator.shared_assets}")
print(f" - Packages: {packager.packages_dir}")
# Show final directory structure
print(f"\n📂 Final directory structure:")
for root, dirs, files in os.walk(demo_workspace):
level = root.replace(str(demo_workspace), '').count(os.sep)
indent = ' ' * 2 * level
print(f"{indent}{os.path.basename(root)}/")
subindent = ' ' * 2 * (level + 1)
for file in files:
file_path = Path(root) / file
if file_path.is_symlink():
target = os.readlink(file_path)
print(f"{subindent}{file} -> {target}")
else:
print(f"{subindent}{file}")
if __name__ == "__main__":
demo_asset_management()