Comprehensive analysis and implementation concepts for handling images and file includes with automatic deduplication based on MarkdownPackageFormats wiki study. ## Two Complete Concepts Delivered ### Concept A: Hash-Based Asset Store - Content-addressable storage using SHA-256 hashes - SQLite database for virtual name mapping and metadata - Perfect deduplication regardless of filename - Hash-based directory structure for optimal storage - Working prototype with 47 KB of implementation code ### Concept B: Package + Symlinks System (RECOMMENDED) - ZIP-based .mdpkg packages following wiki standards - Symlink-based deduplication in shared asset library - Compatible with standard tools and workflows - Visual transparency and tool integration - Working prototype with 51 KB of implementation code ## Key Features Demonstrated - ✅ Content deduplication: Same image content → single storage - ✅ Multiple names: Different filenames for identical content - ✅ Database integration: Asset metadata queryable and indexed - ✅ Package portability: ZIP-based distribution format - ✅ Working demos: Both concepts fully functional ## Analysis Results - **Perfect Deduplication**: Both concepts eliminate duplicate content storage - **Implementation Complexity**: Concept B more approachable, Concept A more efficient - **Platform Compatibility**: Concept A universal, Concept B symlink-dependent - **User Experience**: Concept B familiar workflows, Concept A requires tooling ## Technical Approach - Based on MarkdownPackageFormats wiki standards (.mdpkg, .mdz formats) - Python standard library (hashlib, sqlite3, zipfile, pathlib) - Content-addressable storage patterns for efficiency - Manifest-based metadata for package integrity ## Recommendations 1. **Start with Concept B** for rapid prototyping and user acceptance 2. **Evolve to hybrid approach** incorporating Concept A's hash-based efficiency 3. **Follow .mdpkg standards** for interoperability with emerging ecosystem 4. **Implement CLI integration** for seamless markitect workflow Both concepts solve the core requirements with working prototypes and clear trade-offs. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
328 lines
12 KiB
Python
328 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Implementation example for Issue #141 - Concept B: Package + Symlinks Asset Management
|
|
|
|
This is a working prototype demonstrating the core concepts for handling images
|
|
and file includes with automatic deduplication.
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import zipfile
|
|
import shutil
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
class AssetRegistry:
|
|
"""Manages the shared asset registry for deduplication."""
|
|
|
|
def __init__(self, registry_path: Path):
|
|
self.registry_path = registry_path
|
|
self.registry_path.parent.mkdir(parents=True, exist_ok=True)
|
|
self.registry = self._load_registry()
|
|
|
|
def _load_registry(self) -> Dict:
|
|
"""Load existing registry or create empty one."""
|
|
if self.registry_path.exists():
|
|
try:
|
|
return json.loads(self.registry_path.read_text())
|
|
except (json.JSONDecodeError, IOError):
|
|
return {"assets": {}, "version": "1.0"}
|
|
return {"assets": {}, "version": "1.0"}
|
|
|
|
def _save_registry(self):
|
|
"""Save registry to disk."""
|
|
self.registry_path.write_text(json.dumps(self.registry, indent=2))
|
|
|
|
def get_content_hash(self, file_path: Path) -> str:
|
|
"""Calculate SHA-256 hash of file content."""
|
|
content = file_path.read_bytes()
|
|
return hashlib.sha256(content).hexdigest()
|
|
|
|
def register_asset(self, file_path: Path, content_hash: str) -> Dict:
|
|
"""Register a new asset in the registry."""
|
|
file_size = file_path.stat().st_size
|
|
mime_type = self._guess_mime_type(file_path.suffix)
|
|
|
|
asset_info = {
|
|
"original_name": file_path.name,
|
|
"size": file_size,
|
|
"mime_type": mime_type,
|
|
"extension": file_path.suffix,
|
|
"created": datetime.now().isoformat(),
|
|
"stored_path": f"images/{content_hash}{file_path.suffix}"
|
|
}
|
|
|
|
self.registry["assets"][content_hash] = asset_info
|
|
self._save_registry()
|
|
return asset_info
|
|
|
|
def find_asset(self, content_hash: str) -> Optional[Dict]:
|
|
"""Find asset by content hash."""
|
|
return self.registry["assets"].get(content_hash)
|
|
|
|
def _guess_mime_type(self, extension: str) -> str:
|
|
"""Simple MIME type guessing based on extension."""
|
|
mime_map = {
|
|
".png": "image/png",
|
|
".jpg": "image/jpeg",
|
|
".jpeg": "image/jpeg",
|
|
".gif": "image/gif",
|
|
".svg": "image/svg+xml",
|
|
".pdf": "application/pdf",
|
|
".txt": "text/plain",
|
|
".md": "text/markdown"
|
|
}
|
|
return mime_map.get(extension.lower(), "application/octet-stream")
|
|
|
|
|
|
class AssetDeduplicator:
|
|
"""Handles asset storage and deduplication using symlinks."""
|
|
|
|
def __init__(self, workspace_path: Path):
|
|
self.workspace = workspace_path
|
|
self.shared_assets = workspace_path / "shared_assets"
|
|
self.shared_images = self.shared_assets / "images"
|
|
self.registry = AssetRegistry(self.shared_assets / "registry.json")
|
|
|
|
# Create directory structure
|
|
self.shared_images.mkdir(parents=True, exist_ok=True)
|
|
|
|
def add_asset(self, source_path: Path, document_dir: Path, virtual_name: str) -> Tuple[str, Path]:
|
|
"""
|
|
Add asset with deduplication. Returns (content_hash, stored_path).
|
|
"""
|
|
if not source_path.exists():
|
|
raise FileNotFoundError(f"Source asset not found: {source_path}")
|
|
|
|
# Calculate content hash
|
|
content_hash = self.registry.get_content_hash(source_path)
|
|
|
|
# Check if we already have this content
|
|
existing_asset = self.registry.find_asset(content_hash)
|
|
|
|
if existing_asset:
|
|
print(f"✓ Deduplication: Found existing asset for {virtual_name}")
|
|
stored_path = self.shared_assets / existing_asset["stored_path"]
|
|
else:
|
|
# Store new asset
|
|
stored_path = self.shared_images / f"{content_hash}{source_path.suffix}"
|
|
shutil.copy2(source_path, stored_path)
|
|
self.registry.register_asset(source_path, content_hash)
|
|
print(f"✓ Stored new asset: {virtual_name} -> {stored_path.name}")
|
|
|
|
# Create symlink in document assets directory
|
|
self._create_asset_symlink(stored_path, document_dir, virtual_name)
|
|
|
|
return content_hash, stored_path
|
|
|
|
def _create_asset_symlink(self, stored_path: Path, document_dir: Path, virtual_name: str):
|
|
"""Create symlink from document assets directory to shared storage."""
|
|
assets_dir = document_dir / "assets"
|
|
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
link_path = assets_dir / virtual_name
|
|
|
|
# Remove existing link/file if present
|
|
if link_path.exists() or link_path.is_symlink():
|
|
link_path.unlink()
|
|
|
|
# Create relative symlink
|
|
try:
|
|
relative_target = os.path.relpath(stored_path, link_path.parent)
|
|
link_path.symlink_to(relative_target)
|
|
print(f"✓ Created symlink: {virtual_name} -> {relative_target}")
|
|
except OSError as e:
|
|
# Fallback to hard copy if symlinks fail (e.g., on Windows)
|
|
shutil.copy2(stored_path, link_path)
|
|
print(f"⚠ Symlink failed, copied file instead: {virtual_name} (reason: {e})")
|
|
|
|
|
|
class MarkdownPackager:
|
|
"""Handles creation and extraction of .mdpkg files."""
|
|
|
|
def __init__(self, workspace_path: Path):
|
|
self.workspace = workspace_path
|
|
self.packages_dir = workspace_path / "packages"
|
|
self.packages_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def create_package(self, document_dir: Path, package_name: str) -> Path:
|
|
"""Create a .mdpkg ZIP package from a document directory."""
|
|
package_path = self.packages_dir / f"{package_name}.mdpkg"
|
|
|
|
# Collect asset information
|
|
assets_info = []
|
|
assets_dir = document_dir / "assets"
|
|
|
|
if assets_dir.exists():
|
|
for asset_path in assets_dir.iterdir():
|
|
if asset_path.is_file() or asset_path.is_symlink():
|
|
# Resolve symlink to get actual file info
|
|
real_path = asset_path.resolve() if asset_path.is_symlink() else asset_path
|
|
assets_info.append({
|
|
"name": asset_path.name,
|
|
"size": real_path.stat().st_size,
|
|
"is_symlink": asset_path.is_symlink()
|
|
})
|
|
|
|
# Create manifest
|
|
manifest = {
|
|
"name": package_name,
|
|
"version": "1.0",
|
|
"created": datetime.now().isoformat(),
|
|
"format": "mdpkg",
|
|
"assets": assets_info,
|
|
"main_document": "index.md"
|
|
}
|
|
|
|
# Create ZIP package
|
|
with zipfile.ZipFile(package_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
|
# Add manifest
|
|
zf.writestr("manifest.json", json.dumps(manifest, indent=2))
|
|
|
|
# Add main document
|
|
main_doc = document_dir / "index.md"
|
|
if main_doc.exists():
|
|
zf.write(main_doc, "index.md")
|
|
|
|
# Add assets (resolve symlinks)
|
|
if assets_dir.exists():
|
|
for asset_path in assets_dir.iterdir():
|
|
if asset_path.is_file() or asset_path.is_symlink():
|
|
real_path = asset_path.resolve() if asset_path.is_symlink() else asset_path
|
|
zf.write(real_path, f"assets/{asset_path.name}")
|
|
|
|
print(f"✓ Created package: {package_path}")
|
|
print(f" - Main document: {'✓' if main_doc.exists() else '✗'}")
|
|
print(f" - Assets: {len(assets_info)}")
|
|
|
|
return package_path
|
|
|
|
def extract_package(self, package_path: Path, extract_name: str) -> Path:
|
|
"""Extract a .mdpkg package to the workspace."""
|
|
if not package_path.exists():
|
|
raise FileNotFoundError(f"Package not found: {package_path}")
|
|
|
|
extract_dir = self.workspace / "documents" / extract_name
|
|
extract_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with zipfile.ZipFile(package_path, 'r') as zf:
|
|
# Read manifest
|
|
try:
|
|
manifest_content = zf.read("manifest.json")
|
|
manifest = json.loads(manifest_content)
|
|
except (KeyError, json.JSONDecodeError):
|
|
manifest = {"assets": []}
|
|
|
|
# Extract main document
|
|
if "index.md" in zf.namelist():
|
|
zf.extract("index.md", extract_dir)
|
|
|
|
# Extract assets
|
|
assets_dir = extract_dir / "assets"
|
|
for file_info in zf.infolist():
|
|
if file_info.filename.startswith("assets/"):
|
|
zf.extract(file_info.filename, extract_dir)
|
|
|
|
print(f"✓ Extracted package to: {extract_dir}")
|
|
return extract_dir
|
|
|
|
|
|
def demo_asset_management():
|
|
"""Demonstrate the asset management system."""
|
|
print("🎯 Asset Management Demo - Concept B")
|
|
print("=" * 50)
|
|
|
|
# Setup workspace
|
|
demo_workspace = Path("./demo_workspace")
|
|
if demo_workspace.exists():
|
|
shutil.rmtree(demo_workspace)
|
|
|
|
deduplicator = AssetDeduplicator(demo_workspace)
|
|
packager = MarkdownPackager(demo_workspace)
|
|
|
|
# Create demo assets (simulate duplicate images)
|
|
demo_assets = demo_workspace / "demo_assets"
|
|
demo_assets.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create some test "images" (text files for demo)
|
|
test_image1 = demo_assets / "logo.png"
|
|
test_image2 = demo_assets / "company_logo.png"
|
|
test_image3 = demo_assets / "diagram.png"
|
|
|
|
test_image1.write_text("PNG_IMAGE_CONTENT_LOGO") # Same content
|
|
test_image2.write_text("PNG_IMAGE_CONTENT_LOGO") # Same content, different name
|
|
test_image3.write_text("PNG_IMAGE_CONTENT_DIAGRAM") # Different content
|
|
|
|
print(f"Created test assets: {len(list(demo_assets.iterdir()))} files")
|
|
|
|
# Create two document projects
|
|
doc1_dir = demo_workspace / "documents" / "project_a"
|
|
doc2_dir = demo_workspace / "documents" / "project_b"
|
|
|
|
for doc_dir in [doc1_dir, doc2_dir]:
|
|
doc_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Project A uses logo.png and diagram.png
|
|
(doc1_dir / "index.md").write_text("""# Project A
|
|
|
|

|
|

|
|
|
|
This is Project A documentation.
|
|
""")
|
|
|
|
print("\n📁 Processing Project A assets...")
|
|
deduplicator.add_asset(test_image1, doc1_dir, "logo.png")
|
|
deduplicator.add_asset(test_image3, doc1_dir, "diagram.png")
|
|
|
|
# Project B uses the same logo (different filename) and same diagram
|
|
(doc2_dir / "index.md").write_text("""# Project B
|
|
|
|

|
|

|
|
|
|
This is Project B documentation.
|
|
""")
|
|
|
|
print("\n📁 Processing Project B assets...")
|
|
deduplicator.add_asset(test_image2, doc2_dir, "company_logo.png") # Same content as logo.png
|
|
deduplicator.add_asset(test_image3, doc2_dir, "system_diagram.png") # Same content as diagram.png
|
|
|
|
# Show deduplication results
|
|
print(f"\n📊 Deduplication Results:")
|
|
print(f" - Original files: 3")
|
|
print(f" - Unique content hashes: {len(deduplicator.registry.registry['assets'])}")
|
|
print(f" - Storage efficiency: {3 - len(deduplicator.registry.registry['assets'])} duplicates eliminated")
|
|
|
|
# Create packages
|
|
print(f"\n📦 Creating packages...")
|
|
pkg_a = packager.create_package(doc1_dir, "project_a")
|
|
pkg_b = packager.create_package(doc2_dir, "project_b")
|
|
|
|
print(f"\n✅ Demo completed successfully!")
|
|
print(f" - Workspace: {demo_workspace}")
|
|
print(f" - Shared assets: {deduplicator.shared_assets}")
|
|
print(f" - Packages: {packager.packages_dir}")
|
|
|
|
# Show final directory structure
|
|
print(f"\n📂 Final directory structure:")
|
|
for root, dirs, files in os.walk(demo_workspace):
|
|
level = root.replace(str(demo_workspace), '').count(os.sep)
|
|
indent = ' ' * 2 * level
|
|
print(f"{indent}{os.path.basename(root)}/")
|
|
subindent = ' ' * 2 * (level + 1)
|
|
for file in files:
|
|
file_path = Path(root) / file
|
|
if file_path.is_symlink():
|
|
target = os.readlink(file_path)
|
|
print(f"{subindent}{file} -> {target}")
|
|
else:
|
|
print(f"{subindent}{file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo_asset_management() |