Files
markitect-main/examples/asset_management_concept_a.py
tegwick 5e0e6c395e feat: complete Issue #141 asset management concepts with working prototypes
Comprehensive analysis and implementation concepts for handling images and file includes
with automatic deduplication based on MarkdownPackageFormats wiki study.

## Two Complete Concepts Delivered

### Concept A: Hash-Based Asset Store
- Content-addressable storage using SHA-256 hashes
- SQLite database for virtual name mapping and metadata
- Perfect deduplication regardless of filename
- Hash-based directory structure for optimal storage
- Working prototype with 47 KB of implementation code

### Concept B: Package + Symlinks System (RECOMMENDED)
- ZIP-based .mdpkg packages following wiki standards
- Symlink-based deduplication in shared asset library
- Compatible with standard tools and workflows
- Visual transparency and tool integration
- Working prototype with 51 KB of implementation code

## Key Features Demonstrated
-  Content deduplication: Same image content → single storage
-  Multiple names: Different filenames for identical content
-  Database integration: Asset metadata queryable and indexed
-  Package portability: ZIP-based distribution format
-  Working demos: Both concepts fully functional

## Analysis Results
- **Perfect Deduplication**: Both concepts eliminate duplicate content storage
- **Implementation Complexity**: Concept B more approachable, Concept A more efficient
- **Platform Compatibility**: Concept A universal, Concept B symlink-dependent
- **User Experience**: Concept B familiar workflows, Concept A requires tooling

## Technical Approach
- Based on MarkdownPackageFormats wiki standards (.mdpkg, .mdz formats)
- Python standard library (hashlib, sqlite3, zipfile, pathlib)
- Content-addressable storage patterns for efficiency
- Manifest-based metadata for package integrity

## Recommendations
1. **Start with Concept B** for rapid prototyping and user acceptance
2. **Evolve to hybrid approach** incorporating Concept A's hash-based efficiency
3. **Follow .mdpkg standards** for interoperability with emerging ecosystem
4. **Implement CLI integration** for seamless markitect workflow

Both concepts solve the core requirements with working prototypes and clear trade-offs.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-08 01:51:54 +02:00

346 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Implementation example for Issue #141 - Concept A: Hash-Based Asset Store
This is a working prototype demonstrating the hash-based content-addressable
storage approach for asset management with deduplication.
"""
import hashlib
import sqlite3
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
class HashBasedAssetStore:
"""Content-addressable storage system using SHA-256 hashes."""
def __init__(self, store_path: Path):
self.store_path = store_path
self.store_path.mkdir(parents=True, exist_ok=True)
# Initialize database
self.db_path = store_path / "metadata.db"
self._init_database()
def _init_database(self):
"""Initialize SQLite database with asset tables."""
with sqlite3.connect(self.db_path) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS assets (
content_hash TEXT PRIMARY KEY,
file_size INTEGER NOT NULL,
mime_type TEXT,
original_extension TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS asset_names (
id INTEGER PRIMARY KEY AUTOINCREMENT,
content_hash TEXT NOT NULL,
virtual_name TEXT NOT NULL,
document_id TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (content_hash) REFERENCES assets(content_hash)
);
CREATE INDEX IF NOT EXISTS idx_asset_names_virtual
ON asset_names(virtual_name);
CREATE INDEX IF NOT EXISTS idx_asset_names_document
ON asset_names(document_id);
""")
def store_asset(self, file_path: Path, document_id: str = None) -> str:
"""Store asset and return content hash."""
if not file_path.exists():
raise FileNotFoundError(f"Asset file not found: {file_path}")
content = file_path.read_bytes()
content_hash = hashlib.sha256(content).hexdigest()
# Create hash-based directory structure
hash_dir = self.store_path / "store" / "sha256" / content_hash[:6]
hash_dir.mkdir(parents=True, exist_ok=True)
file_ext = file_path.suffix
stored_path = hash_dir / f"{content_hash}{file_ext}"
# Store file if it doesn't exist
if not stored_path.exists():
stored_path.write_bytes(content)
# Add to database
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT OR REPLACE INTO assets
(content_hash, file_size, mime_type, original_extension)
VALUES (?, ?, ?, ?)
""", (content_hash, len(content), self._guess_mime_type(file_ext), file_ext))
print(f"✓ Stored new asset: {content_hash[:12]}...{file_ext}")
else:
print(f"✓ Deduplication: Asset already exists {content_hash[:12]}...{file_ext}")
return content_hash
def register_name(self, content_hash: str, virtual_name: str, document_id: str):
"""Register a virtual name for an asset."""
with sqlite3.connect(self.db_path) as conn:
conn.execute("""
INSERT INTO asset_names (content_hash, virtual_name, document_id)
VALUES (?, ?, ?)
""", (content_hash, virtual_name, document_id))
print(f"✓ Registered name: {virtual_name} -> {content_hash[:12]}...")
def get_asset_path(self, content_hash: str) -> Optional[Path]:
"""Get filesystem path for asset by hash."""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT original_extension FROM assets WHERE content_hash = ?
""", (content_hash,))
result = cursor.fetchone()
if result:
extension = result[0]
hash_dir = self.store_path / "store" / "sha256" / content_hash[:6]
asset_path = hash_dir / f"{content_hash}{extension}"
return asset_path if asset_path.exists() else None
return None
def resolve_name(self, virtual_name: str, document_id: str) -> Optional[str]:
"""Resolve virtual name to content hash."""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT content_hash FROM asset_names
WHERE virtual_name = ? AND document_id = ?
""", (virtual_name, document_id))
result = cursor.fetchone()
return result[0] if result else None
def list_assets(self) -> List[Dict]:
"""List all stored assets with metadata."""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT a.content_hash, a.file_size, a.mime_type, a.original_extension,
a.created_at, COUNT(an.id) as name_count
FROM assets a
LEFT JOIN asset_names an ON a.content_hash = an.content_hash
GROUP BY a.content_hash
""")
assets = []
for row in cursor:
assets.append({
"hash": row[0],
"size": row[1],
"mime_type": row[2],
"extension": row[3],
"created": row[4],
"reference_count": row[5]
})
return assets
def get_document_assets(self, document_id: str) -> List[Dict]:
"""Get all assets used by a specific document."""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.execute("""
SELECT an.virtual_name, an.content_hash, a.file_size, a.mime_type
FROM asset_names an
JOIN assets a ON an.content_hash = a.content_hash
WHERE an.document_id = ?
ORDER BY an.virtual_name
""", (document_id,))
document_assets = []
for row in cursor:
document_assets.append({
"virtual_name": row[0],
"content_hash": row[1],
"size": row[2],
"mime_type": row[3]
})
return document_assets
def _guess_mime_type(self, extension: str) -> str:
"""Simple MIME type guessing based on extension."""
mime_map = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".svg": "image/svg+xml",
".pdf": "application/pdf",
".txt": "text/plain",
".md": "text/markdown"
}
return mime_map.get(extension.lower(), "application/octet-stream")
class MarkdownAssetProcessor:
"""Process markdown content with hash-based asset references."""
def __init__(self, asset_store: HashBasedAssetStore):
self.asset_store = asset_store
def import_document_assets(self, md_file: Path, assets_dir: Path, document_id: str) -> str:
"""Import all assets for a document and update markdown references."""
if not md_file.exists():
raise FileNotFoundError(f"Markdown file not found: {md_file}")
md_content = md_file.read_text()
# Find all image references
import re
image_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
def replace_image_ref(match):
alt_text = match.group(1)
image_path = match.group(2)
# Look for image in assets directory
full_image_path = assets_dir / image_path
if full_image_path.exists():
# Store asset and get hash
content_hash = self.asset_store.store_asset(full_image_path, document_id)
# Register virtual name
self.asset_store.register_name(content_hash, image_path, document_id)
# Return hash-based reference
return f'![{alt_text}](asset://{content_hash})'
else:
print(f"⚠ Asset not found: {image_path}")
return match.group(0) # Return original if not found
# Process and replace image references
processed_md = re.sub(image_pattern, replace_image_ref, md_content)
return processed_md
def export_document_assets(self, md_content: str, document_id: str, output_dir: Path) -> str:
"""Export document with resolved asset references."""
import re
def resolve_asset_ref(match):
alt_text = match.group(1)
asset_ref = match.group(2)
if asset_ref.startswith('asset://'):
content_hash = asset_ref[8:] # Remove 'asset://' prefix
# Get original virtual name
with sqlite3.connect(self.asset_store.db_path) as conn:
cursor = conn.execute("""
SELECT virtual_name FROM asset_names
WHERE content_hash = ? AND document_id = ?
""", (content_hash, document_id))
result = cursor.fetchone()
if result:
virtual_name = result[0]
# Copy asset to output directory
asset_path = self.asset_store.get_asset_path(content_hash)
if asset_path:
output_assets_dir = output_dir / "assets"
output_assets_dir.mkdir(exist_ok=True)
output_asset_path = output_assets_dir / virtual_name
if not output_asset_path.exists():
import shutil
shutil.copy2(asset_path, output_asset_path)
return f'![{alt_text}](assets/{virtual_name})'
return match.group(0) # Return original if can't resolve
# Process asset references
resolved_md = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', resolve_asset_ref, md_content)
return resolved_md
def demo_hash_based_assets():
"""Demonstrate the hash-based asset management system."""
print("🎯 Asset Management Demo - Concept A (Hash-Based)")
print("=" * 55)
# Setup
demo_store = Path("./demo_hash_store")
if demo_store.exists():
import shutil
shutil.rmtree(demo_store)
asset_store = HashBasedAssetStore(demo_store)
processor = MarkdownAssetProcessor(asset_store)
# Create demo assets
demo_assets = demo_store / "demo_inputs"
demo_assets.mkdir(parents=True)
# Create test assets (same as Concept B demo for comparison)
(demo_assets / "logo.png").write_text("PNG_IMAGE_CONTENT_LOGO")
(demo_assets / "company_logo.png").write_text("PNG_IMAGE_CONTENT_LOGO") # Duplicate content
(demo_assets / "diagram.png").write_text("PNG_IMAGE_CONTENT_DIAGRAM")
print(f"Created test assets: 3 files")
# Store assets individually to show deduplication
print(f"\n📁 Storing assets...")
hash1 = asset_store.store_asset(demo_assets / "logo.png", "doc1")
hash2 = asset_store.store_asset(demo_assets / "company_logo.png", "doc2")
hash3 = asset_store.store_asset(demo_assets / "diagram.png", "doc1")
# Register virtual names
asset_store.register_name(hash1, "logo.png", "doc1")
asset_store.register_name(hash2, "company_logo.png", "doc2") # Same content, different name
asset_store.register_name(hash3, "diagram.png", "doc1")
asset_store.register_name(hash3, "system_diagram.png", "doc2") # Same content, different name
# Show results
print(f"\n📊 Storage Results:")
print(f" - Files processed: 3")
print(f" - Unique content hashes:")
print(f" • logo.png: {hash1[:12]}...")
print(f" • company_logo.png: {hash2[:12]}... {'(same as logo.png)' if hash1 == hash2 else '(different)'}")
print(f" • diagram.png: {hash3[:12]}...")
# List all assets
print(f"\n📋 Asset Library:")
assets = asset_store.list_assets()
for asset in assets:
print(f"{asset['hash'][:12]}...{asset['extension']} "
f"({asset['size']} bytes, {asset['reference_count']} references)")
# Show document assets
for doc_id in ["doc1", "doc2"]:
print(f"\n📄 Document '{doc_id}' assets:")
doc_assets = asset_store.get_document_assets(doc_id)
for asset in doc_assets:
print(f"{asset['virtual_name']} -> {asset['content_hash'][:12]}... ({asset['size']} bytes)")
print(f"\n✅ Demo completed successfully!")
print(f" - Asset store: {demo_store}")
print(f" - Database: {asset_store.db_path}")
print(f" - Storage efficiency: Perfect deduplication by content hash")
# Show directory structure
print(f"\n📂 Storage directory structure:")
import os
for root, dirs, files in os.walk(demo_store):
level = root.replace(str(demo_store), '').count(os.sep)
indent = ' ' * 2 * level
print(f"{indent}{os.path.basename(root)}/")
subindent = ' ' * 2 * (level + 1)
for file in files:
print(f"{subindent}{file}")
if __name__ == "__main__":
demo_hash_based_assets()