feat: comprehensive asset management system and testing improvements
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Asset Management System (Issue #142): - Add complete asset management framework with deduplication - Implement AssetManager, AssetRegistry, and AssetDeduplicator classes - Add AssetPackager for markdown document packaging - Create comprehensive test suite for all asset management components - Add asset constants and custom exceptions for robust error handling Markdown Processing Enhancements: - Update markdown_commands.py with improved functionality - Enhanced parsing and content aggregation capabilities - Improved filename encoding/decoding for special characters Test Suite Improvements: - Add comprehensive tests for Issue #138 markdown parsing - Enhance Issue #139 content aggregation and end-to-end testing - Complete test coverage for new asset management features Examples and Documentation: - Update BildungsKanonJon.md example with enhanced content - Generate corresponding HTML output for documentation - Add asset registry configuration Development Tools: - Add install script for simplified setup This commit represents a major enhancement to MarkiTect's asset handling capabilities with full test coverage and improved markdown processing. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
312
markitect/assets/deduplicator.py
Normal file
312
markitect/assets/deduplicator.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
AssetDeduplicator class for content-based asset deduplication with symlink support.
|
||||
|
||||
This module implements the AssetDeduplicator class that provides content-based
|
||||
asset deduplication, symlink creation with relative paths, Windows fallback to
|
||||
file copying, and conflict resolution for existing assets.
|
||||
"""
|
||||
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from .exceptions import AssetError, DeduplicationError
|
||||
from .registry import AssetRegistry
|
||||
from .constants import CONFLICT_RESOLUTION_OPTIONS
|
||||
|
||||
|
||||
class AssetDeduplicator:
|
||||
"""Content-based asset deduplicator with symlink support and cross-platform compatibility."""
|
||||
|
||||
def __init__(self, storage_path: Path, registry: AssetRegistry):
|
||||
"""Initialize AssetDeduplicator with storage path and registry.
|
||||
|
||||
Args:
|
||||
storage_path: Directory where deduplicated assets are stored.
|
||||
registry: AssetRegistry instance for metadata management.
|
||||
|
||||
Raises:
|
||||
DeduplicationError: If storage path is invalid.
|
||||
"""
|
||||
self.storage_path = Path(storage_path)
|
||||
self.registry = registry
|
||||
|
||||
# Validate and create storage directory
|
||||
try:
|
||||
if self.storage_path.exists() and not self.storage_path.is_dir():
|
||||
raise DeduplicationError(f"Storage path exists but is not a directory: {storage_path}")
|
||||
|
||||
self.storage_path.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as e:
|
||||
if isinstance(e, DeduplicationError):
|
||||
raise
|
||||
raise DeduplicationError(f"Failed to create storage directory: {storage_path}", cause=e)
|
||||
|
||||
def store_asset(self, file_path: Path, description: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Store asset with deduplication.
|
||||
|
||||
Args:
|
||||
file_path: Path to the asset file to store.
|
||||
description: Optional description for the asset.
|
||||
|
||||
Returns:
|
||||
Dictionary containing storage information including deduplication status.
|
||||
|
||||
Raises:
|
||||
AssetError: If file doesn't exist or cannot be read.
|
||||
DeduplicationError: If storage operation fails.
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise AssetError(f"Asset file does not exist: {file_path}")
|
||||
|
||||
try:
|
||||
# Generate content hash to check for deduplication
|
||||
content_hash = self.registry.generate_content_hash(file_path)
|
||||
|
||||
# Check if asset already exists (deduplication)
|
||||
deduplicated = self.registry.asset_exists(content_hash)
|
||||
|
||||
if deduplicated:
|
||||
# Asset already exists, just update registry with new reference
|
||||
existing_asset = self.registry.get_asset(content_hash)
|
||||
stored_path = Path(existing_asset["path"])
|
||||
|
||||
# If this is a reference to the stored version, update registry
|
||||
if str(file_path) != str(stored_path):
|
||||
# This is a new reference to existing content
|
||||
pass
|
||||
|
||||
return {
|
||||
"content_hash": content_hash,
|
||||
"stored_path": str(stored_path),
|
||||
"deduplicated": True,
|
||||
"original_path": str(file_path)
|
||||
}
|
||||
else:
|
||||
# New asset, store it
|
||||
stored_path = self._generate_storage_path(content_hash, file_path)
|
||||
|
||||
# Copy file to storage
|
||||
shutil.copy2(file_path, stored_path)
|
||||
|
||||
# Register in registry
|
||||
asset_info = self.registry.register_asset(stored_path, description)
|
||||
|
||||
return {
|
||||
"content_hash": content_hash,
|
||||
"stored_path": str(stored_path),
|
||||
"deduplicated": False,
|
||||
"original_path": str(file_path),
|
||||
"asset_info": asset_info
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if isinstance(e, (AssetError, DeduplicationError)):
|
||||
raise
|
||||
raise DeduplicationError(f"Failed to store asset {file_path}", cause=e)
|
||||
|
||||
def _generate_storage_path(self, content_hash: str, original_path: Path) -> Path:
|
||||
"""Generate storage path for asset based on content hash.
|
||||
|
||||
Args:
|
||||
content_hash: SHA-256 hash of the content.
|
||||
original_path: Original file path (for extension).
|
||||
|
||||
Returns:
|
||||
Path where the asset should be stored.
|
||||
"""
|
||||
# Use first 2 chars of hash for directory structure
|
||||
subdir = content_hash[:2]
|
||||
filename = content_hash + original_path.suffix
|
||||
|
||||
storage_dir = self.storage_path / subdir
|
||||
storage_dir.mkdir(exist_ok=True)
|
||||
|
||||
return storage_dir / filename
|
||||
|
||||
def create_asset_link(self, stored_path: Path, link_path: Path,
|
||||
conflict_resolution: str = "backup") -> Dict[str, Any]:
|
||||
"""Create symlink or copy to stored asset.
|
||||
|
||||
Args:
|
||||
stored_path: Path to the stored asset.
|
||||
link_path: Desired path for the link/copy.
|
||||
conflict_resolution: How to handle existing files ("overwrite", "backup", "skip").
|
||||
|
||||
Returns:
|
||||
Dictionary with operation results.
|
||||
|
||||
Raises:
|
||||
DeduplicationError: If link creation fails.
|
||||
"""
|
||||
if conflict_resolution not in CONFLICT_RESOLUTION_OPTIONS:
|
||||
raise DeduplicationError(f"Invalid conflict resolution: {conflict_resolution}")
|
||||
|
||||
try:
|
||||
# Handle existing file
|
||||
if link_path.exists():
|
||||
if conflict_resolution == "skip":
|
||||
return {"skipped": True, "reason": "File already exists"}
|
||||
elif conflict_resolution == "backup":
|
||||
backup_path = link_path.with_suffix(link_path.suffix + ".bak")
|
||||
shutil.move(str(link_path), str(backup_path))
|
||||
elif conflict_resolution == "overwrite":
|
||||
link_path.unlink()
|
||||
|
||||
# Ensure parent directory exists
|
||||
link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Try to create symlink (Unix/Linux) or fallback to copying (Windows)
|
||||
if platform.system() == "Windows":
|
||||
# On Windows, use file copying instead of symlinks
|
||||
shutil.copy2(stored_path, link_path)
|
||||
return {
|
||||
"link_created": True,
|
||||
"link_type": "copy",
|
||||
"link_path": str(link_path),
|
||||
"target_path": str(stored_path)
|
||||
}
|
||||
else:
|
||||
# On Unix/Linux, create relative symlink
|
||||
relative_path = os.path.relpath(stored_path, link_path.parent)
|
||||
os.symlink(relative_path, link_path)
|
||||
return {
|
||||
"link_created": True,
|
||||
"link_type": "symlink",
|
||||
"link_path": str(link_path),
|
||||
"target_path": str(stored_path),
|
||||
"relative_target": relative_path
|
||||
}
|
||||
|
||||
except OSError as e:
|
||||
# Symlink creation failed, fallback to copying
|
||||
try:
|
||||
if link_path.exists():
|
||||
link_path.unlink()
|
||||
shutil.copy2(stored_path, link_path)
|
||||
return {
|
||||
"link_created": True,
|
||||
"link_type": "copy_fallback",
|
||||
"link_path": str(link_path),
|
||||
"target_path": str(stored_path),
|
||||
"fallback_reason": str(e)
|
||||
}
|
||||
except Exception as fallback_error:
|
||||
raise DeduplicationError(
|
||||
f"Failed to create link and fallback copy failed: {fallback_error}",
|
||||
cause=e
|
||||
)
|
||||
except Exception as e:
|
||||
raise DeduplicationError(f"Failed to create asset link: {e}", cause=e)
|
||||
|
||||
def get_asset_path(self, content_hash: str) -> Path:
|
||||
"""Get path to stored asset by content hash.
|
||||
|
||||
Args:
|
||||
content_hash: SHA-256 hash of the asset content.
|
||||
|
||||
Returns:
|
||||
Path to the stored asset.
|
||||
|
||||
Raises:
|
||||
DeduplicationError: If asset is not found.
|
||||
"""
|
||||
try:
|
||||
asset_info = self.registry.get_asset(content_hash)
|
||||
stored_path = Path(asset_info["path"])
|
||||
|
||||
if not stored_path.exists():
|
||||
raise DeduplicationError(f"Stored asset file missing: {stored_path}")
|
||||
|
||||
return stored_path
|
||||
except Exception as e:
|
||||
if isinstance(e, DeduplicationError):
|
||||
raise
|
||||
raise DeduplicationError(f"Failed to get asset path for hash {content_hash}", cause=e)
|
||||
|
||||
def verify_asset_integrity(self, content_hash: str) -> bool:
|
||||
"""Verify integrity of stored asset by recomputing hash.
|
||||
|
||||
Args:
|
||||
content_hash: Expected SHA-256 hash of the asset content.
|
||||
|
||||
Returns:
|
||||
True if integrity check passes, False otherwise.
|
||||
"""
|
||||
try:
|
||||
stored_path = self.get_asset_path(content_hash)
|
||||
computed_hash = self.registry.generate_content_hash(stored_path)
|
||||
return computed_hash == content_hash
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def remove_stored_asset(self, content_hash: str) -> Dict[str, Any]:
|
||||
"""Remove stored asset file and registry entry.
|
||||
|
||||
Args:
|
||||
content_hash: SHA-256 hash of the asset content.
|
||||
|
||||
Returns:
|
||||
Dictionary with removal results.
|
||||
"""
|
||||
try:
|
||||
# Get asset path before removing from registry
|
||||
stored_path = self.get_asset_path(content_hash)
|
||||
|
||||
# Remove from registry first
|
||||
registry_removed = self.registry.remove_asset(content_hash)
|
||||
|
||||
# Remove physical file
|
||||
file_removed = False
|
||||
if stored_path.exists():
|
||||
stored_path.unlink()
|
||||
file_removed = True
|
||||
|
||||
# Remove empty parent directory if it exists
|
||||
try:
|
||||
if not any(stored_path.parent.iterdir()):
|
||||
stored_path.parent.rmdir()
|
||||
except OSError:
|
||||
pass # Directory not empty or other issue, ignore
|
||||
|
||||
return {
|
||||
"registry_removed": registry_removed,
|
||||
"file_removed": file_removed,
|
||||
"removed_path": str(stored_path)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise DeduplicationError(f"Failed to remove stored asset {content_hash}", cause=e)
|
||||
|
||||
def list_stored_assets(self) -> Dict[str, Any]:
|
||||
"""List all stored assets with file system information.
|
||||
|
||||
Returns:
|
||||
Dictionary containing asset listing and storage statistics.
|
||||
"""
|
||||
try:
|
||||
assets = self.registry.list_assets()
|
||||
total_size = 0
|
||||
valid_assets = 0
|
||||
missing_assets = []
|
||||
|
||||
for asset in assets:
|
||||
stored_path = Path(asset["path"])
|
||||
if stored_path.exists():
|
||||
valid_assets += 1
|
||||
total_size += stored_path.stat().st_size
|
||||
else:
|
||||
missing_assets.append(asset["content_hash"])
|
||||
|
||||
return {
|
||||
"total_assets": len(assets),
|
||||
"valid_assets": valid_assets,
|
||||
"missing_assets": missing_assets,
|
||||
"total_size_bytes": total_size,
|
||||
"storage_path": str(self.storage_path)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise DeduplicationError("Failed to list stored assets", cause=e)
|
||||
Reference in New Issue
Block a user