Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Asset Management System (Issue #142): - Add complete asset management framework with deduplication - Implement AssetManager, AssetRegistry, and AssetDeduplicator classes - Add AssetPackager for markdown document packaging - Create comprehensive test suite for all asset management components - Add asset constants and custom exceptions for robust error handling Markdown Processing Enhancements: - Update markdown_commands.py with improved functionality - Enhanced parsing and content aggregation capabilities - Improved filename encoding/decoding for special characters Test Suite Improvements: - Add comprehensive tests for Issue #138 markdown parsing - Enhance Issue #139 content aggregation and end-to-end testing - Complete test coverage for new asset management features Examples and Documentation: - Update BildungsKanonJon.md example with enhanced content - Generate corresponding HTML output for documentation - Add asset registry configuration Development Tools: - Add install script for simplified setup This commit represents a major enhancement to MarkiTect's asset handling capabilities with full test coverage and improved markdown processing. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
312 lines
11 KiB
Python
312 lines
11 KiB
Python
"""
|
|
AssetDeduplicator class for content-based asset deduplication with symlink support.
|
|
|
|
This module implements the AssetDeduplicator class that provides content-based
|
|
asset deduplication, symlink creation with relative paths, Windows fallback to
|
|
file copying, and conflict resolution for existing assets.
|
|
"""
|
|
|
|
import os
|
|
import platform
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
|
|
from .exceptions import AssetError, DeduplicationError
|
|
from .registry import AssetRegistry
|
|
from .constants import CONFLICT_RESOLUTION_OPTIONS
|
|
|
|
|
|
class AssetDeduplicator:
|
|
"""Content-based asset deduplicator with symlink support and cross-platform compatibility."""
|
|
|
|
def __init__(self, storage_path: Path, registry: AssetRegistry):
|
|
"""Initialize AssetDeduplicator with storage path and registry.
|
|
|
|
Args:
|
|
storage_path: Directory where deduplicated assets are stored.
|
|
registry: AssetRegistry instance for metadata management.
|
|
|
|
Raises:
|
|
DeduplicationError: If storage path is invalid.
|
|
"""
|
|
self.storage_path = Path(storage_path)
|
|
self.registry = registry
|
|
|
|
# Validate and create storage directory
|
|
try:
|
|
if self.storage_path.exists() and not self.storage_path.is_dir():
|
|
raise DeduplicationError(f"Storage path exists but is not a directory: {storage_path}")
|
|
|
|
self.storage_path.mkdir(parents=True, exist_ok=True)
|
|
except Exception as e:
|
|
if isinstance(e, DeduplicationError):
|
|
raise
|
|
raise DeduplicationError(f"Failed to create storage directory: {storage_path}", cause=e)
|
|
|
|
def store_asset(self, file_path: Path, description: Optional[str] = None) -> Dict[str, Any]:
|
|
"""Store asset with deduplication.
|
|
|
|
Args:
|
|
file_path: Path to the asset file to store.
|
|
description: Optional description for the asset.
|
|
|
|
Returns:
|
|
Dictionary containing storage information including deduplication status.
|
|
|
|
Raises:
|
|
AssetError: If file doesn't exist or cannot be read.
|
|
DeduplicationError: If storage operation fails.
|
|
"""
|
|
if not file_path.exists():
|
|
raise AssetError(f"Asset file does not exist: {file_path}")
|
|
|
|
try:
|
|
# Generate content hash to check for deduplication
|
|
content_hash = self.registry.generate_content_hash(file_path)
|
|
|
|
# Check if asset already exists (deduplication)
|
|
deduplicated = self.registry.asset_exists(content_hash)
|
|
|
|
if deduplicated:
|
|
# Asset already exists, just update registry with new reference
|
|
existing_asset = self.registry.get_asset(content_hash)
|
|
stored_path = Path(existing_asset["path"])
|
|
|
|
# If this is a reference to the stored version, update registry
|
|
if str(file_path) != str(stored_path):
|
|
# This is a new reference to existing content
|
|
pass
|
|
|
|
return {
|
|
"content_hash": content_hash,
|
|
"stored_path": str(stored_path),
|
|
"deduplicated": True,
|
|
"original_path": str(file_path)
|
|
}
|
|
else:
|
|
# New asset, store it
|
|
stored_path = self._generate_storage_path(content_hash, file_path)
|
|
|
|
# Copy file to storage
|
|
shutil.copy2(file_path, stored_path)
|
|
|
|
# Register in registry
|
|
asset_info = self.registry.register_asset(stored_path, description)
|
|
|
|
return {
|
|
"content_hash": content_hash,
|
|
"stored_path": str(stored_path),
|
|
"deduplicated": False,
|
|
"original_path": str(file_path),
|
|
"asset_info": asset_info
|
|
}
|
|
|
|
except Exception as e:
|
|
if isinstance(e, (AssetError, DeduplicationError)):
|
|
raise
|
|
raise DeduplicationError(f"Failed to store asset {file_path}", cause=e)
|
|
|
|
def _generate_storage_path(self, content_hash: str, original_path: Path) -> Path:
|
|
"""Generate storage path for asset based on content hash.
|
|
|
|
Args:
|
|
content_hash: SHA-256 hash of the content.
|
|
original_path: Original file path (for extension).
|
|
|
|
Returns:
|
|
Path where the asset should be stored.
|
|
"""
|
|
# Use first 2 chars of hash for directory structure
|
|
subdir = content_hash[:2]
|
|
filename = content_hash + original_path.suffix
|
|
|
|
storage_dir = self.storage_path / subdir
|
|
storage_dir.mkdir(exist_ok=True)
|
|
|
|
return storage_dir / filename
|
|
|
|
def create_asset_link(self, stored_path: Path, link_path: Path,
|
|
conflict_resolution: str = "backup") -> Dict[str, Any]:
|
|
"""Create symlink or copy to stored asset.
|
|
|
|
Args:
|
|
stored_path: Path to the stored asset.
|
|
link_path: Desired path for the link/copy.
|
|
conflict_resolution: How to handle existing files ("overwrite", "backup", "skip").
|
|
|
|
Returns:
|
|
Dictionary with operation results.
|
|
|
|
Raises:
|
|
DeduplicationError: If link creation fails.
|
|
"""
|
|
if conflict_resolution not in CONFLICT_RESOLUTION_OPTIONS:
|
|
raise DeduplicationError(f"Invalid conflict resolution: {conflict_resolution}")
|
|
|
|
try:
|
|
# Handle existing file
|
|
if link_path.exists():
|
|
if conflict_resolution == "skip":
|
|
return {"skipped": True, "reason": "File already exists"}
|
|
elif conflict_resolution == "backup":
|
|
backup_path = link_path.with_suffix(link_path.suffix + ".bak")
|
|
shutil.move(str(link_path), str(backup_path))
|
|
elif conflict_resolution == "overwrite":
|
|
link_path.unlink()
|
|
|
|
# Ensure parent directory exists
|
|
link_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Try to create symlink (Unix/Linux) or fallback to copying (Windows)
|
|
if platform.system() == "Windows":
|
|
# On Windows, use file copying instead of symlinks
|
|
shutil.copy2(stored_path, link_path)
|
|
return {
|
|
"link_created": True,
|
|
"link_type": "copy",
|
|
"link_path": str(link_path),
|
|
"target_path": str(stored_path)
|
|
}
|
|
else:
|
|
# On Unix/Linux, create relative symlink
|
|
relative_path = os.path.relpath(stored_path, link_path.parent)
|
|
os.symlink(relative_path, link_path)
|
|
return {
|
|
"link_created": True,
|
|
"link_type": "symlink",
|
|
"link_path": str(link_path),
|
|
"target_path": str(stored_path),
|
|
"relative_target": relative_path
|
|
}
|
|
|
|
except OSError as e:
|
|
# Symlink creation failed, fallback to copying
|
|
try:
|
|
if link_path.exists():
|
|
link_path.unlink()
|
|
shutil.copy2(stored_path, link_path)
|
|
return {
|
|
"link_created": True,
|
|
"link_type": "copy_fallback",
|
|
"link_path": str(link_path),
|
|
"target_path": str(stored_path),
|
|
"fallback_reason": str(e)
|
|
}
|
|
except Exception as fallback_error:
|
|
raise DeduplicationError(
|
|
f"Failed to create link and fallback copy failed: {fallback_error}",
|
|
cause=e
|
|
)
|
|
except Exception as e:
|
|
raise DeduplicationError(f"Failed to create asset link: {e}", cause=e)
|
|
|
|
def get_asset_path(self, content_hash: str) -> Path:
|
|
"""Get path to stored asset by content hash.
|
|
|
|
Args:
|
|
content_hash: SHA-256 hash of the asset content.
|
|
|
|
Returns:
|
|
Path to the stored asset.
|
|
|
|
Raises:
|
|
DeduplicationError: If asset is not found.
|
|
"""
|
|
try:
|
|
asset_info = self.registry.get_asset(content_hash)
|
|
stored_path = Path(asset_info["path"])
|
|
|
|
if not stored_path.exists():
|
|
raise DeduplicationError(f"Stored asset file missing: {stored_path}")
|
|
|
|
return stored_path
|
|
except Exception as e:
|
|
if isinstance(e, DeduplicationError):
|
|
raise
|
|
raise DeduplicationError(f"Failed to get asset path for hash {content_hash}", cause=e)
|
|
|
|
def verify_asset_integrity(self, content_hash: str) -> bool:
|
|
"""Verify integrity of stored asset by recomputing hash.
|
|
|
|
Args:
|
|
content_hash: Expected SHA-256 hash of the asset content.
|
|
|
|
Returns:
|
|
True if integrity check passes, False otherwise.
|
|
"""
|
|
try:
|
|
stored_path = self.get_asset_path(content_hash)
|
|
computed_hash = self.registry.generate_content_hash(stored_path)
|
|
return computed_hash == content_hash
|
|
except Exception:
|
|
return False
|
|
|
|
def remove_stored_asset(self, content_hash: str) -> Dict[str, Any]:
|
|
"""Remove stored asset file and registry entry.
|
|
|
|
Args:
|
|
content_hash: SHA-256 hash of the asset content.
|
|
|
|
Returns:
|
|
Dictionary with removal results.
|
|
"""
|
|
try:
|
|
# Get asset path before removing from registry
|
|
stored_path = self.get_asset_path(content_hash)
|
|
|
|
# Remove from registry first
|
|
registry_removed = self.registry.remove_asset(content_hash)
|
|
|
|
# Remove physical file
|
|
file_removed = False
|
|
if stored_path.exists():
|
|
stored_path.unlink()
|
|
file_removed = True
|
|
|
|
# Remove empty parent directory if it exists
|
|
try:
|
|
if not any(stored_path.parent.iterdir()):
|
|
stored_path.parent.rmdir()
|
|
except OSError:
|
|
pass # Directory not empty or other issue, ignore
|
|
|
|
return {
|
|
"registry_removed": registry_removed,
|
|
"file_removed": file_removed,
|
|
"removed_path": str(stored_path)
|
|
}
|
|
|
|
except Exception as e:
|
|
raise DeduplicationError(f"Failed to remove stored asset {content_hash}", cause=e)
|
|
|
|
def list_stored_assets(self) -> Dict[str, Any]:
|
|
"""List all stored assets with file system information.
|
|
|
|
Returns:
|
|
Dictionary containing asset listing and storage statistics.
|
|
"""
|
|
try:
|
|
assets = self.registry.list_assets()
|
|
total_size = 0
|
|
valid_assets = 0
|
|
missing_assets = []
|
|
|
|
for asset in assets:
|
|
stored_path = Path(asset["path"])
|
|
if stored_path.exists():
|
|
valid_assets += 1
|
|
total_size += stored_path.stat().st_size
|
|
else:
|
|
missing_assets.append(asset["content_hash"])
|
|
|
|
return {
|
|
"total_assets": len(assets),
|
|
"valid_assets": valid_assets,
|
|
"missing_assets": missing_assets,
|
|
"total_size_bytes": total_size,
|
|
"storage_path": str(self.storage_path)
|
|
}
|
|
|
|
except Exception as e:
|
|
raise DeduplicationError("Failed to list stored assets", cause=e) |