Files
markitect-main/markitect/assets/deduplicator.py
tegwick 81d3da5fe7
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat: comprehensive asset management system and testing improvements
Asset Management System (Issue #142):
- Add complete asset management framework with deduplication
- Implement AssetManager, AssetRegistry, and AssetDeduplicator classes
- Add AssetPackager for markdown document packaging
- Create comprehensive test suite for all asset management components
- Add asset constants and custom exceptions for robust error handling

Markdown Processing Enhancements:
- Update markdown_commands.py with improved functionality
- Enhanced parsing and content aggregation capabilities
- Improved filename encoding/decoding for special characters

Test Suite Improvements:
- Add comprehensive tests for Issue #138 markdown parsing
- Enhance Issue #139 content aggregation and end-to-end testing
- Complete test coverage for new asset management features

Examples and Documentation:
- Update BildungsKanonJon.md example with enhanced content
- Generate corresponding HTML output for documentation
- Add asset registry configuration

Development Tools:
- Add install script for simplified setup

This commit represents a major enhancement to MarkiTect's asset handling
capabilities with full test coverage and improved markdown processing.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 19:57:31 +02:00

312 lines
11 KiB
Python

"""
AssetDeduplicator class for content-based asset deduplication with symlink support.
This module implements the AssetDeduplicator class that provides content-based
asset deduplication, symlink creation with relative paths, Windows fallback to
file copying, and conflict resolution for existing assets.
"""
import os
import platform
import shutil
from pathlib import Path
from typing import Dict, Any, Optional
from .exceptions import AssetError, DeduplicationError
from .registry import AssetRegistry
from .constants import CONFLICT_RESOLUTION_OPTIONS
class AssetDeduplicator:
"""Content-based asset deduplicator with symlink support and cross-platform compatibility."""
def __init__(self, storage_path: Path, registry: AssetRegistry):
"""Initialize AssetDeduplicator with storage path and registry.
Args:
storage_path: Directory where deduplicated assets are stored.
registry: AssetRegistry instance for metadata management.
Raises:
DeduplicationError: If storage path is invalid.
"""
self.storage_path = Path(storage_path)
self.registry = registry
# Validate and create storage directory
try:
if self.storage_path.exists() and not self.storage_path.is_dir():
raise DeduplicationError(f"Storage path exists but is not a directory: {storage_path}")
self.storage_path.mkdir(parents=True, exist_ok=True)
except Exception as e:
if isinstance(e, DeduplicationError):
raise
raise DeduplicationError(f"Failed to create storage directory: {storage_path}", cause=e)
def store_asset(self, file_path: Path, description: Optional[str] = None) -> Dict[str, Any]:
"""Store asset with deduplication.
Args:
file_path: Path to the asset file to store.
description: Optional description for the asset.
Returns:
Dictionary containing storage information including deduplication status.
Raises:
AssetError: If file doesn't exist or cannot be read.
DeduplicationError: If storage operation fails.
"""
if not file_path.exists():
raise AssetError(f"Asset file does not exist: {file_path}")
try:
# Generate content hash to check for deduplication
content_hash = self.registry.generate_content_hash(file_path)
# Check if asset already exists (deduplication)
deduplicated = self.registry.asset_exists(content_hash)
if deduplicated:
# Asset already exists, just update registry with new reference
existing_asset = self.registry.get_asset(content_hash)
stored_path = Path(existing_asset["path"])
# If this is a reference to the stored version, update registry
if str(file_path) != str(stored_path):
# This is a new reference to existing content
pass
return {
"content_hash": content_hash,
"stored_path": str(stored_path),
"deduplicated": True,
"original_path": str(file_path)
}
else:
# New asset, store it
stored_path = self._generate_storage_path(content_hash, file_path)
# Copy file to storage
shutil.copy2(file_path, stored_path)
# Register in registry
asset_info = self.registry.register_asset(stored_path, description)
return {
"content_hash": content_hash,
"stored_path": str(stored_path),
"deduplicated": False,
"original_path": str(file_path),
"asset_info": asset_info
}
except Exception as e:
if isinstance(e, (AssetError, DeduplicationError)):
raise
raise DeduplicationError(f"Failed to store asset {file_path}", cause=e)
def _generate_storage_path(self, content_hash: str, original_path: Path) -> Path:
"""Generate storage path for asset based on content hash.
Args:
content_hash: SHA-256 hash of the content.
original_path: Original file path (for extension).
Returns:
Path where the asset should be stored.
"""
# Use first 2 chars of hash for directory structure
subdir = content_hash[:2]
filename = content_hash + original_path.suffix
storage_dir = self.storage_path / subdir
storage_dir.mkdir(exist_ok=True)
return storage_dir / filename
def create_asset_link(self, stored_path: Path, link_path: Path,
conflict_resolution: str = "backup") -> Dict[str, Any]:
"""Create symlink or copy to stored asset.
Args:
stored_path: Path to the stored asset.
link_path: Desired path for the link/copy.
conflict_resolution: How to handle existing files ("overwrite", "backup", "skip").
Returns:
Dictionary with operation results.
Raises:
DeduplicationError: If link creation fails.
"""
if conflict_resolution not in CONFLICT_RESOLUTION_OPTIONS:
raise DeduplicationError(f"Invalid conflict resolution: {conflict_resolution}")
try:
# Handle existing file
if link_path.exists():
if conflict_resolution == "skip":
return {"skipped": True, "reason": "File already exists"}
elif conflict_resolution == "backup":
backup_path = link_path.with_suffix(link_path.suffix + ".bak")
shutil.move(str(link_path), str(backup_path))
elif conflict_resolution == "overwrite":
link_path.unlink()
# Ensure parent directory exists
link_path.parent.mkdir(parents=True, exist_ok=True)
# Try to create symlink (Unix/Linux) or fallback to copying (Windows)
if platform.system() == "Windows":
# On Windows, use file copying instead of symlinks
shutil.copy2(stored_path, link_path)
return {
"link_created": True,
"link_type": "copy",
"link_path": str(link_path),
"target_path": str(stored_path)
}
else:
# On Unix/Linux, create relative symlink
relative_path = os.path.relpath(stored_path, link_path.parent)
os.symlink(relative_path, link_path)
return {
"link_created": True,
"link_type": "symlink",
"link_path": str(link_path),
"target_path": str(stored_path),
"relative_target": relative_path
}
except OSError as e:
# Symlink creation failed, fallback to copying
try:
if link_path.exists():
link_path.unlink()
shutil.copy2(stored_path, link_path)
return {
"link_created": True,
"link_type": "copy_fallback",
"link_path": str(link_path),
"target_path": str(stored_path),
"fallback_reason": str(e)
}
except Exception as fallback_error:
raise DeduplicationError(
f"Failed to create link and fallback copy failed: {fallback_error}",
cause=e
)
except Exception as e:
raise DeduplicationError(f"Failed to create asset link: {e}", cause=e)
def get_asset_path(self, content_hash: str) -> Path:
"""Get path to stored asset by content hash.
Args:
content_hash: SHA-256 hash of the asset content.
Returns:
Path to the stored asset.
Raises:
DeduplicationError: If asset is not found.
"""
try:
asset_info = self.registry.get_asset(content_hash)
stored_path = Path(asset_info["path"])
if not stored_path.exists():
raise DeduplicationError(f"Stored asset file missing: {stored_path}")
return stored_path
except Exception as e:
if isinstance(e, DeduplicationError):
raise
raise DeduplicationError(f"Failed to get asset path for hash {content_hash}", cause=e)
def verify_asset_integrity(self, content_hash: str) -> bool:
"""Verify integrity of stored asset by recomputing hash.
Args:
content_hash: Expected SHA-256 hash of the asset content.
Returns:
True if integrity check passes, False otherwise.
"""
try:
stored_path = self.get_asset_path(content_hash)
computed_hash = self.registry.generate_content_hash(stored_path)
return computed_hash == content_hash
except Exception:
return False
def remove_stored_asset(self, content_hash: str) -> Dict[str, Any]:
"""Remove stored asset file and registry entry.
Args:
content_hash: SHA-256 hash of the asset content.
Returns:
Dictionary with removal results.
"""
try:
# Get asset path before removing from registry
stored_path = self.get_asset_path(content_hash)
# Remove from registry first
registry_removed = self.registry.remove_asset(content_hash)
# Remove physical file
file_removed = False
if stored_path.exists():
stored_path.unlink()
file_removed = True
# Remove empty parent directory if it exists
try:
if not any(stored_path.parent.iterdir()):
stored_path.parent.rmdir()
except OSError:
pass # Directory not empty or other issue, ignore
return {
"registry_removed": registry_removed,
"file_removed": file_removed,
"removed_path": str(stored_path)
}
except Exception as e:
raise DeduplicationError(f"Failed to remove stored asset {content_hash}", cause=e)
def list_stored_assets(self) -> Dict[str, Any]:
"""List all stored assets with file system information.
Returns:
Dictionary containing asset listing and storage statistics.
"""
try:
assets = self.registry.list_assets()
total_size = 0
valid_assets = 0
missing_assets = []
for asset in assets:
stored_path = Path(asset["path"])
if stored_path.exists():
valid_assets += 1
total_size += stored_path.stat().st_size
else:
missing_assets.append(asset["content_hash"])
return {
"total_assets": len(assets),
"valid_assets": valid_assets,
"missing_assets": missing_assets,
"total_size_bytes": total_size,
"storage_path": str(self.storage_path)
}
except Exception as e:
raise DeduplicationError("Failed to list stored assets", cause=e)