This commit preserves work from a refactoring session that attempted to: ACHIEVEMENTS: - Implemented Robustness Principle with dual-mode error handling - Created sophisticated error detection for edit mode failures - Added comprehensive safety utilities in control-base.js - Successfully recovered JavaScript components from git history - Fixed template variable substitution and initialization flow - Added detailed documentation (REFACTORING_SESSION_REPORT.md) PROBLEMS: - Violated GUARDRAILS.md by embedding JavaScript in Python strings - Mixed old and new component systems without proper migration - Content rendering issues - no visible content despite initialization - Became overly complex trying to solve multiple problems simultaneously LESSONS LEARNED: - Focus is critical - solve one problem at a time - Respect architectural constraints (keep JS separate from Python) - Component migration requires explicit planning - Incremental testing prevents complexity accumulation RECOMMENDATION: Reset to working commit and take focused, incremental approach that respects GUARDRAILS.md while achieving core edit mode functionality. See REFACTORING_SESSION_REPORT.md for detailed analysis. 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
583 lines
22 KiB
Python
583 lines
22 KiB
Python
"""
|
|
Asset discovery and scanning functionality for Issue #144.
|
|
|
|
This module provides automatic asset discovery from markdown files,
|
|
broken link detection, and asset usage analytics.
|
|
"""
|
|
|
|
import re
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any, Set
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
|
|
from .manager import AssetManager
|
|
from .utils import (
|
|
PathUtils, TimedOperation, BaseResult,
|
|
FileValidator, MemoryCache
|
|
)
|
|
|
|
|
|
class ReferenceType(Enum):
|
|
"""Types of asset references."""
|
|
IMAGE = "image"
|
|
LINK = "link"
|
|
EMBED = "embed"
|
|
REFERENCE_STYLE = "reference_style"
|
|
|
|
|
|
@dataclass
|
|
class AssetReference:
|
|
"""Represents a reference to an asset in a markdown file."""
|
|
source_file: Path
|
|
asset_path: str
|
|
reference_type: ReferenceType
|
|
line_number: int
|
|
alt_text: str = ""
|
|
title: str = ""
|
|
is_broken: bool = False
|
|
resolved_path: Optional[Path] = None
|
|
resolved_hash: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class ScanResult:
|
|
"""Result of scanning directory for asset references."""
|
|
scanned_files: List[Path] = field(default_factory=list)
|
|
asset_references: List[AssetReference] = field(default_factory=list)
|
|
broken_links: List[AssetReference] = field(default_factory=list)
|
|
processing_time: float = 0.0
|
|
success: bool = True
|
|
error: Optional[Exception] = None
|
|
|
|
def __post_init__(self):
|
|
"""Post-initialization validation."""
|
|
if self.error is not None and self.success:
|
|
self.success = False
|
|
|
|
def get_broken_links(self) -> List[AssetReference]:
|
|
"""Get list of broken asset references."""
|
|
return [ref for ref in self.asset_references if ref.is_broken]
|
|
|
|
|
|
@dataclass
|
|
class RegistrationResult:
|
|
"""Result of automatic asset registration."""
|
|
registered_count: int = 0
|
|
skipped_broken: int = 0
|
|
skipped_existing: int = 0
|
|
errors: List[Exception] = field(default_factory=list)
|
|
processing_time: float = 0.0
|
|
success: bool = True
|
|
error: Optional[Exception] = None
|
|
|
|
def __post_init__(self):
|
|
"""Post-initialization validation."""
|
|
if self.error is not None and self.success:
|
|
self.success = False
|
|
# Also set success to False if there are any errors
|
|
if self.errors and self.success:
|
|
self.success = False
|
|
|
|
|
|
@dataclass
|
|
class UsageAnalysis:
|
|
"""Analysis of asset usage across a project."""
|
|
total_assets: int = 0
|
|
used_assets: int = 0
|
|
unused_assets: int = 0
|
|
broken_references: int = 0
|
|
processing_time: float = 0.0
|
|
success: bool = True
|
|
error: Optional[Exception] = None
|
|
unused_asset_list: List[Dict[str, Any]] = field(default_factory=list)
|
|
|
|
def __post_init__(self):
|
|
"""Post-initialization validation."""
|
|
if self.error is not None and self.success:
|
|
self.success = False
|
|
|
|
def get_unused_assets(self) -> List[Dict[str, Any]]:
|
|
"""Get list of unused assets."""
|
|
return self.unused_asset_list
|
|
|
|
|
|
class MarkdownScanner:
|
|
"""Scanner for asset references in markdown files."""
|
|
|
|
def __init__(self, scan_patterns: Optional[List[str]] = None,
|
|
ignore_patterns: Optional[List[str]] = None,
|
|
enable_caching: bool = True):
|
|
"""Initialize markdown scanner."""
|
|
self.scan_patterns = scan_patterns or ["*.md", "*.mdx"]
|
|
self.ignore_patterns = ignore_patterns or []
|
|
self.logger = logging.getLogger(f'{__name__}.{self.__class__.__name__}')
|
|
|
|
# Optional caching for repeated scans
|
|
self.cache = MemoryCache(default_ttl=300.0) if enable_caching else None
|
|
|
|
# Regex patterns for finding asset references
|
|
self.image_pattern = re.compile(
|
|
r'!\[([^\]]*)\]\(([^)\s]+)(?:\s+"([^"]*)")?\)',
|
|
re.MULTILINE
|
|
)
|
|
self.link_pattern = re.compile(
|
|
r'(?<!!)\[([^\]]*)\]\(([^)\s]+)(?:\s+"([^"]*)")?\)',
|
|
re.MULTILINE
|
|
)
|
|
self.reference_pattern = re.compile(
|
|
r'^\[([^\]]+)\]:\s*(.+)$',
|
|
re.MULTILINE
|
|
)
|
|
|
|
def scan_file(self, file_path: Path) -> List[AssetReference]:
|
|
"""Scan a single markdown file for asset references."""
|
|
# Normalize path
|
|
file_path = PathUtils.normalize_path(file_path)
|
|
|
|
# Validate file
|
|
if not FileValidator.is_readable_file(file_path):
|
|
self.logger.debug(f"Skipping unreadable file: {file_path}")
|
|
return []
|
|
|
|
# Check cache if enabled
|
|
cache_key = f"scan:{file_path}:{file_path.stat().st_mtime}"
|
|
if self.cache:
|
|
cached_result = self.cache.get(cache_key)
|
|
if cached_result is not None:
|
|
self.logger.debug(f"Using cached scan result for {file_path}")
|
|
return cached_result
|
|
|
|
try:
|
|
content = file_path.read_text(encoding='utf-8')
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to read file {file_path}: {e}")
|
|
return []
|
|
|
|
references = []
|
|
lines = content.splitlines()
|
|
|
|
# Find image references
|
|
for match in self.image_pattern.finditer(content):
|
|
alt_text, asset_path, title = match.groups()
|
|
line_num = self._get_line_number(content, match.start(), lines)
|
|
|
|
ref = AssetReference(
|
|
source_file=file_path,
|
|
asset_path=asset_path,
|
|
reference_type=ReferenceType.IMAGE,
|
|
line_number=line_num,
|
|
alt_text=alt_text or "",
|
|
title=title or ""
|
|
)
|
|
references.append(ref)
|
|
|
|
# Find link references
|
|
for match in self.link_pattern.finditer(content):
|
|
link_text, asset_path, title = match.groups()
|
|
line_num = self._get_line_number(content, match.start(), lines)
|
|
|
|
# Skip URLs
|
|
if asset_path.startswith(('http:', 'https:', 'mailto:', 'data:')):
|
|
continue
|
|
|
|
ref = AssetReference(
|
|
source_file=file_path,
|
|
asset_path=asset_path,
|
|
reference_type=ReferenceType.LINK,
|
|
line_number=line_num,
|
|
alt_text=link_text or "",
|
|
title=title or ""
|
|
)
|
|
references.append(ref)
|
|
|
|
# Find reference-style links
|
|
for match in self.reference_pattern.finditer(content):
|
|
ref_id, asset_path = match.groups()
|
|
line_num = self._get_line_number(content, match.start(), lines)
|
|
|
|
ref = AssetReference(
|
|
source_file=file_path,
|
|
asset_path=asset_path,
|
|
reference_type=ReferenceType.REFERENCE_STYLE,
|
|
line_number=line_num,
|
|
alt_text=ref_id
|
|
)
|
|
references.append(ref)
|
|
|
|
# Cache result if caching is enabled
|
|
if self.cache:
|
|
self.cache.set(cache_key, references)
|
|
|
|
return references
|
|
|
|
def _get_line_number(self, content: str, position: int, lines: List[str]) -> int:
|
|
"""Get line number for a position in the content."""
|
|
line_start = 0
|
|
for i, line in enumerate(lines):
|
|
line_end = line_start + len(line) + 1 # +1 for newline
|
|
if position < line_end:
|
|
return i + 1
|
|
line_start = line_end
|
|
return len(lines)
|
|
|
|
|
|
def discover_assets_from_markdown(markdown_content: str, base_path: Path) -> List[AssetReference]:
|
|
"""
|
|
Simple function to discover assets from markdown content for md-render.
|
|
|
|
Args:
|
|
markdown_content: The markdown content to scan
|
|
base_path: Base path for resolving relative asset paths
|
|
|
|
Returns:
|
|
List of AssetReference objects found in the markdown
|
|
"""
|
|
scanner = MarkdownScanner()
|
|
|
|
# Create a temporary file to use the existing scan_file method
|
|
import tempfile
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
|
|
temp_file.write(markdown_content)
|
|
temp_path = Path(temp_file.name)
|
|
|
|
try:
|
|
references = scanner.scan_file(temp_path)
|
|
# Update the source_file to the actual base_path for relative resolution
|
|
for ref in references:
|
|
ref.source_file = base_path
|
|
# Resolve the asset path relative to base_path
|
|
if not ref.asset_path.startswith(('http:', 'https:', 'mailto:', 'data:')):
|
|
# Clean up relative path indicators
|
|
clean_path = ref.asset_path.lstrip('./')
|
|
resolved_path = base_path / clean_path
|
|
if resolved_path.exists():
|
|
ref.resolved_path = resolved_path
|
|
else:
|
|
ref.is_broken = True
|
|
return references
|
|
finally:
|
|
# Clean up temporary file
|
|
temp_path.unlink(missing_ok=True)
|
|
|
|
|
|
def discover_assets_from_html(html_content: str, base_path: Path) -> List[AssetReference]:
|
|
"""
|
|
Discover JavaScript and CSS assets from HTML content for md-render.
|
|
|
|
This function scans the final HTML output to find <script> and <link> tags
|
|
that reference local assets, enabling proper asset shipping to target directories.
|
|
|
|
Args:
|
|
html_content: The HTML content to scan
|
|
base_path: Base path for resolving relative asset paths
|
|
|
|
Returns:
|
|
List of AssetReference objects found in the HTML
|
|
"""
|
|
import re
|
|
|
|
references = []
|
|
|
|
# Pattern to find <script src="..."> tags
|
|
script_pattern = re.compile(
|
|
r'<script[^>]+src=["\']([^"\']+)["\'][^>]*>',
|
|
re.IGNORECASE | re.MULTILINE
|
|
)
|
|
|
|
# Pattern to find <link href="..." rel="stylesheet"> or CSS files
|
|
css_pattern = re.compile(
|
|
r'<link[^>]+href=["\']([^"\']+\.css)["\'][^>]*>',
|
|
re.IGNORECASE | re.MULTILINE
|
|
)
|
|
|
|
lines = html_content.splitlines()
|
|
|
|
# Find JavaScript references
|
|
for match in script_pattern.finditer(html_content):
|
|
asset_path = match.group(1)
|
|
|
|
# Skip external URLs and data URLs
|
|
if asset_path.startswith(('http:', 'https:', '//', 'data:', 'mailto:')):
|
|
continue
|
|
|
|
line_num = _get_html_line_number(html_content, match.start(), lines)
|
|
|
|
# Clean up relative path indicators
|
|
clean_path = asset_path.lstrip('./')
|
|
resolved_path = base_path / clean_path
|
|
|
|
ref = AssetReference(
|
|
source_file=base_path,
|
|
asset_path=asset_path,
|
|
reference_type=ReferenceType.EMBED,
|
|
line_number=line_num,
|
|
alt_text="JavaScript",
|
|
title="",
|
|
resolved_path=resolved_path if resolved_path.exists() else None,
|
|
is_broken=not resolved_path.exists()
|
|
)
|
|
references.append(ref)
|
|
|
|
# Find CSS references
|
|
for match in css_pattern.finditer(html_content):
|
|
asset_path = match.group(1)
|
|
|
|
# Skip external URLs and data URLs
|
|
if asset_path.startswith(('http:', 'https:', '//', 'data:', 'mailto:')):
|
|
continue
|
|
|
|
line_num = _get_html_line_number(html_content, match.start(), lines)
|
|
|
|
# Clean up relative path indicators
|
|
clean_path = asset_path.lstrip('./')
|
|
resolved_path = base_path / clean_path
|
|
|
|
ref = AssetReference(
|
|
source_file=base_path,
|
|
asset_path=asset_path,
|
|
reference_type=ReferenceType.EMBED,
|
|
line_number=line_num,
|
|
alt_text="CSS",
|
|
title="",
|
|
resolved_path=resolved_path if resolved_path.exists() else None,
|
|
is_broken=not resolved_path.exists()
|
|
)
|
|
references.append(ref)
|
|
|
|
return references
|
|
|
|
|
|
def _get_html_line_number(content: str, position: int, lines: list) -> int:
|
|
"""Get line number for a position in HTML content."""
|
|
line_start = 0
|
|
for i, line in enumerate(lines):
|
|
line_end = line_start + len(line) + 1 # +1 for newline
|
|
if position < line_end:
|
|
return i + 1
|
|
line_start = line_end
|
|
return len(lines)
|
|
|
|
|
|
class AssetDiscoveryEngine:
|
|
"""Main engine for asset discovery and analysis."""
|
|
|
|
def __init__(self, asset_manager: AssetManager, enable_caching: bool = True):
|
|
"""Initialize discovery engine."""
|
|
self.asset_manager = asset_manager
|
|
self.scanner = MarkdownScanner(enable_caching=enable_caching)
|
|
self.logger = logging.getLogger(f'{__name__}.{self.__class__.__name__}')
|
|
|
|
def scan_directory(self, directory: Path, recursive: bool = True,
|
|
file_patterns: Optional[List[str]] = None) -> ScanResult:
|
|
"""Scan directory for asset references."""
|
|
# Normalize and validate directory
|
|
directory = PathUtils.normalize_path(directory)
|
|
if not directory.exists() or not directory.is_dir():
|
|
error = ValueError(f"Directory {directory} does not exist or is not a directory")
|
|
return ScanResult(success=False, error=error)
|
|
|
|
with TimedOperation(f"directory scan of {directory}") as timer:
|
|
result = ScanResult()
|
|
patterns = file_patterns or ["*.md", "*.mdx"]
|
|
|
|
try:
|
|
# Find markdown files
|
|
if recursive:
|
|
for pattern in patterns:
|
|
result.scanned_files.extend(directory.rglob(pattern))
|
|
else:
|
|
for pattern in patterns:
|
|
result.scanned_files.extend(directory.glob(pattern))
|
|
|
|
self.logger.info(f"Found {len(result.scanned_files)} markdown files to scan")
|
|
|
|
# Scan each file
|
|
for file_path in result.scanned_files:
|
|
try:
|
|
references = self.scanner.scan_file(file_path)
|
|
result.asset_references.extend(references)
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to scan file {file_path}: {e}")
|
|
|
|
# Check for broken links
|
|
broken_count = 0
|
|
for ref in result.asset_references:
|
|
ref.is_broken = self._is_reference_broken(ref, directory)
|
|
if ref.is_broken:
|
|
result.broken_links.append(ref)
|
|
broken_count += 1
|
|
|
|
result.processing_time = timer.elapsed_time
|
|
|
|
self.logger.info(f"Scan completed: {len(result.asset_references)} references found, "
|
|
f"{broken_count} broken links detected")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to scan directory {directory}: {e}")
|
|
result.success = False
|
|
result.error = e
|
|
result.processing_time = timer.elapsed_time
|
|
|
|
return result
|
|
|
|
def _is_reference_broken(self, reference: AssetReference, scan_root: Optional[Path] = None) -> bool:
|
|
"""Check if an asset reference is broken."""
|
|
if reference.asset_path.startswith(('http:', 'https:', 'data:')):
|
|
return False # Skip external URLs and data URLs
|
|
|
|
# Try multiple resolution strategies
|
|
try:
|
|
# Strategy 1: Relative to source file directory
|
|
resolved_path = (reference.source_file.parent / reference.asset_path).resolve()
|
|
if resolved_path.exists():
|
|
return False
|
|
|
|
# Strategy 2: Relative to scan root (if provided)
|
|
if scan_root:
|
|
resolved_path = (scan_root / reference.asset_path.lstrip('./')).resolve()
|
|
if resolved_path.exists():
|
|
return False
|
|
|
|
# Strategy 3: Try removing leading ./ and resolve from scan root
|
|
if scan_root and reference.asset_path.startswith('./'):
|
|
clean_path = reference.asset_path[2:] # Remove './'
|
|
resolved_path = (scan_root / clean_path).resolve()
|
|
if resolved_path.exists():
|
|
return False
|
|
|
|
return True
|
|
except Exception:
|
|
return True
|
|
|
|
def _resolve_asset_path(self, reference: AssetReference, scan_root: Path) -> Optional[Path]:
|
|
"""Resolve asset path using multiple strategies."""
|
|
try:
|
|
# Strategy 1: Relative to source file directory
|
|
resolved_path = (reference.source_file.parent / reference.asset_path).resolve()
|
|
if resolved_path.exists():
|
|
return resolved_path
|
|
|
|
# Strategy 2: Relative to scan root
|
|
resolved_path = (scan_root / reference.asset_path.lstrip('./')).resolve()
|
|
if resolved_path.exists():
|
|
return resolved_path
|
|
|
|
# Strategy 3: Remove leading ./ and resolve from scan root
|
|
if reference.asset_path.startswith('./'):
|
|
clean_path = reference.asset_path[2:] # Remove './'
|
|
resolved_path = (scan_root / clean_path).resolve()
|
|
if resolved_path.exists():
|
|
return resolved_path
|
|
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
def auto_register_assets(self, directory: Path, register_existing: bool = True,
|
|
skip_broken: bool = True) -> RegistrationResult:
|
|
"""Automatically register discovered assets."""
|
|
with TimedOperation("asset auto-registration") as timer:
|
|
scan_result = self.scan_directory(directory, recursive=True)
|
|
registration_result = RegistrationResult()
|
|
|
|
if not scan_result.success:
|
|
return RegistrationResult(
|
|
success=False,
|
|
error=scan_result.error,
|
|
processing_time=timer.elapsed_time
|
|
)
|
|
|
|
self.logger.info(f"Starting auto-registration of {len(scan_result.asset_references)} discovered assets")
|
|
|
|
for ref in scan_result.asset_references:
|
|
if ref.is_broken and skip_broken:
|
|
registration_result.skipped_broken += 1
|
|
continue
|
|
|
|
try:
|
|
# Resolve asset path using multiple strategies
|
|
abs_asset_path = self._resolve_asset_path(ref, directory)
|
|
|
|
if abs_asset_path and FileValidator.is_readable_file(abs_asset_path):
|
|
# Check if already registered
|
|
# (simplified - would check content hash in reality)
|
|
if register_existing:
|
|
self.asset_manager.add_asset(abs_asset_path)
|
|
registration_result.registered_count += 1
|
|
self.logger.debug(f"Registered asset: {abs_asset_path}")
|
|
else:
|
|
registration_result.skipped_existing += 1
|
|
else:
|
|
# Asset file doesn't exist or isn't readable
|
|
registration_result.skipped_broken += 1
|
|
|
|
except Exception as e:
|
|
registration_result.errors.append(e)
|
|
self.logger.warning(f"Failed to register asset {ref.asset_path}: {e}")
|
|
|
|
registration_result.processing_time = timer.elapsed_time
|
|
self.logger.info(f"Auto-registration completed: {registration_result.registered_count} assets registered")
|
|
|
|
return registration_result
|
|
|
|
def analyze_asset_usage(self, directory: Path) -> UsageAnalysis:
|
|
"""Analyze asset usage patterns across the project."""
|
|
with TimedOperation("asset usage analysis") as timer:
|
|
analysis = UsageAnalysis()
|
|
|
|
try:
|
|
# Get all registered assets
|
|
all_assets = self.asset_manager.registry.list_assets()
|
|
analysis.total_assets = len(all_assets)
|
|
|
|
# Scan for references
|
|
scan_result = self.scan_directory(directory, recursive=True)
|
|
|
|
if not scan_result.success:
|
|
return UsageAnalysis(
|
|
success=False,
|
|
error=scan_result.error,
|
|
processing_time=timer.elapsed_time
|
|
)
|
|
|
|
analysis.broken_references = len(scan_result.broken_links)
|
|
|
|
# Determine which assets are used by resolving references to actual asset files
|
|
used_asset_hashes = set()
|
|
for ref in scan_result.asset_references:
|
|
if not ref.is_broken:
|
|
# Try to resolve the reference to an actual asset file
|
|
resolved_path = self._resolve_asset_path(ref, directory)
|
|
if resolved_path and resolved_path.exists():
|
|
# Calculate the content hash to match with stored assets
|
|
try:
|
|
import hashlib
|
|
content = resolved_path.read_bytes()
|
|
content_hash = hashlib.sha256(content).hexdigest()
|
|
used_asset_hashes.add(content_hash)
|
|
except Exception:
|
|
# If we can't read the file, skip it
|
|
pass
|
|
|
|
# Identify unused assets
|
|
analysis.unused_asset_list = []
|
|
for asset in all_assets:
|
|
if asset['content_hash'] not in used_asset_hashes:
|
|
analysis.unused_asset_list.append(asset)
|
|
|
|
analysis.used_assets = len(used_asset_hashes)
|
|
analysis.unused_assets = len(analysis.unused_asset_list)
|
|
analysis.processing_time = timer.elapsed_time
|
|
|
|
self.logger.info(f"Usage analysis completed: {analysis.used_assets}/{analysis.total_assets} "
|
|
f"assets in use, {analysis.broken_references} broken references")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to analyze asset usage: {e}")
|
|
analysis.success = False
|
|
analysis.error = e
|
|
analysis.processing_time = timer.elapsed_time
|
|
|
|
return analysis |