chore: Issue closure 125 cleanup

2025-10-05 12:49:28 +02:00
parent 20e7f0f5bd
commit bce680e6cb
26 changed files with 2362 additions and 388 deletions
--- a/capabilities/markitect-utils/src/markitect_utils/init.py
+++ b/capabilities/markitect-utils/src/markitect_utils/init.py
@@ -0,0 +1,50 @@
+"""
+MarkiTect Utils - A collection of utility functions for the MarkiTect ecosystem.
+
+This capability provides commonly used utility functions that can be shared
+across different MarkiTect capabilities and projects.
+"""
+
+from .string_utils import (
+    slugify,
+    truncate,
+    camel_to_snake,
+    snake_to_camel,
+    strip_ansi_codes,
+)
+
+from .file_utils import (
+    safe_filename,
+    ensure_extension,
+    get_file_size,
+    is_text_file,
+    normalize_path,
+)
+
+from .validation_utils import (
+    is_valid_email,
+    is_valid_url,
+    is_valid_semver,
+    validate_required_fields,
+)
+
+__version__ = "0.1.0-dev"
+__all__ = [
+    # String utilities
+    "slugify",
+    "truncate",
+    "camel_to_snake",
+    "snake_to_camel",
+    "strip_ansi_codes",
+    # File utilities
+    "safe_filename",
+    "ensure_extension",
+    "get_file_size",
+    "is_text_file",
+    "normalize_path",
+    # Validation utilities
+    "is_valid_email",
+    "is_valid_url",
+    "is_valid_semver",
+    "validate_required_fields",
+]
--- a/capabilities/markitect-utils/src/markitect_utils/file_utils.py
+++ b/capabilities/markitect-utils/src/markitect_utils/file_utils.py
@@ -0,0 +1,168 @@
+"""
+File utility functions for MarkiTect ecosystem.
+
+Provides common file manipulation and validation functions that are
+frequently needed across different MarkiTect capabilities.
+"""
+
+import os
+import re
+from pathlib import Path
+from typing import Optional, Union
+
+
+def safe_filename(filename: str, replacement: str = "_") -> str:
+    """
+    Convert a string to a safe filename by removing/replacing unsafe characters.
+
+    Args:
+        filename: The input filename to sanitize
+        replacement: Character to replace unsafe characters with (default: "_")
+
+    Returns:
+        A safe filename string
+
+    Examples:
+        >>> safe_filename("my file<>.txt")
+        'my_file__.txt'
+        >>> safe_filename("file/with\\path.txt")
+        'file_with_path.txt'
+    """
+    if not filename:
+        return ""
+
+    # Replace unsafe characters
+    unsafe_chars = r'[<>:"/\\|?*\x00-\x1f]'
+    safe_name = re.sub(unsafe_chars, replacement, filename)
+
+    # Remove leading/trailing dots and spaces
+    safe_name = safe_name.strip('. ')
+
+    # Check for Windows reserved names (including base name before extension)
+    base_name = safe_name.split('.')[0].upper() if safe_name else ""
+    reserved_names = {
+        'CON', 'PRN', 'AUX', 'NUL',
+        'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
+        'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
+    }
+
+    # Ensure not empty and not reserved names
+    if not safe_name or base_name in reserved_names:
+        safe_name = f"file{replacement}{safe_name}"
+
+    return safe_name
+
+
+def ensure_extension(filename: str, extension: str) -> str:
+    """
+    Ensure a filename has the specified extension.
+
+    Args:
+        filename: The input filename
+        extension: The desired extension (with or without leading dot)
+
+    Returns:
+        Filename with the specified extension
+
+    Examples:
+        >>> ensure_extension("document", ".md")
+        'document.md'
+        >>> ensure_extension("document.txt", ".md")
+        'document.txt.md'
+        >>> ensure_extension("document.md", "md")
+        'document.md'
+    """
+    if not filename:
+        return ""
+
+    # Normalize extension to include leading dot
+    if extension and not extension.startswith('.'):
+        extension = f".{extension}"
+
+    if extension and not filename.endswith(extension):
+        return filename + extension
+
+    return filename
+
+
+def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
+    """
+    Get the size of a file in bytes.
+
+    Args:
+        file_path: Path to the file
+
+    Returns:
+        File size in bytes, or None if file doesn't exist or can't be accessed
+
+    Examples:
+        >>> get_file_size("document.txt")  # doctest: +SKIP
+        1024
+    """
+    try:
+        return os.path.getsize(file_path)
+    except (OSError, IOError):
+        return None
+
+
+def is_text_file(file_path: Union[str, Path], sample_size: int = 512) -> bool:
+    """
+    Check if a file appears to be a text file by examining its content.
+
+    Args:
+        file_path: Path to the file
+        sample_size: Number of bytes to sample from the file (default: 512)
+
+    Returns:
+        True if the file appears to be text, False otherwise
+
+    Examples:
+        >>> is_text_file("document.txt")  # doctest: +SKIP
+        True
+    """
+    try:
+        with open(file_path, 'rb') as f:
+            sample = f.read(sample_size)
+
+        if not sample:
+            return True  # Empty file is considered text
+
+        # Check for null bytes (common in binary files)
+        if b'\x00' in sample:
+            return False
+
+        # Check if most bytes are printable ASCII or common UTF-8
+        try:
+            sample.decode('utf-8')
+            return True
+        except UnicodeDecodeError:
+            pass
+
+        try:
+            sample.decode('ascii')
+            return True
+        except UnicodeDecodeError:
+            return False
+
+    except (OSError, IOError):
+        return False
+
+
+def normalize_path(path: Union[str, Path]) -> str:
+    """
+    Normalize a file path by resolving relative components and converting to absolute.
+
+    Args:
+        path: The input path to normalize
+
+    Returns:
+        Normalized absolute path as a string
+
+    Examples:
+        >>> normalize_path("./dir/../file.txt")  # doctest: +SKIP
+        '/current/working/directory/file.txt'
+    """
+    if not path:
+        return ""
+
+    return str(Path(path).resolve())
--- a/capabilities/markitect-utils/src/markitect_utils/string_utils.py
+++ b/capabilities/markitect-utils/src/markitect_utils/string_utils.py
@@ -0,0 +1,162 @@
+"""
+String utility functions for MarkiTect ecosystem.
+
+Provides common string manipulation and formatting functions that are
+frequently needed across different MarkiTect capabilities.
+"""
+
+import re
+from typing import Optional
+
+
+def slugify(text: str, separator: str = "-") -> str:
+    """
+    Convert a string to a URL-friendly slug.
+
+    Args:
+        text: The input string to convert
+        separator: Character to use for word separation (default: "-")
+
+    Returns:
+        A lowercase string with special characters removed and words separated
+
+    Examples:
+        >>> slugify("Hello World!")
+        'hello-world'
+        >>> slugify("My Great Article", "_")
+        'my_great_article'
+    """
+    if not text:
+        return ""
+
+    # Convert to lowercase and normalize unicode
+    text = text.lower()
+    # Remove unicode accents by replacing with ASCII equivalents
+    text = re.sub(r'[àáâãäå]', 'a', text)
+    text = re.sub(r'[èéêë]', 'e', text)
+    text = re.sub(r'[ìíîï]', 'i', text)
+    text = re.sub(r'[òóôõö]', 'o', text)
+    text = re.sub(r'[ùúûü]', 'u', text)
+    text = re.sub(r'[ýÿ]', 'y', text)
+    text = re.sub(r'[ç]', 'c', text)
+    text = re.sub(r'[ñ]', 'n', text)
+
+    # Replace non-alphanumeric characters (except underscores and dashes) with separator
+    text = re.sub(r'[^\w\s-]', '', text)
+    # Replace whitespace and underscores with separator
+    text = re.sub(r'[\s_]+', separator, text)
+    # Replace multiple separators with single separator
+    text = re.sub(f'[{re.escape(separator)}]+', separator, text)
+    # Remove leading/trailing separators
+    text = text.strip(separator)
+
+    return text
+
+
+def truncate(text: str, max_length: int, suffix: str = "...") -> str:
+    """
+    Truncate a string to a maximum length, adding a suffix if truncated.
+
+    Args:
+        text: The input string to truncate
+        max_length: Maximum length of the result (including suffix)
+        suffix: String to append if truncation occurs (default: "...")
+
+    Returns:
+        The truncated string with suffix if needed
+
+    Examples:
+        >>> truncate("This is a long string", 10)
+        'This is...'
+        >>> truncate("Short", 10)
+        'Short'
+    """
+    if not text or len(text) <= max_length:
+        return text
+
+    if max_length <= len(suffix):
+        return suffix[:max_length]
+
+    truncate_at = max_length - len(suffix)
+    return text[:truncate_at] + suffix
+
+
+def camel_to_snake(text: str) -> str:
+    """
+    Convert camelCase or PascalCase to snake_case.
+
+    Args:
+        text: The input string in camelCase or PascalCase
+
+    Returns:
+        String converted to snake_case
+
+    Examples:
+        >>> camel_to_snake("camelCase")
+        'camel_case'
+        >>> camel_to_snake("PascalCase")
+        'pascal_case'
+        >>> camel_to_snake("XMLHttpRequest")
+        'xml_http_request'
+    """
+    if not text:
+        return text
+
+    # Insert underscore before uppercase letters that follow lowercase letters
+    text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
+    # Insert underscore before uppercase letters that follow lowercase letters or digits
+    text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text)
+
+    return text.lower()
+
+
+def snake_to_camel(text: str, pascal_case: bool = False) -> str:
+    """
+    Convert snake_case to camelCase or PascalCase.
+
+    Args:
+        text: The input string in snake_case
+        pascal_case: If True, return PascalCase; otherwise camelCase (default: False)
+
+    Returns:
+        String converted to camelCase or PascalCase
+
+    Examples:
+        >>> snake_to_camel("snake_case")
+        'snakeCase'
+        >>> snake_to_camel("snake_case", pascal_case=True)
+        'SnakeCase'
+    """
+    if not text:
+        return text
+
+    components = text.split('_')
+    if not components:
+        return text
+
+    if pascal_case:
+        return ''.join(word.capitalize() for word in components)
+    else:
+        return components[0] + ''.join(word.capitalize() for word in components[1:])
+
+
+def strip_ansi_codes(text: str) -> str:
+    """
+    Remove ANSI escape sequences from a string.
+
+    Args:
+        text: String that may contain ANSI escape sequences
+
+    Returns:
+        String with ANSI codes removed
+
+    Examples:
+        >>> strip_ansi_codes("\\033[31mRed text\\033[0m")
+        'Red text'
+    """
+    if not text:
+        return text
+
+    # ANSI escape sequence pattern
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+    return ansi_escape.sub('', text)
--- a/capabilities/markitect-utils/src/markitect_utils/validation_utils.py
+++ b/capabilities/markitect-utils/src/markitect_utils/validation_utils.py
@@ -0,0 +1,160 @@
+"""
+Validation utility functions for MarkiTect ecosystem.
+
+Provides common validation functions for various data types and formats
+that are frequently needed across different MarkiTect capabilities.
+"""
+
+import re
+from typing import Any, Dict, List, Optional, Union
+
+
+def is_valid_email(email: str) -> bool:
+    """
+    Check if a string is a valid email address format.
+
+    Args:
+        email: The email address to validate
+
+    Returns:
+        True if the email format is valid, False otherwise
+
+    Examples:
+        >>> is_valid_email("user@example.com")
+        True
+        >>> is_valid_email("invalid.email")
+        False
+    """
+    if not email or not isinstance(email, str):
+        return False
+
+    # Basic email regex pattern
+    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
+    return bool(re.match(pattern, email))
+
+
+def is_valid_url(url: str) -> bool:
+    """
+    Check if a string is a valid URL format.
+
+    Args:
+        url: The URL to validate
+
+    Returns:
+        True if the URL format is valid, False otherwise
+
+    Examples:
+        >>> is_valid_url("https://example.com")
+        True
+        >>> is_valid_url("not-a-url")
+        False
+    """
+    if not url or not isinstance(url, str):
+        return False
+
+    # URL regex pattern
+    pattern = re.compile(
+        r'^https?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+
+    return bool(pattern.match(url))
+
+
+def is_valid_semver(version: str) -> bool:
+    """
+    Check if a string is a valid semantic version (semver) format.
+
+    Args:
+        version: The version string to validate
+
+    Returns:
+        True if the version follows semver format, False otherwise
+
+    Examples:
+        >>> is_valid_semver("1.0.0")
+        True
+        >>> is_valid_semver("1.0.0-alpha.1")
+        True
+        >>> is_valid_semver("1.0")
+        False
+    """
+    if not version or not isinstance(version, str):
+        return False
+
+    # Semantic version regex pattern
+    pattern = re.compile(
+        r'^(?P<major>0|[1-9]\d*)\.'
+        r'(?P<minor>0|[1-9]\d*)\.'
+        r'(?P<patch>0|[1-9]\d*)'
+        r'(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)'
+        r'(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?'
+        r'(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$'
+    )
+
+    return bool(pattern.match(version))
+
+
+def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> Dict[str, List[str]]:
+    """
+    Validate that required fields are present and not empty in a dictionary.
+
+    Args:
+        data: Dictionary to validate
+        required_fields: List of field names that are required
+
+    Returns:
+        Dictionary with 'missing' and 'empty' keys containing lists of field names
+
+    Examples:
+        >>> validate_required_fields({"name": "John", "email": ""}, ["name", "email", "age"])
+        {'missing': ['age'], 'empty': ['email']}
+        >>> validate_required_fields({"name": "John", "email": "john@example.com"}, ["name", "email"])
+        {'missing': [], 'empty': []}
+    """
+    result = {
+        'missing': [],
+        'empty': []
+    }
+
+    if not isinstance(data, dict) or not isinstance(required_fields, list):
+        return result
+
+    for field in required_fields:
+        if field not in data:
+            result['missing'].append(field)
+        elif _is_empty_value(data[field]):
+            result['empty'].append(field)
+
+    return result
+
+
+def _is_empty_value(value: Any) -> bool:
+    """
+    Check if a value should be considered empty for validation purposes.
+
+    Args:
+        value: The value to check
+
+    Returns:
+        True if the value is considered empty, False otherwise
+    """
+    if value is None:
+        return True
+
+    if isinstance(value, str):
+        return not value.strip()
+
+    if isinstance(value, (list, tuple, dict, set)):
+        return len(value) == 0
+
+    # For numeric types (int, float), only None is considered empty
+    # Zero and False are valid values
+    if isinstance(value, (int, float, bool)):
+        return False
+
+    # For other types, use Python's truthiness
+    return not bool(value)