chore: Issue closure 125 cleanup

This commit is contained in:
2025-10-05 12:49:28 +02:00
parent 20e7f0f5bd
commit bce680e6cb
26 changed files with 2362 additions and 388 deletions

View File

@@ -0,0 +1,50 @@
"""
MarkiTect Utils - A collection of utility functions for the MarkiTect ecosystem.
This capability provides commonly used utility functions that can be shared
across different MarkiTect capabilities and projects.
"""
from .string_utils import (
slugify,
truncate,
camel_to_snake,
snake_to_camel,
strip_ansi_codes,
)
from .file_utils import (
safe_filename,
ensure_extension,
get_file_size,
is_text_file,
normalize_path,
)
from .validation_utils import (
is_valid_email,
is_valid_url,
is_valid_semver,
validate_required_fields,
)
__version__ = "0.1.0-dev"
__all__ = [
# String utilities
"slugify",
"truncate",
"camel_to_snake",
"snake_to_camel",
"strip_ansi_codes",
# File utilities
"safe_filename",
"ensure_extension",
"get_file_size",
"is_text_file",
"normalize_path",
# Validation utilities
"is_valid_email",
"is_valid_url",
"is_valid_semver",
"validate_required_fields",
]

View File

@@ -0,0 +1,168 @@
"""
File utility functions for MarkiTect ecosystem.
Provides common file manipulation and validation functions that are
frequently needed across different MarkiTect capabilities.
"""
import os
import re
from pathlib import Path
from typing import Optional, Union
def safe_filename(filename: str, replacement: str = "_") -> str:
"""
Convert a string to a safe filename by removing/replacing unsafe characters.
Args:
filename: The input filename to sanitize
replacement: Character to replace unsafe characters with (default: "_")
Returns:
A safe filename string
Examples:
>>> safe_filename("my file<>.txt")
'my_file__.txt'
>>> safe_filename("file/with\\path.txt")
'file_with_path.txt'
"""
if not filename:
return ""
# Replace unsafe characters
unsafe_chars = r'[<>:"/\\|?*\x00-\x1f]'
safe_name = re.sub(unsafe_chars, replacement, filename)
# Remove leading/trailing dots and spaces
safe_name = safe_name.strip('. ')
# Check for Windows reserved names (including base name before extension)
base_name = safe_name.split('.')[0].upper() if safe_name else ""
reserved_names = {
'CON', 'PRN', 'AUX', 'NUL',
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
}
# Ensure not empty and not reserved names
if not safe_name or base_name in reserved_names:
safe_name = f"file{replacement}{safe_name}"
return safe_name
def ensure_extension(filename: str, extension: str) -> str:
"""
Ensure a filename has the specified extension.
Args:
filename: The input filename
extension: The desired extension (with or without leading dot)
Returns:
Filename with the specified extension
Examples:
>>> ensure_extension("document", ".md")
'document.md'
>>> ensure_extension("document.txt", ".md")
'document.txt.md'
>>> ensure_extension("document.md", "md")
'document.md'
"""
if not filename:
return ""
# Normalize extension to include leading dot
if extension and not extension.startswith('.'):
extension = f".{extension}"
if extension and not filename.endswith(extension):
return filename + extension
return filename
def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
"""
Get the size of a file in bytes.
Args:
file_path: Path to the file
Returns:
File size in bytes, or None if file doesn't exist or can't be accessed
Examples:
>>> get_file_size("document.txt") # doctest: +SKIP
1024
"""
try:
return os.path.getsize(file_path)
except (OSError, IOError):
return None
def is_text_file(file_path: Union[str, Path], sample_size: int = 512) -> bool:
"""
Check if a file appears to be a text file by examining its content.
Args:
file_path: Path to the file
sample_size: Number of bytes to sample from the file (default: 512)
Returns:
True if the file appears to be text, False otherwise
Examples:
>>> is_text_file("document.txt") # doctest: +SKIP
True
"""
try:
with open(file_path, 'rb') as f:
sample = f.read(sample_size)
if not sample:
return True # Empty file is considered text
# Check for null bytes (common in binary files)
if b'\x00' in sample:
return False
# Check if most bytes are printable ASCII or common UTF-8
try:
sample.decode('utf-8')
return True
except UnicodeDecodeError:
pass
try:
sample.decode('ascii')
return True
except UnicodeDecodeError:
return False
except (OSError, IOError):
return False
def normalize_path(path: Union[str, Path]) -> str:
"""
Normalize a file path by resolving relative components and converting to absolute.
Args:
path: The input path to normalize
Returns:
Normalized absolute path as a string
Examples:
>>> normalize_path("./dir/../file.txt") # doctest: +SKIP
'/current/working/directory/file.txt'
"""
if not path:
return ""
return str(Path(path).resolve())

View File

@@ -0,0 +1,162 @@
"""
String utility functions for MarkiTect ecosystem.
Provides common string manipulation and formatting functions that are
frequently needed across different MarkiTect capabilities.
"""
import re
from typing import Optional
def slugify(text: str, separator: str = "-") -> str:
"""
Convert a string to a URL-friendly slug.
Args:
text: The input string to convert
separator: Character to use for word separation (default: "-")
Returns:
A lowercase string with special characters removed and words separated
Examples:
>>> slugify("Hello World!")
'hello-world'
>>> slugify("My Great Article", "_")
'my_great_article'
"""
if not text:
return ""
# Convert to lowercase and normalize unicode
text = text.lower()
# Remove unicode accents by replacing with ASCII equivalents
text = re.sub(r'[àáâãäå]', 'a', text)
text = re.sub(r'[èéêë]', 'e', text)
text = re.sub(r'[ìíîï]', 'i', text)
text = re.sub(r'[òóôõö]', 'o', text)
text = re.sub(r'[ùúûü]', 'u', text)
text = re.sub(r'[ýÿ]', 'y', text)
text = re.sub(r'[ç]', 'c', text)
text = re.sub(r'[ñ]', 'n', text)
# Replace non-alphanumeric characters (except underscores and dashes) with separator
text = re.sub(r'[^\w\s-]', '', text)
# Replace whitespace and underscores with separator
text = re.sub(r'[\s_]+', separator, text)
# Replace multiple separators with single separator
text = re.sub(f'[{re.escape(separator)}]+', separator, text)
# Remove leading/trailing separators
text = text.strip(separator)
return text
def truncate(text: str, max_length: int, suffix: str = "...") -> str:
"""
Truncate a string to a maximum length, adding a suffix if truncated.
Args:
text: The input string to truncate
max_length: Maximum length of the result (including suffix)
suffix: String to append if truncation occurs (default: "...")
Returns:
The truncated string with suffix if needed
Examples:
>>> truncate("This is a long string", 10)
'This is...'
>>> truncate("Short", 10)
'Short'
"""
if not text or len(text) <= max_length:
return text
if max_length <= len(suffix):
return suffix[:max_length]
truncate_at = max_length - len(suffix)
return text[:truncate_at] + suffix
def camel_to_snake(text: str) -> str:
"""
Convert camelCase or PascalCase to snake_case.
Args:
text: The input string in camelCase or PascalCase
Returns:
String converted to snake_case
Examples:
>>> camel_to_snake("camelCase")
'camel_case'
>>> camel_to_snake("PascalCase")
'pascal_case'
>>> camel_to_snake("XMLHttpRequest")
'xml_http_request'
"""
if not text:
return text
# Insert underscore before uppercase letters that follow lowercase letters
text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
# Insert underscore before uppercase letters that follow lowercase letters or digits
text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text)
return text.lower()
def snake_to_camel(text: str, pascal_case: bool = False) -> str:
"""
Convert snake_case to camelCase or PascalCase.
Args:
text: The input string in snake_case
pascal_case: If True, return PascalCase; otherwise camelCase (default: False)
Returns:
String converted to camelCase or PascalCase
Examples:
>>> snake_to_camel("snake_case")
'snakeCase'
>>> snake_to_camel("snake_case", pascal_case=True)
'SnakeCase'
"""
if not text:
return text
components = text.split('_')
if not components:
return text
if pascal_case:
return ''.join(word.capitalize() for word in components)
else:
return components[0] + ''.join(word.capitalize() for word in components[1:])
def strip_ansi_codes(text: str) -> str:
"""
Remove ANSI escape sequences from a string.
Args:
text: String that may contain ANSI escape sequences
Returns:
String with ANSI codes removed
Examples:
>>> strip_ansi_codes("\\033[31mRed text\\033[0m")
'Red text'
"""
if not text:
return text
# ANSI escape sequence pattern
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
return ansi_escape.sub('', text)

View File

@@ -0,0 +1,160 @@
"""
Validation utility functions for MarkiTect ecosystem.
Provides common validation functions for various data types and formats
that are frequently needed across different MarkiTect capabilities.
"""
import re
from typing import Any, Dict, List, Optional, Union
def is_valid_email(email: str) -> bool:
"""
Check if a string is a valid email address format.
Args:
email: The email address to validate
Returns:
True if the email format is valid, False otherwise
Examples:
>>> is_valid_email("user@example.com")
True
>>> is_valid_email("invalid.email")
False
"""
if not email or not isinstance(email, str):
return False
# Basic email regex pattern
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def is_valid_url(url: str) -> bool:
"""
Check if a string is a valid URL format.
Args:
url: The URL to validate
Returns:
True if the URL format is valid, False otherwise
Examples:
>>> is_valid_url("https://example.com")
True
>>> is_valid_url("not-a-url")
False
"""
if not url or not isinstance(url, str):
return False
# URL regex pattern
pattern = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(pattern.match(url))
def is_valid_semver(version: str) -> bool:
"""
Check if a string is a valid semantic version (semver) format.
Args:
version: The version string to validate
Returns:
True if the version follows semver format, False otherwise
Examples:
>>> is_valid_semver("1.0.0")
True
>>> is_valid_semver("1.0.0-alpha.1")
True
>>> is_valid_semver("1.0")
False
"""
if not version or not isinstance(version, str):
return False
# Semantic version regex pattern
pattern = re.compile(
r'^(?P<major>0|[1-9]\d*)\.'
r'(?P<minor>0|[1-9]\d*)\.'
r'(?P<patch>0|[1-9]\d*)'
r'(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)'
r'(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?'
r'(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$'
)
return bool(pattern.match(version))
def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> Dict[str, List[str]]:
"""
Validate that required fields are present and not empty in a dictionary.
Args:
data: Dictionary to validate
required_fields: List of field names that are required
Returns:
Dictionary with 'missing' and 'empty' keys containing lists of field names
Examples:
>>> validate_required_fields({"name": "John", "email": ""}, ["name", "email", "age"])
{'missing': ['age'], 'empty': ['email']}
>>> validate_required_fields({"name": "John", "email": "john@example.com"}, ["name", "email"])
{'missing': [], 'empty': []}
"""
result = {
'missing': [],
'empty': []
}
if not isinstance(data, dict) or not isinstance(required_fields, list):
return result
for field in required_fields:
if field not in data:
result['missing'].append(field)
elif _is_empty_value(data[field]):
result['empty'].append(field)
return result
def _is_empty_value(value: Any) -> bool:
"""
Check if a value should be considered empty for validation purposes.
Args:
value: The value to check
Returns:
True if the value is considered empty, False otherwise
"""
if value is None:
return True
if isinstance(value, str):
return not value.strip()
if isinstance(value, (list, tuple, dict, set)):
return len(value) == 0
# For numeric types (int, float), only None is considered empty
# Zero and False are valid values
if isinstance(value, (int, float, bool)):
return False
# For other types, use Python's truthiness
return not bool(value)