162 lines
4.6 KiB
Python
162 lines
4.6 KiB
Python
"""
|
|
String utility functions for MarkiTect ecosystem.
|
|
|
|
Provides common string manipulation and formatting functions that are
|
|
frequently needed across different MarkiTect capabilities.
|
|
"""
|
|
|
|
import re
|
|
from typing import Optional
|
|
|
|
|
|
def slugify(text: str, separator: str = "-") -> str:
|
|
"""
|
|
Convert a string to a URL-friendly slug.
|
|
|
|
Args:
|
|
text: The input string to convert
|
|
separator: Character to use for word separation (default: "-")
|
|
|
|
Returns:
|
|
A lowercase string with special characters removed and words separated
|
|
|
|
Examples:
|
|
>>> slugify("Hello World!")
|
|
'hello-world'
|
|
>>> slugify("My Great Article", "_")
|
|
'my_great_article'
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Convert to lowercase and normalize unicode
|
|
text = text.lower()
|
|
# Remove unicode accents by replacing with ASCII equivalents
|
|
text = re.sub(r'[àáâãäå]', 'a', text)
|
|
text = re.sub(r'[èéêë]', 'e', text)
|
|
text = re.sub(r'[ìíîï]', 'i', text)
|
|
text = re.sub(r'[òóôõö]', 'o', text)
|
|
text = re.sub(r'[ùúûü]', 'u', text)
|
|
text = re.sub(r'[ýÿ]', 'y', text)
|
|
text = re.sub(r'[ç]', 'c', text)
|
|
text = re.sub(r'[ñ]', 'n', text)
|
|
|
|
# Replace non-alphanumeric characters (except underscores and dashes) with separator
|
|
text = re.sub(r'[^\w\s-]', '', text)
|
|
# Replace whitespace and underscores with separator
|
|
text = re.sub(r'[\s_]+', separator, text)
|
|
# Replace multiple separators with single separator
|
|
text = re.sub(f'[{re.escape(separator)}]+', separator, text)
|
|
# Remove leading/trailing separators
|
|
text = text.strip(separator)
|
|
|
|
return text
|
|
|
|
|
|
def truncate(text: str, max_length: int, suffix: str = "...") -> str:
|
|
"""
|
|
Truncate a string to a maximum length, adding a suffix if truncated.
|
|
|
|
Args:
|
|
text: The input string to truncate
|
|
max_length: Maximum length of the result (including suffix)
|
|
suffix: String to append if truncation occurs (default: "...")
|
|
|
|
Returns:
|
|
The truncated string with suffix if needed
|
|
|
|
Examples:
|
|
>>> truncate("This is a long string", 10)
|
|
'This is...'
|
|
>>> truncate("Short", 10)
|
|
'Short'
|
|
"""
|
|
if not text or len(text) <= max_length:
|
|
return text
|
|
|
|
if max_length <= len(suffix):
|
|
return suffix[:max_length]
|
|
|
|
truncate_at = max_length - len(suffix)
|
|
return text[:truncate_at] + suffix
|
|
|
|
|
|
def camel_to_snake(text: str) -> str:
|
|
"""
|
|
Convert camelCase or PascalCase to snake_case.
|
|
|
|
Args:
|
|
text: The input string in camelCase or PascalCase
|
|
|
|
Returns:
|
|
String converted to snake_case
|
|
|
|
Examples:
|
|
>>> camel_to_snake("camelCase")
|
|
'camel_case'
|
|
>>> camel_to_snake("PascalCase")
|
|
'pascal_case'
|
|
>>> camel_to_snake("XMLHttpRequest")
|
|
'xml_http_request'
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
# Insert underscore before uppercase letters that follow lowercase letters
|
|
text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
|
|
# Insert underscore before uppercase letters that follow lowercase letters or digits
|
|
text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text)
|
|
|
|
return text.lower()
|
|
|
|
|
|
def snake_to_camel(text: str, pascal_case: bool = False) -> str:
|
|
"""
|
|
Convert snake_case to camelCase or PascalCase.
|
|
|
|
Args:
|
|
text: The input string in snake_case
|
|
pascal_case: If True, return PascalCase; otherwise camelCase (default: False)
|
|
|
|
Returns:
|
|
String converted to camelCase or PascalCase
|
|
|
|
Examples:
|
|
>>> snake_to_camel("snake_case")
|
|
'snakeCase'
|
|
>>> snake_to_camel("snake_case", pascal_case=True)
|
|
'SnakeCase'
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
components = text.split('_')
|
|
if not components:
|
|
return text
|
|
|
|
if pascal_case:
|
|
return ''.join(word.capitalize() for word in components)
|
|
else:
|
|
return components[0] + ''.join(word.capitalize() for word in components[1:])
|
|
|
|
|
|
def strip_ansi_codes(text: str) -> str:
|
|
"""
|
|
Remove ANSI escape sequences from a string.
|
|
|
|
Args:
|
|
text: String that may contain ANSI escape sequences
|
|
|
|
Returns:
|
|
String with ANSI codes removed
|
|
|
|
Examples:
|
|
>>> strip_ansi_codes("\\033[31mRed text\\033[0m")
|
|
'Red text'
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
# ANSI escape sequence pattern
|
|
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
|
|
return ansi_escape.sub('', text) |