Files
markitect-main/capabilities/markitect-utils/src/markitect_utils/string_utils.py

162 lines
4.6 KiB
Python

"""
String utility functions for MarkiTect ecosystem.
Provides common string manipulation and formatting functions that are
frequently needed across different MarkiTect capabilities.
"""
import re
from typing import Optional
def slugify(text: str, separator: str = "-") -> str:
"""
Convert a string to a URL-friendly slug.
Args:
text: The input string to convert
separator: Character to use for word separation (default: "-")
Returns:
A lowercase string with special characters removed and words separated
Examples:
>>> slugify("Hello World!")
'hello-world'
>>> slugify("My Great Article", "_")
'my_great_article'
"""
if not text:
return ""
# Convert to lowercase and normalize unicode
text = text.lower()
# Remove unicode accents by replacing with ASCII equivalents
text = re.sub(r'[àáâãäå]', 'a', text)
text = re.sub(r'[èéêë]', 'e', text)
text = re.sub(r'[ìíîï]', 'i', text)
text = re.sub(r'[òóôõö]', 'o', text)
text = re.sub(r'[ùúûü]', 'u', text)
text = re.sub(r'[ýÿ]', 'y', text)
text = re.sub(r'[ç]', 'c', text)
text = re.sub(r'[ñ]', 'n', text)
# Replace non-alphanumeric characters (except underscores and dashes) with separator
text = re.sub(r'[^\w\s-]', '', text)
# Replace whitespace and underscores with separator
text = re.sub(r'[\s_]+', separator, text)
# Replace multiple separators with single separator
text = re.sub(f'[{re.escape(separator)}]+', separator, text)
# Remove leading/trailing separators
text = text.strip(separator)
return text
def truncate(text: str, max_length: int, suffix: str = "...") -> str:
"""
Truncate a string to a maximum length, adding a suffix if truncated.
Args:
text: The input string to truncate
max_length: Maximum length of the result (including suffix)
suffix: String to append if truncation occurs (default: "...")
Returns:
The truncated string with suffix if needed
Examples:
>>> truncate("This is a long string", 10)
'This is...'
>>> truncate("Short", 10)
'Short'
"""
if not text or len(text) <= max_length:
return text
if max_length <= len(suffix):
return suffix[:max_length]
truncate_at = max_length - len(suffix)
return text[:truncate_at] + suffix
def camel_to_snake(text: str) -> str:
"""
Convert camelCase or PascalCase to snake_case.
Args:
text: The input string in camelCase or PascalCase
Returns:
String converted to snake_case
Examples:
>>> camel_to_snake("camelCase")
'camel_case'
>>> camel_to_snake("PascalCase")
'pascal_case'
>>> camel_to_snake("XMLHttpRequest")
'xml_http_request'
"""
if not text:
return text
# Insert underscore before uppercase letters that follow lowercase letters
text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
# Insert underscore before uppercase letters that follow lowercase letters or digits
text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text)
return text.lower()
def snake_to_camel(text: str, pascal_case: bool = False) -> str:
"""
Convert snake_case to camelCase or PascalCase.
Args:
text: The input string in snake_case
pascal_case: If True, return PascalCase; otherwise camelCase (default: False)
Returns:
String converted to camelCase or PascalCase
Examples:
>>> snake_to_camel("snake_case")
'snakeCase'
>>> snake_to_camel("snake_case", pascal_case=True)
'SnakeCase'
"""
if not text:
return text
components = text.split('_')
if not components:
return text
if pascal_case:
return ''.join(word.capitalize() for word in components)
else:
return components[0] + ''.join(word.capitalize() for word in components[1:])
def strip_ansi_codes(text: str) -> str:
"""
Remove ANSI escape sequences from a string.
Args:
text: String that may contain ANSI escape sequences
Returns:
String with ANSI codes removed
Examples:
>>> strip_ansi_codes("\\033[31mRed text\\033[0m")
'Red text'
"""
if not text:
return text
# ANSI escape sequence pattern
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
return ansi_escape.sub('', text)