chore: Issue closure 125 cleanup
This commit is contained in:
162
capabilities/markitect-utils/src/markitect_utils/string_utils.py
Normal file
162
capabilities/markitect-utils/src/markitect_utils/string_utils.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
String utility functions for MarkiTect ecosystem.
|
||||
|
||||
Provides common string manipulation and formatting functions that are
|
||||
frequently needed across different MarkiTect capabilities.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def slugify(text: str, separator: str = "-") -> str:
|
||||
"""
|
||||
Convert a string to a URL-friendly slug.
|
||||
|
||||
Args:
|
||||
text: The input string to convert
|
||||
separator: Character to use for word separation (default: "-")
|
||||
|
||||
Returns:
|
||||
A lowercase string with special characters removed and words separated
|
||||
|
||||
Examples:
|
||||
>>> slugify("Hello World!")
|
||||
'hello-world'
|
||||
>>> slugify("My Great Article", "_")
|
||||
'my_great_article'
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Convert to lowercase and normalize unicode
|
||||
text = text.lower()
|
||||
# Remove unicode accents by replacing with ASCII equivalents
|
||||
text = re.sub(r'[àáâãäå]', 'a', text)
|
||||
text = re.sub(r'[èéêë]', 'e', text)
|
||||
text = re.sub(r'[ìíîï]', 'i', text)
|
||||
text = re.sub(r'[òóôõö]', 'o', text)
|
||||
text = re.sub(r'[ùúûü]', 'u', text)
|
||||
text = re.sub(r'[ýÿ]', 'y', text)
|
||||
text = re.sub(r'[ç]', 'c', text)
|
||||
text = re.sub(r'[ñ]', 'n', text)
|
||||
|
||||
# Replace non-alphanumeric characters (except underscores and dashes) with separator
|
||||
text = re.sub(r'[^\w\s-]', '', text)
|
||||
# Replace whitespace and underscores with separator
|
||||
text = re.sub(r'[\s_]+', separator, text)
|
||||
# Replace multiple separators with single separator
|
||||
text = re.sub(f'[{re.escape(separator)}]+', separator, text)
|
||||
# Remove leading/trailing separators
|
||||
text = text.strip(separator)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def truncate(text: str, max_length: int, suffix: str = "...") -> str:
|
||||
"""
|
||||
Truncate a string to a maximum length, adding a suffix if truncated.
|
||||
|
||||
Args:
|
||||
text: The input string to truncate
|
||||
max_length: Maximum length of the result (including suffix)
|
||||
suffix: String to append if truncation occurs (default: "...")
|
||||
|
||||
Returns:
|
||||
The truncated string with suffix if needed
|
||||
|
||||
Examples:
|
||||
>>> truncate("This is a long string", 10)
|
||||
'This is...'
|
||||
>>> truncate("Short", 10)
|
||||
'Short'
|
||||
"""
|
||||
if not text or len(text) <= max_length:
|
||||
return text
|
||||
|
||||
if max_length <= len(suffix):
|
||||
return suffix[:max_length]
|
||||
|
||||
truncate_at = max_length - len(suffix)
|
||||
return text[:truncate_at] + suffix
|
||||
|
||||
|
||||
def camel_to_snake(text: str) -> str:
|
||||
"""
|
||||
Convert camelCase or PascalCase to snake_case.
|
||||
|
||||
Args:
|
||||
text: The input string in camelCase or PascalCase
|
||||
|
||||
Returns:
|
||||
String converted to snake_case
|
||||
|
||||
Examples:
|
||||
>>> camel_to_snake("camelCase")
|
||||
'camel_case'
|
||||
>>> camel_to_snake("PascalCase")
|
||||
'pascal_case'
|
||||
>>> camel_to_snake("XMLHttpRequest")
|
||||
'xml_http_request'
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Insert underscore before uppercase letters that follow lowercase letters
|
||||
text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
|
||||
# Insert underscore before uppercase letters that follow lowercase letters or digits
|
||||
text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text)
|
||||
|
||||
return text.lower()
|
||||
|
||||
|
||||
def snake_to_camel(text: str, pascal_case: bool = False) -> str:
|
||||
"""
|
||||
Convert snake_case to camelCase or PascalCase.
|
||||
|
||||
Args:
|
||||
text: The input string in snake_case
|
||||
pascal_case: If True, return PascalCase; otherwise camelCase (default: False)
|
||||
|
||||
Returns:
|
||||
String converted to camelCase or PascalCase
|
||||
|
||||
Examples:
|
||||
>>> snake_to_camel("snake_case")
|
||||
'snakeCase'
|
||||
>>> snake_to_camel("snake_case", pascal_case=True)
|
||||
'SnakeCase'
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
components = text.split('_')
|
||||
if not components:
|
||||
return text
|
||||
|
||||
if pascal_case:
|
||||
return ''.join(word.capitalize() for word in components)
|
||||
else:
|
||||
return components[0] + ''.join(word.capitalize() for word in components[1:])
|
||||
|
||||
|
||||
def strip_ansi_codes(text: str) -> str:
|
||||
"""
|
||||
Remove ANSI escape sequences from a string.
|
||||
|
||||
Args:
|
||||
text: String that may contain ANSI escape sequences
|
||||
|
||||
Returns:
|
||||
String with ANSI codes removed
|
||||
|
||||
Examples:
|
||||
>>> strip_ansi_codes("\\033[31mRed text\\033[0m")
|
||||
'Red text'
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# ANSI escape sequence pattern
|
||||
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
|
||||
return ansi_escape.sub('', text)
|
||||
Reference in New Issue
Block a user