markitect-main/capabilities/markitect-utils/src/markitect_utils/string_utils.py

"""
String utility functions for MarkiTect ecosystem.

Provides common string manipulation and formatting functions that are
frequently needed across different MarkiTect capabilities.
"""

import re
from typing import Optional


def slugify(text: str, separator: str = "-") -> str:
    """
    Convert a string to a URL-friendly slug.

    Args:
        text: The input string to convert
        separator: Character to use for word separation (default: "-")

    Returns:
        A lowercase string with special characters removed and words separated

    Examples:
        >>> slugify("Hello World!")
        'hello-world'
        >>> slugify("My Great Article", "_")
        'my_great_article'
    """
    if not text:
        return ""

    # Convert to lowercase and normalize unicode
    text = text.lower()
    # Remove unicode accents by replacing with ASCII equivalents
    text = re.sub(r'[àáâãäå]', 'a', text)
    text = re.sub(r'[èéêë]', 'e', text)
    text = re.sub(r'[ìíîï]', 'i', text)
    text = re.sub(r'[òóôõö]', 'o', text)
    text = re.sub(r'[ùúûü]', 'u', text)
    text = re.sub(r'[ýÿ]', 'y', text)
    text = re.sub(r'[ç]', 'c', text)
    text = re.sub(r'[ñ]', 'n', text)

    # Replace non-alphanumeric characters (except underscores and dashes) with separator
    text = re.sub(r'[^\w\s-]', '', text)
    # Replace whitespace and underscores with separator
    text = re.sub(r'[\s_]+', separator, text)
    # Replace multiple separators with single separator
    text = re.sub(f'[{re.escape(separator)}]+', separator, text)
    # Remove leading/trailing separators
    text = text.strip(separator)

    return text


def truncate(text: str, max_length: int, suffix: str = "...") -> str:
    """
    Truncate a string to a maximum length, adding a suffix if truncated.

    Args:
        text: The input string to truncate
        max_length: Maximum length of the result (including suffix)
        suffix: String to append if truncation occurs (default: "...")

    Returns:
        The truncated string with suffix if needed

    Examples:
        >>> truncate("This is a long string", 10)
        'This is...'
        >>> truncate("Short", 10)
        'Short'
    """
    if not text or len(text) <= max_length:
        return text

    if max_length <= len(suffix):
        return suffix[:max_length]

    truncate_at = max_length - len(suffix)
    return text[:truncate_at] + suffix


def camel_to_snake(text: str) -> str:
    """
    Convert camelCase or PascalCase to snake_case.

    Args:
        text: The input string in camelCase or PascalCase

    Returns:
        String converted to snake_case

    Examples:
        >>> camel_to_snake("camelCase")
        'camel_case'
        >>> camel_to_snake("PascalCase")
        'pascal_case'
        >>> camel_to_snake("XMLHttpRequest")
        'xml_http_request'
    """
    if not text:
        return text

    # Insert underscore before uppercase letters that follow lowercase letters
    text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
    # Insert underscore before uppercase letters that follow lowercase letters or digits
    text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text)

    return text.lower()


def snake_to_camel(text: str, pascal_case: bool = False) -> str:
    """
    Convert snake_case to camelCase or PascalCase.

    Args:
        text: The input string in snake_case
        pascal_case: If True, return PascalCase; otherwise camelCase (default: False)

    Returns:
        String converted to camelCase or PascalCase

    Examples:
        >>> snake_to_camel("snake_case")
        'snakeCase'
        >>> snake_to_camel("snake_case", pascal_case=True)
        'SnakeCase'
    """
    if not text:
        return text

    components = text.split('_')
    if not components:
        return text

    if pascal_case:
        return ''.join(word.capitalize() for word in components)
    else:
        return components[0] + ''.join(word.capitalize() for word in components[1:])


def strip_ansi_codes(text: str) -> str:
    """
    Remove ANSI escape sequences from a string.

    Args:
        text: String that may contain ANSI escape sequences

    Returns:
        String with ANSI codes removed

    Examples:
        >>> strip_ansi_codes("\\033[31mRed text\\033[0m")
        'Red text'
    """
    if not text:
        return text

    # ANSI escape sequence pattern
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text)