""" File utility functions for MarkiTect ecosystem. Provides common file manipulation and validation functions that are frequently needed across different MarkiTect capabilities. """ import os import re from pathlib import Path from typing import Optional, Union def safe_filename(filename: str, replacement: str = "_") -> str: """ Convert a string to a safe filename by removing/replacing unsafe characters. Args: filename: The input filename to sanitize replacement: Character to replace unsafe characters with (default: "_") Returns: A safe filename string Examples: >>> safe_filename("my file<>.txt") 'my_file__.txt' >>> safe_filename("file/with\\path.txt") 'file_with_path.txt' """ if not filename: return "" # Replace unsafe characters unsafe_chars = r'[<>:"/\\|?*\x00-\x1f]' safe_name = re.sub(unsafe_chars, replacement, filename) # Remove leading/trailing dots and spaces safe_name = safe_name.strip('. ') # Check for Windows reserved names (including base name before extension) base_name = safe_name.split('.')[0].upper() if safe_name else "" reserved_names = { 'CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' } # Ensure not empty and not reserved names if not safe_name or base_name in reserved_names: safe_name = f"file{replacement}{safe_name}" return safe_name def ensure_extension(filename: str, extension: str) -> str: """ Ensure a filename has the specified extension. Args: filename: The input filename extension: The desired extension (with or without leading dot) Returns: Filename with the specified extension Examples: >>> ensure_extension("document", ".md") 'document.md' >>> ensure_extension("document.txt", ".md") 'document.txt.md' >>> ensure_extension("document.md", "md") 'document.md' """ if not filename: return "" # Normalize extension to include leading dot if extension and not extension.startswith('.'): extension = f".{extension}" if extension and not filename.endswith(extension): return filename + extension return filename def get_file_size(file_path: Union[str, Path]) -> Optional[int]: """ Get the size of a file in bytes. Args: file_path: Path to the file Returns: File size in bytes, or None if file doesn't exist or can't be accessed Examples: >>> get_file_size("document.txt") # doctest: +SKIP 1024 """ try: return os.path.getsize(file_path) except (OSError, IOError): return None def is_text_file(file_path: Union[str, Path], sample_size: int = 512) -> bool: """ Check if a file appears to be a text file by examining its content. Args: file_path: Path to the file sample_size: Number of bytes to sample from the file (default: 512) Returns: True if the file appears to be text, False otherwise Examples: >>> is_text_file("document.txt") # doctest: +SKIP True """ try: with open(file_path, 'rb') as f: sample = f.read(sample_size) if not sample: return True # Empty file is considered text # Check for null bytes (common in binary files) if b'\x00' in sample: return False # Check if most bytes are printable ASCII or common UTF-8 try: sample.decode('utf-8') return True except UnicodeDecodeError: pass try: sample.decode('ascii') return True except UnicodeDecodeError: return False except (OSError, IOError): return False def normalize_path(path: Union[str, Path]) -> str: """ Normalize a file path by resolving relative components and converting to absolute. Args: path: The input path to normalize Returns: Normalized absolute path as a string Examples: >>> normalize_path("./dir/../file.txt") # doctest: +SKIP '/current/working/directory/file.txt' """ if not path: return "" return str(Path(path).resolve())