Files
markitect-main/capabilities/markitect-utils/src/markitect_utils/file_utils.py

168 lines
4.5 KiB
Python

"""
File utility functions for MarkiTect ecosystem.
Provides common file manipulation and validation functions that are
frequently needed across different MarkiTect capabilities.
"""
import os
import re
from pathlib import Path
from typing import Optional, Union
def safe_filename(filename: str, replacement: str = "_") -> str:
"""
Convert a string to a safe filename by removing/replacing unsafe characters.
Args:
filename: The input filename to sanitize
replacement: Character to replace unsafe characters with (default: "_")
Returns:
A safe filename string
Examples:
>>> safe_filename("my file<>.txt")
'my_file__.txt'
>>> safe_filename("file/with\\path.txt")
'file_with_path.txt'
"""
if not filename:
return ""
# Replace unsafe characters
unsafe_chars = r'[<>:"/\\|?*\x00-\x1f]'
safe_name = re.sub(unsafe_chars, replacement, filename)
# Remove leading/trailing dots and spaces
safe_name = safe_name.strip('. ')
# Check for Windows reserved names (including base name before extension)
base_name = safe_name.split('.')[0].upper() if safe_name else ""
reserved_names = {
'CON', 'PRN', 'AUX', 'NUL',
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
}
# Ensure not empty and not reserved names
if not safe_name or base_name in reserved_names:
safe_name = f"file{replacement}{safe_name}"
return safe_name
def ensure_extension(filename: str, extension: str) -> str:
"""
Ensure a filename has the specified extension.
Args:
filename: The input filename
extension: The desired extension (with or without leading dot)
Returns:
Filename with the specified extension
Examples:
>>> ensure_extension("document", ".md")
'document.md'
>>> ensure_extension("document.txt", ".md")
'document.txt.md'
>>> ensure_extension("document.md", "md")
'document.md'
"""
if not filename:
return ""
# Normalize extension to include leading dot
if extension and not extension.startswith('.'):
extension = f".{extension}"
if extension and not filename.endswith(extension):
return filename + extension
return filename
def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
"""
Get the size of a file in bytes.
Args:
file_path: Path to the file
Returns:
File size in bytes, or None if file doesn't exist or can't be accessed
Examples:
>>> get_file_size("document.txt") # doctest: +SKIP
1024
"""
try:
return os.path.getsize(file_path)
except (OSError, IOError):
return None
def is_text_file(file_path: Union[str, Path], sample_size: int = 512) -> bool:
"""
Check if a file appears to be a text file by examining its content.
Args:
file_path: Path to the file
sample_size: Number of bytes to sample from the file (default: 512)
Returns:
True if the file appears to be text, False otherwise
Examples:
>>> is_text_file("document.txt") # doctest: +SKIP
True
"""
try:
with open(file_path, 'rb') as f:
sample = f.read(sample_size)
if not sample:
return True # Empty file is considered text
# Check for null bytes (common in binary files)
if b'\x00' in sample:
return False
# Check if most bytes are printable ASCII or common UTF-8
try:
sample.decode('utf-8')
return True
except UnicodeDecodeError:
pass
try:
sample.decode('ascii')
return True
except UnicodeDecodeError:
return False
except (OSError, IOError):
return False
def normalize_path(path: Union[str, Path]) -> str:
"""
Normalize a file path by resolving relative components and converting to absolute.
Args:
path: The input path to normalize
Returns:
Normalized absolute path as a string
Examples:
>>> normalize_path("./dir/../file.txt") # doctest: +SKIP
'/current/working/directory/file.txt'
"""
if not path:
return ""
return str(Path(path).resolve())