168 lines
4.5 KiB
Python
168 lines
4.5 KiB
Python
"""
|
|
File utility functions for MarkiTect ecosystem.
|
|
|
|
Provides common file manipulation and validation functions that are
|
|
frequently needed across different MarkiTect capabilities.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Optional, Union
|
|
|
|
|
|
def safe_filename(filename: str, replacement: str = "_") -> str:
|
|
"""
|
|
Convert a string to a safe filename by removing/replacing unsafe characters.
|
|
|
|
Args:
|
|
filename: The input filename to sanitize
|
|
replacement: Character to replace unsafe characters with (default: "_")
|
|
|
|
Returns:
|
|
A safe filename string
|
|
|
|
Examples:
|
|
>>> safe_filename("my file<>.txt")
|
|
'my_file__.txt'
|
|
>>> safe_filename("file/with\\path.txt")
|
|
'file_with_path.txt'
|
|
"""
|
|
if not filename:
|
|
return ""
|
|
|
|
# Replace unsafe characters
|
|
unsafe_chars = r'[<>:"/\\|?*\x00-\x1f]'
|
|
safe_name = re.sub(unsafe_chars, replacement, filename)
|
|
|
|
# Remove leading/trailing dots and spaces
|
|
safe_name = safe_name.strip('. ')
|
|
|
|
# Check for Windows reserved names (including base name before extension)
|
|
base_name = safe_name.split('.')[0].upper() if safe_name else ""
|
|
reserved_names = {
|
|
'CON', 'PRN', 'AUX', 'NUL',
|
|
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
|
|
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
|
|
}
|
|
|
|
# Ensure not empty and not reserved names
|
|
if not safe_name or base_name in reserved_names:
|
|
safe_name = f"file{replacement}{safe_name}"
|
|
|
|
return safe_name
|
|
|
|
|
|
def ensure_extension(filename: str, extension: str) -> str:
|
|
"""
|
|
Ensure a filename has the specified extension.
|
|
|
|
Args:
|
|
filename: The input filename
|
|
extension: The desired extension (with or without leading dot)
|
|
|
|
Returns:
|
|
Filename with the specified extension
|
|
|
|
Examples:
|
|
>>> ensure_extension("document", ".md")
|
|
'document.md'
|
|
>>> ensure_extension("document.txt", ".md")
|
|
'document.txt.md'
|
|
>>> ensure_extension("document.md", "md")
|
|
'document.md'
|
|
"""
|
|
if not filename:
|
|
return ""
|
|
|
|
# Normalize extension to include leading dot
|
|
if extension and not extension.startswith('.'):
|
|
extension = f".{extension}"
|
|
|
|
if extension and not filename.endswith(extension):
|
|
return filename + extension
|
|
|
|
return filename
|
|
|
|
|
|
def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
|
|
"""
|
|
Get the size of a file in bytes.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
File size in bytes, or None if file doesn't exist or can't be accessed
|
|
|
|
Examples:
|
|
>>> get_file_size("document.txt") # doctest: +SKIP
|
|
1024
|
|
"""
|
|
try:
|
|
return os.path.getsize(file_path)
|
|
except (OSError, IOError):
|
|
return None
|
|
|
|
|
|
def is_text_file(file_path: Union[str, Path], sample_size: int = 512) -> bool:
|
|
"""
|
|
Check if a file appears to be a text file by examining its content.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
sample_size: Number of bytes to sample from the file (default: 512)
|
|
|
|
Returns:
|
|
True if the file appears to be text, False otherwise
|
|
|
|
Examples:
|
|
>>> is_text_file("document.txt") # doctest: +SKIP
|
|
True
|
|
"""
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
sample = f.read(sample_size)
|
|
|
|
if not sample:
|
|
return True # Empty file is considered text
|
|
|
|
# Check for null bytes (common in binary files)
|
|
if b'\x00' in sample:
|
|
return False
|
|
|
|
# Check if most bytes are printable ASCII or common UTF-8
|
|
try:
|
|
sample.decode('utf-8')
|
|
return True
|
|
except UnicodeDecodeError:
|
|
pass
|
|
|
|
try:
|
|
sample.decode('ascii')
|
|
return True
|
|
except UnicodeDecodeError:
|
|
return False
|
|
|
|
except (OSError, IOError):
|
|
return False
|
|
|
|
|
|
def normalize_path(path: Union[str, Path]) -> str:
|
|
"""
|
|
Normalize a file path by resolving relative components and converting to absolute.
|
|
|
|
Args:
|
|
path: The input path to normalize
|
|
|
|
Returns:
|
|
Normalized absolute path as a string
|
|
|
|
Examples:
|
|
>>> normalize_path("./dir/../file.txt") # doctest: +SKIP
|
|
'/current/working/directory/file.txt'
|
|
"""
|
|
if not path:
|
|
return ""
|
|
|
|
return str(Path(path).resolve()) |