""" Markitdown extractor — uses Microsoft's markitdown package for broad file-type support. Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Supports both the official ``markitdown`` package and the lighter ``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime dependency chain). The no-magika variant is preferred as the default. """ from pathlib import Path from markitect.proxy.extractors.base import BaseExtractor from markitect.proxy.models import ExtractionResult from markitect.proxy.exceptions import DependencyMissingError def _import_markitdown(): """Import MarkItDown from whichever package is installed. Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``. Returns the MarkItDown class or None. """ try: from markitdown_no_magika import MarkItDown return MarkItDown except ImportError: pass try: from markitdown import MarkItDown return MarkItDown except ImportError: return None class MarkitdownExtractor(BaseExtractor): """Converts many file types to Markdown via Microsoft markitdown.""" name = "markitdown" version = "1.0" extensions = ( ".pdf", ".html", ".htm", ".docx", ".pptx", ".xlsx", ".xls", ".csv", ".json", ".xml", ) def check_dependencies(self) -> bool: return _import_markitdown() is not None def dependency_hint(self) -> str: return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)' def extract(self, source_path: Path) -> ExtractionResult: MarkItDown = _import_markitdown() if MarkItDown is None: raise DependencyMissingError( "markitdown is required to extract this file type.", package="markitdown-no-magika", install_hint=self.dependency_hint(), ) md = MarkItDown() try: result = md.convert(str(source_path)) except Exception as exc: # Catch markitdown's FileConversionException (and sub-type # MissingDependencyException) and surface a clean install hint. msg = str(exc) if "MissingDependency" in type(exc).__name__ or "MissingDependency" in msg: ext = source_path.suffix.lstrip(".") raise DependencyMissingError( f"markitdown needs an extra dependency for {ext} files.", package=f"markitdown-no-magika[{ext}]", install_hint=f'pip install "markitdown-no-magika[{ext}]"', ) from exc raise return ExtractionResult( content=result.text_content, extractor=self.name, extractor_version=self.version, )