Files
markitect-main/markitect/proxy/extractors/markitdown_ext.py
tegwick 120ed89780 fix(proxy): catch markitdown missing-dependency errors with clean hint
When markitdown is installed but a format-specific sub-dependency is
missing (e.g. pdfminer-six for PDF), translate the raw traceback into
a DependencyMissingError with the correct install command.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 21:00:51 +01:00

89 lines
2.9 KiB
Python

"""
Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
Supports both the official ``markitdown`` package and the lighter
``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
dependency chain). The no-magika variant is preferred as the default.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
def _import_markitdown():
"""Import MarkItDown from whichever package is installed.
Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
Returns the MarkItDown class or None.
"""
try:
from markitdown_no_magika import MarkItDown
return MarkItDown
except ImportError:
pass
try:
from markitdown import MarkItDown
return MarkItDown
except ImportError:
return None
class MarkitdownExtractor(BaseExtractor):
"""Converts many file types to Markdown via Microsoft markitdown."""
name = "markitdown"
version = "1.0"
extensions = (
".pdf",
".html",
".htm",
".docx",
".pptx",
".xlsx",
".xls",
".csv",
".json",
".xml",
)
def check_dependencies(self) -> bool:
return _import_markitdown() is not None
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)'
def extract(self, source_path: Path) -> ExtractionResult:
MarkItDown = _import_markitdown()
if MarkItDown is None:
raise DependencyMissingError(
"markitdown is required to extract this file type.",
package="markitdown-no-magika",
install_hint=self.dependency_hint(),
)
md = MarkItDown()
try:
result = md.convert(str(source_path))
except Exception as exc:
# Catch markitdown's FileConversionException (and sub-type
# MissingDependencyException) and surface a clean install hint.
msg = str(exc)
if "MissingDependency" in type(exc).__name__ or "MissingDependency" in msg:
ext = source_path.suffix.lstrip(".")
raise DependencyMissingError(
f"markitdown needs an extra dependency for {ext} files.",
package=f"markitdown-no-magika[{ext}]",
install_hint=f'pip install "markitdown-no-magika[{ext}]"',
) from exc
raise
return ExtractionResult(
content=result.text_content,
extractor=self.name,
extractor_version=self.version,
)