When markitdown is installed but a format-specific sub-dependency is missing (e.g. pdfminer-six for PDF), translate the raw traceback into a DependencyMissingError with the correct install command. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
"""
|
|
Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
|
|
|
|
Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
|
|
|
|
Supports both the official ``markitdown`` package and the lighter
|
|
``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
|
|
dependency chain). The no-magika variant is preferred as the default.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
from markitect.proxy.extractors.base import BaseExtractor
|
|
from markitect.proxy.models import ExtractionResult
|
|
from markitect.proxy.exceptions import DependencyMissingError
|
|
|
|
|
|
def _import_markitdown():
|
|
"""Import MarkItDown from whichever package is installed.
|
|
|
|
Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
|
|
Returns the MarkItDown class or None.
|
|
"""
|
|
try:
|
|
from markitdown_no_magika import MarkItDown
|
|
return MarkItDown
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from markitdown import MarkItDown
|
|
return MarkItDown
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
class MarkitdownExtractor(BaseExtractor):
|
|
"""Converts many file types to Markdown via Microsoft markitdown."""
|
|
|
|
name = "markitdown"
|
|
version = "1.0"
|
|
extensions = (
|
|
".pdf",
|
|
".html",
|
|
".htm",
|
|
".docx",
|
|
".pptx",
|
|
".xlsx",
|
|
".xls",
|
|
".csv",
|
|
".json",
|
|
".xml",
|
|
)
|
|
|
|
def check_dependencies(self) -> bool:
|
|
return _import_markitdown() is not None
|
|
|
|
def dependency_hint(self) -> str:
|
|
return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)'
|
|
|
|
def extract(self, source_path: Path) -> ExtractionResult:
|
|
MarkItDown = _import_markitdown()
|
|
if MarkItDown is None:
|
|
raise DependencyMissingError(
|
|
"markitdown is required to extract this file type.",
|
|
package="markitdown-no-magika",
|
|
install_hint=self.dependency_hint(),
|
|
)
|
|
|
|
md = MarkItDown()
|
|
try:
|
|
result = md.convert(str(source_path))
|
|
except Exception as exc:
|
|
# Catch markitdown's FileConversionException (and sub-type
|
|
# MissingDependencyException) and surface a clean install hint.
|
|
msg = str(exc)
|
|
if "MissingDependency" in type(exc).__name__ or "MissingDependency" in msg:
|
|
ext = source_path.suffix.lstrip(".")
|
|
raise DependencyMissingError(
|
|
f"markitdown needs an extra dependency for {ext} files.",
|
|
package=f"markitdown-no-magika[{ext}]",
|
|
install_hint=f'pip install "markitdown-no-magika[{ext}]"',
|
|
) from exc
|
|
raise
|
|
return ExtractionResult(
|
|
content=result.text_content,
|
|
extractor=self.name,
|
|
extractor_version=self.version,
|
|
)
|