Files
markitect-main/markitect/proxy/extractors/markitdown_ext.py
tegwick e4fbba8a57 feat(proxy): add markitdown as default proxy backend
Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to
handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks
when markitdown is not installed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:48:47 +01:00

76 lines
2.2 KiB
Python

"""
Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
Supports both the official ``markitdown`` package and the lighter
``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
dependency chain). The no-magika variant is preferred as the default.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
def _import_markitdown():
"""Import MarkItDown from whichever package is installed.
Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
Returns the MarkItDown class or None.
"""
try:
from markitdown_no_magika import MarkItDown
return MarkItDown
except ImportError:
pass
try:
from markitdown import MarkItDown
return MarkItDown
except ImportError:
return None
class MarkitdownExtractor(BaseExtractor):
"""Converts many file types to Markdown via Microsoft markitdown."""
name = "markitdown"
version = "1.0"
extensions = (
".pdf",
".html",
".htm",
".docx",
".pptx",
".xlsx",
".xls",
".csv",
".json",
".xml",
)
def check_dependencies(self) -> bool:
return _import_markitdown() is not None
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)'
def extract(self, source_path: Path) -> ExtractionResult:
MarkItDown = _import_markitdown()
if MarkItDown is None:
raise DependencyMissingError(
"markitdown is required to extract this file type.",
package="markitdown-no-magika",
install_hint=self.dependency_hint(),
)
md = MarkItDown()
result = md.convert(str(source_path))
return ExtractionResult(
content=result.text_content,
extractor=self.name,
extractor_version=self.version,
)