feat(proxy): add markitdown as default proxy backend

Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks when markitdown is not installed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:48:47 +01:00
parent ac334c679d
commit e4fbba8a57
3 changed files with 93 additions and 1 deletions
--- a/markitect/proxy/extractors/init.py
+++ b/markitect/proxy/extractors/init.py
@@ -2,6 +2,11 @@
 Built-in extractor registration.
 Importing this module registers all built-in extractors with the global registry.
 Registration order matters: specialized extractors are registered first, then
 markitdown (if available) overwrites the overlapping extensions so it becomes
 the default backend.  If markitdown is not installed, the specialized extractors
 remain active for their extensions.
 """
 from markitect.proxy.registry import registry
@@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor
 from markitect.proxy.extractors.html import HtmlExtractor
 from markitect.proxy.extractors.markdown import MarkdownNormalizer
 # 1. Specialized extractors (baseline)
 registry.register(PdfExtractor())
 registry.register(HtmlExtractor())
 registry.register(MarkdownNormalizer())
 # 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
 #    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
 try:
    from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
    _ext = MarkitdownExtractor()
    if _ext.check_dependencies():
        registry.register(_ext)
 except ImportError:
    pass
--- a/markitect/proxy/extractors/markitdown_ext.py
+++ b/markitect/proxy/extractors/markitdown_ext.py
@@ -0,0 +1,75 @@
 """
 Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
 Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
 Supports both the official ``markitdown`` package and the lighter
 ``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
 dependency chain).  The no-magika variant is preferred as the default.
 """
 from pathlib import Path
 from markitect.proxy.extractors.base import BaseExtractor
 from markitect.proxy.models import ExtractionResult
 from markitect.proxy.exceptions import DependencyMissingError
 def _import_markitdown():
    """Import MarkItDown from whichever package is installed.
    Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
    Returns the MarkItDown class or None.
    """
    try:
        from markitdown_no_magika import MarkItDown
        return MarkItDown
    except ImportError:
        pass
    try:
        from markitdown import MarkItDown
        return MarkItDown
    except ImportError:
        return None
 class MarkitdownExtractor(BaseExtractor):
    """Converts many file types to Markdown via Microsoft markitdown."""
    name = "markitdown"
    version = "1.0"
    extensions = (
        ".pdf",
        ".html",
        ".htm",
        ".docx",
        ".pptx",
        ".xlsx",
        ".xls",
        ".csv",
        ".json",
        ".xml",
    )
    def check_dependencies(self) -> bool:
        return _import_markitdown() is not None
    def dependency_hint(self) -> str:
        return 'pip install "markitect[proxy-markitdown]"  (or: pip install markitdown-no-magika)'
    def extract(self, source_path: Path) -> ExtractionResult:
        MarkItDown = _import_markitdown()
        if MarkItDown is None:
            raise DependencyMissingError(
                "markitdown is required to extract this file type.",
                package="markitdown-no-magika",
                install_hint=self.dependency_hint(),
            )
        md = MarkItDown()
        result = md.convert(str(source_path))
        return ExtractionResult(
            content=result.text_content,
            extractor=self.name,
            extractor_version=self.version,
        )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,8 @@ development = [
 ]
 proxy-pdf = ["pymupdf4llm>=0.0.10"]
 proxy-html = ["markdownify>=0.13.1"]
-proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
+proxy-markitdown = ["markitdown-no-magika[pdf]"]
 proxy = ["markitdown-no-magika[pdf]"]
 [project.scripts]
 markitect = "markitect.cli:main"