feat(proxy): add markitdown as default proxy backend

Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks when markitdown is not installed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:48:47 +01:00
parent ac334c679d
commit e4fbba8a57
3 changed files with 93 additions and 1 deletions
--- a/markitect/proxy/extractors/init.py
+++ b/markitect/proxy/extractors/init.py
@@ -2,6 +2,11 @@
 Built-in extractor registration.

 Importing this module registers all built-in extractors with the global registry.
+
+Registration order matters: specialized extractors are registered first, then
+markitdown (if available) overwrites the overlapping extensions so it becomes
+the default backend.  If markitdown is not installed, the specialized extractors
+remain active for their extensions.
 """

 from markitect.proxy.registry import registry
@@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor
 from markitect.proxy.extractors.html import HtmlExtractor
 from markitect.proxy.extractors.markdown import MarkdownNormalizer

+# 1. Specialized extractors (baseline)
 registry.register(PdfExtractor())
 registry.register(HtmlExtractor())
 registry.register(MarkdownNormalizer())
+
+# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
+#    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
+try:
+    from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
+    _ext = MarkitdownExtractor()
+    if _ext.check_dependencies():
+        registry.register(_ext)
+except ImportError:
+    pass
--- a/markitect/proxy/extractors/markitdown_ext.py
+++ b/markitect/proxy/extractors/markitdown_ext.py
@@ -0,0 +1,75 @@
+"""
+Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
+
+Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
+
+Supports both the official ``markitdown`` package and the lighter
+``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
+dependency chain).  The no-magika variant is preferred as the default.
+"""
+
+from pathlib import Path
+
+from markitect.proxy.extractors.base import BaseExtractor
+from markitect.proxy.models import ExtractionResult
+from markitect.proxy.exceptions import DependencyMissingError
+
+
+def _import_markitdown():
+    """Import MarkItDown from whichever package is installed.
+
+    Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
+    Returns the MarkItDown class or None.
+    """
+    try:
+        from markitdown_no_magika import MarkItDown
+        return MarkItDown
+    except ImportError:
+        pass
+    try:
+        from markitdown import MarkItDown
+        return MarkItDown
+    except ImportError:
+        return None
+
+
+class MarkitdownExtractor(BaseExtractor):
+    """Converts many file types to Markdown via Microsoft markitdown."""
+
+    name = "markitdown"
+    version = "1.0"
+    extensions = (
+        ".pdf",
+        ".html",
+        ".htm",
+        ".docx",
+        ".pptx",
+        ".xlsx",
+        ".xls",
+        ".csv",
+        ".json",
+        ".xml",
+    )
+
+    def check_dependencies(self) -> bool:
+        return _import_markitdown() is not None
+
+    def dependency_hint(self) -> str:
+        return 'pip install "markitect[proxy-markitdown]"  (or: pip install markitdown-no-magika)'
+
+    def extract(self, source_path: Path) -> ExtractionResult:
+        MarkItDown = _import_markitdown()
+        if MarkItDown is None:
+            raise DependencyMissingError(
+                "markitdown is required to extract this file type.",
+                package="markitdown-no-magika",
+                install_hint=self.dependency_hint(),
+            )
+
+        md = MarkItDown()
+        result = md.convert(str(source_path))
+        return ExtractionResult(
+            content=result.text_content,
+            extractor=self.name,
+            extractor_version=self.version,
+        )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,8 @@ development = [
 ]
 proxy-pdf = ["pymupdf4llm>=0.0.10"]
 proxy-html = ["markdownify>=0.13.1"]
-proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
+proxy-markitdown = ["markitdown-no-magika[pdf]"]
+proxy = ["markitdown-no-magika[pdf]"]

 [project.scripts]
 markitect = "markitect.cli:main"