feat(proxy): add markitdown as default proxy backend

Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks when markitdown is not installed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:48:47 +01:00
parent ac334c679d
commit e4fbba8a57
3 changed files with 93 additions and 1 deletions
--- a/markitect/proxy/extractors/init.py
+++ b/markitect/proxy/extractors/init.py
@@ -2,6 +2,11 @@
 Built-in extractor registration.

 Importing this module registers all built-in extractors with the global registry.
+
+Registration order matters: specialized extractors are registered first, then
+markitdown (if available) overwrites the overlapping extensions so it becomes
+the default backend.  If markitdown is not installed, the specialized extractors
+remain active for their extensions.
 """

 from markitect.proxy.registry import registry
@@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor
 from markitect.proxy.extractors.html import HtmlExtractor
 from markitect.proxy.extractors.markdown import MarkdownNormalizer

+# 1. Specialized extractors (baseline)
 registry.register(PdfExtractor())
 registry.register(HtmlExtractor())
 registry.register(MarkdownNormalizer())
+
+# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
+#    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
+try:
+    from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
+    _ext = MarkitdownExtractor()
+    if _ext.check_dependencies():
+        registry.register(_ext)
+except ImportError:
+    pass