fix(proxy): register markitdown extractor unconditionally

Always register MarkitdownExtractor so it overrides specialized extractors for all its extensions. When markitdown-no-magika is not installed, users now see the correct install hint instead of the old pymupdf4llm message. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:52:07 +01:00
parent e4fbba8a57
commit 9fa239c140
1 changed files with 7 additions and 12 deletions
--- a/markitect/proxy/extractors/init.py
+++ b/markitect/proxy/extractors/init.py
@@ -3,28 +3,23 @@ Built-in extractor registration.

 Importing this module registers all built-in extractors with the global registry.

-Registration order matters: specialized extractors are registered first, then
-markitdown (if available) overwrites the overlapping extensions so it becomes
-the default backend.  If markitdown is not installed, the specialized extractors
-remain active for their extensions.
+Markitdown is registered last and unconditionally — it overrides the specialized
+extractors for overlapping extensions (.pdf, .html, .htm) and adds new types.
+If markitdown is not installed, it gives a clear install hint at extraction time
+(same pattern as PdfExtractor / HtmlExtractor).
 """

 from markitect.proxy.registry import registry
 from markitect.proxy.extractors.pdf import PdfExtractor
 from markitect.proxy.extractors.html import HtmlExtractor
 from markitect.proxy.extractors.markdown import MarkdownNormalizer
+from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor

-# 1. Specialized extractors (baseline)
+# 1. Specialized extractors (baseline — available as explicit fallbacks)
 registry.register(PdfExtractor())
 registry.register(HtmlExtractor())
 registry.register(MarkdownNormalizer())

 # 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
 #    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
-try:
-    from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
-    _ext = MarkitdownExtractor()
-    if _ext.check_dependencies():
-        registry.register(_ext)
-except ImportError:
-    pass
+registry.register(MarkitdownExtractor())