From 9fa239c140f3784639b4dff80252ce328503e1f8 Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Fri, 13 Feb 2026 20:52:07 +0100
Subject: [PATCH] fix(proxy): register markitdown extractor unconditionally

Always register MarkitdownExtractor so it overrides specialized extractors
for all its extensions. When markitdown-no-magika is not installed, users
now see the correct install hint instead of the old pymupdf4llm message.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 markitect/proxy/extractors/__init__.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/markitect/proxy/extractors/__init__.py b/markitect/proxy/extractors/__init__.py
index 3bc4af3d..cfccc3be 100644
--- a/markitect/proxy/extractors/__init__.py
+++ b/markitect/proxy/extractors/__init__.py
@@ -3,28 +3,23 @@ Built-in extractor registration.
 
 Importing this module registers all built-in extractors with the global registry.
 
-Registration order matters: specialized extractors are registered first, then
-markitdown (if available) overwrites the overlapping extensions so it becomes
-the default backend.  If markitdown is not installed, the specialized extractors
-remain active for their extensions.
+Markitdown is registered last and unconditionally — it overrides the specialized
+extractors for overlapping extensions (.pdf, .html, .htm) and adds new types.
+If markitdown is not installed, it gives a clear install hint at extraction time
+(same pattern as PdfExtractor / HtmlExtractor).
 """
 
 from markitect.proxy.registry import registry
 from markitect.proxy.extractors.pdf import PdfExtractor
 from markitect.proxy.extractors.html import HtmlExtractor
 from markitect.proxy.extractors.markdown import MarkdownNormalizer
+from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
 
-# 1. Specialized extractors (baseline)
+# 1. Specialized extractors (baseline — available as explicit fallbacks)
 registry.register(PdfExtractor())
 registry.register(HtmlExtractor())
 registry.register(MarkdownNormalizer())
 
 # 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
 #    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
-try:
-    from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
-    _ext = MarkitdownExtractor()
-    if _ext.check_dependencies():
-        registry.register(_ext)
-except ImportError:
-    pass
+registry.register(MarkitdownExtractor())