From 9fa239c140f3784639b4dff80252ce328503e1f8 Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 13 Feb 2026 20:52:07 +0100 Subject: [PATCH] fix(proxy): register markitdown extractor unconditionally Always register MarkitdownExtractor so it overrides specialized extractors for all its extensions. When markitdown-no-magika is not installed, users now see the correct install hint instead of the old pymupdf4llm message. Co-Authored-By: Claude Opus 4.6 --- markitect/proxy/extractors/__init__.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/markitect/proxy/extractors/__init__.py b/markitect/proxy/extractors/__init__.py index 3bc4af3d..cfccc3be 100644 --- a/markitect/proxy/extractors/__init__.py +++ b/markitect/proxy/extractors/__init__.py @@ -3,28 +3,23 @@ Built-in extractor registration. Importing this module registers all built-in extractors with the global registry. -Registration order matters: specialized extractors are registered first, then -markitdown (if available) overwrites the overlapping extensions so it becomes -the default backend. If markitdown is not installed, the specialized extractors -remain active for their extensions. +Markitdown is registered last and unconditionally — it overrides the specialized +extractors for overlapping extensions (.pdf, .html, .htm) and adds new types. +If markitdown is not installed, it gives a clear install hint at extraction time +(same pattern as PdfExtractor / HtmlExtractor). """ from markitect.proxy.registry import registry from markitect.proxy.extractors.pdf import PdfExtractor from markitect.proxy.extractors.html import HtmlExtractor from markitect.proxy.extractors.markdown import MarkdownNormalizer +from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor -# 1. Specialized extractors (baseline) +# 1. Specialized extractors (baseline — available as explicit fallbacks) registry.register(PdfExtractor()) registry.register(HtmlExtractor()) registry.register(MarkdownNormalizer()) # 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds # new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml) -try: - from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor - _ext = MarkitdownExtractor() - if _ext.check_dependencies(): - registry.register(_ext) -except ImportError: - pass +registry.register(MarkitdownExtractor())