diff --git a/markitect/proxy/extractors/__init__.py b/markitect/proxy/extractors/__init__.py index 3bc4af3d..cfccc3be 100644 --- a/markitect/proxy/extractors/__init__.py +++ b/markitect/proxy/extractors/__init__.py @@ -3,28 +3,23 @@ Built-in extractor registration. Importing this module registers all built-in extractors with the global registry. -Registration order matters: specialized extractors are registered first, then -markitdown (if available) overwrites the overlapping extensions so it becomes -the default backend. If markitdown is not installed, the specialized extractors -remain active for their extensions. +Markitdown is registered last and unconditionally — it overrides the specialized +extractors for overlapping extensions (.pdf, .html, .htm) and adds new types. +If markitdown is not installed, it gives a clear install hint at extraction time +(same pattern as PdfExtractor / HtmlExtractor). """ from markitect.proxy.registry import registry from markitect.proxy.extractors.pdf import PdfExtractor from markitect.proxy.extractors.html import HtmlExtractor from markitect.proxy.extractors.markdown import MarkdownNormalizer +from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor -# 1. Specialized extractors (baseline) +# 1. Specialized extractors (baseline — available as explicit fallbacks) registry.register(PdfExtractor()) registry.register(HtmlExtractor()) registry.register(MarkdownNormalizer()) # 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds # new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml) -try: - from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor - _ext = MarkitdownExtractor() - if _ext.check_dependencies(): - registry.register(_ext) -except ImportError: - pass +registry.register(MarkitdownExtractor())