fix(proxy): register markitdown extractor unconditionally
Always register MarkitdownExtractor so it overrides specialized extractors for all its extensions. When markitdown-no-magika is not installed, users now see the correct install hint instead of the old pymupdf4llm message. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,28 +3,23 @@ Built-in extractor registration.
|
|||||||
|
|
||||||
Importing this module registers all built-in extractors with the global registry.
|
Importing this module registers all built-in extractors with the global registry.
|
||||||
|
|
||||||
Registration order matters: specialized extractors are registered first, then
|
Markitdown is registered last and unconditionally — it overrides the specialized
|
||||||
markitdown (if available) overwrites the overlapping extensions so it becomes
|
extractors for overlapping extensions (.pdf, .html, .htm) and adds new types.
|
||||||
the default backend. If markitdown is not installed, the specialized extractors
|
If markitdown is not installed, it gives a clear install hint at extraction time
|
||||||
remain active for their extensions.
|
(same pattern as PdfExtractor / HtmlExtractor).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from markitect.proxy.registry import registry
|
from markitect.proxy.registry import registry
|
||||||
from markitect.proxy.extractors.pdf import PdfExtractor
|
from markitect.proxy.extractors.pdf import PdfExtractor
|
||||||
from markitect.proxy.extractors.html import HtmlExtractor
|
from markitect.proxy.extractors.html import HtmlExtractor
|
||||||
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
||||||
|
from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
|
||||||
|
|
||||||
# 1. Specialized extractors (baseline)
|
# 1. Specialized extractors (baseline — available as explicit fallbacks)
|
||||||
registry.register(PdfExtractor())
|
registry.register(PdfExtractor())
|
||||||
registry.register(HtmlExtractor())
|
registry.register(HtmlExtractor())
|
||||||
registry.register(MarkdownNormalizer())
|
registry.register(MarkdownNormalizer())
|
||||||
|
|
||||||
# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
|
# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
|
||||||
# new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
|
# new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
|
||||||
try:
|
registry.register(MarkitdownExtractor())
|
||||||
from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
|
|
||||||
_ext = MarkitdownExtractor()
|
|
||||||
if _ext.check_dependencies():
|
|
||||||
registry.register(_ext)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|||||||
Reference in New Issue
Block a user