markitect-main/markitect/proxy/extractors/__init__.py

"""
Built-in extractor registration.

Importing this module registers all built-in extractors with the global registry.

Markitdown is registered last and unconditionally — it overrides the specialized
extractors for overlapping extensions (.pdf, .html, .htm) and adds new types.
If markitdown is not installed, it gives a clear install hint at extraction time
(same pattern as PdfExtractor / HtmlExtractor).
"""

from markitect.proxy.registry import registry
from markitect.proxy.extractors.pdf import PdfExtractor
from markitect.proxy.extractors.html import HtmlExtractor
from markitect.proxy.extractors.markdown import MarkdownNormalizer
from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor

# 1. Specialized extractors (baseline — available as explicit fallbacks)
registry.register(PdfExtractor())
registry.register(HtmlExtractor())
registry.register(MarkdownNormalizer())

# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
#    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
registry.register(MarkitdownExtractor())