markitect-main/markitect/proxy/extractors/__init__.py

"""
Built-in extractor registration.

Importing this module registers all built-in extractors with the global registry.

Registration order matters: specialized extractors are registered first, then
markitdown (if available) overwrites the overlapping extensions so it becomes
the default backend.  If markitdown is not installed, the specialized extractors
remain active for their extensions.
"""

from markitect.proxy.registry import registry
from markitect.proxy.extractors.pdf import PdfExtractor
from markitect.proxy.extractors.html import HtmlExtractor
from markitect.proxy.extractors.markdown import MarkdownNormalizer

# 1. Specialized extractors (baseline)
registry.register(PdfExtractor())
registry.register(HtmlExtractor())
registry.register(MarkdownNormalizer())

# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
#    new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
try:
    from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
    _ext = MarkitdownExtractor()
    if _ext.check_dependencies():
        registry.register(_ext)
except ImportError:
    pass