From e4fbba8a57a77bfc8419fe746140405780265506 Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 13 Feb 2026 20:48:47 +0100 Subject: [PATCH] feat(proxy): add markitdown as default proxy backend Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks when markitdown is not installed. Co-Authored-By: Claude Opus 4.6 --- markitect/proxy/extractors/__init__.py | 16 +++++ markitect/proxy/extractors/markitdown_ext.py | 75 ++++++++++++++++++++ pyproject.toml | 3 +- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 markitect/proxy/extractors/markitdown_ext.py diff --git a/markitect/proxy/extractors/__init__.py b/markitect/proxy/extractors/__init__.py index c9e379af..3bc4af3d 100644 --- a/markitect/proxy/extractors/__init__.py +++ b/markitect/proxy/extractors/__init__.py @@ -2,6 +2,11 @@ Built-in extractor registration. Importing this module registers all built-in extractors with the global registry. + +Registration order matters: specialized extractors are registered first, then +markitdown (if available) overwrites the overlapping extensions so it becomes +the default backend. If markitdown is not installed, the specialized extractors +remain active for their extensions. """ from markitect.proxy.registry import registry @@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor from markitect.proxy.extractors.html import HtmlExtractor from markitect.proxy.extractors.markdown import MarkdownNormalizer +# 1. Specialized extractors (baseline) registry.register(PdfExtractor()) registry.register(HtmlExtractor()) registry.register(MarkdownNormalizer()) + +# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds +# new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml) +try: + from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor + _ext = MarkitdownExtractor() + if _ext.check_dependencies(): + registry.register(_ext) +except ImportError: + pass diff --git a/markitect/proxy/extractors/markitdown_ext.py b/markitect/proxy/extractors/markitdown_ext.py new file mode 100644 index 00000000..f2e2de93 --- /dev/null +++ b/markitect/proxy/extractors/markitdown_ext.py @@ -0,0 +1,75 @@ +""" +Markitdown extractor — uses Microsoft's markitdown package for broad file-type support. + +Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. + +Supports both the official ``markitdown`` package and the lighter +``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime +dependency chain). The no-magika variant is preferred as the default. +""" + +from pathlib import Path + +from markitect.proxy.extractors.base import BaseExtractor +from markitect.proxy.models import ExtractionResult +from markitect.proxy.exceptions import DependencyMissingError + + +def _import_markitdown(): + """Import MarkItDown from whichever package is installed. + + Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``. + Returns the MarkItDown class or None. + """ + try: + from markitdown_no_magika import MarkItDown + return MarkItDown + except ImportError: + pass + try: + from markitdown import MarkItDown + return MarkItDown + except ImportError: + return None + + +class MarkitdownExtractor(BaseExtractor): + """Converts many file types to Markdown via Microsoft markitdown.""" + + name = "markitdown" + version = "1.0" + extensions = ( + ".pdf", + ".html", + ".htm", + ".docx", + ".pptx", + ".xlsx", + ".xls", + ".csv", + ".json", + ".xml", + ) + + def check_dependencies(self) -> bool: + return _import_markitdown() is not None + + def dependency_hint(self) -> str: + return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)' + + def extract(self, source_path: Path) -> ExtractionResult: + MarkItDown = _import_markitdown() + if MarkItDown is None: + raise DependencyMissingError( + "markitdown is required to extract this file type.", + package="markitdown-no-magika", + install_hint=self.dependency_hint(), + ) + + md = MarkItDown() + result = md.convert(str(source_path)) + return ExtractionResult( + content=result.text_content, + extractor=self.name, + extractor_version=self.version, + ) diff --git a/pyproject.toml b/pyproject.toml index 1bacd192..b13e989c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,8 @@ development = [ ] proxy-pdf = ["pymupdf4llm>=0.0.10"] proxy-html = ["markdownify>=0.13.1"] -proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"] +proxy-markitdown = ["markitdown-no-magika[pdf]"] +proxy = ["markitdown-no-magika[pdf]"] [project.scripts] markitect = "markitect.cli:main"