feat(proxy): add markitdown as default proxy backend

Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to
handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks
when markitdown is not installed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 20:48:47 +01:00
parent ac334c679d
commit e4fbba8a57
3 changed files with 93 additions and 1 deletions

View File

@@ -2,6 +2,11 @@
Built-in extractor registration.
Importing this module registers all built-in extractors with the global registry.
Registration order matters: specialized extractors are registered first, then
markitdown (if available) overwrites the overlapping extensions so it becomes
the default backend. If markitdown is not installed, the specialized extractors
remain active for their extensions.
"""
from markitect.proxy.registry import registry
@@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor
from markitect.proxy.extractors.html import HtmlExtractor
from markitect.proxy.extractors.markdown import MarkdownNormalizer
# 1. Specialized extractors (baseline)
registry.register(PdfExtractor())
registry.register(HtmlExtractor())
registry.register(MarkdownNormalizer())
# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
# new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
try:
from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
_ext = MarkitdownExtractor()
if _ext.check_dependencies():
registry.register(_ext)
except ImportError:
pass

View File

@@ -0,0 +1,75 @@
"""
Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
Supports both the official ``markitdown`` package and the lighter
``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
dependency chain). The no-magika variant is preferred as the default.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
def _import_markitdown():
"""Import MarkItDown from whichever package is installed.
Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
Returns the MarkItDown class or None.
"""
try:
from markitdown_no_magika import MarkItDown
return MarkItDown
except ImportError:
pass
try:
from markitdown import MarkItDown
return MarkItDown
except ImportError:
return None
class MarkitdownExtractor(BaseExtractor):
"""Converts many file types to Markdown via Microsoft markitdown."""
name = "markitdown"
version = "1.0"
extensions = (
".pdf",
".html",
".htm",
".docx",
".pptx",
".xlsx",
".xls",
".csv",
".json",
".xml",
)
def check_dependencies(self) -> bool:
return _import_markitdown() is not None
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)'
def extract(self, source_path: Path) -> ExtractionResult:
MarkItDown = _import_markitdown()
if MarkItDown is None:
raise DependencyMissingError(
"markitdown is required to extract this file type.",
package="markitdown-no-magika",
install_hint=self.dependency_hint(),
)
md = MarkItDown()
result = md.convert(str(source_path))
return ExtractionResult(
content=result.text_content,
extractor=self.name,
extractor_version=self.version,
)

View File

@@ -34,7 +34,8 @@ development = [
]
proxy-pdf = ["pymupdf4llm>=0.0.10"]
proxy-html = ["markdownify>=0.13.1"]
proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
proxy-markitdown = ["markitdown-no-magika[pdf]"]
proxy = ["markitdown-no-magika[pdf]"]
[project.scripts]
markitect = "markitect.cli:main"