feat(proxy): add markitdown as default proxy backend
Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks when markitdown is not installed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,11 @@
|
||||
Built-in extractor registration.
|
||||
|
||||
Importing this module registers all built-in extractors with the global registry.
|
||||
|
||||
Registration order matters: specialized extractors are registered first, then
|
||||
markitdown (if available) overwrites the overlapping extensions so it becomes
|
||||
the default backend. If markitdown is not installed, the specialized extractors
|
||||
remain active for their extensions.
|
||||
"""
|
||||
|
||||
from markitect.proxy.registry import registry
|
||||
@@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor
|
||||
from markitect.proxy.extractors.html import HtmlExtractor
|
||||
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
||||
|
||||
# 1. Specialized extractors (baseline)
|
||||
registry.register(PdfExtractor())
|
||||
registry.register(HtmlExtractor())
|
||||
registry.register(MarkdownNormalizer())
|
||||
|
||||
# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
|
||||
# new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
|
||||
try:
|
||||
from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
|
||||
_ext = MarkitdownExtractor()
|
||||
if _ext.check_dependencies():
|
||||
registry.register(_ext)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
75
markitect/proxy/extractors/markitdown_ext.py
Normal file
75
markitect/proxy/extractors/markitdown_ext.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
|
||||
|
||||
Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
|
||||
|
||||
Supports both the official ``markitdown`` package and the lighter
|
||||
``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
|
||||
dependency chain). The no-magika variant is preferred as the default.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
from markitect.proxy.exceptions import DependencyMissingError
|
||||
|
||||
|
||||
def _import_markitdown():
|
||||
"""Import MarkItDown from whichever package is installed.
|
||||
|
||||
Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
|
||||
Returns the MarkItDown class or None.
|
||||
"""
|
||||
try:
|
||||
from markitdown_no_magika import MarkItDown
|
||||
return MarkItDown
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
return MarkItDown
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
class MarkitdownExtractor(BaseExtractor):
|
||||
"""Converts many file types to Markdown via Microsoft markitdown."""
|
||||
|
||||
name = "markitdown"
|
||||
version = "1.0"
|
||||
extensions = (
|
||||
".pdf",
|
||||
".html",
|
||||
".htm",
|
||||
".docx",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".xls",
|
||||
".csv",
|
||||
".json",
|
||||
".xml",
|
||||
)
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
return _import_markitdown() is not None
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)'
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
MarkItDown = _import_markitdown()
|
||||
if MarkItDown is None:
|
||||
raise DependencyMissingError(
|
||||
"markitdown is required to extract this file type.",
|
||||
package="markitdown-no-magika",
|
||||
install_hint=self.dependency_hint(),
|
||||
)
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(str(source_path))
|
||||
return ExtractionResult(
|
||||
content=result.text_content,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
@@ -34,7 +34,8 @@ development = [
|
||||
]
|
||||
proxy-pdf = ["pymupdf4llm>=0.0.10"]
|
||||
proxy-html = ["markdownify>=0.13.1"]
|
||||
proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
|
||||
proxy-markitdown = ["markitdown-no-magika[pdf]"]
|
||||
proxy = ["markitdown-no-magika[pdf]"]
|
||||
|
||||
[project.scripts]
|
||||
markitect = "markitect.cli:main"
|
||||
|
||||
Reference in New Issue
Block a user