feat(proxy): add markitdown as default proxy backend
Uses markitdown-no-magika (lighter fork without magika/onnxruntime) to handle PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files. Specialized extractors (pymupdf4llm, markdownify) remain as fallbacks when markitdown is not installed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,11 @@
|
|||||||
Built-in extractor registration.
|
Built-in extractor registration.
|
||||||
|
|
||||||
Importing this module registers all built-in extractors with the global registry.
|
Importing this module registers all built-in extractors with the global registry.
|
||||||
|
|
||||||
|
Registration order matters: specialized extractors are registered first, then
|
||||||
|
markitdown (if available) overwrites the overlapping extensions so it becomes
|
||||||
|
the default backend. If markitdown is not installed, the specialized extractors
|
||||||
|
remain active for their extensions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from markitect.proxy.registry import registry
|
from markitect.proxy.registry import registry
|
||||||
@@ -9,6 +14,17 @@ from markitect.proxy.extractors.pdf import PdfExtractor
|
|||||||
from markitect.proxy.extractors.html import HtmlExtractor
|
from markitect.proxy.extractors.html import HtmlExtractor
|
||||||
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
||||||
|
|
||||||
|
# 1. Specialized extractors (baseline)
|
||||||
registry.register(PdfExtractor())
|
registry.register(PdfExtractor())
|
||||||
registry.register(HtmlExtractor())
|
registry.register(HtmlExtractor())
|
||||||
registry.register(MarkdownNormalizer())
|
registry.register(MarkdownNormalizer())
|
||||||
|
|
||||||
|
# 2. Markitdown as default backend — overrides .pdf, .html, .htm and adds
|
||||||
|
# new types (.docx, .pptx, .xlsx, .xls, .csv, .json, .xml)
|
||||||
|
try:
|
||||||
|
from markitect.proxy.extractors.markitdown_ext import MarkitdownExtractor
|
||||||
|
_ext = MarkitdownExtractor()
|
||||||
|
if _ext.check_dependencies():
|
||||||
|
registry.register(_ext)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|||||||
75
markitect/proxy/extractors/markitdown_ext.py
Normal file
75
markitect/proxy/extractors/markitdown_ext.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
"""
|
||||||
|
Markitdown extractor — uses Microsoft's markitdown package for broad file-type support.
|
||||||
|
|
||||||
|
Handles PDF, HTML, DOCX, PPTX, XLSX, XLS, CSV, JSON, and XML files.
|
||||||
|
|
||||||
|
Supports both the official ``markitdown`` package and the lighter
|
||||||
|
``markitdown-no-magika`` fork (which avoids the heavy magika/onnxruntime
|
||||||
|
dependency chain). The no-magika variant is preferred as the default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from markitect.proxy.extractors.base import BaseExtractor
|
||||||
|
from markitect.proxy.models import ExtractionResult
|
||||||
|
from markitect.proxy.exceptions import DependencyMissingError
|
||||||
|
|
||||||
|
|
||||||
|
def _import_markitdown():
|
||||||
|
"""Import MarkItDown from whichever package is installed.
|
||||||
|
|
||||||
|
Tries ``markitdown_no_magika`` first (lighter), then ``markitdown``.
|
||||||
|
Returns the MarkItDown class or None.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from markitdown_no_magika import MarkItDown
|
||||||
|
return MarkItDown
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
return MarkItDown
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class MarkitdownExtractor(BaseExtractor):
|
||||||
|
"""Converts many file types to Markdown via Microsoft markitdown."""
|
||||||
|
|
||||||
|
name = "markitdown"
|
||||||
|
version = "1.0"
|
||||||
|
extensions = (
|
||||||
|
".pdf",
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
".docx",
|
||||||
|
".pptx",
|
||||||
|
".xlsx",
|
||||||
|
".xls",
|
||||||
|
".csv",
|
||||||
|
".json",
|
||||||
|
".xml",
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_dependencies(self) -> bool:
|
||||||
|
return _import_markitdown() is not None
|
||||||
|
|
||||||
|
def dependency_hint(self) -> str:
|
||||||
|
return 'pip install "markitect[proxy-markitdown]" (or: pip install markitdown-no-magika)'
|
||||||
|
|
||||||
|
def extract(self, source_path: Path) -> ExtractionResult:
|
||||||
|
MarkItDown = _import_markitdown()
|
||||||
|
if MarkItDown is None:
|
||||||
|
raise DependencyMissingError(
|
||||||
|
"markitdown is required to extract this file type.",
|
||||||
|
package="markitdown-no-magika",
|
||||||
|
install_hint=self.dependency_hint(),
|
||||||
|
)
|
||||||
|
|
||||||
|
md = MarkItDown()
|
||||||
|
result = md.convert(str(source_path))
|
||||||
|
return ExtractionResult(
|
||||||
|
content=result.text_content,
|
||||||
|
extractor=self.name,
|
||||||
|
extractor_version=self.version,
|
||||||
|
)
|
||||||
@@ -34,7 +34,8 @@ development = [
|
|||||||
]
|
]
|
||||||
proxy-pdf = ["pymupdf4llm>=0.0.10"]
|
proxy-pdf = ["pymupdf4llm>=0.0.10"]
|
||||||
proxy-html = ["markdownify>=0.13.1"]
|
proxy-html = ["markdownify>=0.13.1"]
|
||||||
proxy = ["pymupdf4llm>=0.0.10", "markdownify>=0.13.1"]
|
proxy-markitdown = ["markitdown-no-magika[pdf]"]
|
||||||
|
proxy = ["markitdown-no-magika[pdf]"]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
markitect = "markitect.cli:main"
|
markitect = "markitect.cli:main"
|
||||||
|
|||||||
Reference in New Issue
Block a user