""" PDF extractor using pymupdf4llm. """ from pathlib import Path from markitect.proxy.extractors.base import BaseExtractor from markitect.proxy.models import ExtractionResult from markitect.proxy.exceptions import DependencyMissingError class PdfExtractor(BaseExtractor): """Extracts markdown from PDF files via pymupdf4llm.""" name = "pdf" version = "1.0" extensions = (".pdf",) def check_dependencies(self) -> bool: try: import pymupdf4llm # noqa: F401 return True except ImportError: return False def dependency_hint(self) -> str: return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)' def extract(self, source_path: Path) -> ExtractionResult: if not self.check_dependencies(): raise DependencyMissingError( "pymupdf4llm is required to extract PDF files.", package="pymupdf4llm", install_hint=self.dependency_hint(), ) import pymupdf4llm md_text = pymupdf4llm.to_markdown(str(source_path)) return ExtractionResult( content=md_text, extractor=self.name, extractor_version=self.version, )