""" HTML extractor using markdownify. """ from pathlib import Path from markitect.proxy.extractors.base import BaseExtractor from markitect.proxy.models import ExtractionResult from markitect.proxy.exceptions import DependencyMissingError class HtmlExtractor(BaseExtractor): """Converts HTML files to Markdown via markdownify.""" name = "html" version = "1.0" extensions = (".html", ".htm") def check_dependencies(self) -> bool: try: import markdownify # noqa: F401 return True except ImportError: return False def dependency_hint(self) -> str: return 'pip install "markitect[proxy-html]" (or: pip install markdownify)' def extract(self, source_path: Path) -> ExtractionResult: if not self.check_dependencies(): raise DependencyMissingError( "markdownify is required to extract HTML files.", package="markdownify", install_hint=self.dependency_hint(), ) import markdownify html_content = source_path.read_text(encoding="utf-8") md_text = markdownify.markdownify(html_content, heading_style="ATX") return ExtractionResult( content=md_text, extractor=self.name, extractor_version=self.version, )