Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
46 lines
1.4 KiB
Python
46 lines
1.4 KiB
Python
"""
|
|
HTML extractor using markdownify.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
from markitect.proxy.extractors.base import BaseExtractor
|
|
from markitect.proxy.models import ExtractionResult
|
|
from markitect.proxy.exceptions import DependencyMissingError
|
|
|
|
|
|
class HtmlExtractor(BaseExtractor):
|
|
"""Converts HTML files to Markdown via markdownify."""
|
|
|
|
name = "html"
|
|
version = "1.0"
|
|
extensions = (".html", ".htm")
|
|
|
|
def check_dependencies(self) -> bool:
|
|
try:
|
|
import markdownify # noqa: F401
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
def dependency_hint(self) -> str:
|
|
return 'pip install "markitect[proxy-html]" (or: pip install markdownify)'
|
|
|
|
def extract(self, source_path: Path) -> ExtractionResult:
|
|
if not self.check_dependencies():
|
|
raise DependencyMissingError(
|
|
"markdownify is required to extract HTML files.",
|
|
package="markdownify",
|
|
install_hint=self.dependency_hint(),
|
|
)
|
|
|
|
import markdownify
|
|
|
|
html_content = source_path.read_text(encoding="utf-8")
|
|
md_text = markdownify.markdownify(html_content, heading_style="ATX")
|
|
return ExtractionResult(
|
|
content=md_text,
|
|
extractor=self.name,
|
|
extractor_version=self.version,
|
|
)
|