Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
44 lines
1.1 KiB
Python
44 lines
1.1 KiB
Python
"""
|
|
Abstract base class for proxy file extractors.
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
|
|
from markitect.proxy.models import ExtractionResult
|
|
|
|
|
|
class BaseExtractor(ABC):
|
|
"""Base class that all proxy extractors must implement."""
|
|
|
|
name: str = ""
|
|
version: str = "1.0"
|
|
extensions: tuple = ()
|
|
|
|
@abstractmethod
|
|
def extract(self, source_path: Path) -> ExtractionResult:
|
|
"""Extract markdown content from a source file.
|
|
|
|
Args:
|
|
source_path: Path to the source file.
|
|
|
|
Returns:
|
|
ExtractionResult with the extracted markdown content.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def check_dependencies(self) -> bool:
|
|
"""Check whether all required dependencies are available.
|
|
|
|
Returns:
|
|
True if all dependencies are installed, False otherwise.
|
|
"""
|
|
|
|
def dependency_hint(self) -> str:
|
|
"""Human-readable install instructions for missing dependencies.
|
|
|
|
Returns:
|
|
A string like ``pip install markitect[proxy-pdf]``.
|
|
"""
|
|
return ""
|