feat(proxy): add proxy file system for non-markdown source conversion
Introduces a new `markitect/proxy/` module with pluggable extractors that convert non-markdown sources (PDF, HTML) into tracked markdown proxy files. Proxy files preserve origin metadata (path, checksum, timestamp) so they can be kept in sync when the original changes. CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`. Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
14
markitect/proxy/extractors/__init__.py
Normal file
14
markitect/proxy/extractors/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""
|
||||
Built-in extractor registration.
|
||||
|
||||
Importing this module registers all built-in extractors with the global registry.
|
||||
"""
|
||||
|
||||
from markitect.proxy.registry import registry
|
||||
from markitect.proxy.extractors.pdf import PdfExtractor
|
||||
from markitect.proxy.extractors.html import HtmlExtractor
|
||||
from markitect.proxy.extractors.markdown import MarkdownNormalizer
|
||||
|
||||
registry.register(PdfExtractor())
|
||||
registry.register(HtmlExtractor())
|
||||
registry.register(MarkdownNormalizer())
|
||||
43
markitect/proxy/extractors/base.py
Normal file
43
markitect/proxy/extractors/base.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Abstract base class for proxy file extractors.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
|
||||
|
||||
class BaseExtractor(ABC):
|
||||
"""Base class that all proxy extractors must implement."""
|
||||
|
||||
name: str = ""
|
||||
version: str = "1.0"
|
||||
extensions: tuple = ()
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
"""Extract markdown content from a source file.
|
||||
|
||||
Args:
|
||||
source_path: Path to the source file.
|
||||
|
||||
Returns:
|
||||
ExtractionResult with the extracted markdown content.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def check_dependencies(self) -> bool:
|
||||
"""Check whether all required dependencies are available.
|
||||
|
||||
Returns:
|
||||
True if all dependencies are installed, False otherwise.
|
||||
"""
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
"""Human-readable install instructions for missing dependencies.
|
||||
|
||||
Returns:
|
||||
A string like ``pip install markitect[proxy-pdf]``.
|
||||
"""
|
||||
return ""
|
||||
45
markitect/proxy/extractors/html.py
Normal file
45
markitect/proxy/extractors/html.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""
|
||||
HTML extractor using markdownify.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
from markitect.proxy.exceptions import DependencyMissingError
|
||||
|
||||
|
||||
class HtmlExtractor(BaseExtractor):
|
||||
"""Converts HTML files to Markdown via markdownify."""
|
||||
|
||||
name = "html"
|
||||
version = "1.0"
|
||||
extensions = (".html", ".htm")
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
try:
|
||||
import markdownify # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
return 'pip install "markitect[proxy-html]" (or: pip install markdownify)'
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
if not self.check_dependencies():
|
||||
raise DependencyMissingError(
|
||||
"markdownify is required to extract HTML files.",
|
||||
package="markdownify",
|
||||
install_hint=self.dependency_hint(),
|
||||
)
|
||||
|
||||
import markdownify
|
||||
|
||||
html_content = source_path.read_text(encoding="utf-8")
|
||||
md_text = markdownify.markdownify(html_content, heading_style="ATX")
|
||||
return ExtractionResult(
|
||||
content=md_text,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
29
markitect/proxy/extractors/markdown.py
Normal file
29
markitect/proxy/extractors/markdown.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""
|
||||
Markdown normalizer — passes through Markdown with minimal transformation.
|
||||
|
||||
No external dependencies required.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
|
||||
|
||||
class MarkdownNormalizer(BaseExtractor):
|
||||
"""Normalizes other Markdown files (built-in, no optional deps)."""
|
||||
|
||||
name = "markdown"
|
||||
version = "1.0"
|
||||
extensions = (".md", ".markdown", ".mdown")
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
return True
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
content = source_path.read_text(encoding="utf-8")
|
||||
return ExtractionResult(
|
||||
content=content,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
44
markitect/proxy/extractors/pdf.py
Normal file
44
markitect/proxy/extractors/pdf.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
PDF extractor using pymupdf4llm.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from markitect.proxy.extractors.base import BaseExtractor
|
||||
from markitect.proxy.models import ExtractionResult
|
||||
from markitect.proxy.exceptions import DependencyMissingError
|
||||
|
||||
|
||||
class PdfExtractor(BaseExtractor):
|
||||
"""Extracts markdown from PDF files via pymupdf4llm."""
|
||||
|
||||
name = "pdf"
|
||||
version = "1.0"
|
||||
extensions = (".pdf",)
|
||||
|
||||
def check_dependencies(self) -> bool:
|
||||
try:
|
||||
import pymupdf4llm # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def dependency_hint(self) -> str:
|
||||
return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)'
|
||||
|
||||
def extract(self, source_path: Path) -> ExtractionResult:
|
||||
if not self.check_dependencies():
|
||||
raise DependencyMissingError(
|
||||
"pymupdf4llm is required to extract PDF files.",
|
||||
package="pymupdf4llm",
|
||||
install_hint=self.dependency_hint(),
|
||||
)
|
||||
|
||||
import pymupdf4llm
|
||||
|
||||
md_text = pymupdf4llm.to_markdown(str(source_path))
|
||||
return ExtractionResult(
|
||||
content=md_text,
|
||||
extractor=self.name,
|
||||
extractor_version=self.version,
|
||||
)
|
||||
Reference in New Issue
Block a user