feat(proxy): add proxy file system for non-markdown source conversion

Introduces a new `markitect/proxy/` module with pluggable extractors that
convert non-markdown sources (PDF, HTML) into tracked markdown proxy files.
Proxy files preserve origin metadata (path, checksum, timestamp) so they
can be kept in sync when the original changes.

CLI commands: `proxy create`, `proxy update`, `proxy status`, `proxy extractors`.
Built-in extractors: PDF (pymupdf4llm), HTML (markdownify), Markdown (built-in).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 19:06:09 +01:00
parent 69aea1ada7
commit ac334c679d
13 changed files with 781 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
"""
Built-in extractor registration.
Importing this module registers all built-in extractors with the global registry.
"""
from markitect.proxy.registry import registry
from markitect.proxy.extractors.pdf import PdfExtractor
from markitect.proxy.extractors.html import HtmlExtractor
from markitect.proxy.extractors.markdown import MarkdownNormalizer
registry.register(PdfExtractor())
registry.register(HtmlExtractor())
registry.register(MarkdownNormalizer())

View File

@@ -0,0 +1,43 @@
"""
Abstract base class for proxy file extractors.
"""
from abc import ABC, abstractmethod
from pathlib import Path
from markitect.proxy.models import ExtractionResult
class BaseExtractor(ABC):
"""Base class that all proxy extractors must implement."""
name: str = ""
version: str = "1.0"
extensions: tuple = ()
@abstractmethod
def extract(self, source_path: Path) -> ExtractionResult:
"""Extract markdown content from a source file.
Args:
source_path: Path to the source file.
Returns:
ExtractionResult with the extracted markdown content.
"""
@abstractmethod
def check_dependencies(self) -> bool:
"""Check whether all required dependencies are available.
Returns:
True if all dependencies are installed, False otherwise.
"""
def dependency_hint(self) -> str:
"""Human-readable install instructions for missing dependencies.
Returns:
A string like ``pip install markitect[proxy-pdf]``.
"""
return ""

View File

@@ -0,0 +1,45 @@
"""
HTML extractor using markdownify.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
class HtmlExtractor(BaseExtractor):
"""Converts HTML files to Markdown via markdownify."""
name = "html"
version = "1.0"
extensions = (".html", ".htm")
def check_dependencies(self) -> bool:
try:
import markdownify # noqa: F401
return True
except ImportError:
return False
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-html]" (or: pip install markdownify)'
def extract(self, source_path: Path) -> ExtractionResult:
if not self.check_dependencies():
raise DependencyMissingError(
"markdownify is required to extract HTML files.",
package="markdownify",
install_hint=self.dependency_hint(),
)
import markdownify
html_content = source_path.read_text(encoding="utf-8")
md_text = markdownify.markdownify(html_content, heading_style="ATX")
return ExtractionResult(
content=md_text,
extractor=self.name,
extractor_version=self.version,
)

View File

@@ -0,0 +1,29 @@
"""
Markdown normalizer — passes through Markdown with minimal transformation.
No external dependencies required.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
class MarkdownNormalizer(BaseExtractor):
"""Normalizes other Markdown files (built-in, no optional deps)."""
name = "markdown"
version = "1.0"
extensions = (".md", ".markdown", ".mdown")
def check_dependencies(self) -> bool:
return True
def extract(self, source_path: Path) -> ExtractionResult:
content = source_path.read_text(encoding="utf-8")
return ExtractionResult(
content=content,
extractor=self.name,
extractor_version=self.version,
)

View File

@@ -0,0 +1,44 @@
"""
PDF extractor using pymupdf4llm.
"""
from pathlib import Path
from markitect.proxy.extractors.base import BaseExtractor
from markitect.proxy.models import ExtractionResult
from markitect.proxy.exceptions import DependencyMissingError
class PdfExtractor(BaseExtractor):
"""Extracts markdown from PDF files via pymupdf4llm."""
name = "pdf"
version = "1.0"
extensions = (".pdf",)
def check_dependencies(self) -> bool:
try:
import pymupdf4llm # noqa: F401
return True
except ImportError:
return False
def dependency_hint(self) -> str:
return 'pip install "markitect[proxy-pdf]" (or: pip install pymupdf4llm)'
def extract(self, source_path: Path) -> ExtractionResult:
if not self.check_dependencies():
raise DependencyMissingError(
"pymupdf4llm is required to extract PDF files.",
package="pymupdf4llm",
install_hint=self.dependency_hint(),
)
import pymupdf4llm
md_text = pymupdf4llm.to_markdown(str(source_path))
return ExtractionResult(
content=md_text,
extractor=self.name,
extractor_version=self.version,
)