first ingestion/normalization slice

This commit is contained in:
2026-05-06 02:35:40 +02:00
parent 286ebc3cb6
commit 565a5643a3
19 changed files with 1231 additions and 10 deletions

View File

@@ -0,0 +1,5 @@
"""Local filesystem ingestion connector."""
from .connector import LocalFileConnector
__all__ = ["LocalFileConnector"]

View File

@@ -0,0 +1,77 @@
"""Local file and directory source connector."""
from __future__ import annotations
import mimetypes
from pathlib import Path
from typing import Any
from kontextual_engine.core import ConnectorCapability, SourcePayload, SourceReference, content_digest
from kontextual_engine.errors import NotFoundError, ValidationError
class LocalFileConnector:
name = "local_file"
def capabilities(self) -> ConnectorCapability:
return ConnectorCapability(
connector_name=self.name,
source_types=("file", "directory"),
supports_directories=True,
metadata={"uri_schemes": ["file", "path"]},
)
def fetch(self, source_uri: str) -> SourcePayload:
path = Path(source_uri).expanduser()
if not path.exists():
raise NotFoundError("Local source file not found", details={"path": str(path)})
if not path.is_file():
raise ValidationError("Local source is not a file", details={"path": str(path)})
content = path.read_bytes()
media_type = _guess_media_type(path)
source_ref = SourceReference(
source_system=self.name,
path=str(path),
checksum=content_digest(content),
connector_ref=f"{self.name}:{path.resolve()}",
metadata=_file_metadata(path),
)
return SourcePayload(
connector_name=self.name,
source_uri=str(path),
source_ref=source_ref,
media_type=media_type,
content=content,
title=path.stem,
metadata={"filename": path.name, **_file_metadata(path)},
)
def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]:
root = Path(source_uri).expanduser()
if not root.exists():
raise NotFoundError("Local source directory not found", details={"path": str(root)})
if root.is_file():
return [str(root)]
if not root.is_dir():
raise ValidationError("Local source is not a directory", details={"path": str(root)})
pattern = "**/*" if recursive else "*"
return sorted(str(path) for path in root.glob(pattern) if path.is_file())
def _guess_media_type(path: Path) -> str:
suffix = path.suffix.lower()
if suffix in {".md", ".markdown", ".mkd"}:
return "text/markdown"
if suffix in {".txt", ".text", ".log"}:
return "text/plain"
guessed, _ = mimetypes.guess_type(path.name)
return guessed or "application/octet-stream"
def _file_metadata(path: Path) -> dict[str, Any]:
stat = path.stat()
return {
"size_bytes": stat.st_size,
"mtime_ns": stat.st_mtime_ns,
}