generated from coulomb/repo-seed
first ingestion/normalization slice
This commit is contained in:
5
src/kontextual_engine/adapters/local_files/__init__.py
Normal file
5
src/kontextual_engine/adapters/local_files/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Local filesystem ingestion connector."""
|
||||
|
||||
from .connector import LocalFileConnector
|
||||
|
||||
__all__ = ["LocalFileConnector"]
|
||||
77
src/kontextual_engine/adapters/local_files/connector.py
Normal file
77
src/kontextual_engine/adapters/local_files/connector.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Local file and directory source connector."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from kontextual_engine.core import ConnectorCapability, SourcePayload, SourceReference, content_digest
|
||||
from kontextual_engine.errors import NotFoundError, ValidationError
|
||||
|
||||
|
||||
class LocalFileConnector:
|
||||
name = "local_file"
|
||||
|
||||
def capabilities(self) -> ConnectorCapability:
|
||||
return ConnectorCapability(
|
||||
connector_name=self.name,
|
||||
source_types=("file", "directory"),
|
||||
supports_directories=True,
|
||||
metadata={"uri_schemes": ["file", "path"]},
|
||||
)
|
||||
|
||||
def fetch(self, source_uri: str) -> SourcePayload:
|
||||
path = Path(source_uri).expanduser()
|
||||
if not path.exists():
|
||||
raise NotFoundError("Local source file not found", details={"path": str(path)})
|
||||
if not path.is_file():
|
||||
raise ValidationError("Local source is not a file", details={"path": str(path)})
|
||||
|
||||
content = path.read_bytes()
|
||||
media_type = _guess_media_type(path)
|
||||
source_ref = SourceReference(
|
||||
source_system=self.name,
|
||||
path=str(path),
|
||||
checksum=content_digest(content),
|
||||
connector_ref=f"{self.name}:{path.resolve()}",
|
||||
metadata=_file_metadata(path),
|
||||
)
|
||||
return SourcePayload(
|
||||
connector_name=self.name,
|
||||
source_uri=str(path),
|
||||
source_ref=source_ref,
|
||||
media_type=media_type,
|
||||
content=content,
|
||||
title=path.stem,
|
||||
metadata={"filename": path.name, **_file_metadata(path)},
|
||||
)
|
||||
|
||||
def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]:
|
||||
root = Path(source_uri).expanduser()
|
||||
if not root.exists():
|
||||
raise NotFoundError("Local source directory not found", details={"path": str(root)})
|
||||
if root.is_file():
|
||||
return [str(root)]
|
||||
if not root.is_dir():
|
||||
raise ValidationError("Local source is not a directory", details={"path": str(root)})
|
||||
pattern = "**/*" if recursive else "*"
|
||||
return sorted(str(path) for path in root.glob(pattern) if path.is_file())
|
||||
|
||||
|
||||
def _guess_media_type(path: Path) -> str:
|
||||
suffix = path.suffix.lower()
|
||||
if suffix in {".md", ".markdown", ".mkd"}:
|
||||
return "text/markdown"
|
||||
if suffix in {".txt", ".text", ".log"}:
|
||||
return "text/plain"
|
||||
guessed, _ = mimetypes.guess_type(path.name)
|
||||
return guessed or "application/octet-stream"
|
||||
|
||||
|
||||
def _file_metadata(path: Path) -> dict[str, Any]:
|
||||
stat = path.stat()
|
||||
return {
|
||||
"size_bytes": stat.st_size,
|
||||
"mtime_ns": stat.st_mtime_ns,
|
||||
}
|
||||
Reference in New Issue
Block a user