generated from coulomb/repo-seed
first ingestion/normalization slice
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
"""Built-in baseline format extractors."""
|
||||
|
||||
from .text import PlainTextExtractor
|
||||
|
||||
__all__ = ["PlainTextExtractor"]
|
||||
42
src/kontextual_engine/adapters/builtin_extractors/text.py
Normal file
42
src/kontextual_engine/adapters/builtin_extractors/text.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""Plain text normalization extractor."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
|
||||
|
||||
|
||||
class PlainTextExtractor:
|
||||
name = "plain-text"
|
||||
media_types = ("text/plain",)
|
||||
|
||||
def capabilities(self) -> ExtractorCapability:
|
||||
return ExtractorCapability(
|
||||
extractor_name=self.name,
|
||||
media_types=self.media_types,
|
||||
extraction_depth="text",
|
||||
produces_structure=False,
|
||||
)
|
||||
|
||||
def supports(self, media_type: str) -> bool:
|
||||
return media_type in self.media_types or media_type.startswith("text/plain")
|
||||
|
||||
def extract(self, payload: SourcePayload) -> ExtractionResult:
|
||||
text = payload.read_text()
|
||||
normalized = NormalizedDocument(
|
||||
title=payload.title,
|
||||
text=text,
|
||||
fields={"line_count": len(text.splitlines())},
|
||||
confidence=1.0,
|
||||
extractor_metadata={
|
||||
"extractor": self.name,
|
||||
"source_media_type": payload.media_type,
|
||||
},
|
||||
)
|
||||
return ExtractionResult(
|
||||
normalized=normalized,
|
||||
metadata={
|
||||
"extractor": self.name,
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
},
|
||||
)
|
||||
5
src/kontextual_engine/adapters/local_files/__init__.py
Normal file
5
src/kontextual_engine/adapters/local_files/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Local filesystem ingestion connector."""
|
||||
|
||||
from .connector import LocalFileConnector
|
||||
|
||||
__all__ = ["LocalFileConnector"]
|
||||
77
src/kontextual_engine/adapters/local_files/connector.py
Normal file
77
src/kontextual_engine/adapters/local_files/connector.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Local file and directory source connector."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from kontextual_engine.core import ConnectorCapability, SourcePayload, SourceReference, content_digest
|
||||
from kontextual_engine.errors import NotFoundError, ValidationError
|
||||
|
||||
|
||||
class LocalFileConnector:
|
||||
name = "local_file"
|
||||
|
||||
def capabilities(self) -> ConnectorCapability:
|
||||
return ConnectorCapability(
|
||||
connector_name=self.name,
|
||||
source_types=("file", "directory"),
|
||||
supports_directories=True,
|
||||
metadata={"uri_schemes": ["file", "path"]},
|
||||
)
|
||||
|
||||
def fetch(self, source_uri: str) -> SourcePayload:
|
||||
path = Path(source_uri).expanduser()
|
||||
if not path.exists():
|
||||
raise NotFoundError("Local source file not found", details={"path": str(path)})
|
||||
if not path.is_file():
|
||||
raise ValidationError("Local source is not a file", details={"path": str(path)})
|
||||
|
||||
content = path.read_bytes()
|
||||
media_type = _guess_media_type(path)
|
||||
source_ref = SourceReference(
|
||||
source_system=self.name,
|
||||
path=str(path),
|
||||
checksum=content_digest(content),
|
||||
connector_ref=f"{self.name}:{path.resolve()}",
|
||||
metadata=_file_metadata(path),
|
||||
)
|
||||
return SourcePayload(
|
||||
connector_name=self.name,
|
||||
source_uri=str(path),
|
||||
source_ref=source_ref,
|
||||
media_type=media_type,
|
||||
content=content,
|
||||
title=path.stem,
|
||||
metadata={"filename": path.name, **_file_metadata(path)},
|
||||
)
|
||||
|
||||
def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]:
|
||||
root = Path(source_uri).expanduser()
|
||||
if not root.exists():
|
||||
raise NotFoundError("Local source directory not found", details={"path": str(root)})
|
||||
if root.is_file():
|
||||
return [str(root)]
|
||||
if not root.is_dir():
|
||||
raise ValidationError("Local source is not a directory", details={"path": str(root)})
|
||||
pattern = "**/*" if recursive else "*"
|
||||
return sorted(str(path) for path in root.glob(pattern) if path.is_file())
|
||||
|
||||
|
||||
def _guess_media_type(path: Path) -> str:
|
||||
suffix = path.suffix.lower()
|
||||
if suffix in {".md", ".markdown", ".mkd"}:
|
||||
return "text/markdown"
|
||||
if suffix in {".txt", ".text", ".log"}:
|
||||
return "text/plain"
|
||||
guessed, _ = mimetypes.guess_type(path.name)
|
||||
return guessed or "application/octet-stream"
|
||||
|
||||
|
||||
def _file_metadata(path: Path) -> dict[str, Any]:
|
||||
stat = path.stat()
|
||||
return {
|
||||
"size_bytes": stat.st_size,
|
||||
"mtime_ns": stat.st_mtime_ns,
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
"""markitect-tool ingestion adapter boundary."""
|
||||
|
||||
from .markdown import MarkitectMarkdownExtractor
|
||||
|
||||
__all__ = ["MarkitectMarkdownExtractor"]
|
||||
86
src/kontextual_engine/adapters/markitect_tool/markdown.py
Normal file
86
src/kontextual_engine/adapters/markitect_tool/markdown.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Markdown normalization through markitect-tool."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
|
||||
from kontextual_engine.errors import AdapterUnavailableError
|
||||
|
||||
|
||||
class MarkitectMarkdownExtractor:
|
||||
"""Adapter boundary to markitect-tool; Markdown syntax logic stays external."""
|
||||
|
||||
name = "markitect-tool"
|
||||
media_types = ("text/markdown", "text/x-markdown")
|
||||
|
||||
def capabilities(self) -> ExtractorCapability:
|
||||
return ExtractorCapability(
|
||||
extractor_name=self.name,
|
||||
media_types=self.media_types,
|
||||
extraction_depth="structure",
|
||||
produces_structure=True,
|
||||
optional_dependency="markitect-tool",
|
||||
metadata={"delegates_markdown_syntax": True},
|
||||
)
|
||||
|
||||
def supports(self, media_type: str) -> bool:
|
||||
return media_type in self.media_types
|
||||
|
||||
def extract(self, payload: SourcePayload) -> ExtractionResult:
|
||||
try:
|
||||
import markitect_tool as mkt
|
||||
except Exception as exc: # pragma: no cover - depends on optional environment
|
||||
raise AdapterUnavailableError(
|
||||
"markitect-tool is required for markdown normalization",
|
||||
details={"adapter": self.name, "media_type": payload.media_type},
|
||||
) from exc
|
||||
|
||||
source_path = payload.source_ref.path
|
||||
text = payload.read_text()
|
||||
document = self._parse_document(mkt, text, source_path)
|
||||
serialized = document.to_dict() if hasattr(document, "to_dict") else {}
|
||||
snapshot = self._snapshot(mkt, source_path)
|
||||
structure = {
|
||||
"frontmatter": dict(serialized.get("frontmatter", {})),
|
||||
"headings": list(serialized.get("headings", [])),
|
||||
"sections": list(serialized.get("sections", [])),
|
||||
}
|
||||
normalized = NormalizedDocument(
|
||||
title=payload.title,
|
||||
text=text,
|
||||
structure=structure,
|
||||
fields={
|
||||
"frontmatter": dict(serialized.get("frontmatter", {})),
|
||||
"heading_count": len(structure["headings"]),
|
||||
"section_count": len(structure["sections"]),
|
||||
},
|
||||
confidence=1.0,
|
||||
extractor_metadata={
|
||||
"extractor": self.name,
|
||||
"source_media_type": payload.media_type,
|
||||
"snapshot": snapshot,
|
||||
},
|
||||
)
|
||||
return ExtractionResult(
|
||||
normalized=normalized,
|
||||
metadata={
|
||||
"extractor": self.name,
|
||||
"frontmatter": structure["frontmatter"],
|
||||
"headings": structure["headings"],
|
||||
"snapshot": snapshot,
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
},
|
||||
)
|
||||
|
||||
def _parse_document(self, mkt: Any, text: str, source_path: str | None) -> Any:
|
||||
if source_path and Path(source_path).exists() and hasattr(mkt, "parse_markdown_file"):
|
||||
return mkt.parse_markdown_file(Path(source_path))
|
||||
return mkt.parse_markdown(text, source_path=source_path)
|
||||
|
||||
def _snapshot(self, mkt: Any, source_path: str | None) -> dict[str, Any]:
|
||||
if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
|
||||
return {}
|
||||
return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()
|
||||
@@ -13,6 +13,8 @@ from kontextual_engine.core import (
|
||||
ContextEntity,
|
||||
CoreRelationship,
|
||||
IdempotencyRecord,
|
||||
IngestionJob,
|
||||
IngestionJobStatus,
|
||||
KnowledgeAsset,
|
||||
LifecycleState,
|
||||
MetadataRecord,
|
||||
@@ -32,6 +34,7 @@ class InMemoryAssetRegistryRepository:
|
||||
versions: dict[str, list[AssetVersion]] = field(default_factory=dict)
|
||||
audit_events: dict[str, AuditEvent] = field(default_factory=dict)
|
||||
idempotency_records: dict[str, IdempotencyRecord] = field(default_factory=dict)
|
||||
ingestion_jobs: dict[str, IngestionJob] = field(default_factory=dict)
|
||||
|
||||
def save_actor(self, actor: Actor) -> Actor:
|
||||
self.actors[actor.id] = actor
|
||||
@@ -190,3 +193,23 @@ class InMemoryAssetRegistryRepository:
|
||||
|
||||
def get_idempotency_record(self, key: str) -> IdempotencyRecord | None:
|
||||
return self.idempotency_records.get(key)
|
||||
|
||||
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
|
||||
self.ingestion_jobs[job.job_id] = job
|
||||
return job
|
||||
|
||||
def get_ingestion_job(self, job_id: str) -> IngestionJob:
|
||||
try:
|
||||
return self.ingestion_jobs[job_id]
|
||||
except KeyError as exc:
|
||||
raise NotFoundError("Ingestion job not found", details={"job_id": job_id}) from exc
|
||||
|
||||
def list_ingestion_jobs(
|
||||
self,
|
||||
*,
|
||||
status: IngestionJobStatus | None = None,
|
||||
) -> list[IngestionJob]:
|
||||
jobs: Iterable[IngestionJob] = self.ingestion_jobs.values()
|
||||
if status is not None:
|
||||
jobs = [job for job in jobs if job.status == status]
|
||||
return sorted(jobs, key=lambda job: (job.created_at, job.job_id))
|
||||
|
||||
@@ -15,6 +15,8 @@ from kontextual_engine.core import (
|
||||
ContextEntity,
|
||||
CoreRelationship,
|
||||
IdempotencyRecord,
|
||||
IngestionJob,
|
||||
IngestionJobStatus,
|
||||
KnowledgeAsset,
|
||||
LifecycleState,
|
||||
MetadataRecord,
|
||||
@@ -381,6 +383,51 @@ class SQLiteAssetRegistryRepository:
|
||||
return None
|
||||
return IdempotencyRecord.from_dict(_loads(row["payload"]))
|
||||
|
||||
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
insert into ingestion_jobs (id, status, actor_id, correlation_id, created_at, updated_at, payload)
|
||||
values (?, ?, ?, ?, ?, ?, ?)
|
||||
on conflict(id) do update set
|
||||
status=excluded.status,
|
||||
actor_id=excluded.actor_id,
|
||||
correlation_id=excluded.correlation_id,
|
||||
updated_at=excluded.updated_at,
|
||||
payload=excluded.payload
|
||||
""",
|
||||
(
|
||||
job.job_id,
|
||||
job.status.value,
|
||||
job.actor_id,
|
||||
job.correlation_id,
|
||||
job.created_at,
|
||||
job.updated_at,
|
||||
_json(job.to_dict()),
|
||||
),
|
||||
)
|
||||
return job
|
||||
|
||||
def get_ingestion_job(self, job_id: str) -> IngestionJob:
|
||||
row = self._one("select payload from ingestion_jobs where id = ?", (job_id,))
|
||||
if row is None:
|
||||
raise NotFoundError("Ingestion job not found", details={"job_id": job_id})
|
||||
return IngestionJob.from_dict(_loads(row["payload"]))
|
||||
|
||||
def list_ingestion_jobs(
|
||||
self,
|
||||
*,
|
||||
status: IngestionJobStatus | None = None,
|
||||
) -> list[IngestionJob]:
|
||||
if status is None:
|
||||
rows = self._all("select payload from ingestion_jobs order by created_at, id", ())
|
||||
else:
|
||||
rows = self._all(
|
||||
"select payload from ingestion_jobs where status = ? order by created_at, id",
|
||||
(status.value,),
|
||||
)
|
||||
return [IngestionJob.from_dict(_loads(row["payload"])) for row in rows]
|
||||
|
||||
def _initialize(self) -> None:
|
||||
with self._connect() as conn:
|
||||
conn.executescript(
|
||||
@@ -449,6 +496,15 @@ class SQLiteAssetRegistryRepository:
|
||||
status text not null,
|
||||
payload text not null
|
||||
);
|
||||
create table if not exists ingestion_jobs (
|
||||
id text primary key,
|
||||
status text not null,
|
||||
actor_id text not null,
|
||||
correlation_id text not null,
|
||||
created_at text not null,
|
||||
updated_at text not null,
|
||||
payload text not null
|
||||
);
|
||||
create index if not exists idx_assets_lifecycle on assets(lifecycle);
|
||||
create index if not exists idx_representations_asset on representations(asset_id);
|
||||
create index if not exists idx_metadata_asset on metadata_records(asset_id);
|
||||
@@ -458,6 +514,8 @@ class SQLiteAssetRegistryRepository:
|
||||
create index if not exists idx_versions_asset on asset_versions(asset_id);
|
||||
create index if not exists idx_audit_target on audit_events(target);
|
||||
create index if not exists idx_audit_correlation on audit_events(correlation_id);
|
||||
create index if not exists idx_ingestion_jobs_status on ingestion_jobs(status);
|
||||
create index if not exists idx_ingestion_jobs_correlation on ingestion_jobs(correlation_id);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user