generated from coulomb/repo-seed
richer normalized structure, permission context preservation
This commit is contained in:
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
|
||||
@@ -31,9 +32,10 @@ class CsvDatasetExtractor:
|
||||
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
|
||||
columns = list(reader.fieldnames or [])
|
||||
rows = [dict(row) for row in reader]
|
||||
links = _links_from_rows(rows)
|
||||
table = {
|
||||
"name": payload.title,
|
||||
"columns": columns,
|
||||
"columns": [{"name": column, "index": index} for index, column in enumerate(columns)],
|
||||
"rows": rows,
|
||||
"row_count": len(rows),
|
||||
}
|
||||
@@ -44,6 +46,9 @@ class CsvDatasetExtractor:
|
||||
"column_count": len(columns),
|
||||
"row_count": len(rows),
|
||||
"table_count": 1,
|
||||
"link_count": len(links),
|
||||
"links": links,
|
||||
"sample_rows": rows[:5],
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
}
|
||||
@@ -53,15 +58,19 @@ class CsvDatasetExtractor:
|
||||
structure={
|
||||
"kind": "dataset",
|
||||
"format": metadata["dataset_format"],
|
||||
"columns": columns,
|
||||
"columns": [{"name": column, "index": index} for index, column in enumerate(columns)],
|
||||
"row_count": len(rows),
|
||||
"table_count": 1,
|
||||
"sample_rows": rows[:5],
|
||||
},
|
||||
tables=[table],
|
||||
links=links,
|
||||
fields={
|
||||
"columns": columns,
|
||||
"column_count": len(columns),
|
||||
"row_count": len(rows),
|
||||
"dataset_format": metadata["dataset_format"],
|
||||
"link_count": len(links),
|
||||
},
|
||||
confidence=0.95,
|
||||
extractor_metadata={
|
||||
@@ -77,3 +86,21 @@ def _delimiter_for(payload: SourcePayload) -> str:
|
||||
if payload.media_type == "text/tab-separated-values" or filename.endswith(".tsv"):
|
||||
return "\t"
|
||||
return ","
|
||||
|
||||
|
||||
def _links_from_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
links: list[dict[str, Any]] = []
|
||||
for row_index, row in enumerate(rows):
|
||||
for column, value in row.items():
|
||||
if not isinstance(value, str):
|
||||
continue
|
||||
for match in re.finditer(r"https?://[^\s<>)]+", value):
|
||||
links.append(
|
||||
{
|
||||
"url": match.group(0),
|
||||
"table": 0,
|
||||
"row": row_index,
|
||||
"column": column,
|
||||
}
|
||||
)
|
||||
return links
|
||||
|
||||
@@ -63,6 +63,9 @@ class DocumentPlaceholderExtractor:
|
||||
"document_kind": document_kind,
|
||||
"extraction_depth": "metadata_only",
|
||||
"unsupported_elements": [unsupported],
|
||||
"unsupported_count": 1,
|
||||
"link_count": 0,
|
||||
"table_count": 0,
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
}
|
||||
@@ -72,11 +75,15 @@ class DocumentPlaceholderExtractor:
|
||||
structure={
|
||||
"kind": document_kind,
|
||||
"extraction_depth": "metadata_only",
|
||||
"unsupported_elements": [unsupported],
|
||||
},
|
||||
fields={
|
||||
"document_kind": document_kind,
|
||||
"source_media_type": payload.media_type,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
"unsupported_count": 1,
|
||||
"link_count": 0,
|
||||
"table_count": 0,
|
||||
},
|
||||
confidence=0.0,
|
||||
unsupported_elements=[unsupported],
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
|
||||
|
||||
|
||||
@@ -13,8 +15,8 @@ class PlainTextExtractor:
|
||||
return ExtractorCapability(
|
||||
extractor_name=self.name,
|
||||
media_types=self.media_types,
|
||||
extraction_depth="text",
|
||||
produces_structure=False,
|
||||
extraction_depth="text_structure",
|
||||
produces_structure=True,
|
||||
)
|
||||
|
||||
def supports(self, media_type: str) -> bool:
|
||||
@@ -22,10 +24,23 @@ class PlainTextExtractor:
|
||||
|
||||
def extract(self, payload: SourcePayload) -> ExtractionResult:
|
||||
text = payload.read_text()
|
||||
lines = _lines(text)
|
||||
paragraphs = _paragraphs(text)
|
||||
links = _links(text)
|
||||
normalized = NormalizedDocument(
|
||||
title=payload.title,
|
||||
text=text,
|
||||
fields={"line_count": len(text.splitlines())},
|
||||
structure={
|
||||
"kind": "plain_text",
|
||||
"lines": lines,
|
||||
"paragraphs": paragraphs,
|
||||
},
|
||||
links=links,
|
||||
fields={
|
||||
"line_count": len(lines),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"link_count": len(links),
|
||||
},
|
||||
confidence=1.0,
|
||||
extractor_metadata={
|
||||
"extractor": self.name,
|
||||
@@ -36,7 +51,70 @@ class PlainTextExtractor:
|
||||
normalized=normalized,
|
||||
metadata={
|
||||
"extractor": self.name,
|
||||
"line_count": len(lines),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"link_count": len(links),
|
||||
"links": links,
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _lines(text: str) -> list[dict[str, int | str]]:
|
||||
return [
|
||||
{
|
||||
"index": index,
|
||||
"line_number": index + 1,
|
||||
"text": line,
|
||||
}
|
||||
for index, line in enumerate(text.splitlines())
|
||||
]
|
||||
|
||||
|
||||
def _paragraphs(text: str) -> list[dict[str, int | str]]:
|
||||
paragraphs: list[dict[str, int | str]] = []
|
||||
current: list[str] = []
|
||||
start_line: int | None = None
|
||||
for index, line in enumerate(text.splitlines(), start=1):
|
||||
if line.strip():
|
||||
if start_line is None:
|
||||
start_line = index
|
||||
current.append(line)
|
||||
continue
|
||||
if current and start_line is not None:
|
||||
paragraphs.append(
|
||||
{
|
||||
"index": len(paragraphs),
|
||||
"line_start": start_line,
|
||||
"line_end": index - 1,
|
||||
"text": "\n".join(current),
|
||||
}
|
||||
)
|
||||
current = []
|
||||
start_line = None
|
||||
if current and start_line is not None:
|
||||
paragraphs.append(
|
||||
{
|
||||
"index": len(paragraphs),
|
||||
"line_start": start_line,
|
||||
"line_end": start_line + len(current) - 1,
|
||||
"text": "\n".join(current),
|
||||
}
|
||||
)
|
||||
return paragraphs
|
||||
|
||||
|
||||
def _links(text: str) -> list[dict[str, int | str]]:
|
||||
links: list[dict[str, int | str]] = []
|
||||
for line_index, line in enumerate(text.splitlines(), start=1):
|
||||
for match in re.finditer(r"https?://[^\s<>)]+", line):
|
||||
links.append(
|
||||
{
|
||||
"url": match.group(0),
|
||||
"line": line_index,
|
||||
"start": match.start(),
|
||||
"end": match.end(),
|
||||
}
|
||||
)
|
||||
return links
|
||||
|
||||
@@ -42,8 +42,11 @@ class MarkitectMarkdownExtractor:
|
||||
document = self._parse_document(mkt, text, source_path)
|
||||
serialized = document.to_dict() if hasattr(document, "to_dict") else {}
|
||||
snapshot = self._snapshot(mkt, source_path)
|
||||
links = _links_from_tokens(list(serialized.get("tokens", [])))
|
||||
tables = _tables_from_blocks(list(serialized.get("blocks", [])))
|
||||
structure = {
|
||||
"frontmatter": dict(serialized.get("frontmatter", {})),
|
||||
"blocks": list(serialized.get("blocks", [])),
|
||||
"headings": list(serialized.get("headings", [])),
|
||||
"sections": list(serialized.get("sections", [])),
|
||||
}
|
||||
@@ -51,10 +54,15 @@ class MarkitectMarkdownExtractor:
|
||||
title=payload.title,
|
||||
text=text,
|
||||
structure=structure,
|
||||
tables=tables,
|
||||
links=links,
|
||||
fields={
|
||||
"frontmatter": dict(serialized.get("frontmatter", {})),
|
||||
"block_count": len(structure["blocks"]),
|
||||
"heading_count": len(structure["headings"]),
|
||||
"section_count": len(structure["sections"]),
|
||||
"table_count": len(tables),
|
||||
"link_count": len(links),
|
||||
},
|
||||
confidence=1.0,
|
||||
extractor_metadata={
|
||||
@@ -68,7 +76,12 @@ class MarkitectMarkdownExtractor:
|
||||
metadata={
|
||||
"extractor": self.name,
|
||||
"frontmatter": structure["frontmatter"],
|
||||
"blocks": structure["blocks"],
|
||||
"headings": structure["headings"],
|
||||
"sections": structure["sections"],
|
||||
"links": links,
|
||||
"link_count": len(links),
|
||||
"table_count": len(tables),
|
||||
"snapshot": snapshot,
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
@@ -84,3 +97,52 @@ class MarkitectMarkdownExtractor:
|
||||
if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
|
||||
return {}
|
||||
return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()
|
||||
|
||||
|
||||
def _links_from_tokens(tokens: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
links: list[dict[str, Any]] = []
|
||||
for token in _walk_tokens(tokens):
|
||||
if token.get("type") != "link_open":
|
||||
continue
|
||||
href = _attr_value(token.get("attrs"), "href")
|
||||
if href:
|
||||
links.append({"url": href, "kind": "markdown_link"})
|
||||
return links
|
||||
|
||||
|
||||
def _walk_tokens(tokens: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
walked: list[dict[str, Any]] = []
|
||||
for token in tokens:
|
||||
walked.append(token)
|
||||
children = token.get("children")
|
||||
if isinstance(children, list):
|
||||
walked.extend(_walk_tokens([child for child in children if isinstance(child, dict)]))
|
||||
return walked
|
||||
|
||||
|
||||
def _attr_value(attrs: Any, name: str) -> str | None:
|
||||
if isinstance(attrs, dict):
|
||||
value = attrs.get(name)
|
||||
return str(value) if value is not None else None
|
||||
if isinstance(attrs, list):
|
||||
for item in attrs:
|
||||
if isinstance(item, (list, tuple)) and len(item) == 2 and item[0] == name:
|
||||
return str(item[1])
|
||||
return None
|
||||
|
||||
|
||||
def _tables_from_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
tables: list[dict[str, Any]] = []
|
||||
for index, block in enumerate(blocks):
|
||||
if block.get("type") != "table":
|
||||
continue
|
||||
tables.append(
|
||||
{
|
||||
"index": len(tables),
|
||||
"source_block_index": index,
|
||||
"text": block.get("text", ""),
|
||||
"line_start": block.get("line_start"),
|
||||
"line_end": block.get("line_end"),
|
||||
}
|
||||
)
|
||||
return tables
|
||||
|
||||
Reference in New Issue
Block a user