generated from coulomb/repo-seed
Workplan dependencies and prio for text research lab workplans
This commit is contained in:
@@ -21,6 +21,12 @@ from markitect_tool.contract import (
|
||||
validate_contract_file,
|
||||
)
|
||||
from markitect_tool.diagnostics import Diagnostic, SourceLocation
|
||||
from markitect_tool.query import (
|
||||
InvalidQueryError,
|
||||
QueryMatch,
|
||||
extract_document,
|
||||
query_document,
|
||||
)
|
||||
from markitect_tool.schema import (
|
||||
MarkdownSchema,
|
||||
SchemaValidationResult,
|
||||
@@ -55,4 +61,8 @@ __all__ = [
|
||||
"validate_contract_file",
|
||||
"Diagnostic",
|
||||
"SourceLocation",
|
||||
"InvalidQueryError",
|
||||
"QueryMatch",
|
||||
"extract_document",
|
||||
"query_document",
|
||||
]
|
||||
|
||||
@@ -16,6 +16,7 @@ from markitect_tool.contract import (
|
||||
load_contract_file,
|
||||
validate_contract,
|
||||
)
|
||||
from markitect_tool.query import InvalidQueryError, extract_document, query_document
|
||||
from markitect_tool.schema import load_schema_file, validate_markdown_file, validate_schema
|
||||
|
||||
|
||||
@@ -65,6 +66,60 @@ def metrics(file: Path, output_format: str) -> None:
|
||||
_emit_metrics(data, output_format)
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
||||
@click.argument("selector")
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
|
||||
default="json",
|
||||
show_default=True,
|
||||
)
|
||||
def query(file: Path, selector: str, output_format: str) -> None:
|
||||
"""Query structured Markdown content with a small selector."""
|
||||
|
||||
document = parse_markdown_file(file)
|
||||
try:
|
||||
matches = query_document(document, selector)
|
||||
except InvalidQueryError as exc:
|
||||
raise click.ClickException(str(exc)) from exc
|
||||
data = {
|
||||
"selector": selector,
|
||||
"document_path": str(file),
|
||||
"count": len(matches),
|
||||
"matches": [match.to_dict() for match in matches],
|
||||
}
|
||||
_emit_query(data, output_format)
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
||||
@click.argument("selector")
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
type=click.Choice(["text", "json", "yaml"], case_sensitive=False),
|
||||
default="text",
|
||||
show_default=True,
|
||||
)
|
||||
def extract(file: Path, selector: str, output_format: str) -> None:
|
||||
"""Extract text or Markdown content from structured Markdown."""
|
||||
|
||||
document = parse_markdown_file(file)
|
||||
try:
|
||||
items = extract_document(document, selector)
|
||||
except InvalidQueryError as exc:
|
||||
raise click.ClickException(str(exc)) from exc
|
||||
data = {
|
||||
"selector": selector,
|
||||
"document_path": str(file),
|
||||
"count": len(items),
|
||||
"items": items,
|
||||
}
|
||||
_emit_extract(data, output_format)
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
||||
@click.option(
|
||||
@@ -214,5 +269,28 @@ def _emit_metrics(data: dict, output_format: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def _emit_query(data: dict, output_format: str) -> None:
|
||||
if output_format == "json":
|
||||
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
elif output_format == "yaml":
|
||||
click.echo(yaml.safe_dump(data, sort_keys=False))
|
||||
else:
|
||||
click.echo(f"{data['count']} match(es)")
|
||||
for match in data["matches"]:
|
||||
location = f":{match['line']}" if match.get("line") else ""
|
||||
click.echo(f"- {match['kind']} {match['path']}{location}")
|
||||
if match.get("text"):
|
||||
click.echo(f" {match['text'].splitlines()[0]}")
|
||||
|
||||
|
||||
def _emit_extract(data: dict, output_format: str) -> None:
|
||||
if output_format == "json":
|
||||
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
elif output_format == "yaml":
|
||||
click.echo(yaml.safe_dump(data, sort_keys=False))
|
||||
else:
|
||||
click.echo("\n\n".join(data["items"]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
15
src/markitect_tool/query/__init__.py
Normal file
15
src/markitect_tool/query/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""Query and extraction helpers for parsed Markdown documents."""
|
||||
|
||||
from markitect_tool.query.engine import (
|
||||
InvalidQueryError,
|
||||
QueryMatch,
|
||||
extract_document,
|
||||
query_document,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"InvalidQueryError",
|
||||
"QueryMatch",
|
||||
"extract_document",
|
||||
"query_document",
|
||||
]
|
||||
242
src/markitect_tool/query/engine.py
Normal file
242
src/markitect_tool/query/engine.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Small selector engine for structured Markdown documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from markitect_tool.contract import collect_metrics
|
||||
from markitect_tool.core import ContentBlock, Document, Heading, Section
|
||||
|
||||
|
||||
class InvalidQueryError(ValueError):
|
||||
"""Raised when a selector cannot be parsed or evaluated."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QueryMatch:
|
||||
"""One match returned by a selector."""
|
||||
|
||||
kind: str
|
||||
path: str
|
||||
value: Any
|
||||
text: str | None = None
|
||||
line: int | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {
|
||||
"kind": self.kind,
|
||||
"path": self.path,
|
||||
"value": self.value,
|
||||
"text": self.text,
|
||||
"line": self.line,
|
||||
}
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _Selector:
|
||||
target: str
|
||||
path: list[str]
|
||||
filters: dict[str, str]
|
||||
|
||||
|
||||
def query_document(document: Document, selector: str) -> list[QueryMatch]:
|
||||
"""Query a parsed document with a small Markitect selector."""
|
||||
|
||||
parsed = _parse_selector(selector)
|
||||
if parsed.target in {"document", "$", "."}:
|
||||
return [QueryMatch(kind="document", path="$", value=document.to_dict())]
|
||||
if parsed.target == "frontmatter":
|
||||
return _query_mapping(document.frontmatter, parsed.path, "frontmatter", "$.frontmatter")
|
||||
if parsed.target == "headings":
|
||||
return _query_headings(document.headings, parsed.filters)
|
||||
if parsed.target == "sections":
|
||||
return _query_sections(document.sections, parsed.filters)
|
||||
if parsed.target == "blocks":
|
||||
return _query_blocks(document.blocks, parsed.filters)
|
||||
if parsed.target == "metrics":
|
||||
return _query_mapping(collect_metrics(document).to_dict(), parsed.path, "metrics", "$.metrics")
|
||||
raise InvalidQueryError(f"Unsupported selector target `{parsed.target}`")
|
||||
|
||||
|
||||
def extract_document(document: Document, selector: str) -> list[str]:
|
||||
"""Extract text content from query matches."""
|
||||
|
||||
extracted: list[str] = []
|
||||
for match in query_document(document, selector):
|
||||
if match.text is not None:
|
||||
extracted.append(match.text)
|
||||
elif isinstance(match.value, str):
|
||||
extracted.append(match.value)
|
||||
elif isinstance(match.value, int | float | bool):
|
||||
extracted.append(str(match.value))
|
||||
return extracted
|
||||
|
||||
|
||||
def _parse_selector(selector: str) -> _Selector:
|
||||
raw = selector.strip()
|
||||
if not raw:
|
||||
raise InvalidQueryError("Selector cannot be empty")
|
||||
|
||||
filters: dict[str, str] = {}
|
||||
base = raw
|
||||
if "[" in raw or "]" in raw:
|
||||
if not raw.endswith("]") or raw.count("[") != 1 or raw.count("]") != 1:
|
||||
raise InvalidQueryError(f"Invalid selector filter syntax `{selector}`")
|
||||
base, raw_filter = raw[:-1].split("[", 1)
|
||||
filters = _parse_filters(raw_filter)
|
||||
|
||||
parts = [part for part in base.split(".") if part]
|
||||
if not parts:
|
||||
return _Selector(target="document", path=[], filters=filters)
|
||||
return _Selector(target=parts[0], path=parts[1:], filters=filters)
|
||||
|
||||
|
||||
def _parse_filters(raw_filter: str) -> dict[str, str]:
|
||||
filters: dict[str, str] = {}
|
||||
for raw_part in raw_filter.split(","):
|
||||
part = raw_part.strip()
|
||||
if not part:
|
||||
continue
|
||||
operator = "~=" if "~=" in part else "="
|
||||
if operator not in part:
|
||||
raise InvalidQueryError(f"Invalid filter `{part}`")
|
||||
key, value = part.split(operator, 1)
|
||||
key = key.strip()
|
||||
if operator == "~=":
|
||||
key = f"{key}~"
|
||||
if not key:
|
||||
raise InvalidQueryError(f"Invalid filter `{part}`")
|
||||
filters[key] = _strip_quotes(value.strip())
|
||||
return filters
|
||||
|
||||
|
||||
def _query_mapping(
|
||||
mapping: dict[str, Any],
|
||||
path: list[str],
|
||||
kind: str,
|
||||
root_path: str,
|
||||
) -> list[QueryMatch]:
|
||||
if not path:
|
||||
return [QueryMatch(kind=kind, path=root_path, value=mapping)]
|
||||
value: Any = mapping
|
||||
current_path = root_path
|
||||
for part in path:
|
||||
current_path = f"{current_path}.{part}"
|
||||
if isinstance(value, dict) and part in value:
|
||||
value = value[part]
|
||||
else:
|
||||
return []
|
||||
return [QueryMatch(kind=kind, path=current_path, value=value, text=_text_value(value))]
|
||||
|
||||
|
||||
def _query_headings(headings: list[Heading], filters: dict[str, str]) -> list[QueryMatch]:
|
||||
matches: list[QueryMatch] = []
|
||||
for index, heading in enumerate(headings):
|
||||
if not _match_heading(heading, filters):
|
||||
continue
|
||||
matches.append(
|
||||
QueryMatch(
|
||||
kind="heading",
|
||||
path=f"$.headings[{index}]",
|
||||
value=heading.to_dict(),
|
||||
text=f"{'#' * heading.level} {heading.text}",
|
||||
line=heading.line,
|
||||
)
|
||||
)
|
||||
return matches
|
||||
|
||||
|
||||
def _query_sections(sections: list[Section], filters: dict[str, str]) -> list[QueryMatch]:
|
||||
matches: list[QueryMatch] = []
|
||||
for index, section in enumerate(sections):
|
||||
if not _match_section(section, filters):
|
||||
continue
|
||||
matches.append(
|
||||
QueryMatch(
|
||||
kind="section",
|
||||
path=f"$.sections[{index}]",
|
||||
value=section.to_dict(),
|
||||
text=_section_markdown(section),
|
||||
line=section.heading.line,
|
||||
)
|
||||
)
|
||||
return matches
|
||||
|
||||
|
||||
def _query_blocks(blocks: list[ContentBlock], filters: dict[str, str]) -> list[QueryMatch]:
|
||||
matches: list[QueryMatch] = []
|
||||
for index, block in enumerate(blocks):
|
||||
if not _match_block(block, filters):
|
||||
continue
|
||||
matches.append(
|
||||
QueryMatch(
|
||||
kind="block",
|
||||
path=f"$.blocks[{index}]",
|
||||
value=block.to_dict(),
|
||||
text=block.text,
|
||||
line=block.line_start,
|
||||
)
|
||||
)
|
||||
return matches
|
||||
|
||||
|
||||
def _match_heading(heading: Heading, filters: dict[str, str]) -> bool:
|
||||
for key, expected in filters.items():
|
||||
if key == "level" and str(heading.level) != expected:
|
||||
return False
|
||||
if key in {"text", "heading", "title"} and heading.text != expected:
|
||||
return False
|
||||
if key in {"text~", "heading~", "title~"} and expected.lower() not in heading.text.lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _match_section(section: Section, filters: dict[str, str]) -> bool:
|
||||
section_text = "\n".join(block.text for block in section.blocks if block.text)
|
||||
for key, expected in filters.items():
|
||||
if key == "level" and str(section.heading.level) != expected:
|
||||
return False
|
||||
if key in {"heading", "title", "text"} and section.heading.text != expected:
|
||||
return False
|
||||
if key in {"heading~", "title~", "text~"} and expected.lower() not in section.heading.text.lower():
|
||||
return False
|
||||
if key == "contains" and expected not in section_text:
|
||||
return False
|
||||
if key == "contains~" and expected.lower() not in section_text.lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _match_block(block: ContentBlock, filters: dict[str, str]) -> bool:
|
||||
for key, expected in filters.items():
|
||||
if key == "type" and block.type != expected:
|
||||
return False
|
||||
if key == "contains" and expected not in block.text:
|
||||
return False
|
||||
if key == "contains~" and expected.lower() not in block.text.lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _section_markdown(section: Section) -> str:
|
||||
lines = [f"{'#' * section.heading.level} {section.heading.text}"]
|
||||
for block in section.blocks:
|
||||
if block.text:
|
||||
lines.extend(["", block.text])
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def _strip_quotes(value: str) -> str:
|
||||
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
|
||||
return value[1:-1]
|
||||
return value
|
||||
|
||||
|
||||
def _text_value(value: Any) -> str | None:
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
if isinstance(value, int | float | bool):
|
||||
return str(value)
|
||||
return None
|
||||
Reference in New Issue
Block a user