generated from coulomb/repo-seed
Optional JSONPath query/extract support, FTS5 section/block search, mkt cache query and search. Local SQLite backend now supports parsed snapshot persistence, incremental refresh, cached querying, and ranked full-text search
This commit is contained in:
@@ -36,6 +36,7 @@ from markitect_tool.backend.local_store import (
|
||||
DEFAULT_LOCAL_INDEX_PATH,
|
||||
LOCAL_INDEX_SCHEMA_VERSION,
|
||||
LocalIndexBuildResult,
|
||||
LocalSearchResult,
|
||||
LocalSnapshotStore,
|
||||
local_index_path_for,
|
||||
)
|
||||
@@ -70,6 +71,7 @@ __all__ = [
|
||||
"DEFAULT_LOCAL_INDEX_PATH",
|
||||
"LOCAL_INDEX_SCHEMA_VERSION",
|
||||
"LocalIndexBuildResult",
|
||||
"LocalSearchResult",
|
||||
"LocalSnapshotStore",
|
||||
"local_index_path_for",
|
||||
]
|
||||
|
||||
@@ -56,6 +56,24 @@ class LocalIndexBuildResult:
|
||||
return data
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LocalSearchResult:
|
||||
"""One FTS search match from the local index."""
|
||||
|
||||
path: str
|
||||
snapshot_id: str
|
||||
unit_kind: str
|
||||
unit_index: int
|
||||
heading: str | None
|
||||
text: str
|
||||
rank: float
|
||||
line_start: int | None = None
|
||||
line_end: int | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {key: value for key, value in asdict(self).items() if value is not None}
|
||||
|
||||
|
||||
class LocalSnapshotStore:
|
||||
"""SQLite-backed local snapshot store for parsed Markdown documents."""
|
||||
|
||||
@@ -217,6 +235,7 @@ class LocalSnapshotStore:
|
||||
return
|
||||
with self._connect() as conn:
|
||||
_create_schema(conn)
|
||||
conn.execute("delete from search_units where path = ?", (path,))
|
||||
conn.execute("delete from blocks where path = ?", (path,))
|
||||
conn.execute("delete from sections where path = ?", (path,))
|
||||
conn.execute("delete from headings where path = ?", (path,))
|
||||
@@ -236,6 +255,45 @@ class LocalSnapshotStore:
|
||||
raise KeyError(f"No indexed document `{path}`")
|
||||
return json.loads(row["document_json"])
|
||||
|
||||
def search(self, query: str, *, limit: int = 20) -> list[LocalSearchResult]:
|
||||
"""Search indexed section and block text with SQLite FTS5."""
|
||||
|
||||
if not query.strip():
|
||||
raise ValueError("Search query cannot be empty")
|
||||
if not self.path.exists():
|
||||
return []
|
||||
with self._connect() as conn:
|
||||
_create_schema(conn)
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
select s.path, s.snapshot_id, s.unit_kind, s.unit_index,
|
||||
s.heading, s.text, s.line_start, s.line_end,
|
||||
bm25(search_units) as rank
|
||||
from search_units s
|
||||
where search_units match ?
|
||||
order by rank
|
||||
limit ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.OperationalError as exc:
|
||||
raise ValueError(f"Invalid FTS query `{query}`: {exc}") from exc
|
||||
return [
|
||||
LocalSearchResult(
|
||||
path=row["path"],
|
||||
snapshot_id=row["snapshot_id"],
|
||||
unit_kind=row["unit_kind"],
|
||||
unit_index=row["unit_index"],
|
||||
heading=row["heading"],
|
||||
text=row["text"],
|
||||
line_start=row["line_start"],
|
||||
line_end=row["line_end"],
|
||||
rank=float(row["rank"]),
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
def build(
|
||||
self,
|
||||
paths: list[str | Path],
|
||||
@@ -382,6 +440,16 @@ def _create_schema(conn: sqlite3.Connection) -> None:
|
||||
target_snapshot_id text,
|
||||
metadata_json text not null default '{}'
|
||||
);
|
||||
create virtual table if not exists search_units using fts5(
|
||||
path unindexed,
|
||||
snapshot_id unindexed,
|
||||
unit_kind unindexed,
|
||||
unit_index unindexed,
|
||||
heading,
|
||||
text,
|
||||
line_start unindexed,
|
||||
line_end unindexed
|
||||
);
|
||||
create index if not exists idx_sources_content_hash on sources(content_hash);
|
||||
create index if not exists idx_sources_snapshot_id on sources(snapshot_id);
|
||||
create index if not exists idx_sources_parser on sources(parser, parser_version);
|
||||
@@ -402,6 +470,7 @@ def _replace_document_units(
|
||||
conn.execute("delete from blocks where path = ?", (path,))
|
||||
conn.execute("delete from sections where path = ?", (path,))
|
||||
conn.execute("delete from headings where path = ?", (path,))
|
||||
conn.execute("delete from search_units where path = ?", (path,))
|
||||
for idx, heading in enumerate(document.get("headings", [])):
|
||||
conn.execute(
|
||||
"""
|
||||
@@ -441,6 +510,22 @@ def _replace_document_units(
|
||||
line_end,
|
||||
),
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
insert into search_units(
|
||||
path, snapshot_id, unit_kind, unit_index, heading, text, line_start, line_end
|
||||
) values (?, ?, 'section', ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
path,
|
||||
snapshot_id,
|
||||
idx,
|
||||
str(heading["text"]),
|
||||
text,
|
||||
line_start,
|
||||
line_end,
|
||||
),
|
||||
)
|
||||
for idx, block in enumerate(document.get("blocks", [])):
|
||||
conn.execute(
|
||||
"""
|
||||
@@ -459,6 +544,22 @@ def _replace_document_units(
|
||||
block.get("heading_level"),
|
||||
),
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
insert into search_units(
|
||||
path, snapshot_id, unit_kind, unit_index, heading, text, line_start, line_end
|
||||
) values (?, ?, 'block', ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
path,
|
||||
snapshot_id,
|
||||
idx,
|
||||
None,
|
||||
str(block.get("text", "")),
|
||||
block.get("line_start"),
|
||||
block.get("line_end"),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _load_dependencies(conn: sqlite3.Connection) -> dict[str, list[DependencyEdge]]:
|
||||
|
||||
Reference in New Issue
Block a user