Add canon reset and reingest guardrails

This commit is contained in:
2026-05-23 14:52:57 +02:00
parent 653411ffb8
commit 9c22d3e0df
12 changed files with 634 additions and 5 deletions

View File

@@ -90,6 +90,7 @@ NODE_KIND_CANON_MAP: dict[str, CanonNodeMapping] = {
"DeploymentService": CanonNodeMapping("deployment", "model/devsecops", "direct"),
"DomainName": CanonNodeMapping("endpoint", "model/network", "partial"),
"ExternalLibrary": CanonNodeMapping("software-system", "model/landscape", "partial"),
"FabricRegistryEntry": CanonNodeMapping("source-repository", "model/devsecops", "partial"),
"InterfaceDeclaration": CanonNodeMapping("endpoint", "model/network", "partial"),
"Library": CanonNodeMapping("software-system", "model/landscape", "partial"),
"Lockfile": CanonNodeMapping("evidence", "model/observability", "partial"),

View File

@@ -20,6 +20,7 @@ from .graph import FabricGraph, build_graph
from .graph_explorer import fabric_graph_explorer_payload
from .llm_extraction import LLMExtractionConfig
from .reconciliation import reconcile_discovery_snapshots
from .registry import RESET_CONFIRMATION_TOKEN
from .scanner import EXTRACTOR_VERSION, ScanOptions, scan_repo
from .validation import validate_roots
@@ -240,6 +241,26 @@ def build_parser() -> argparse.ArgumentParser:
accept_discovery.add_argument("--accept-review-state", action="append", default=None)
accept_discovery.add_argument("--commit", default=None)
accept_discovery.add_argument("--json", action="store_true", help="Print the raw accept response.")
export_archive = registry_sub.add_parser(
"export-reset-archive",
help="Export registry graph/discovery data before a guarded reset.",
)
export_archive.add_argument("output", type=Path)
export_archive.add_argument("--registry-url", default="http://127.0.0.1:8765")
export_archive.add_argument("--overwrite", action="store_true", help="Overwrite an existing archive file.")
export_archive.add_argument("--json", action="store_true", help="Print archive metadata.")
reset_graph = registry_sub.add_parser(
"reset-graph-data",
help="Export an archive, then reset registry graph/discovery data with an explicit confirmation token.",
)
reset_graph.add_argument("--registry-url", default="http://127.0.0.1:8765")
reset_graph.add_argument("--archive", type=Path, required=True, help="Archive JSON file to write before reset.")
reset_graph.add_argument("--overwrite-archive", action="store_true", help="Overwrite an existing archive file.")
reset_graph.add_argument("--confirm", required=True, help=f"Must equal {RESET_CONFIRMATION_TOKEN}.")
reset_graph.add_argument("--reason", required=True)
reset_graph.add_argument("--json", action="store_true", help="Print the raw reset response.")
return parser
@@ -311,6 +332,10 @@ def main(argv: list[str] | None = None) -> int:
return _registry_ingest_discovery(args)
if args.registry_command == "accept-discovery":
return _registry_accept_discovery(args)
if args.registry_command == "export-reset-archive":
return _registry_export_reset_archive(args)
if args.registry_command == "reset-graph-data":
return _registry_reset_graph_data(args)
parser.error(f"unknown command {args.command!r}")
return 2
@@ -1011,7 +1036,12 @@ def _scan_manifest_exit_code(summary: dict[str, Any], args: argparse.Namespace)
def _manifest_discovery_snapshot_path(base_dir: Path, slug: str, profile: str) -> Path:
return base_dir.resolve() / f"{_slugify(slug)}-{_slugify(profile)}.discovery.json"
slug_part = _slugify(slug)
raw_slug_part = slug.strip().lower()
if slug_part != raw_slug_part:
fingerprint = hashlib.sha256(slug.encode("utf-8")).hexdigest()[:8]
slug_part = f"{slug_part}-{fingerprint}"
return base_dir.resolve() / f"{slug_part}-{_slugify(profile)}.discovery.json"
def _candidate_counts(snapshot: dict[str, Any]) -> dict[str, int]:
@@ -1285,6 +1315,57 @@ def _registry_accept_discovery(args: argparse.Namespace) -> int:
return 0
def _registry_export_reset_archive(args: argparse.Namespace) -> int:
try:
archive = _registry_get_checked(args.registry_url, "/exports/reset-archive")
archive_sha256 = _write_json_archive(args.output, archive, overwrite=args.overwrite)
except (RegistryRequestError, OSError) as exc:
print(f"ERROR {exc}", file=sys.stderr)
return 1
metadata = {
"archive": str(args.output),
"archive_sha256": archive_sha256,
"counts": archive.get("counts", {}),
}
if args.json:
print(json.dumps(metadata, indent=2, sort_keys=True))
else:
print(f"wrote reset archive {args.output}")
print(f"archive sha256 {archive_sha256}")
return 0
def _registry_reset_graph_data(args: argparse.Namespace) -> int:
if args.confirm != RESET_CONFIRMATION_TOKEN:
print(f"ERROR --confirm must equal {RESET_CONFIRMATION_TOKEN}", file=sys.stderr)
return 1
try:
archive = _registry_get_checked(args.registry_url, "/exports/reset-archive")
archive_sha256 = _write_json_archive(args.archive, archive, overwrite=args.overwrite_archive)
result = _registry_post_checked(
args.registry_url,
"/admin/reset-graph-data",
{
"confirm": args.confirm,
"reason": args.reason,
"archive_path": str(args.archive),
"archive_sha256": archive_sha256,
},
)
except (RegistryRequestError, OSError) as exc:
print(f"ERROR {exc}", file=sys.stderr)
return 1
if args.json:
print(json.dumps(result, indent=2, sort_keys=True))
else:
print(f"wrote reset archive {args.archive}")
print(f"archive sha256 {archive_sha256}")
print(f"reset event {result['id']} recorded")
print(f"dropped {json.dumps(result.get('dropped_counts', {}), sort_keys=True)}")
print(f"preserved {result.get('repositories_preserved', 0)} repository registration(s)")
return 0
def _scan_repo(args: argparse.Namespace) -> int:
snapshot = scan_repo(
ScanOptions(
@@ -1369,6 +1450,15 @@ class RegistryRequestError(Exception):
self.status_code = status_code
def _write_json_archive(path: Path, archive: dict[str, object], *, overwrite: bool) -> str:
if path.exists() and not overwrite:
raise OSError(f"archive already exists: {path} (use --overwrite or --overwrite-archive)")
path.parent.mkdir(parents=True, exist_ok=True)
data = json.dumps(archive, indent=2, sort_keys=True).encode("utf-8")
path.write_bytes(data)
return hashlib.sha256(data).hexdigest()
def _registry_post(registry_url: str, path: str, payload: dict[str, object]) -> dict[str, object]:
try:
return _registry_post_checked(registry_url, path, payload)

View File

@@ -6,6 +6,7 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Protocol
from .canon import edge_canon_mapping, evidence_state_for, node_canon_mapping
from .discovery import (
attribute_stable_key,
discovery_stable_key,
@@ -183,12 +184,22 @@ class LocalFabricRegistryConnector:
}
repo_key = _repository_key(context.snapshot, context.repo_slug)
entry_key = discovery_stable_key(context.repo_slug, "FabricRegistryEntry", context.repo_slug)
node_mapping = node_canon_mapping("FabricRegistryEntry")
node = {
"stable_key": entry_key,
"kind": "FabricRegistryEntry",
"label": str(match.get("name") or context.repo_slug),
"repo": context.repo_slug,
"domain": str(match.get("domain") or ""),
"canon_category": node_mapping.category,
"canon_anchor": node_mapping.canon_anchor,
"mapping_fit": node_mapping.fit,
"evidence_state": evidence_state_for(
origin="registry",
source_kind="fabric_registry",
review_state="candidate",
confidence=0.9,
),
"aliases": _unique_strings([context.repo_slug, match.get("name")]),
"attributes": {
"registry_slug": context.repo_slug,
@@ -208,9 +219,20 @@ class LocalFabricRegistryConnector:
"provenance": [provenance],
"source_anchors": [anchor],
}
edge_mapping = edge_canon_mapping("cataloged_as")
edge = {
"stable_key": relationship_stable_key(repo_key, "cataloged_as", entry_key, evidence_scope=scope["id"]),
"edge_type": "cataloged_as",
"canonical_type": edge_mapping.canonical_type,
"canon_anchor": edge_mapping.canon_anchor,
"mapping_fit": edge_mapping.fit,
"display_only": edge_mapping.display_only,
"evidence_state": evidence_state_for(
origin="registry",
source_kind="fabric_registry",
review_state="candidate",
confidence=0.9,
),
"source_key": repo_key,
"target_key": entry_key,
"origin": "registry",

View File

@@ -9,6 +9,7 @@ from typing import Any, Iterable
from jsonschema import ValidationError
from .canon import edge_canon_mapping, evidence_state_for, node_canon_mapping
from .discovery import (
attribute_stable_key,
discovery_stable_key,
@@ -270,12 +271,22 @@ def project_llm_output(
rationale = str(raw_node.get("rationale") or "").strip()
if rationale:
provenance["rationale"] = rationale
canon_mapping = node_canon_mapping(kind)
candidates["nodes"].append(
{
"stable_key": stable_key,
"kind": kind,
"label": label,
"repo": repo_slug,
"canon_category": canon_mapping.category,
"canon_anchor": canon_mapping.canon_anchor,
"mapping_fit": canon_mapping.fit,
"evidence_state": evidence_state_for(
origin="llm",
source_kind="llm",
review_state="needs_review",
confidence=confidence,
),
"aliases": _strings(raw_node.get("aliases")) + [label],
"attributes": _json_object(raw_node.get("attributes")) if isinstance(raw_node.get("attributes"), dict) else {},
"origin": "llm",
@@ -304,10 +315,21 @@ def project_llm_output(
rationale = str(raw_edge.get("rationale") or "").strip()
if rationale:
provenance["rationale"] = rationale
canon_mapping = edge_canon_mapping(edge_type)
candidates["edges"].append(
{
"stable_key": relationship_stable_key(source_key, edge_type, target_key, evidence_scope=scope["id"]),
"edge_type": edge_type,
"canonical_type": canon_mapping.canonical_type,
"canon_anchor": canon_mapping.canon_anchor,
"mapping_fit": canon_mapping.fit,
"display_only": canon_mapping.display_only,
"evidence_state": evidence_state_for(
origin="llm",
source_kind="llm",
review_state="needs_review",
confidence=confidence,
),
"source_key": source_key,
"target_key": target_key,
"attributes": _json_object(raw_edge.get("attributes")) if isinstance(raw_edge.get("attributes"), dict) else {},

View File

@@ -9,10 +9,12 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from .canon import edge_canon_mapping, node_canon_mapping
from .canon import DISPLAY_ONLY_EDGE_TYPES, edge_canon_mapping, node_canon_mapping
from .loader import repo_root
from .schema_validation import draft202012_validator
RESET_CONFIRMATION_TOKEN = "RESET-RAILIANCE-FABRIC-GRAPH-DATA"
class RegistryError(Exception):
def __init__(self, message: str, status_code: int = 400) -> None:
@@ -108,6 +110,15 @@ class RegistryStore:
create index if not exists idx_libraries_purl
on libraries(purl);
create table if not exists registry_reset_events (
id integer primary key autoincrement,
created_at text not null,
reason text not null,
archive_path text,
archive_sha256 text not null,
dropped_counts_json text not null
);
"""
)
@@ -726,6 +737,7 @@ class RegistryStore:
"discovery_snapshots": db.execute("select count(*) from discovery_snapshots").fetchone()[0],
"artifacts": db.execute("select count(*) from artifacts").fetchone()[0],
"libraries": db.execute("select count(*) from libraries").fetchone()[0],
"registry_reset_events": db.execute("select count(*) from registry_reset_events").fetchone()[0],
}
latest = [
{
@@ -748,6 +760,120 @@ class RegistryStore:
"latest_discovery_snapshots": latest_discovery,
}
def reset_archive(self) -> dict[str, Any]:
with self._connect() as db:
snapshot_rows = db.execute(
"""
select id, repo_slug, commit_sha, generated_at, graph_json, created_at
from snapshots
order by repo_slug, id
"""
).fetchall()
discovery_rows = db.execute(
"""
select id, repo_slug, commit_sha, profile, generated_at,
snapshot_json, accepted_graph_snapshot_id, created_at
from discovery_snapshots
order by repo_slug, profile, id
"""
).fetchall()
artifact_rows = db.execute(
"""
select id, repo_slug, target_id, target_kind, artifact_type, name, uri,
media_type, digest, version, metadata_json, created_at
from artifacts
order by repo_slug, id
"""
).fetchall()
library_rows = db.execute(
"""
select id, repo_slug, bom_ref, component_type, name, version, purl, scope,
licenses_json, hashes_json, metadata_json, created_at
from libraries
order by repo_slug, id
"""
).fetchall()
reset_rows = db.execute(
"""
select id, created_at, reason, archive_path, archive_sha256, dropped_counts_json
from registry_reset_events
order by id
"""
).fetchall()
return {
"apiVersion": "railiance.fabric/v1alpha1",
"kind": "RegistryResetArchive",
"generated_at": _utc_now(),
"source": {"database": str(self.path)},
"counts": self.status()["counts"],
"combined_graph": self.combined_graph(),
"repositories": self.list_repositories(),
"snapshots": [_snapshot_dict(row) for row in snapshot_rows],
"discovery_snapshots": [_discovery_snapshot_dict(row) for row in discovery_rows],
"artifacts": [_artifact_dict(row) for row in artifact_rows],
"libraries": [_library_dict(row) for row in library_rows],
"reset_events": [_reset_event_dict(row) for row in reset_rows],
"rollback": {
"limits": (
"This archive is a JSON evidence bundle, not an automatic SQLite restore. "
"Use it to inspect and manually reinsert prior registry graph data if needed."
),
"post_reset_source_of_truth": (
"Repository registrations remain in the registry. Graph snapshots, discovery "
"snapshots, artifacts, and library inventory must be recreated by reingesting "
"registered/local repositories with the canon-aligned scanner and graph model."
),
},
}
def reset_graph_data(self, payload: dict[str, Any]) -> dict[str, Any]:
confirm = _required_text(payload, "confirm")
if confirm != RESET_CONFIRMATION_TOKEN:
raise RegistryError(
f"reset requires confirm={RESET_CONFIRMATION_TOKEN!r}",
400,
)
reason = _required_text(payload, "reason")
archive_sha256 = _required_text(payload, "archive_sha256")
archive_path = _optional_text(payload, "archive_path")
now = _utc_now()
with self._connect() as db:
counts = _resettable_counts(db)
cursor = db.execute(
"""
insert into registry_reset_events (
created_at, reason, archive_path, archive_sha256, dropped_counts_json
)
values (?, ?, ?, ?, ?)
""",
(now, reason, archive_path, archive_sha256, json.dumps(counts, sort_keys=True)),
)
event_id = int(cursor.lastrowid)
db.execute("delete from discovery_snapshots")
db.execute("delete from snapshots")
db.execute("delete from artifacts")
db.execute("delete from libraries")
event = self.get_reset_event(event_id)
return {
**event,
"confirm": confirm,
"repositories_preserved": len(self.list_repositories()),
}
def get_reset_event(self, event_id: int) -> dict[str, Any]:
with self._connect() as db:
row = db.execute(
"""
select id, created_at, reason, archive_path, archive_sha256, dropped_counts_json
from registry_reset_events
where id = ?
""",
(event_id,),
).fetchone()
if row is None:
raise RegistryError(f"reset event not found: {event_id}", 404)
return _reset_event_dict(row)
def latest_discovery_snapshots(self, profile: str | None = None) -> list[dict[str, Any]]:
params: list[Any] = []
where = ""
@@ -794,6 +920,9 @@ def validate_graph_export(graph: dict[str, Any]) -> None:
error = errors[0]
location = ".".join(str(part) for part in error.path) or "<root>"
raise RegistryError(f"invalid FabricGraphExport at {location}: {error.message}")
canon_errors = _graph_canon_metadata_errors(graph)
if canon_errors:
raise RegistryError(f"invalid FabricGraphExport canon metadata: {canon_errors[0]}")
def validate_discovery_snapshot(snapshot: dict[str, Any]) -> None:
@@ -804,6 +933,88 @@ def validate_discovery_snapshot(snapshot: dict[str, Any]) -> None:
error = errors[0]
location = ".".join(str(part) for part in error.path) or "<root>"
raise RegistryError(f"invalid FabricDiscoverySnapshot at {location}: {error.message}")
canon_errors = _discovery_canon_metadata_errors(snapshot)
if canon_errors:
raise RegistryError(f"invalid FabricDiscoverySnapshot canon metadata: {canon_errors[0]}")
def _graph_canon_metadata_errors(graph: dict[str, Any]) -> list[str]:
errors: list[str] = []
for index, node in enumerate(graph.get("nodes", [])):
if not isinstance(node, dict):
continue
if _has_any(node, ("canon_category", "canon_anchor", "mapping_fit", "evidence_state")):
_require_fields(
errors,
f"nodes[{index}]",
node,
("canon_category", "mapping_fit", "evidence_state"),
)
for index, edge in enumerate(graph.get("edges", [])):
if not isinstance(edge, dict):
continue
_validate_edge_canon_metadata(errors, f"edges[{index}]", edge, type_field="type")
return errors
def _discovery_canon_metadata_errors(snapshot: dict[str, Any]) -> list[str]:
errors: list[str] = []
candidates = snapshot.get("candidates") if isinstance(snapshot.get("candidates"), dict) else {}
for index, node in enumerate(candidates.get("nodes", [])):
if not isinstance(node, dict):
continue
_require_fields(
errors,
f"candidates.nodes[{index}]",
node,
("canon_category", "mapping_fit", "evidence_state"),
)
for index, edge in enumerate(candidates.get("edges", [])):
if not isinstance(edge, dict):
continue
_require_fields(
errors,
f"candidates.edges[{index}]",
edge,
("mapping_fit", "display_only", "evidence_state"),
)
_validate_edge_canon_metadata(errors, f"candidates.edges[{index}]", edge, type_field="edge_type")
return errors
def _validate_edge_canon_metadata(
errors: list[str],
path: str,
edge: dict[str, Any],
*,
type_field: str,
) -> None:
edge_type = str(edge.get(type_field) or "")
has_canon_fields = _has_any(
edge,
("canonical_type", "canon_anchor", "mapping_fit", "display_only", "evidence_state"),
)
if has_canon_fields:
_require_fields(errors, path, edge, ("mapping_fit", "display_only", "evidence_state"))
if edge_type in DISPLAY_ONLY_EDGE_TYPES and edge.get("display_only") is not True:
errors.append(f"{path} uses display-only edge type {edge_type!r} without display_only=true")
if edge.get("display_only") is True and edge_type and not has_canon_fields:
errors.append(f"{path} is display-only but lacks canon metadata")
def _has_any(item: dict[str, Any], fields: tuple[str, ...]) -> bool:
return any(field in item for field in fields)
def _require_fields(
errors: list[str],
path: str,
item: dict[str, Any],
fields: tuple[str, ...],
) -> None:
for field in fields:
if field not in item or item.get(field) in (None, ""):
errors.append(f"{path} missing required canon metadata field {field!r}")
def providers(graph: dict[str, Any], capability: str) -> list[dict[str, Any]]:
@@ -1269,6 +1480,26 @@ def _row_dict(row: sqlite3.Row) -> dict[str, Any]:
return {key: row[key] for key in row.keys()}
def _resettable_counts(db: sqlite3.Connection) -> dict[str, int]:
return {
"snapshots": int(db.execute("select count(*) from snapshots").fetchone()[0]),
"discovery_snapshots": int(db.execute("select count(*) from discovery_snapshots").fetchone()[0]),
"artifacts": int(db.execute("select count(*) from artifacts").fetchone()[0]),
"libraries": int(db.execute("select count(*) from libraries").fetchone()[0]),
}
def _reset_event_dict(row: sqlite3.Row) -> dict[str, Any]:
return {
"id": row["id"],
"created_at": row["created_at"],
"reason": row["reason"],
"archive_path": row["archive_path"],
"archive_sha256": row["archive_sha256"],
"dropped_counts": json.loads(row["dropped_counts_json"]),
}
def _artifact_dict(row: sqlite3.Row) -> dict[str, Any]:
return {
"id": row["id"],

View File

@@ -107,6 +107,8 @@ class RegistryHandler(BaseHTTPRequestHandler):
return HTTPStatus.OK, {"lines": dependency_path_lines(self.store.combined_graph(), _query_one(query, "service_id"))}
if parts == ["exports", "state-hub"]:
return HTTPStatus.OK, self.store.combined_graph()
if parts == ["exports", "reset-archive"]:
return HTTPStatus.OK, self.store.reset_archive()
if parts == ["exports", "backstage"]:
return HTTPStatus.OK, backstage_projection(self.store.combined_graph())
if parts == ["exports", "xregistry"]:
@@ -159,6 +161,8 @@ class RegistryHandler(BaseHTTPRequestHandler):
)
if len(parts) == 4 and parts[0] == "repositories" and parts[2] == "libraries" and parts[3] == "cyclonedx":
return HTTPStatus.CREATED, self.store.ingest_cyclonedx(parts[1], body)
if parts == ["admin", "reset-graph-data"]:
return HTTPStatus.CREATED, self.store.reset_graph_data(body)
if parts == ["artifacts"]:
return HTTPStatus.CREATED, self.store.add_artifact(body)
raise RegistryError(f"route not found: {path}", 404)