Improved datamodel and deterministic generation

This commit is contained in:
2026-04-30 01:29:29 +02:00
parent 973d4bbe7c
commit 26e87ab52c
14 changed files with 848 additions and 39 deletions

View File

@@ -51,6 +51,7 @@ class RegistryStore:
self._ensure_repository_scopes_table(connection)
self._ensure_approved_source_ref_columns(connection)
self._ensure_evidence_relationship_columns(connection)
self._ensure_characteristic_classification_columns(connection)
self._ensure_expectation_gaps_table(connection)
def connect(self) -> sqlite3.Connection:
@@ -108,6 +109,60 @@ class RegistryStore:
"""
)
def _ensure_characteristic_classification_columns(
self,
connection: sqlite3.Connection,
) -> None:
defaults = {
"candidate_abilities": "ability",
"approved_abilities": "ability",
"candidate_capabilities": "capability",
"approved_capabilities": "capability",
"candidate_features": "",
"approved_features": "",
}
for table, default_class in defaults.items():
columns = {
row["name"]
for row in connection.execute(f"PRAGMA table_info({table})").fetchall()
}
if "primary_class" not in columns:
connection.execute(
f"ALTER TABLE {table} ADD COLUMN primary_class TEXT NOT NULL DEFAULT '{default_class}'"
)
if "attributes" not in columns:
connection.execute(
f"ALTER TABLE {table} ADD COLUMN attributes TEXT NOT NULL DEFAULT '[]'"
)
for table in ("candidate_abilities", "approved_abilities"):
connection.execute(
f"""
UPDATE {table}
SET primary_class = COALESCE(NULLIF(primary_class, ''), 'ability'),
attributes = COALESCE(NULLIF(attributes, ''), '[]')
WHERE primary_class = '' OR attributes = ''
"""
)
for table in ("candidate_capabilities", "approved_capabilities"):
connection.execute(
f"""
UPDATE {table}
SET primary_class = COALESCE(NULLIF(primary_class, ''), 'capability'),
attributes = COALESCE(NULLIF(attributes, ''), '[]')
WHERE primary_class = '' OR attributes = ''
"""
)
for table in ("candidate_features", "approved_features"):
connection.execute(
f"""
UPDATE {table}
SET primary_class = COALESCE(NULLIF(primary_class, ''), type),
attributes = COALESCE(NULLIF(attributes, ''), json_array(type))
WHERE primary_class = '' OR attributes = ''
"""
)
def _ensure_content_chunks_table(self, connection: sqlite3.Connection) -> None:
connection.execute(
"""
@@ -361,14 +416,17 @@ class RegistryStore:
ability_cursor = connection.execute(
"""
INSERT INTO candidate_abilities
(repository_id, analysis_run_id, name, description, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?)
(repository_id, analysis_run_id, name, description, primary_class,
attributes, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
analysis_run_id,
ability.name,
ability.description,
ability.primary_class or "ability",
self._attributes_to_json(ability.attributes),
ability.confidence,
self._source_refs_to_json(ability.source_refs),
),
@@ -379,8 +437,8 @@ class RegistryStore:
"""
INSERT INTO candidate_capabilities
(repository_id, analysis_run_id, ability_id, name, description,
inputs, outputs, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
inputs, outputs, primary_class, attributes, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
@@ -390,6 +448,8 @@ class RegistryStore:
capability.description,
json.dumps(capability.inputs),
json.dumps(capability.outputs),
capability.primary_class or "capability",
self._attributes_to_json(capability.attributes),
capability.confidence,
self._source_refs_to_json(capability.source_refs),
),
@@ -400,8 +460,8 @@ class RegistryStore:
"""
INSERT INTO candidate_features
(repository_id, analysis_run_id, capability_id, name, type,
location, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
primary_class, attributes, location, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
@@ -409,6 +469,10 @@ class RegistryStore:
capability_id,
feature.name,
feature.type,
feature.primary_class or feature.type,
self._attributes_to_json(
feature.attributes or [feature.type]
),
feature.location,
feature.confidence,
self._source_refs_to_json(feature.source_refs),
@@ -448,7 +512,8 @@ class RegistryStore:
with self.connect() as connection:
ability_rows = connection.execute(
"""
SELECT id, name, description, confidence, status, source_refs
SELECT id, name, description, primary_class, attributes, confidence,
status, source_refs
FROM candidate_abilities
WHERE repository_id = ? AND analysis_run_id = ?
ORDER BY id
@@ -458,7 +523,7 @@ class RegistryStore:
capability_rows = connection.execute(
"""
SELECT id, ability_id, name, description, inputs, outputs,
confidence, status, source_refs
primary_class, attributes, confidence, status, source_refs
FROM candidate_capabilities
WHERE repository_id = ? AND analysis_run_id = ?
ORDER BY id
@@ -467,8 +532,8 @@ class RegistryStore:
).fetchall()
feature_rows = connection.execute(
"""
SELECT id, capability_id, name, type, location, confidence,
status, source_refs
SELECT id, capability_id, name, type, primary_class, attributes,
location, confidence, status, source_refs
FROM candidate_features
WHERE repository_id = ? AND analysis_run_id = ?
ORDER BY id
@@ -498,6 +563,8 @@ class RegistryStore:
status=row["status"],
source_refs=self._source_refs_from_json(row["source_refs"]),
confidence_label=confidence_label(row["confidence"]),
primary_class=row["primary_class"] or row["type"],
attributes=self._attributes_from_json(row["attributes"]),
)
)
@@ -531,6 +598,8 @@ class RegistryStore:
status=row["status"],
source_refs=self._source_refs_from_json(row["source_refs"]),
confidence_label=confidence_label(row["confidence"]),
primary_class=row["primary_class"] or "capability",
attributes=self._attributes_from_json(row["attributes"]),
features=features_by_capability.get(row["id"], []),
evidence=evidence_by_capability.get(row["id"], []),
)
@@ -545,6 +614,8 @@ class RegistryStore:
status=row["status"],
source_refs=self._source_refs_from_json(row["source_refs"]),
confidence_label=confidence_label(row["confidence"]),
primary_class=row["primary_class"] or "ability",
attributes=self._attributes_from_json(row["attributes"]),
capabilities=capabilities_by_ability.get(row["id"], []),
)
for row in ability_rows
@@ -861,17 +932,22 @@ class RegistryStore:
name: str,
description: str,
confidence: float,
primary_class: str = "ability",
attributes: list[str] | None = None,
) -> None:
with self.connect() as connection:
cursor = connection.execute(
"""
UPDATE candidate_abilities
SET name = ?, description = ?, confidence = ?
SET name = ?, description = ?, primary_class = ?, attributes = ?,
confidence = ?
WHERE id = ? AND repository_id = ? AND analysis_run_id = ?
""",
(
name,
description,
primary_class or "ability",
self._attributes_to_json(attributes or []),
confidence,
candidate_ability_id,
repository_id,
@@ -894,17 +970,22 @@ class RegistryStore:
name: str,
description: str,
confidence: float,
primary_class: str = "capability",
attributes: list[str] | None = None,
) -> None:
with self.connect() as connection:
cursor = connection.execute(
"""
UPDATE candidate_capabilities
SET name = ?, description = ?, confidence = ?
SET name = ?, description = ?, primary_class = ?, attributes = ?,
confidence = ?
WHERE id = ? AND repository_id = ? AND analysis_run_id = ?
""",
(
name,
description,
primary_class or "capability",
self._attributes_to_json(attributes or []),
confidence,
candidate_capability_id,
repository_id,
@@ -918,6 +999,46 @@ class RegistryStore:
f"{repository_id} analysis run {analysis_run_id}"
)
def update_candidate_feature(
self,
repository_id: int,
analysis_run_id: int,
candidate_feature_id: int,
*,
name: str,
type: str,
location: str,
confidence: float,
primary_class: str | None = None,
attributes: list[str] | None = None,
) -> None:
with self.connect() as connection:
cursor = connection.execute(
"""
UPDATE candidate_features
SET name = ?, type = ?, primary_class = ?, attributes = ?,
location = ?, confidence = ?
WHERE id = ? AND repository_id = ? AND analysis_run_id = ?
""",
(
name,
type,
primary_class or type,
self._attributes_to_json(attributes or [type]),
location,
confidence,
candidate_feature_id,
repository_id,
analysis_run_id,
),
)
if cursor.rowcount == 0:
raise NotFoundError(
"candidate feature "
f"{candidate_feature_id} was not found for repository "
f"{repository_id} analysis run {analysis_run_id}"
)
def relink_candidate_capability(
self,
repository_id: int,
@@ -1604,15 +1725,24 @@ class RegistryStore:
name: str,
description: str,
confidence: float,
primary_class: str = "ability",
attributes: list[str] | None = None,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_abilities
(repository_id, name, description, confidence)
VALUES (?, ?, ?, ?)
(repository_id, name, description, primary_class, attributes, confidence)
VALUES (?, ?, ?, ?, ?, ?)
""",
(repository_id, name, description, confidence),
(
repository_id,
name,
description,
primary_class or "ability",
self._attributes_to_json(attributes or []),
confidence,
),
)
return int(cursor.lastrowid)
@@ -1638,6 +1768,8 @@ class RegistryStore:
name: str | None = None,
description: str | None = None,
confidence: float | None = None,
primary_class: str | None = None,
attributes: list[str] | None = None,
) -> None:
self._update_approved_row(
table="approved_abilities",
@@ -1648,6 +1780,12 @@ class RegistryStore:
"name": name,
"description": description,
"confidence": confidence,
"primary_class": primary_class,
"attributes": (
self._attributes_to_json(attributes)
if attributes is not None
else None
),
},
)
@@ -1669,13 +1807,16 @@ class RegistryStore:
inputs: list[str],
outputs: list[str],
confidence: float,
primary_class: str = "capability",
attributes: list[str] | None = None,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_capabilities
(repository_id, ability_id, name, description, inputs, outputs, confidence)
VALUES (?, ?, ?, ?, ?, ?, ?)
(repository_id, ability_id, name, description, inputs, outputs,
primary_class, attributes, confidence)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
@@ -1684,6 +1825,8 @@ class RegistryStore:
description,
json.dumps(inputs),
json.dumps(outputs),
primary_class or "capability",
self._attributes_to_json(attributes or []),
confidence,
),
)
@@ -1713,6 +1856,8 @@ class RegistryStore:
inputs: list[str] | None = None,
outputs: list[str] | None = None,
confidence: float | None = None,
primary_class: str | None = None,
attributes: list[str] | None = None,
) -> None:
self._update_approved_row(
table="approved_capabilities",
@@ -1725,6 +1870,12 @@ class RegistryStore:
"inputs": json.dumps(inputs) if inputs is not None else None,
"outputs": json.dumps(outputs) if outputs is not None else None,
"confidence": confidence,
"primary_class": primary_class,
"attributes": (
self._attributes_to_json(attributes)
if attributes is not None
else None
),
},
)
@@ -1746,19 +1897,24 @@ class RegistryStore:
location: str,
confidence: float,
source_refs: list[SourceReference] | None = None,
primary_class: str | None = None,
attributes: list[str] | None = None,
) -> int:
with self.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO approved_features
(repository_id, capability_id, name, type, location, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?)
(repository_id, capability_id, name, type, primary_class, attributes,
location, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
capability_id,
name,
type,
primary_class or type,
self._attributes_to_json(attributes or [type]),
location,
confidence,
self._source_refs_to_json(source_refs or []),
@@ -1775,6 +1931,8 @@ class RegistryStore:
type: str | None = None,
location: str | None = None,
confidence: float | None = None,
primary_class: str | None = None,
attributes: list[str] | None = None,
) -> None:
self._update_approved_row(
table="approved_features",
@@ -1784,6 +1942,12 @@ class RegistryStore:
values={
"name": name,
"type": type,
"primary_class": primary_class,
"attributes": (
self._attributes_to_json(attributes)
if attributes is not None
else None
),
"location": location,
"confidence": confidence,
},
@@ -1968,13 +2132,15 @@ class RegistryStore:
ability_cursor = connection.execute(
"""
INSERT INTO approved_abilities
(repository_id, name, description, confidence)
VALUES (?, ?, ?, ?)
(repository_id, name, description, primary_class, attributes, confidence)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
repository_id,
ability.name,
ability.description,
ability.primary_class or "ability",
self._attributes_to_json(ability.attributes),
ability.confidence,
),
)
@@ -1986,8 +2152,8 @@ class RegistryStore:
"""
INSERT INTO approved_capabilities
(repository_id, ability_id, name, description, inputs, outputs,
confidence)
VALUES (?, ?, ?, ?, ?, ?, ?)
primary_class, attributes, confidence)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
@@ -1996,6 +2162,8 @@ class RegistryStore:
capability.description,
json.dumps(capability.inputs),
json.dumps(capability.outputs),
capability.primary_class or "capability",
self._attributes_to_json(capability.attributes),
capability.confidence,
),
)
@@ -2006,15 +2174,19 @@ class RegistryStore:
connection.execute(
"""
INSERT INTO approved_features
(repository_id, capability_id, name, type, location,
confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?)
(repository_id, capability_id, name, type, primary_class,
attributes, location, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
approved_capability_id,
feature.name,
feature.type,
feature.primary_class or feature.type,
self._attributes_to_json(
feature.attributes or [feature.type]
),
feature.location,
feature.confidence,
self._source_refs_to_json(feature.source_refs),
@@ -2051,7 +2223,7 @@ class RegistryStore:
with self.connect() as connection:
ability_rows = connection.execute(
"""
SELECT id, name, description, confidence
SELECT id, name, description, primary_class, attributes, confidence
FROM approved_abilities
WHERE repository_id = ?
ORDER BY id
@@ -2060,7 +2232,8 @@ class RegistryStore:
).fetchall()
capability_rows = connection.execute(
"""
SELECT id, ability_id, name, description, inputs, outputs, confidence
SELECT id, ability_id, name, description, inputs, outputs,
primary_class, attributes, confidence
FROM approved_capabilities
WHERE repository_id = ?
ORDER BY id
@@ -2069,7 +2242,8 @@ class RegistryStore:
).fetchall()
feature_rows = connection.execute(
"""
SELECT id, capability_id, name, type, location, confidence, source_refs
SELECT id, capability_id, name, type, primary_class, attributes,
location, confidence, source_refs
FROM approved_features
WHERE repository_id = ?
ORDER BY id
@@ -2098,6 +2272,8 @@ class RegistryStore:
confidence=row["confidence"],
confidence_label=confidence_label(row["confidence"]),
source_refs=self._source_refs_from_json(row["source_refs"]),
primary_class=row["primary_class"] or row["type"],
attributes=self._attributes_from_json(row["attributes"]),
)
)
@@ -2128,6 +2304,8 @@ class RegistryStore:
outputs=json.loads(row["outputs"]),
confidence=row["confidence"],
confidence_label=confidence_label(row["confidence"]),
primary_class=row["primary_class"] or "capability",
attributes=self._attributes_from_json(row["attributes"]),
features=features_by_capability.get(row["id"], []),
evidence=evidence_by_capability.get(row["id"], []),
)
@@ -2140,6 +2318,8 @@ class RegistryStore:
description=row["description"],
confidence=row["confidence"],
confidence_label=confidence_label(row["confidence"]),
primary_class=row["primary_class"] or "ability",
attributes=self._attributes_from_json(row["attributes"]),
capabilities=capabilities_by_ability.get(row["id"], []),
)
for row in ability_rows
@@ -2578,6 +2758,17 @@ class RegistryStore:
]
)
def _attributes_to_json(self, attributes: list[str]) -> str:
return json.dumps([item.strip() for item in attributes if item.strip()])
def _attributes_from_json(self, value: str) -> list[str]:
if not value:
return []
parsed = json.loads(value)
if not isinstance(parsed, list):
return []
return [str(item) for item in parsed if str(item).strip()]
def _source_refs_from_json(self, value: str) -> list[SourceReference]:
return [
SourceReference(