diff --git a/pyproject.toml b/pyproject.toml index e4c4078..6a8e987 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,9 @@ dev = [ "pytest>=7.4", ] +[project.scripts] +repo-scoping = "repo_registry.cli:main" + [tool.setuptools.packages.find] where = ["src"] diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py index 07a4a19..e824b55 100644 --- a/src/repo_registry/candidate_graph/generator.py +++ b/src/repo_registry/candidate_graph/generator.py @@ -188,7 +188,7 @@ class CandidateGraphGenerator: ), source_refs=self._source_refs(interfaces), primary_class="interface", - attributes=self._interface_attributes(interfaces), + attributes=self._interface_attributes(interfaces, docs, chunks), features=features, evidence=self._evidence(tests, examples, docs), ) @@ -523,10 +523,33 @@ class CandidateGraphGenerator: attributes.append("interface") return "developer-tooling", self._unique(attributes) - def _interface_attributes(self, interfaces: list[ObservedFact]) -> list[str]: + def _interface_attributes( + self, + interfaces: list[ObservedFact], + docs: list[ObservedFact] | None = None, + chunks: list[ContentChunk] | None = None, + ) -> list[str]: feature_types = {self._feature_type(fact) for fact in interfaces} attributes = ["api" if item == "API" else "cli" if item == "CLI" else "callable" for item in feature_types] - return self._unique(["surface", *attributes, "utility-owned"]) + utility = self._interface_utility_relationship(docs or [], chunks or []) + return self._unique(["surface", *attributes, f"utility-{utility}"]) + + def _interface_utility_relationship( + self, + docs: list[ObservedFact], + chunks: list[ContentChunk], + ) -> str: + doc_paths = {fact.path for fact in docs} + text = " ".join( + chunk.text.lower() + for chunk in chunks + if chunk.path in doc_paths + and chunk.kind in {"intent", "documentation"} + and chunk.metadata.get("source_role") != "derived_scope" + ) + if any(token in text for token in ("facade", "proxy", "wrapper", "wraps ")): + return "facade" + return "owned" def _feature_attributes( self, diff --git a/src/repo_registry/cli.py b/src/repo_registry/cli.py new file mode 100644 index 0000000..abb443b --- /dev/null +++ b/src/repo_registry/cli.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Sequence + +from repo_registry.core.models import CharacteristicRebuildResult, Repository +from repo_registry.core.service import RegistryService +from repo_registry.llm_extraction import LLMCandidateExtractor, create_llm_connect_adapter +from repo_registry.repo_ingestion.git import GitIngestionService +from repo_registry.storage.sqlite import NotFoundError, RegistryStore +from repo_registry.web_api.app import Settings + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="repo-scoping", + description="Repository Scoping maintenance commands.", + ) + subparsers = parser.add_subparsers(dest="command", required=True) + rebuild = subparsers.add_parser( + "rebuild-characteristics", + help="Rebuild candidate characteristics for one or more repositories.", + ) + target = rebuild.add_mutually_exclusive_group(required=True) + target.add_argument("--repo", help="Repository id or exact repository name.") + target.add_argument("--all", action="store_true", help="Rebuild every repository.") + rebuild.add_argument("--dry-run", action="store_true", help="Preview without clearing approved characteristics.") + rebuild.add_argument("--no-llm", action="store_true", help="Disable configured LLM assistance.") + rebuild.add_argument( + "--trusted-auto-approve", + action="store_true", + help="Run trusted auto-approval after a confirmed rebuild.", + ) + rebuild.add_argument( + "--confirm", + action="store_true", + help="Confirm a destructive rebuild for selected repositories.", + ) + rebuild.add_argument( + "--confirm-all", + action="store_true", + help="Confirm a destructive all-repository rebuild.", + ) + rebuild.add_argument("--database-path", help="Override REPO_REGISTRY_DATABASE_PATH.") + rebuild.add_argument("--checkout-root", help="Override REPO_REGISTRY_CHECKOUT_ROOT.") + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + if args.command == "rebuild-characteristics": + return rebuild_characteristics_command(args, parser) + parser.error(f"unknown command: {args.command}") + return 2 + + +def rebuild_characteristics_command( + args: argparse.Namespace, + parser: argparse.ArgumentParser, +) -> int: + dry_run = bool(args.dry_run) + if not dry_run and args.all and not args.confirm_all: + parser.error("--all destructive rebuilds require --confirm-all") + if not dry_run and not (args.confirm or args.confirm_all): + parser.error("destructive rebuilds require --confirm or --confirm-all") + + service = service_from_args(args) + repositories = selected_repositories(service, args) + if not repositories: + parser.error("no repositories matched the requested target") + + for repository in repositories: + result = service.rebuild_characteristics_from_scratch( + repository.id, + dry_run=dry_run, + confirm=not dry_run, + use_llm_assistance=not args.no_llm, + ) + if args.trusted_auto_approve and not dry_run and result.analysis_run.status == "completed": + service.trusted_auto_approve_candidate_graph( + repository.id, + result.analysis_run.id, + notes="CLI trusted auto-approve after rebuild.", + ) + print(rebuild_summary_line(service, result, args)) + return 0 + + +def service_from_args(args: argparse.Namespace) -> RegistryService: + settings = Settings() + database_path = Path(args.database_path or settings.database_path) + checkout_root = args.checkout_root or settings.checkout_root + database_path.parent.mkdir(parents=True, exist_ok=True) + store = RegistryStore(database_path) + store.initialize() + llm_extractor = None + if not args.no_llm and settings.llm_enabled and settings.llm_provider: + adapter = create_llm_connect_adapter(settings.llm_provider, model=settings.llm_model) + llm_extractor = LLMCandidateExtractor(adapter) + return RegistryService( + store, + ingestion=GitIngestionService(checkout_root), + llm_extractor=llm_extractor, + ) + + +def selected_repositories( + service: RegistryService, + args: argparse.Namespace, +) -> list[Repository]: + repositories = service.list_repositories() + if args.all: + return repositories + repo = str(args.repo) + if repo.isdigit(): + try: + return [service.get_repository(int(repo))] + except NotFoundError: + return [] + return [repository for repository in repositories if repository.name == repo] + + +def rebuild_summary_line( + service: RegistryService, + result: CharacteristicRebuildResult, + args: argparse.Namespace, +) -> str: + graph = ( + service.candidate_graph(result.repository.id, result.analysis_run.id) + if result.analysis_run.status == "completed" + else None + ) + remaining_review = 0 + if graph is not None: + remaining_review = sum( + 1 + for ability in graph.abilities + for capability in ability.capabilities + if capability.status == "candidate" + ) + candidate_source = "deterministic" if args.no_llm else "configured" + return ( + f"repo={result.repository.id}:{result.repository.name} " + f"latest_analysis_run={result.analysis_run.id} " + f"candidate_source={candidate_source} " + f"dry_run={result.dry_run} " + f"cleared_approved={result.cleared_approved} " + f"approved_superseded={result.previous_counts} " + f"candidates={result.candidate_counts} " + f"remaining_review_queue={remaining_review}" + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index a40ebe8..1604f30 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -215,7 +215,7 @@ class RegistryService: candidate_source = "deterministic" candidates = normalize_candidate_drafts(candidates) self.store.replace_candidate_graph(repository_id, completed_run.id, candidates) - if candidate_source == "llm": + if "llm" in candidate_source: log_operation( "llm_extraction_used", repository_id=repository_id, @@ -226,7 +226,10 @@ class RegistryService: repository_id, completed_run.id, action="llm_extraction_used", - notes=f"Generated {len(candidates)} candidate ability draft(s).", + notes=( + f"Generated {len(candidates)} candidate ability draft(s) " + f"from {candidate_source} candidate generation." + ), ) if trusted_auto_approve: self.trusted_auto_approve_candidate_graph( @@ -260,11 +263,13 @@ class RegistryService: *, use_llm_assistance: bool = True, ): + deterministic = self.candidate_generator.generate(repository, facts, chunks) if use_llm_assistance and self.llm_extractor is not None: extracted = self.llm_extractor.extract(repository, chunks) if extracted: - return self.llm_mapper.map(extracted, facts, chunks), "llm" - return self.candidate_generator.generate(repository, facts, chunks), "deterministic" + llm_candidates = self.llm_mapper.map(extracted, facts, chunks) + return llm_candidates + deterministic, "llm+deterministic" + return deterministic, "deterministic" def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]: return self.store.list_analysis_runs(repository_id) diff --git a/src/repo_registry/llm_extraction/extractor.py b/src/repo_registry/llm_extraction/extractor.py index 260724c..d5f5a73 100644 --- a/src/repo_registry/llm_extraction/extractor.py +++ b/src/repo_registry/llm_extraction/extractor.py @@ -75,12 +75,20 @@ class LLMCandidateExtractor: chunk_text = "\n\n".join( ( f"Source: {chunk.path}:{chunk.start_line}-{chunk.end_line} " - f"({chunk.kind})\n{chunk.text}" + f"({chunk.kind}; source_role={self._source_role(chunk)})\n{chunk.text}" ) - for chunk in chunks[:12] + for chunk in self._prompt_chunks(chunks) ) return ( "Extract a conservative, source-linked repository ability map.\n" + "Use original repository utility only: capabilities the repository " + "owns, intentionally exposes as a facade, or implements as an adapter.\n" + "Prefer source_role=intent_summary, product_documentation, " + "implementation_source, and test_evidence. Do not use SCOPE.md or " + "source_role=derived_scope as primary evidence; it is a derived prior " + "registry view and may be stale. Ignore agent guidance, CI/tooling, " + "dependency-only, and mention-only context unless owned product " + "evidence supports the same claim.\n" "Return strict JSON only with this shape:\n" "{\n" ' "abilities": [\n' @@ -108,6 +116,46 @@ class LLMCandidateExtractor: f"{chunk_text}\n" ) + def _prompt_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]: + promptable = [ + chunk + for chunk in chunks + if self._source_role(chunk) not in {"agent_guidance", "derived_scope"} + ] + return sorted( + promptable, + key=lambda chunk: ( + self._source_role_priority(self._source_role(chunk)), + chunk.path, + chunk.start_line, + ), + )[:12] + + def _source_role(self, chunk: ContentChunk) -> str: + role = chunk.metadata.get("source_role") + if isinstance(role, str) and role: + return role + path = chunk.path.lower() + if path.endswith("intent.md"): + return "intent_summary" + if path.endswith("scope.md"): + return "derived_scope" + if path.endswith(("agents.md", "claude.md")) or "/.claude/" in path: + return "agent_guidance" + return "" + + def _source_role_priority(self, source_role: str) -> int: + priorities = { + "intent_summary": 0, + "product_documentation": 1, + "implementation_source": 2, + "test_evidence": 3, + "configuration": 4, + "dependency_declaration": 5, + "ci_tooling": 6, + } + return priorities.get(source_role, 7) + def parse_response(self, content: str) -> list[ExtractedAbility]: try: payload = json.loads(self._json_text(content)) diff --git a/tests/fixtures.py b/tests/fixtures.py index 2ac152b..64eec64 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -73,3 +73,93 @@ def write_empty_repo(root: Path) -> Path: repo = root / "empty-repo" repo.mkdir() return repo + + +def write_key_cape_like_repo(root: Path) -> Path: + repo = root / "key-cape-like" + repo.mkdir() + (repo / "INTENT.md").write_text( + "# INTENT\n\n" + "Provide lightweight IAM profile enforcement for small deployments.\n\n" + "## Intended Capabilities\n\n" + "- Enforce OIDC PKCE profiles.\n" + "- Validate LDAP schema migrations.\n" + "- Run migration tooling for identity data.\n", + encoding="utf-8", + ) + (repo / "SCOPE.md").write_text( + "# SCOPE\n\n" + "Old polluted scope mentions routing LLM provider requests.\n", + encoding="utf-8", + ) + (repo / "README.md").write_text( + "# KeyCape\n\n" + "Lightweight IAM service with OIDC profile enforcement and LDAP schema " + "validation. Backend adapters live under src/internal/adapters.\n" + "See CLAUDE.md for agent workflow.\n", + encoding="utf-8", + ) + (repo / "CLAUDE.md").write_text( + "# CLAUDE.md\n\n" + "Guidance for Claude Code when working in this repository.\n", + encoding="utf-8", + ) + (repo / "src" / "internal" / "adapters").mkdir(parents=True) + (repo / "src" / "internal" / "adapters" / "oidc.py").write_text( + "def enforce_pkce_profile(client):\n" + " return client.require_pkce\n", + encoding="utf-8", + ) + return repo + + +def write_llm_connect_like_repo(root: Path) -> Path: + repo = root / "llm-connect-like" + repo.mkdir() + (repo / "README.md").write_text( + "# LLM Connect\n\nSupports OpenRouter and Claude fallback for prompts.\n", + encoding="utf-8", + ) + (repo / ".env.example").write_text( + "OPENROUTER_API_KEY=\nANTHROPIC_API_KEY=\n", + encoding="utf-8", + ) + (repo / "providers.py").write_text( + "provider_registry = {'openrouter': OpenRouterAdapter, 'anthropic': ClaudeAdapter}\n" + "fallback_provider = 'claude'\n", + encoding="utf-8", + ) + return repo + + +def write_facade_repo(root: Path) -> Path: + repo = root / "facade-repo" + repo.mkdir() + (repo / "README.md").write_text( + "# Mail Facade\n\n" + "Provides a public HTTP facade that wraps the upstream mail classifier.\n", + encoding="utf-8", + ) + (repo / "app.py").write_text( + "from fastapi import FastAPI\n" + "app = FastAPI()\n" + '@app.post("/classify")\n' + "def classify():\n" + " return {}\n", + encoding="utf-8", + ) + return repo + + +def write_dependency_only_repo(root: Path) -> Path: + repo = root / "dependency-only" + repo.mkdir() + (repo / "README.md").write_text( + "# Dependency Only\n\nUses OpenRouter during experiments but exposes no API.\n", + encoding="utf-8", + ) + (repo / "requirements.txt").write_text( + "openai\nanthropic\n", + encoding="utf-8", + ) + return repo diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..d4eab43 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,100 @@ +import pytest + +from repo_registry.cli import main +from repo_registry.core.service import RegistryService +from repo_registry.repo_ingestion.git import GitIngestionService +from repo_registry.storage.sqlite import RegistryStore + + +def make_service(tmp_path): + store = RegistryStore(tmp_path / "registry.sqlite3") + store.initialize() + return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts")) + + +def write_repo(tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text("# CLI Rebuild\nReports health.\n", encoding="utf-8") + (source / "app.py").write_text('@app.get("/health")\ndef health():\n return {}\n', encoding="utf-8") + return source + + +def approved_repository(tmp_path): + service = make_service(tmp_path) + source = write_repo(tmp_path) + repository = service.register_repository(name="CLI Rebuild", url=str(source)) + summary = service.analyze_repository(repository.id, use_llm_assistance=False) + service.approve_candidate_graph(repository.id, summary.analysis_run.id) + return service, repository + + +def test_rebuild_cli_dry_run_preserves_approved_characteristics(tmp_path, capsys): + service, repository = approved_repository(tmp_path) + + exit_code = main( + [ + "rebuild-characteristics", + "--repo", + str(repository.id), + "--dry-run", + "--no-llm", + "--database-path", + str(tmp_path / "registry.sqlite3"), + "--checkout-root", + str(tmp_path / "checkouts"), + ] + ) + + output = capsys.readouterr().out + assert exit_code == 0 + assert "repo=1:CLI Rebuild" in output + assert "latest_analysis_run=2" in output + assert "candidate_source=deterministic" in output + assert "dry_run=True" in output + assert "cleared_approved=False" in output + assert service.ability_map(repository.id).abilities + + +def test_rebuild_cli_confirmed_single_repo_clears_approved_characteristics(tmp_path, capsys): + _service, repository = approved_repository(tmp_path) + + exit_code = main( + [ + "rebuild-characteristics", + "--repo", + str(repository.id), + "--no-llm", + "--confirm", + "--database-path", + str(tmp_path / "registry.sqlite3"), + "--checkout-root", + str(tmp_path / "checkouts"), + ] + ) + + service = make_service(tmp_path) + output = capsys.readouterr().out + assert exit_code == 0 + assert "dry_run=False" in output + assert "cleared_approved=True" in output + assert service.ability_map(repository.id).abilities == [] + + +def test_rebuild_cli_refuses_destructive_all_without_confirm_all(tmp_path): + approved_repository(tmp_path) + + with pytest.raises(SystemExit) as exc: + main( + [ + "rebuild-characteristics", + "--all", + "--confirm", + "--database-path", + str(tmp_path / "registry.sqlite3"), + "--checkout-root", + str(tmp_path / "checkouts"), + ] + ) + + assert exc.value.code == 2 diff --git a/tests/test_llm_extraction.py b/tests/test_llm_extraction.py index f0ca36d..ab177b7 100644 --- a/tests/test_llm_extraction.py +++ b/tests/test_llm_extraction.py @@ -50,6 +50,58 @@ def chunk(): ) +def test_llm_prompt_filters_derived_scope_and_labels_source_roles(): + adapter = FakeAdapter('{"abilities": []}') + extractor = LLMCandidateExtractor(adapter) + chunks = [ + ContentChunk( + id=1, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + path="SCOPE.md", + kind="scope", + start_line=1, + end_line=3, + text="# SCOPE\n\nOld approved LLM routing entry.", + metadata={"source_role": "derived_scope"}, + ), + ContentChunk( + id=2, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + path="INTENT.md", + kind="intent", + start_line=1, + end_line=3, + text="# INTENT\n\nProvide lightweight IAM.", + metadata={"source_role": "intent_summary"}, + ), + ContentChunk( + id=3, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + path="CLAUDE.md", + kind="documentation", + start_line=1, + end_line=2, + text="# CLAUDE\n\nAgent guidance.", + metadata={"source_role": "agent_guidance"}, + ), + ] + + extractor.extract(repository(), chunks) + + assert "Source: INTENT.md" in adapter.last_prompt + assert "source_role=intent_summary" in adapter.last_prompt + assert "Source: SCOPE.md" not in adapter.last_prompt + assert "Old approved LLM routing entry" not in adapter.last_prompt + assert "Source: CLAUDE.md" not in adapter.last_prompt + assert "Do not use SCOPE.md" in adapter.last_prompt + + def test_llm_candidate_extractor_parses_structured_response(): adapter = FakeAdapter( """ diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index bcc9872..82567bd 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -9,8 +9,12 @@ from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.semantic import HashingEmbeddingProvider from repo_registry.storage.sqlite import NotFoundError, RegistryStore from tests.fixtures import ( + write_dependency_only_repo, write_empty_repo, + write_facade_repo, write_javascript_typescript_package_repo, + write_key_cape_like_repo, + write_llm_connect_like_repo, write_misleading_docs_repo, write_python_cli_repo, write_readme_only_repo, @@ -396,6 +400,80 @@ def test_fixture_breadth_misleading_docs_do_not_become_approved_truth(tmp_path): assert ability_map.abilities == [] +def test_regression_key_cape_like_repo_centers_iam_not_llm_provider_routing(tmp_path): + source = write_key_cape_like_repo(tmp_path) + service = make_service(tmp_path) + repository = service.register_repository(name="KeyCape Like", url=str(source)) + + summary = service.analyze_repository(repository.id, use_llm_assistance=False) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + capability_names = { + capability.name + for ability in graph.abilities + for capability in ability.capabilities + } + assert "Enforce OIDC PKCE Profiles" in capability_names + assert "Validate LDAP Schema Migrations" in capability_names + assert "Run Migration Tooling For Identity Data" in capability_names + assert "Route LLM Requests Across Providers" not in capability_names + facts = {(fact.kind, fact.name, fact.path) for fact in summary.facts} + assert ("llm_provider", "Claude", "CLAUDE.md") not in facts + + +def test_regression_llm_connect_like_repo_still_promotes_provider_routing(tmp_path): + source = write_llm_connect_like_repo(tmp_path) + service = make_service(tmp_path) + repository = service.register_repository(name="LLM Connect Like", url=str(source)) + + summary = service.analyze_repository(repository.id, use_llm_assistance=False) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + capability = next( + capability + for ability in graph.abilities + for capability in ability.capabilities + if capability.name == "Route LLM Requests Across Providers" + ) + assert {"utility-adapter", "llm-provider", "openrouter", "claude"} <= set( + capability.attributes + ) + + +def test_regression_facade_repo_promotes_public_wrapper_as_facade(tmp_path): + source = write_facade_repo(tmp_path) + service = make_service(tmp_path) + repository = service.register_repository(name="Mail Facade", url=str(source)) + + summary = service.analyze_repository(repository.id, use_llm_assistance=False) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + capability = graph.abilities[0].capabilities[0] + assert capability.name == "Expose Repository Interface" + assert "utility-facade" in capability.attributes + assert "POST /classify" in {feature.name for feature in capability.features} + + +def test_regression_dependency_only_repo_keeps_libraries_as_context(tmp_path): + source = write_dependency_only_repo(tmp_path) + service = make_service(tmp_path) + repository = service.register_repository(name="Dependency Only", url=str(source)) + + summary = service.analyze_repository(repository.id, use_llm_assistance=False) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + capability_names = { + capability.name + for ability in graph.abilities + for capability in ability.capabilities + } + assert "Route LLM Requests Across Providers" not in capability_names + assert capability_names == {"Describe Repository Structure"} + structure = graph.abilities[0].capabilities[0] + assert "utility-dependency" in structure.attributes + assert "review-required-structural-context" in structure.attributes + + def test_fixture_breadth_empty_repo_produces_no_candidate_claims(tmp_path): source = write_empty_repo(tmp_path) service = make_service(tmp_path) @@ -622,7 +700,110 @@ def test_analyze_repository_can_use_optional_llm_extractor(tmp_path): assert graph.abilities[0].capabilities[0].name == "Classify Incoming Email" assert graph.abilities[0].source_refs[0].path == "README.md" assert decisions[0].action == "llm_extraction_used" - assert "1 candidate ability" in decisions[0].notes + assert "llm+deterministic candidate generation" in decisions[0].notes + assert {ability.name for ability in graph.abilities} >= { + "Business Email Routing", + "Route Incoming Customer Email", + } + + +def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stale_entries(tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "INTENT.md").write_text( + "# INTENT\n\n" + "Provide lightweight IAM.\n\n" + "## Intended Capabilities\n\n" + "- Enforce OIDC PKCE profiles.\n", + encoding="utf-8", + ) + (source / "SCOPE.md").write_text( + "# SCOPE\n\nOld approved entry: route LLM provider requests.\n", + encoding="utf-8", + ) + store = RegistryStore(tmp_path / "registry.sqlite3") + store.initialize() + extractor = FakeLLMExtractor( + [ + ExtractedAbility( + name="Old LLM Routing", + description="Stale prior scope claim.", + source_paths=["SCOPE.md"], + capabilities=[ + ExtractedCapability( + name="Route LLM Provider Requests", + description="Old scope reuse.", + source_paths=["SCOPE.md"], + ) + ], + ) + ] + ) + service = RegistryService( + store, + ingestion=GitIngestionService(tmp_path / "checkouts"), + llm_extractor=extractor, + ) + repository = service.register_repository(name="KeyCape Like", url=str(source)) + + summary = service.analyze_repository(repository.id) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + decisions = service.list_review_decisions(repository.id, summary.analysis_run.id) + + capability_names = { + capability.name + for ability in graph.abilities + for capability in ability.capabilities + } + assert "Route LLM Provider Requests" in capability_names + assert "Enforce OIDC PKCE Profiles" in capability_names + assert decisions[0].action == "llm_extraction_used" + assert "llm+deterministic candidate generation" in decisions[0].notes + + +def test_analysis_isolation_between_repositories_with_stale_approved_data(tmp_path): + poisoned_source = write_llm_connect_like_repo(tmp_path) + target_source = write_key_cape_like_repo(tmp_path) + service = make_service(tmp_path) + poisoned = service.register_repository( + name="Poisoned LLM Connect", + url=str(poisoned_source), + ) + target = service.register_repository( + name="Isolated KeyCape", + url=str(target_source), + ) + + poisoned_summary = service.analyze_repository( + poisoned.id, + use_llm_assistance=False, + ) + service.approve_candidate_graph(poisoned.id, poisoned_summary.analysis_run.id) + assert any( + capability.name == "Route LLM Requests Across Providers" + for ability in service.ability_map(poisoned.id).abilities + for capability in ability.capabilities + ) + + target_summary = service.analyze_repository( + target.id, + use_llm_assistance=False, + ) + target_graph = service.candidate_graph(target.id, target_summary.analysis_run.id) + target_facts = service.list_observed_facts(target.id, target_summary.analysis_run.id) + target_chunks = service.list_content_chunks(target.id, target_summary.analysis_run.id) + + target_capability_names = { + capability.name + for ability in target_graph.abilities + for capability in ability.capabilities + } + assert "Enforce OIDC PKCE Profiles" in target_capability_names + assert "Route LLM Requests Across Providers" not in target_capability_names + assert all(fact.repository_id == target.id for fact in target_facts) + assert all(chunk.repository_id == target.id for chunk in target_chunks) + assert all(ref.path != "providers.py" for ability in target_graph.abilities for ref in ability.source_refs) + assert service.ability_map(target.id).abilities == [] def test_analyze_repository_can_disable_optional_llm_extractor(tmp_path): @@ -695,8 +876,9 @@ def test_analyze_repository_normalizes_duplicate_llm_candidates(tmp_path): summary = service.analyze_repository(repository.id) graph = service.candidate_graph(repository.id, summary.analysis_run.id) - assert len(graph.abilities) == 1 + assert len(graph.abilities) == 2 assert graph.abilities[0].name == "LLM Provider Integrations" + assert graph.abilities[1].name == "Support OpenRouter Providers" def test_analyze_repository_falls_back_when_optional_llm_extractor_returns_no_candidates(tmp_path): diff --git a/workplans/RREG-WP-0009-provenance-aware-characteristic-rebuild.md b/workplans/RREG-WP-0009-provenance-aware-characteristic-rebuild.md index b9ac74d..bd7af49 100644 --- a/workplans/RREG-WP-0009-provenance-aware-characteristic-rebuild.md +++ b/workplans/RREG-WP-0009-provenance-aware-characteristic-rebuild.md @@ -4,7 +4,7 @@ type: workplan title: "Provenance-Aware Characteristic Rebuild" domain: capabilities repo: repo-scoping -status: active +status: done owner: codex topic_slug: foerster-capabilities created: "2026-05-01" @@ -195,7 +195,7 @@ Acceptance criteria: ```task id: RREG-WP-0009-T07 -status: todo +status: done priority: medium state_hub_task_id: "7afd6550-e4a4-4a8a-94bf-d974b0ccb8d2" ``` @@ -216,7 +216,7 @@ Acceptance criteria: ```task id: RREG-WP-0009-T08 -status: todo +status: done priority: high state_hub_task_id: "05077f3d-d40d-45fd-865c-0924407beb4f" ``` @@ -256,3 +256,24 @@ Acceptance criteria: analysis while preserving approved characteristics. - key-cape is documented as the motivating failure mode without hard-coding product-specific behavior into the scanner. + +## Cross-Repository Analysis Isolation + +```task +id: RREG-WP-0009-T10 +status: done +priority: high +``` + +Validate that analyzing one repository never depends on approved maps, +candidate graphs, facts, chunks, or derived scope data from any other +repository in the registry database. + +Acceptance criteria: +- A repository with stale approved characteristics cannot influence fresh + candidate generation for another repository. +- Candidate graph, observed fact, and content chunk lookups remain scoped by + repository and analysis run. +- Tests cover a poisoned-repo scenario where repo A contains old LLM/provider + characteristics and repo B still generates only its own repository-owned + candidates.