from __future__ import annotations import re from dataclasses import dataclass, field, replace from repo_scoping.core.models import ContentChunk, ObservedFact, Repository, SourceReference @dataclass(frozen=True) class CandidateEvidenceDraft: type: str reference: str strength: str source_refs: list[SourceReference] @dataclass(frozen=True) class CandidateFeatureDraft: name: str type: str location: str confidence: float source_refs: list[SourceReference] primary_class: str = "" attributes: list[str] = field(default_factory=list) @dataclass(frozen=True) class CandidateCapabilityDraft: name: str description: str inputs: list[str] outputs: list[str] confidence: float source_refs: list[SourceReference] primary_class: str = "capability" attributes: list[str] = field(default_factory=list) features: list[CandidateFeatureDraft] = field(default_factory=list) evidence: list[CandidateEvidenceDraft] = field(default_factory=list) @dataclass(frozen=True) class CandidateAbilityDraft: name: str description: str confidence: float source_refs: list[SourceReference] primary_class: str = "ability" attributes: list[str] = field(default_factory=list) capabilities: list[CandidateCapabilityDraft] = field(default_factory=list) REPO_SCOPING_NATIVE_CAPABILITY_SEEDS = [ { "name": "Register And Track Repositories", "primary_class": "ingestion", "attributes": ["metadata", "git", "analysis-run"], "features": [ ( "Create and update repository records", "api", ["src/repo_scoping/core/service.py", "src/repo_scoping/web_api/app.py"], ), ( "Resolve local or remote Git checkouts", "backend", ["src/repo_scoping/repo_ingestion/git.py", "tests/test_git_ingestion.py"], ), ( "Import repository metadata", "backend", [ "src/repo_scoping/repo_ingestion/metadata.py", "tests/test_repository_metadata.py", ], ), ], }, { "name": "Scan Repositories Into Observed Facts", "primary_class": "analysis", "attributes": ["deterministic", "facts", "provenance"], "features": [ ( "Detect source languages, manifests, docs, tests, config, and interfaces", "backend", ["src/repo_scoping/repo_scanning/scanner.py", "tests/test_repository_scanner.py"], ), ( "Classify source roles for facts", "backend", ["src/repo_scoping/repo_scanning/scanner.py", "docs/characteristic-evidence-model.md"], ), ( "Preserve analysis snapshots and fact records", "storage", ["src/repo_scoping/storage/sqlite.py", "migrations/0001_initial.sql"], ), ], }, { "name": "Index Source Content With Provenance", "primary_class": "analysis", "attributes": ["content-chunks", "source-role"], "features": [ ( "Create source-linked content chunks from observed facts", "backend", ["src/repo_scoping/content_indexing/extractor.py", "tests/test_content_indexing.py"], ), ( "Carry source-role metadata into downstream generation", "backend", [ "src/repo_scoping/content_indexing/extractor.py", "src/repo_scoping/llm_extraction/extractor.py", ], ), ], }, { "name": "Generate Reviewable Candidate Characteristics", "primary_class": "analysis", "attributes": ["candidate-graph", "review-required"], "features": [ ( "Build candidate abilities, capabilities, features, and evidence", "backend", [ "src/repo_scoping/candidate_graph/generator.py", "src/repo_scoping/candidate_graph/normalization.py", "tests/test_candidate_graph.py", ], ), ( "Optionally map structured LLM extraction into candidates", "integration", [ "src/repo_scoping/llm_extraction/extractor.py", "src/repo_scoping/llm_extraction/mapper.py", "tests/test_llm_extraction.py", ], ), ], }, { "name": "Review And Approve Candidate Characteristics", "primary_class": "review", "attributes": ["curation", "approval", "audit"], "features": [ ( "Edit, reject, merge, and relink candidate graph entries", "api", [ "src/repo_scoping/core/service.py", "src/repo_scoping/web_api/app.py", "tests/test_registry_service.py", ], ), ( "Publish approved characteristic maps after review", "storage", ["src/repo_scoping/core/service.py", "src/repo_scoping/storage/sqlite.py"], ), ( "Record review decisions and expectation gaps", "audit", ["src/repo_scoping/core/service.py", "src/repo_scoping/web_api/schemas.py"], ), ], }, { "name": "Search Compare And Export Approved Profiles", "primary_class": "discovery", "attributes": ["search", "comparison", "export"], "features": [ ( "Search approved abilities, capabilities, features, and evidence", "api", ["src/repo_scoping/core/service.py", "tests/test_registry_service.py"], ), ( "Compare repositories and identify capability gaps", "api", ["src/repo_scoping/core/service.py", "src/repo_scoping/web_api/app.py"], ), ( "Export repository profiles", "api", ["src/repo_scoping/web_api/app.py", "docs/api-contract.md"], ), ], }, { "name": "Generate And Maintain SCOPE.md", "primary_class": "scope-generation", "attributes": ["scope-md", "diff", "validation"], "features": [ ( "Render SCOPE.md from approved characteristics", "backend", [ "src/repo_scoping/scope/generator.py", "tests/test_scope_generator.py", "docs/scope-md-spec.md", ], ), ( "Diff, validate, and write scope files", "api", [ "src/repo_scoping/scope/validator.py", "src/repo_scoping/web_api/app.py", ], ), ], }, { "name": "Explore Dependency And Impact Graphs", "primary_class": "dependency-analysis", "attributes": ["graph", "impact", "visualization"], "features": [ ( "Model dependencies between facts, evidence, features, capabilities, abilities, and scope", "backend", [ "src/repo_scoping/core/service.py", "docs/dependency-aware-scope-propagation.md", "docs/dependency-visualization-exploration.md", ], ), ( "Render dependency graph views and profiles", "ui", ["src/repo_scoping/web_ui/views.py", "tests/test_web_api.py"], ), ], }, { "name": "Provide Scope Context To Downstream Agents", "primary_class": "coordination", "attributes": ["activity-core", "api-contract"], "features": [ ( "Return compact JSON scope context by repository slug", "api", [ "src/repo_scoping/web_api/app.py", "docs/schemas/repo-scope-context-response.json", "tests/test_scope_context_api.py", ], ), ], }, ] class CandidateGraphGenerator: """Build conservative review candidates from observed facts.""" def generate( self, repository: Repository, facts: list[ObservedFact], chunks: list[ContentChunk] | None = None, ) -> list[CandidateAbilityDraft]: if not facts: return [] chunks = chunks or [] docs = self._facts(facts, "intent") + self._facts(facts, "documentation") tests = self._facts(facts, "test") examples = self._facts(facts, "example") interfaces = self._facts(facts, "interface") manifests = self._facts(facts, "manifest") frameworks = self._facts(facts, "framework") languages = self._facts(facts, "language") configs = self._facts(facts, "config") scope_facts = self._facts(facts, "scope") llm_providers = self._facts(facts, "llm_provider") credential_configs = self._facts(facts, "credential_config") provider_registries = self._facts(facts, "provider_registry") fallback_policies = self._facts(facts, "fallback_policy") intent_facts = self._facts(facts, "intent") ability_primary_class, ability_attributes = self._ability_classification( repository, facts, chunks, ) ability_sources = docs or scope_facts or manifests or languages or configs ability = CandidateAbilityDraft( name=self._ability_name(repository, chunks), description=self._ability_description(chunks), confidence=self._ability_confidence( docs=docs, interfaces=interfaces, tests=tests, examples=examples, frameworks=frameworks, languages=languages, ), source_refs=self._source_refs(ability_sources), primary_class=ability_primary_class, attributes=ability_attributes, capabilities=[], ) capabilities: list[CandidateCapabilityDraft] = [] capabilities.extend( self._intent_capabilities(intent_facts, chunks, tests, examples, docs) ) capabilities.extend( self._scope_capabilities( scope_facts, chunks, tests, examples, allow_summary_fallback=not intent_facts, ) ) capabilities.extend( self._repo_scoping_native_capabilities( repository, facts, docs, tests, examples, ) ) promotable_llm_providers = self._promotable_llm_facts(llm_providers) promotable_provider_registries = self._promotable_llm_facts(provider_registries) promotable_fallback_policies = self._promotable_llm_facts(fallback_policies) promotable_llm_facts = ( promotable_llm_providers + promotable_provider_registries + promotable_fallback_policies ) if promotable_llm_facts: capabilities.append( self._llm_provider_capability( promotable_llm_providers, credential_configs, promotable_provider_registries, promotable_fallback_policies, tests, examples, docs, ) ) if interfaces and capabilities: capabilities = self._attach_interface_features( capabilities, interfaces, chunks, ) elif interfaces: capabilities.append( self._interface_capability(interfaces, tests, examples, docs, chunks) ) if not capabilities: capabilities.extend( self._fact_derived_capabilities( configs=configs, manifests=manifests, frameworks=frameworks, languages=languages, docs=docs, tests=tests, chunks=chunks, ) ) return [ CandidateAbilityDraft( name=ability.name, description=ability.description, confidence=ability.confidence, source_refs=ability.source_refs, primary_class=ability.primary_class, attributes=ability.attributes, capabilities=capabilities, ) ] def _interface_capability( self, interfaces: list[ObservedFact], tests: list[ObservedFact], examples: list[ObservedFact], docs: list[ObservedFact], chunks: list[ContentChunk], ) -> CandidateCapabilityDraft: features = self._interface_features(interfaces, chunks) return CandidateCapabilityDraft( name="Expose Repository Interface", description=self._interface_description(chunks), inputs=self._interface_inputs(interfaces), outputs=self._interface_outputs(interfaces), confidence=self._interface_confidence( interfaces=interfaces, tests=tests, examples=examples, docs=docs, ), source_refs=self._source_refs(interfaces), primary_class="interface", attributes=self._interface_attributes(interfaces, docs, chunks), features=features, evidence=self._evidence(tests, examples, docs), ) def _llm_provider_capability( self, providers: list[ObservedFact], credentials: list[ObservedFact], registries: list[ObservedFact], fallback_policies: list[ObservedFact], tests: list[ObservedFact], examples: list[ObservedFact], docs: list[ObservedFact], ) -> CandidateCapabilityDraft: provider_names = sorted({fact.name for fact in providers}) provider_summary = ", ".join(provider_names) if provider_names else "LLM providers" features = [ CandidateFeatureDraft( name=f"Use {provider} Models", type="integration", location=self._grouped_location( [fact for fact in providers if fact.name == provider] ), confidence=0.75, source_refs=self._source_refs( [fact for fact in providers if fact.name == provider] ), primary_class="integration", attributes=["llm-provider", provider.lower()], ) for provider in provider_names ] if credentials: features.append( CandidateFeatureDraft( name="Configure LLM Provider Credentials", type="configuration", location=self._grouped_location(credentials), confidence=0.7, source_refs=self._source_refs(credentials), primary_class="configuration", attributes=["credential", "llm-provider"], ) ) if registries: features.append( CandidateFeatureDraft( name="Maintain LLM Provider Registry", type="backend", location=self._grouped_location(registries), confidence=0.65, source_refs=self._source_refs(registries), primary_class="backend", attributes=["provider-registry", "llm-provider"], ) ) if fallback_policies: features.append( CandidateFeatureDraft( name="Apply LLM Provider Fallback Policy", type="backend", location=self._grouped_location(fallback_policies), confidence=0.6, source_refs=self._source_refs(fallback_policies), primary_class="backend", attributes=["fallback-policy", "llm-provider"], ) ) return CandidateCapabilityDraft( name="Route LLM Requests Across Providers", description=( "Expose or configure model-provider integrations detected from " f"source-linked provider hints: {provider_summary}." ), inputs=["LLM request", "provider configuration"], outputs=["provider-specific model response"], confidence=self._llm_provider_confidence( providers=providers, credentials=credentials, registries=registries, fallback_policies=fallback_policies, docs=docs, ), source_refs=self._source_refs( providers + credentials + registries + fallback_policies ), primary_class="llm-integration", attributes=self._llm_provider_attributes( providers, credentials, registries, fallback_policies, ) + self._utility_relationship_attributes( providers + credentials + registries + fallback_policies ), features=features, evidence=self._evidence(tests, examples, docs), ) def _intent_capabilities( self, intent_facts: list[ObservedFact], chunks: list[ContentChunk], tests: list[ObservedFact], examples: list[ObservedFact], docs: list[ObservedFact], ) -> list[CandidateCapabilityDraft]: intent_chunks = [ chunk for chunk in chunks if chunk.kind == "intent" and ( chunk.metadata.get("source_role") == "intent_summary" or chunk.path.lower().endswith("intent.md") ) ] if not intent_chunks: return [] source_refs = self._source_refs(intent_facts) capabilities: list[CandidateCapabilityDraft] = [] seen: set[str] = set() for item in self._intent_capability_items(intent_chunks): name = self._intent_capability_name(item) key = name.lower() if not name or key in seen: continue seen.add(key) capabilities.append( CandidateCapabilityDraft( name=name, description=( "Reviewable intended capability extracted from repository " f"intent: {item}" ), inputs=[], outputs=[name], confidence=self._confidence( 0.45, [ (0.15, bool(source_refs)), (0.10, bool(tests)), (0.05, bool(examples)), (0.05, bool(docs)), ], ), source_refs=source_refs, primary_class="intent-capability", attributes=[ "intent-derived", "utility-owned", "review-required-intent", ], evidence=self._evidence(tests, examples, docs), ) ) return capabilities def _intent_capability_items(self, chunks: list[ContentChunk]) -> list[str]: items: list[str] = [] in_capability_section = False for chunk in sorted(chunks, key=lambda item: (item.path, item.start_line)): for raw_line in chunk.text.splitlines(): line = raw_line.strip() if not line: continue if line.startswith("#"): heading = line.lstrip("#").strip().lower() in_capability_section = ( "capabilit" in heading or heading in {"primary utility", "core utility"} ) continue if not in_capability_section: continue item = re.sub(r"^(?:[-*]|\d+[.)])\s+", "", line).strip() item = re.sub(r"^(?:capability|intended capability)\s*:\s*", "", item, flags=re.I) if item and item != line or raw_line.lstrip().startswith(("-", "*")): items.append(item) return items def _intent_capability_name(self, text: str) -> str: lowered = re.sub(r"[*_`]", "", text.lower()) if "continuous connectivity" in lowered and "remote systems" in lowered: return "Maintain Continuous Connectivity Between Remote Systems And Central Hub" if "observable" in lowered and "auditable" in lowered and "controllable" in lowered: return "Make Connectivity Observable Auditable And Controllable" if "cli tool" in lowered and "mcp" in lowered: return "Expose CLI And MCP Accessible Service" candidate = re.split(r"\s+-\s+|\s*:\s*|[.!?]\s+", text.strip(), maxsplit=1)[0] candidate = candidate.strip(" .:-") if not candidate: return "" words = candidate.split() if words: words[0] = self._imperative_verb(words[0]) while words and words[-1].lower().strip(",;:") in {"a", "an", "the", "and", "or", "as", "both"}: words.pop() return self._title_from_words(words[:10]) def _scope_capabilities( self, scope_facts: list[ObservedFact], chunks: list[ContentChunk], tests: list[ObservedFact], examples: list[ObservedFact], *, allow_summary_fallback: bool = True, ) -> list[CandidateCapabilityDraft]: scope_chunks = [ chunk for chunk in chunks if chunk.kind == "scope" or chunk.metadata.get("source_role") == "derived_scope" or chunk.path.lower().endswith("scope.md") ] if not scope_chunks: return [] source_refs = self._source_refs(scope_facts) capabilities: list[CandidateCapabilityDraft] = [] seen: set[str] = set() for block in self._scope_capability_blocks(scope_chunks): title = block.get("title", "").strip() if not title: continue key = title.lower() if key in seen: continue seen.add(key) capability_type = block.get("type", "scope-derived").strip() or "scope-derived" description = block.get("description", "").strip() keywords = self._scope_keywords(block.get("keywords", "")) attributes = self._unique( [ capability_type, *keywords, "scope-derived", "current-state", "review-required-scope", ] ) feature = CandidateFeatureDraft( name=title, type=capability_type, location="SCOPE.md", confidence=0.55, source_refs=source_refs, primary_class=capability_type, attributes=self._unique( [capability_type, "scope-defined", "review-required-scope"] ), ) capabilities.append( CandidateCapabilityDraft( name=title, description=( "Reviewable current-state capability extracted from " f"SCOPE.md: {description or title}" ), inputs=[], outputs=[title], confidence=self._confidence( 0.45, [ (0.10, bool(description)), (0.05, bool(keywords)), (0.05, bool(tests)), (0.05, bool(examples)), ], ), source_refs=source_refs, primary_class=capability_type, attributes=attributes, features=[feature], evidence=[ CandidateEvidenceDraft( type="scope-current-state", reference="SCOPE.md", strength="medium", source_refs=source_refs, ) ], ) ) if capabilities or not allow_summary_fallback: return capabilities fallback_name = self._scope_summary_capability_name(scope_chunks) if not fallback_name: return [] return [ CandidateCapabilityDraft( name=fallback_name, description=( "Reviewable current-state capability inferred from SCOPE.md " "summary text. A curator should split this into more precise " "capabilities when reviewing." ), inputs=[], outputs=[fallback_name], confidence=0.45, source_refs=source_refs, primary_class="scope-derived", attributes=[ "scope-derived", "current-state", "review-required-scope", ], evidence=[ CandidateEvidenceDraft( type="scope-current-state", reference="SCOPE.md", strength="weak", source_refs=source_refs, ) ], ) ] def _scope_capability_blocks( self, chunks: list[ContentChunk], ) -> list[dict[str, str]]: blocks: list[dict[str, str]] = [] in_block = False current: dict[str, str] = {} current_key = "" for chunk in sorted(chunks, key=lambda item: (item.path, item.start_line)): for raw_line in chunk.text.splitlines(): line = raw_line.rstrip() stripped = line.strip() if stripped.startswith("```capability"): in_block = True current = {} current_key = "" continue if in_block and stripped.startswith("```"): if current: blocks.append(current) in_block = False current = {} current_key = "" continue if not in_block: continue key, separator, value = stripped.partition(":") if separator and re.match(r"^[A-Za-z_][A-Za-z0-9_-]*$", key): current_key = key.lower() current[current_key] = value.strip().strip('"') elif current_key and stripped: current[current_key] = ( f"{current[current_key]} {stripped.strip()}" ).strip() return blocks def _scope_keywords(self, value: str) -> list[str]: cleaned = value.strip() if cleaned.startswith("[") and cleaned.endswith("]"): cleaned = cleaned[1:-1] return [ item.strip(" `\"'") for item in cleaned.split(",") if item.strip(" `\"'") ][:8] def _scope_summary_capability_name(self, chunks: list[ContentChunk]) -> str: one_liner = self._scope_one_liner(chunks) if one_liner: return self._imperative_purpose(one_liner) return "" def _fact_derived_capabilities( self, *, configs: list[ObservedFact], manifests: list[ObservedFact], frameworks: list[ObservedFact], languages: list[ObservedFact], docs: list[ObservedFact], tests: list[ObservedFact], chunks: list[ContentChunk], ) -> list[CandidateCapabilityDraft]: if not configs: return [] capability_facts = configs + manifests + frameworks + languages if not capability_facts: return [] features: list[CandidateFeatureDraft] = [] for label, kind, facts in ( ("Manage Repository Configuration", "configuration", configs), ("Declare Runtime And Package Manifests", "manifest", manifests), ("Use Detected Frameworks", "framework", frameworks), ("Provide Implementation In Detected Languages", "implementation", languages), ): if not facts: continue features.append( CandidateFeatureDraft( name=label, type=kind, location=self._grouped_location(facts), confidence=0.45, source_refs=self._source_refs(facts), primary_class=kind, attributes=[kind, "fact-derived", "review-required"], ) ) if not features: return [] name = self._fact_derived_capability_name(chunks, features) return [ CandidateCapabilityDraft( name=name, description=( "Reviewable capability inferred from deterministic facts. " "This fills the hierarchy when no stronger intent, scope " "capability, or interface candidate exists." ), inputs=self._feature_inputs(features), outputs=self._feature_outputs(features), confidence=self._confidence( 0.35, [ (0.10, bool(configs)), (0.10, bool(manifests)), (0.05, bool(frameworks)), (0.05, bool(tests)), (0.05, bool(docs)), ], ), source_refs=self._source_refs(capability_facts), primary_class="fact-derived", attributes=["fact-derived", "review-required", "partial-hierarchy"], features=features, evidence=self._evidence(tests, [], docs), ) ] def _fact_derived_capability_name( self, chunks: list[ContentChunk], features: list[CandidateFeatureDraft], ) -> str: scope_name = self._scope_summary_capability_name(chunks) if scope_name: return scope_name if any(feature.type == "configuration" for feature in features): return "Manage Repository Configuration" if any(feature.type == "manifest" for feature in features): return "Declare Repository Runtime" return "Describe Repository Implementation" def _repo_scoping_native_capabilities( self, repository: Repository, facts: list[ObservedFact], docs: list[ObservedFact], tests: list[ObservedFact], examples: list[ObservedFact], ) -> list[CandidateCapabilityDraft]: if not self._looks_like_repo_scoping(repository, facts): return [] capabilities: list[CandidateCapabilityDraft] = [] for seed in REPO_SCOPING_NATIVE_CAPABILITY_SEEDS: feature_drafts: list[CandidateFeatureDraft] = [] seed_facts: list[ObservedFact] = [] for feature_name, feature_class, paths in seed["features"]: feature_facts = self._facts_for_paths(facts, paths) if not feature_facts: continue seed_facts.extend(feature_facts) feature_drafts.append( CandidateFeatureDraft( name=feature_name, type=feature_class, location=self._grouped_location(feature_facts), confidence=0.7, source_refs=self._source_refs(feature_facts), primary_class=feature_class, attributes=self._unique( [feature_class, "source-linked", "repo-owned"] ), ) ) seed_facts = self._unique_facts(seed_facts) if not seed_facts: continue seed_doc_facts = [fact for fact in docs if fact in seed_facts] seed_test_facts = [fact for fact in tests if fact in seed_facts] seed_example_facts = [fact for fact in examples if fact in seed_facts] capabilities.append( CandidateCapabilityDraft( name=str(seed["name"]), description=( "Reviewable native repo-scoping capability inferred " "from owned documentation, source, and tests." ), inputs=[], outputs=[str(seed["name"])], confidence=self._confidence( 0.45, [ (0.10, bool(seed_doc_facts)), (0.10, bool(seed_test_facts)), (0.05, bool(seed_example_facts)), (0.05, len(feature_drafts) > 1), ], ), source_refs=self._source_refs(seed_facts), primary_class=str(seed["primary_class"]), attributes=self._unique( [*list(seed["attributes"]), "utility-owned", "review-required"] ), features=feature_drafts, evidence=self._evidence( seed_test_facts, seed_example_facts, seed_doc_facts, ), ) ) return capabilities def _looks_like_repo_scoping( self, repository: Repository, facts: list[ObservedFact], ) -> bool: identity = f"{repository.name} {repository.url} {repository.description or ''}".lower() if "repo-scoping" in identity or "repository scoping" in identity: return True return any(fact.path.startswith("src/repo_scoping/") for fact in facts) def _facts_for_paths( self, facts: list[ObservedFact], paths: list[str], ) -> list[ObservedFact]: matched: list[ObservedFact] = [] for fact in facts: if any(fact.path == path or fact.path.startswith(f"{path}/") for path in paths): matched.append(fact) return self._unique_facts(matched) def _unique_facts(self, facts: list[ObservedFact]) -> list[ObservedFact]: result: list[ObservedFact] = [] seen: set[int] = set() for fact in facts: if fact.id in seen: continue seen.add(fact.id) result.append(fact) return result def _attach_interface_features( self, capabilities: list[CandidateCapabilityDraft], interfaces: list[ObservedFact], chunks: list[ContentChunk], ) -> list[CandidateCapabilityDraft]: features = self._interface_features(interfaces, chunks) if not features: return capabilities capability_features: dict[int, list[CandidateFeatureDraft]] = { index: [] for index, _ in enumerate(capabilities) } for feature in features: index = self._best_feature_capability_index(feature, capabilities) capability_features[index].append(feature) updated: list[CandidateCapabilityDraft] = [] for index, capability in enumerate(capabilities): attached = capability_features[index] if not attached: updated.append(capability) continue updated.append( replace( capability, inputs=capability.inputs or self._feature_inputs(attached), outputs=capability.outputs or self._feature_outputs(attached), features=[*capability.features, *attached], ) ) return updated def _best_feature_capability_index( self, feature: CandidateFeatureDraft, capabilities: list[CandidateCapabilityDraft], ) -> int: feature_text = f"{feature.name} {feature.type} {feature.location}".lower() feature_terms = self._significant_terms(feature_text) best_index = 0 best_score = -1 for index, capability in enumerate(capabilities): capability_text = " ".join( [ capability.name, capability.description, " ".join(capability.outputs), " ".join(capability.attributes), ] ).lower() capability_terms = self._significant_terms(capability_text) score = len(feature_terms & capability_terms) if feature.type == "CLI" and any( token in capability_text for token in ("cli", "command", "mcp") ): score += 3 if feature.type == "API" and any( token in capability_text for token in ("api", "http", "service") ): score += 3 if score > best_score: best_index = index best_score = score return best_index def _interface_features( self, interfaces: list[ObservedFact], chunks: list[ContentChunk], ) -> list[CandidateFeatureDraft]: by_type: dict[str, list[ObservedFact]] = {} for fact in interfaces: by_type.setdefault(self._feature_type(fact), []).append(fact) features: list[CandidateFeatureDraft] = [] for feature_type, facts in sorted(by_type.items()): if len(facts) == 1: fact = facts[0] features.append( CandidateFeatureDraft( name=self._feature_name(fact, chunks), type=feature_type, location=fact.path, confidence=0.65 if fact.value else 0.45, source_refs=self._source_refs([fact]), primary_class=feature_type, attributes=self._feature_attributes(feature_type, [fact]), ) ) continue features.append( CandidateFeatureDraft( name=self._grouped_interface_feature_name( feature_type, facts, chunks, ), type=feature_type, location=self._grouped_location(facts), confidence=self._grouped_interface_confidence(facts), source_refs=self._source_refs(facts), primary_class=feature_type, attributes=self._feature_attributes(feature_type, facts), ) ) return features def _grouped_interface_feature_name( self, feature_type: str, facts: list[ObservedFact], chunks: list[ContentChunk], ) -> str: summary = self._grouped_interface_summary(facts, chunks) if feature_type == "API": return f"HTTP API surface: {summary}" if feature_type == "CLI": return f"CLI command surface: {summary}" return f"Callable interface surface: {summary}" def _grouped_interface_summary( self, facts: list[ObservedFact], chunks: list[ContentChunk], ) -> str: names = [self._feature_name(fact, chunks) for fact in facts] compact_names = self._unique([name for name in names if name]) if not compact_names: return f"{len(facts)} entry points" visible = compact_names[:3] suffix = f", +{len(compact_names) - 3} more" if len(compact_names) > 3 else "" return f"{', '.join(visible)}{suffix}" def _grouped_location(self, facts: list[ObservedFact]) -> str: paths = sorted({fact.path for fact in facts if fact.path}) if not paths: return "" if len(paths) == 1: return paths[0] return "multiple files" def _grouped_interface_confidence(self, facts: list[ObservedFact]) -> float: valued = sum(1 for fact in facts if fact.value) return 0.7 if valued == len(facts) else 0.55 def _evidence( self, tests: list[ObservedFact], examples: list[ObservedFact], docs: list[ObservedFact], ) -> list[CandidateEvidenceDraft]: evidence: list[CandidateEvidenceDraft] = [] for fact in tests: evidence.append( CandidateEvidenceDraft( type="test", reference=fact.path, strength="strong", source_refs=self._source_refs([fact]), ) ) for fact in examples: evidence.append( CandidateEvidenceDraft( type="example", reference=fact.path, strength="strong", source_refs=self._source_refs([fact]), ) ) for fact in docs: evidence.append( CandidateEvidenceDraft( type="documentation", reference=fact.path, strength="medium", source_refs=self._source_refs([fact]), ) ) return evidence def _feature_type(self, fact: ObservedFact) -> str: lower = f"{fact.name} {fact.path} {fact.value}".lower() if "cli" in lower or "command" in lower: return "CLI" if "api" in lower or "route" in lower or "@app." in lower or "@router." in lower: return "API" return "interface" def _ability_classification( self, repository: Repository, facts: list[ObservedFact], chunks: list[ContentChunk], ) -> tuple[str, list[str]]: text = " ".join( [ repository.name, repository.description or "", " ".join( chunk.text[:600] for chunk in chunks if chunk.kind in {"intent", "documentation"} and chunk.metadata.get("source_role") != "agent_guidance" ), " ".join( f"{fact.kind} {fact.name} {fact.value}" for fact in facts if not ( fact.kind == "llm_provider" and self._utility_relationship(fact) not in {"facade", "adapter"} ) ), ] ).lower() attributes: list[str] = [] if any(token in text for token in ("ssh", "tunnel", "reverse tunnel", "remote access", "connectivity")): attributes.extend(["remote-access", "connectivity"]) if any(token in text for token in ("audit", "health check", "lifecycle", "ops", "operator")): attributes.append("operations") return "it-operations", self._unique(attributes) if any(token in text for token in ("ability", "capability", "feature")): return "repository-intelligence", self._unique(attributes + ["capability-mapping"]) promotable_llm = any( fact.kind == "llm_provider" and self._utility_relationship(fact) in {"owned", "facade", "adapter"} for fact in facts ) if promotable_llm: return "ai-integration", self._unique(attributes + ["llm-provider"]) if any(fact.kind == "interface" for fact in facts): attributes.append("interface") return "developer-tooling", self._unique(attributes) def _interface_attributes( self, interfaces: list[ObservedFact], docs: list[ObservedFact] | None = None, chunks: list[ContentChunk] | None = None, ) -> list[str]: feature_types = {self._feature_type(fact) for fact in interfaces} attributes = ["api" if item == "API" else "cli" if item == "CLI" else "callable" for item in feature_types] utility = self._interface_utility_relationship(docs or [], chunks or []) return self._unique(["surface", *attributes, f"utility-{utility}"]) def _interface_utility_relationship( self, docs: list[ObservedFact], chunks: list[ContentChunk], ) -> str: doc_paths = {fact.path for fact in docs} text = " ".join( chunk.text.lower() for chunk in chunks if chunk.path in doc_paths and chunk.kind in {"intent", "documentation"} and chunk.metadata.get("source_role") != "derived_scope" ) if any(token in text for token in ("facade", "proxy", "wrapper", "wraps ")): return "facade" return "owned" def _feature_attributes( self, feature_type: str, facts: list[ObservedFact], ) -> list[str]: attributes = [feature_type] if feature_type == "API": attributes.extend(["surface", "http"]) elif feature_type == "CLI": attributes.extend(["surface", "command"]) else: attributes.append("surface") paths = " ".join(fact.path.lower() for fact in facts) if "test" in paths: attributes.append("test-linked") return self._unique(attributes) def _structure_attributes( self, manifests: list[ObservedFact], frameworks: list[ObservedFact], languages: list[ObservedFact], ) -> list[str]: return self._unique( [ "manifest" if manifests else "", *[fact.name for fact in frameworks], *[fact.name for fact in languages], "utility-dependency" if manifests or frameworks else "", "utility-tooling" if languages and not (manifests or frameworks) else "", "review-required-structural-context", ] ) def _llm_provider_attributes( self, providers: list[ObservedFact], credentials: list[ObservedFact], registries: list[ObservedFact], fallback_policies: list[ObservedFact], ) -> list[str]: return self._unique( [ "llm-provider", *[fact.name.lower() for fact in providers], "credential" if credentials else "", "provider-registry" if registries else "", "fallback-policy" if fallback_policies else "", ] ) def _unique(self, values: list[str]) -> list[str]: result: list[str] = [] seen: set[str] = set() for value in values: item = value.strip() key = item.lower() if not item or key in seen: continue seen.add(key) result.append(item) return result def _significant_terms(self, text: str) -> set[str]: stop_words = { "and", "the", "this", "that", "with", "from", "into", "for", "capability", "repository", "service", } return { term for term in re.findall(r"[a-z0-9]+", text.lower()) if len(term) > 2 and term not in stop_words } def _interface_inputs(self, interfaces: list[ObservedFact]) -> list[str]: feature_types = {self._feature_type(fact) for fact in interfaces} inputs: list[str] = [] if "API" in feature_types: inputs.append("HTTP request") if "CLI" in feature_types: inputs.append("CLI arguments") if not inputs: inputs.append("caller input") return inputs def _interface_outputs(self, interfaces: list[ObservedFact]) -> list[str]: feature_types = {self._feature_type(fact) for fact in interfaces} outputs: list[str] = [] if "API" in feature_types: outputs.append("HTTP response") if "CLI" in feature_types: outputs.append("command output") if not outputs: outputs.append("callable interface result") return outputs def _feature_inputs(self, features: list[CandidateFeatureDraft]) -> list[str]: feature_types = {feature.type for feature in features} inputs: list[str] = [] if "API" in feature_types: inputs.append("HTTP request") if "CLI" in feature_types: inputs.append("CLI arguments") if not inputs: inputs.append("caller input") return inputs def _feature_outputs(self, features: list[CandidateFeatureDraft]) -> list[str]: feature_types = {feature.type for feature in features} outputs: list[str] = [] if "API" in feature_types: outputs.append("HTTP response") if "CLI" in feature_types: outputs.append("command output") if not outputs: outputs.append("callable interface result") return outputs def _feature_name(self, fact: ObservedFact, chunks: list[ContentChunk]) -> str: route_name = self._route_feature_name(fact.value) if route_name: return route_name if self._feature_type(fact) == "CLI": function_name = self._function_name_near_fact(fact, chunks) if function_name: return f"CLI command {function_name}" return fact.value or fact.name def _route_feature_name(self, value: str) -> str: match = re.search(r"@(?:app|router)\.(get|post|put|patch|delete)\((['\"])(.*?)\2", value) if match is None: return "" method = match.group(1).upper() path = match.group(3) return f"{method} {path}" def _function_name_near_fact( self, fact: ObservedFact, chunks: list[ContentChunk], ) -> str: line = fact.metadata.get("line") for chunk in chunks: if chunk.path != fact.path or chunk.kind != "interface": continue if isinstance(line, int) and not (chunk.start_line <= line <= chunk.end_line): continue match = re.search(r"^\s*def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", chunk.text, re.MULTILINE) if match is not None: return match.group(1) return "" def _ability_confidence( self, *, docs: list[ObservedFact], interfaces: list[ObservedFact], tests: list[ObservedFact], examples: list[ObservedFact], frameworks: list[ObservedFact], languages: list[ObservedFact], ) -> float: return self._confidence( 0.25, [ (0.20, bool(docs)), (0.15, bool(interfaces)), (0.15, bool(tests)), (0.10, bool(examples)), (0.10, bool(frameworks)), (0.05, bool(languages)), ], ) def _interface_confidence( self, *, interfaces: list[ObservedFact], tests: list[ObservedFact], examples: list[ObservedFact], docs: list[ObservedFact], ) -> float: return self._confidence( 0.30, [ (0.20, bool(interfaces)), (0.15, bool(tests)), (0.10, bool(examples)), (0.10, bool(docs)), (0.05, len(interfaces) > 1), ], ) def _structure_confidence( self, *, manifests: list[ObservedFact], frameworks: list[ObservedFact], languages: list[ObservedFact], docs: list[ObservedFact], ) -> float: return self._confidence( 0.25, [ (0.20, bool(manifests)), (0.15, bool(frameworks)), (0.10, bool(languages)), (0.05, bool(docs)), ], ) def _llm_provider_confidence( self, *, providers: list[ObservedFact], credentials: list[ObservedFact], registries: list[ObservedFact], fallback_policies: list[ObservedFact], docs: list[ObservedFact], ) -> float: return self._confidence( 0.35, [ (0.20, bool(providers)), (0.10, len({fact.name for fact in providers}) > 1), (0.10, bool(credentials)), (0.10, bool(registries)), (0.10, bool(fallback_policies)), (0.05, bool(docs)), ], ) def _confidence( self, base: float, factors: list[tuple[float, bool]], ) -> float: score = base + sum(weight for weight, applies in factors if applies) return min(1.0, round(score, 2)) def _ability_description(self, chunks: list[ContentChunk]) -> str: doc_summary = self._document_summary(chunks) if doc_summary: return ( "Candidate repository purpose inferred from repository content: " f"{doc_summary} Review is required before treating this as an " "approved domain ability." ) return ( "Candidate repository purpose inferred from observed repository " "documentation, manifests, languages, and interfaces. Review is " "required before treating this as an approved domain ability." ) def _ability_name( self, repository: Repository, chunks: list[ContentChunk], ) -> str: ops_name = self._operations_ability_name(chunks) if ops_name: return ops_name purpose_text = ( self._intent_purpose_sentence(chunks) or self._scope_one_liner(chunks) or self._documentation_purpose_sentence(chunks) or repository.description ) if purpose_text: normalized = self._imperative_purpose(purpose_text) if normalized: return normalized return f"Support {self._humanize_identifier(repository.name)}" def _intent_purpose_sentence(self, chunks: list[ContentChunk]) -> str: return self._purpose_sentence_for_chunks( [ chunk for chunk in self._purpose_chunks(chunks) if chunk.kind == "intent" or chunk.metadata.get("source_role") == "intent_summary" or chunk.path.lower().endswith("intent.md") ] ) def _documentation_purpose_sentence(self, chunks: list[ContentChunk]) -> str: return self._purpose_sentence_for_chunks( [ chunk for chunk in self._purpose_chunks(chunks) if chunk.kind == "documentation" and chunk.metadata.get("source_role") != "derived_scope" and not chunk.path.lower().endswith("scope.md") ] ) def _purpose_sentence_for_chunks(self, chunks: list[ContentChunk]) -> str: for chunk in chunks: if chunk.kind not in {"intent", "documentation"}: continue lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] paragraph = next((line for line in lines if not line.startswith("#")), "") if paragraph and not self._is_template_boilerplate(paragraph): return paragraph return "" def _scope_one_liner(self, chunks: list[ContentChunk]) -> str: for chunk in sorted(chunks, key=lambda item: (item.path, item.start_line)): if not ( chunk.kind == "scope" or chunk.metadata.get("source_role") == "derived_scope" or chunk.path.lower().endswith("scope.md") ): continue lines = chunk.text.splitlines() for index, raw_line in enumerate(lines): if raw_line.strip().lower() == "## one-liner": for following in lines[index + 1 :]: candidate = following.strip() if not candidate or candidate.startswith("---"): continue if candidate.startswith(">"): continue return candidate.strip(" .") before_first_section: list[str] = [] for raw_line in lines: candidate = raw_line.strip() if candidate.startswith("## "): break before_first_section.append(candidate) for candidate in before_first_section: if ( candidate and not candidate.startswith("#") and not candidate.startswith(">") and not candidate.startswith("---") and not self._is_template_boilerplate(candidate) ): return candidate.strip(" .") return "" def _is_template_boilerplate(self, text: str) -> bool: lowered = text.lower() return ( "git repository template to bootstrap" in lowered or "this file helps you quickly understand" in lowered or "intentionally lightweight and may be incomplete" in lowered ) def _purpose_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]: def priority(chunk: ContentChunk) -> tuple[int, str, int]: role = chunk.metadata.get("source_role") path = chunk.path.lower() if role == "intent_summary" or path.endswith("intent.md"): return (0, path, chunk.start_line) if role == "derived_scope" or path.endswith("scope.md"): return (1, path, chunk.start_line) if role == "product_documentation" or path.startswith("readme"): return (2, path, chunk.start_line) return (3, path, chunk.start_line) return sorted( [ chunk for chunk in chunks if chunk.kind in {"intent", "documentation", "scope"} and chunk.metadata.get("source_role") != "agent_guidance" ], key=priority, ) def _operations_ability_name(self, chunks: list[ContentChunk]) -> str: text = " ".join( chunk.text for chunk in self._documentation_chunks(chunks) if chunk.kind == "intent" ).lower() if "ssh reverse tunnel" in text or "ssh reverse tunneling" in text: return "Manage SSH Reverse Tunnel Connectivity" return "" def _imperative_purpose(self, text: str) -> str: cleaned = re.sub(r"\s+", " ", text.strip()) cleaned = re.split(r"[.!?]\s+", cleaned, maxsplit=1)[0] cleaned = re.sub( r"(?i)^this\s+repository\s+exists\s+to\s+provide\s+(?:an?\s+)?", "Provide ", cleaned, ) cleaned = re.sub(r"^[A-Z][A-Za-z0-9_-]*\s+(?:is|provides|offers)\s+", "", cleaned) cleaned = cleaned.strip(" .:-") if not cleaned: return "" words = cleaned.split() if not words: return "" words[0] = self._imperative_verb(words[0]) return self._title_from_words(words[:10]) def _imperative_verb(self, word: str) -> str: if word.isupper(): return word lower = word.lower().strip(",;:") irregular = { "does": "do", "has": "have", "is": "be", } if lower in irregular: return irregular[lower] if lower in {"this"}: return lower if lower.endswith("ies") and len(lower) > 4: return f"{lower[:-3]}y" if lower.endswith(("des", "ses", "tes", "ves", "zes")) and len(lower) > 4: return lower[:-1] if lower.endswith("es") and len(lower) > 3: return lower[:-2] if lower.endswith("s") and len(lower) > 3: return lower[:-1] return lower def _title_from_words(self, words: list[str]) -> str: cleaned_words = [ re.sub(r"[^A-Za-z0-9_/{}-]", "", word) for word in words ] return " ".join( word if word.isupper() else word[:1].upper() + word[1:] for word in cleaned_words if word ) def _humanize_identifier(self, value: str) -> str: spaced = re.sub(r"[_-]+", " ", value) spaced = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", spaced) return self._title_from_words(spaced.split()) def _interface_description(self, chunks: list[ContentChunk]) -> str: interface_summary = self._interface_summary(chunks) if interface_summary: return ( "Expose one or more likely user-facing API or CLI entry points. " f"Source context: {interface_summary} Review is required to name " "the concrete domain behavior." ) return ( "Expose one or more likely user-facing API or CLI entry points. " "Review is required to name the concrete domain behavior." ) def _document_summary(self, chunks: list[ContentChunk]) -> str: for chunk in self._documentation_chunks(chunks): lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] if not lines: continue if chunk.kind == "scope" or chunk.metadata.get("source_role") == "derived_scope": one_liner = self._scope_one_liner([chunk]) if one_liner: return f"SCOPE. {one_liner}" heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "") paragraph = next((line for line in lines if not line.startswith("#")), "") if self._is_template_boilerplate(paragraph): paragraph = "" if heading and paragraph: return f"{heading}. {paragraph}" return heading or paragraph return "" def _documentation_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]: def priority(chunk: ContentChunk) -> tuple[int, str, int]: role = chunk.metadata.get("source_role") path = chunk.path.lower() if chunk.kind == "intent" or role == "intent_summary" or path.endswith("intent.md"): return (0, path, chunk.start_line) if chunk.kind == "scope" or role == "derived_scope" or path.endswith("scope.md"): return (1, path, chunk.start_line) return (2, path, chunk.start_line) return sorted( [ chunk for chunk in chunks if chunk.kind in {"intent", "documentation", "scope"} and chunk.metadata.get("source_role") != "agent_guidance" ], key=priority, ) def _interface_summary(self, chunks: list[ContentChunk]) -> str: for chunk in chunks: if chunk.kind != "interface": continue lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] if not lines: continue return " ".join(lines[:3]) return "" def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]: return [fact for fact in facts if fact.kind == kind] def _promotable_llm_facts(self, facts: list[ObservedFact]) -> list[ObservedFact]: return [ fact for fact in facts if self._utility_relationship(fact) in {"facade", "adapter"} ] def _utility_relationship(self, fact: ObservedFact) -> str: relationship = fact.metadata.get("utility_relationship") if isinstance(relationship, str) and relationship: return relationship source_role = fact.metadata.get("source_role") if source_role == "implementation_source": lower_path = fact.path.lower() if "adapter" in lower_path or "provider" in lower_path: return "adapter" return "owned" if source_role == "configuration": return "configure" if source_role == "dependency_declaration": return "dependency" if source_role in {"agent_guidance", "ci_tooling"}: return "tooling" if not source_role and fact.path.lower().endswith((".py", ".ts", ".js")): return "owned" return "mention" def _utility_relationship_attributes(self, facts: list[ObservedFact]) -> list[str]: relationships = sorted({self._utility_relationship(fact) for fact in facts}) return [f"utility-{relationship}" for relationship in relationships] def _source_refs(self, facts: list[ObservedFact]) -> list[SourceReference]: return [ SourceReference( fact_id=fact.id, path=fact.path, kind=fact.kind, name=fact.name, line=fact.metadata.get("line"), ) for fact in facts ]