diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..c40214a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,90 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Commands + +```bash +# Install +pip install -e ".[dev]" + +# Run dev server (port 8001) +uvicorn repo_registry.web_api.app:app --reload --port 8001 + +# Run tests +pytest +pytest -k "test_scanner" # filter by keyword +pytest tests/test_web_api.py # single file + +# Health check +curl http://127.0.0.1:8001/health +``` + +Note: `AGENTS.md` shows `src.repo_registry.app:app` but the correct module path is `repo_registry.web_api.app:app` (as installed via `src/`). + +## Architecture + +The service maps Git repositories to reviewable scope maps using a fixed hierarchy: + +``` +Scope → Ability → Capability → Feature → Evidence → ObservedFact +``` + +**Data flow for an analysis run:** + +1. `POST /repos/{id}/analysis-runs` triggers the pipeline in `RegistryService.run_analysis()` +2. `GitIngestionService` clones or resolves the repo path +3. `RepositoryMetadataExtractor` reads pyproject.toml / package.json / README +4. `DeterministicScanner` produces `ObservedFact` objects (files, languages, manifests, APIs, etc.) +5. `ContentExtractor` chunks files into searchable segments +6. `CandidateGraphGenerator` builds a draft ability→capability→feature→evidence tree from facts +7. Optionally, `LLMCandidateExtractor` proposes additional candidates (requires `REPO_REGISTRY_LLM_ENABLED=true`) +8. Candidates are stored; humans or agents review them via `POST .../candidate-graph/approve` +9. Approved characteristics feed `ScopeGenerator` to produce `SCOPE.md` + +**Key source locations:** + +| Component | Path | +|-----------|------| +| FastAPI routes + DI | `src/repo_registry/web_api/app.py` | +| Orchestration | `src/repo_registry/core/service.py` | +| Frozen dataclasses | `src/repo_registry/core/models.py` | +| Deterministic scanner | `src/repo_registry/repo_scanning/scanner.py` | +| Candidate graph builder | `src/repo_registry/candidate_graph/generator.py` | +| SQLite store | `src/repo_registry/storage/sqlite.py` | +| Schema migration | `migrations/0001_initial.sql` | + +**Storage:** SQLite at `var/repo-registry.sqlite3` (auto-created). Schema migrations run at startup. Dynamic columns are added to support evidence relationships, classification, and expectation gaps. + +**LLM extraction** is optional and disabled by default. Enable with `REPO_REGISTRY_LLM_ENABLED=true` plus `REPO_REGISTRY_LLM_PROVIDER` and `REPO_REGISTRY_LLM_MODEL`. The `llm-connect` sibling package provides the adapter abstraction. + +**Semantic search** uses `HashingEmbeddingProvider` by default — deterministic, no external service required. + +## Environment Variables + +| Variable | Default | Purpose | +|----------|---------|---------| +| `REPO_REGISTRY_DATABASE_PATH` | `var/repo-registry.sqlite3` | SQLite file | +| `REPO_REGISTRY_CHECKOUT_ROOT` | `var/checkouts` | Git clone cache | +| `REPO_REGISTRY_LLM_ENABLED` | `true` | Enable LLM extraction | +| `REPO_REGISTRY_LLM_PROVIDER` | — | e.g. `gemini`, `anthropic` | +| `REPO_REGISTRY_LLM_MODEL` | — | e.g. `gemini-2.5-flash` | +| `REPO_REGISTRY_STATE_HUB_BASE_URL` | `http://127.0.0.1:8000` | State Hub for coordination | + +## State Hub & Workplans + +Active work is tracked in `workplans/RREG-WP-*.md` — these files are the source of truth (ADR-001). The Custodian State Hub caches this state; workplan files take precedence. + +Session protocol (see `AGENTS.md` for full curl examples): +- **Start:** check `workplans/` status headers and State Hub inbox +- **Close:** update task statuses in workplan files, then `POST /progress/` and sync via `POST /repos/repo-scoping/sync` + +Workplan sync warns on C-17 (unpushed commits) — that's normal. A `"result": "fail"` needs investigation. + +## Docs + +Design decisions and terminology live in `docs/`: +- `docs/terminology.md` — characteristic model definitions +- `docs/scope-md-spec.md` — SCOPE.md format +- `docs/characteristic-evidence-model.md` — evidence target kinds +- `docs/classification-strategy.md` — how characteristics are classified diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b88f41b --- /dev/null +++ b/Makefile @@ -0,0 +1,76 @@ +PORT := 8002 +PIDFILE := var/api.pid +LOGFILE := var/api-8002.log +UVICORN := .venv/bin/uvicorn + +.DEFAULT_GOAL := help +.PHONY: help start stop restart status showlogs + +help: ## List available targets + @grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " %-12s %s\n", $$1, $$2}' + +start: ## Start the API server in the background on port 8002 + @mkdir -p var + @if [ -f $(PIDFILE) ] && kill -0 $$(cat $(PIDFILE)) 2>/dev/null; then \ + echo "Already running (PID $$(cat $(PIDFILE)))"; \ + else \ + nohup $(UVICORN) repo_registry.web_api.app:app --port $(PORT) \ + >> $(LOGFILE) 2>&1 & \ + echo $$! > $(PIDFILE); \ + echo "Started (PID $$!) — http://127.0.0.1:$(PORT)/ui"; \ + fi + +stop: ## Stop the API server + @PIDS=""; \ + if [ -f $(PIDFILE) ]; then \ + FILE_PID=$$(cat $(PIDFILE)); \ + if kill -0 $$FILE_PID 2>/dev/null; then \ + PIDS="$$FILE_PID"; \ + fi; \ + rm -f $(PIDFILE); \ + fi; \ + FOUND=$$(ss -tlnp 2>/dev/null | awk '/127\.0\.0\.1:$(PORT)/{match($$0,/pid=([0-9]+)/,m); if(m[1]) print m[1]}'); \ + for p in $$FOUND; do \ + case " $$PIDS " in *" $$p "*) ;; *) PIDS="$$PIDS $$p" ;; esac; \ + done; \ + PIDS=$$(echo $$PIDS | tr -s ' ' | sed 's/^ //;s/ $$//'); \ + if [ -z "$$PIDS" ]; then \ + echo "Not running"; \ + else \ + for p in $$PIDS; do kill $$p 2>/dev/null && echo "Stopped PID $$p"; done; \ + fi + +restart: stop start ## Restart the API server + +status: ## Show whether the API server is running and its resource usage + @PID=""; \ + if [ -f $(PIDFILE) ]; then \ + CANDIDATE=$$(cat $(PIDFILE)); \ + kill -0 $$CANDIDATE 2>/dev/null && PID=$$CANDIDATE; \ + fi; \ + if [ -z "$$PID" ]; then \ + FOUND=$$(ss -tlnp 2>/dev/null | awk '/127\.0\.0\.1:$(PORT)/{match($$0,/pid=([0-9]+)/,m); if(m[1]) print m[1]}' | head -1); \ + [ -n "$$FOUND" ] && PID=$$FOUND; \ + fi; \ + if [ -z "$$PID" ]; then \ + echo "Status: stopped"; \ + echo "URL: http://127.0.0.1:$(PORT)/ui"; \ + echo "Log: $(LOGFILE)"; \ + else \ + START=$$(ps -o lstart= -p $$PID 2>/dev/null | sed 's/^ *//'); \ + CPU=$$(ps -o %cpu= -p $$PID 2>/dev/null | tr -d ' '); \ + MEM=$$(ps -o %mem= -p $$PID 2>/dev/null | tr -d ' '); \ + RSS=$$(ps -o rss= -p $$PID 2>/dev/null | tr -d ' '); \ + RSS_MB=$$(( $${RSS:-0} / 1024 )); \ + echo "Status: running"; \ + echo "PID: $$PID"; \ + echo "Since: $$START"; \ + echo "CPU: $${CPU}%"; \ + echo "Memory: $${MEM}% ($$RSS_MB MB RSS)"; \ + echo "URL: http://127.0.0.1:$(PORT)/ui"; \ + echo "Log: $(LOGFILE)"; \ + fi + +showlogs: ## Tail the API server log + @tail -f $(LOGFILE)