diff --git a/Makefile b/Makefile index 1bf0772..a378dc4 100644 --- a/Makefile +++ b/Makefile @@ -73,10 +73,16 @@ list-repos: @test -n "$(DOMAIN)" || (echo "ERROR: DOMAIN is required."; exit 1) curl -sf "http://127.0.0.1:8000/repos/?domain=$(DOMAIN)" | python3 -m json.tool -## Ingest a repo's lockfile into the SBOM store: make ingest-sbom REPO=the-custodian [LOCKFILE=uv.lock] +## Ingest SBOM data for a repo. +## Single lockfile (explicit): make ingest-sbom REPO=the-custodian LOCKFILE=/path/to/uv.lock +## Scan all lockfiles in tree: make ingest-sbom REPO=the-custodian SCAN=1 REPO_PATH=/home/worsch/the-custodian +## Auto-detect at repo root: make ingest-sbom REPO=the-custodian REPO_PATH=/home/worsch/the-custodian ingest-sbom: - @test -n "$(REPO)" || (echo "ERROR: REPO is required. Usage: make ingest-sbom REPO= [LOCKFILE=]"; exit 1) - uv run python scripts/ingest_sbom.py --repo "$(REPO)" $(if $(LOCKFILE),--lockfile "$(LOCKFILE)",) + @test -n "$(REPO)" || (echo "ERROR: REPO is required."; exit 1) + uv run python scripts/ingest_sbom.py --repo "$(REPO)" \ + $(if $(LOCKFILE),--lockfile "$(LOCKFILE)") \ + $(if $(SCAN),--scan) \ + $(if $(REPO_PATH),--repo-path "$(REPO_PATH)") ## Check a repo for ADR-001 compliance: make validate-adr REPO=/path/to/repo [DOMAIN=custodian] validate-adr: diff --git a/dashboard/src/sbom.md b/dashboard/src/sbom.md index 76abff6..fc6b024 100644 --- a/dashboard/src/sbom.md +++ b/dashboard/src/sbom.md @@ -8,12 +8,13 @@ const API = "http://127.0.0.1:8000"; ```js // Fetch SBOM data on load -let _entries = [], _report = {groups: [], copyleft_direct_count: 0}, _repos = []; +let _entries = [], _report = {groups: [], copyleft_direct_count: 0}, _repos = [], _domains = []; try { - [_entries, _report, _repos] = await Promise.all([ + [_entries, _report, _repos, _domains] = await Promise.all([ fetch(`${API}/sbom/`).then(r => r.ok ? r.json() : []), fetch(`${API}/sbom/report/licences/`).then(r => r.ok ? r.json() : {groups:[], copyleft_direct_count: 0}), fetch(`${API}/repos/`).then(r => r.ok ? r.json() : []), + fetch(`${API}/domains/`).then(r => r.ok ? r.json() : []), ]); } catch {} ``` @@ -22,13 +23,24 @@ try { const entries = _entries ?? []; const report = _report ?? {groups: [], copyleft_direct_count: 0}; const repos = _repos ?? []; +const domains = _domains ?? []; const groups = report.groups ?? []; const riskCount = report.copyleft_direct_count ?? 0; + +// Domain + repo lookups +const domainById = Object.fromEntries(domains.map(d => [d.id, d])); +const repoById = Object.fromEntries(repos.map(r => [r.id, r])); +const repoDomain = Object.fromEntries(repos.map(r => [r.id, domainById[r.domain_id]?.slug ?? "—"])); +const domainSlugs = [...new Set(repos.map(r => repoDomain[r.id]).filter(s => s !== "—"))].sort(); + +// Copyleft detector (mirrors server-side logic) +const COPYLEFT_KW = ["GPL", "AGPL", "LGPL", "EUPL", "CDDL", "MPL"]; +const isCopyleft = spdx => spdx && COPYLEFT_KW.some(k => spdx.toUpperCase().includes(k)); ``` # SBOM -## Licence Risk +## Overview ```js const riskBadge = riskCount === 0 @@ -43,6 +55,10 @@ display(html`

Repos Scanned

${new Set(entries.map(e => e.repo_id)).size}

+
+

Domains Covered

+

${domainSlugs.length || new Set(Object.values(repoDomain).filter(s => s !== "—")).size}

+

Licence Risk

${riskCount}

@@ -55,13 +71,50 @@ display(html`
`); ``` +## By Domain + +```js +if (entries.length === 0) { + display(html`

No SBOM data ingested yet. Run make ingest-sbom REPO=<slug> SCAN=1 REPO_PATH=<path>.

`); +} else { + // Group entries by domain + const byDomain = {}; + for (const e of entries) { + const slug = repoDomain[e.repo_id] ?? "—"; + (byDomain[slug] = byDomain[slug] ?? []).push(e); + } + + const domainTableRows = Object.entries(byDomain).map(([slug, es]) => { + const dom = domains.find(d => d.slug === slug); + const repoCount = new Set(es.map(e => e.repo_id)).size; + const directProd = es.filter(e => e.is_direct && !e.is_dev); + const copyleftRisk = directProd.filter(e => isCopyleft(e.license_spdx)).length; + const ecosystems = [...new Set(es.map(e => e.ecosystem))].sort().join(", "); + return { + domain: dom?.name ?? slug, + repos: repoCount, + packages: es.length, + direct: directProd.length, + copyleft: copyleftRisk, + ecosystems, + }; + }).sort((a, b) => a.domain.localeCompare(b.domain)); + + display(Inputs.table(domainTableRows, { + columns: ["domain", "repos", "packages", "direct", "copyleft", "ecosystems"], + header: {domain: "Domain", repos: "Repos", packages: "All Pkgs", direct: "Direct Prod", copyleft: "Copyleft ⚠", ecosystems: "Ecosystems"}, + maxWidth: 900, + })); +} +``` + ## Licence Distribution ```js import * as Plot from "npm:@observablehq/plot"; if (groups.length === 0) { - display(html`

No SBOM data ingested yet. Run make ingest-sbom REPO=<slug>.

`); + display(html`

No SBOM data ingested yet.

`); } else { const plotData = groups.slice(0, 15).map(g => ({ licence: g.license_spdx ?? "(unknown)", @@ -98,6 +151,57 @@ if (copyleftGroups.length === 0) { ${g.repos.join(", ")}
`)} + +

Note: dual-licensed packages (e.g. "MIT OR GPL-3.0") are flagged conservatively. Review if the non-copyleft variant is used.

`); +} +``` + +## By Repo + +```js +// Group entries by repo, sorted by domain then repo name +const byRepo = {}; +for (const e of entries) { + (byRepo[e.repo_id] = byRepo[e.repo_id] ?? []).push(e); +} + +const repoSections = Object.entries(byRepo) + .map(([repoId, es]) => { + const repo = repoById[repoId]; + const domSlug = repoDomain[repoId] ?? "—"; + const dom = domains.find(d => d.slug === domSlug); + const directProd = es.filter(e => e.is_direct && !e.is_dev); + const copyleftRisk = directProd.filter(e => isCopyleft(e.license_spdx)).length; + const ecosystems = [...new Set(es.map(e => e.ecosystem))].sort(); + return { repoId, repo, dom, domSlug, es, directProd, copyleftRisk, ecosystems }; + }) + .sort((a, b) => (a.domSlug + a.repo?.slug).localeCompare(b.domSlug + b.repo?.slug)); + +if (repoSections.length === 0) { + display(html`

No repo data.

`); +} else { + display(html`
+ ${repoSections.map(({repoId, repo, dom, domSlug, es, directProd, copyleftRisk, ecosystems}) => html` +
+ + ${dom?.name ?? domSlug} + ${repo?.slug ?? repoId.slice(0,8)} + ${es.length} pkgs · ${ecosystems.join(" + ")} · ${directProd.length} direct + ${copyleftRisk > 0 ? html`⚠ ${copyleftRisk} copyleft` : ""} + +
+ ${Inputs.table(es.slice(0, 200).map(e => ({ + Package: e.package_name, + Version: e.package_version ?? "—", + Ecosystem: e.ecosystem, + Licence: e.license_spdx ?? "—", + Direct: e.is_direct ? "✓" : "", + Dev: e.is_dev ? "✓" : "", + })), {maxWidth: 860})} + ${es.length > 200 ? html`

Showing first 200 of ${es.length}

` : ""} +
+
+ `)}
`); } ``` @@ -106,19 +210,19 @@ if (copyleftGroups.length === 0) { ```js // Filters +const domainOpts = ["all", ...domainSlugs]; +const domainFilter = Inputs.select(domainOpts, {label: "Domain", value: "all"}); const ecoFilter = Inputs.select(["all", "python", "node", "rust", "go", "java", "other"], {label: "Ecosystem", value: "all"}); const directOnly = Inputs.toggle({label: "Direct deps only", value: false}); const prodOnly = Inputs.toggle({label: "Prod deps only (no dev)", value: false}); display(html`
- ${ecoFilter}${directOnly}${prodOnly} + ${domainFilter}${ecoFilter}${directOnly}${prodOnly}
`); ``` ```js -// Build repo_id → slug lookup -const repoById = Object.fromEntries(_repos.map(r => [r.id, r.slug])); - const filteredEntries = entries.filter(e => + (domainFilter.value === "all" || repoDomain[e.repo_id] === domainFilter.value) && (ecoFilter.value === "all" || e.ecosystem === ecoFilter.value) && (!directOnly.value || e.is_direct) && (!prodOnly.value || !e.is_dev) @@ -129,22 +233,37 @@ display(Inputs.table(filteredEntries.map(e => ({ Version: e.package_version ?? "—", Ecosystem: e.ecosystem, Licence: e.license_spdx ?? "—", - Repo: repoById[e.repo_id] ?? e.repo_id?.slice(0, 8) ?? "—", + Domain: repoDomain[e.repo_id] ?? "—", + Repo: repoById[e.repo_id]?.slug ?? e.repo_id?.slice(0, 8) ?? "—", Direct: e.is_direct ? "✓" : "", Dev: e.is_dev ? "✓" : "", -})), {maxWidth: 900})); +})), {maxWidth: 960})); ``` diff --git a/scripts/ingest_sbom.py b/scripts/ingest_sbom.py index c21b7d8..59ce949 100644 --- a/scripts/ingest_sbom.py +++ b/scripts/ingest_sbom.py @@ -188,9 +188,19 @@ _LOCKFILE_PARSERS = { "Cargo.lock": _parse_cargo_lock, } +# Directories that never contain project-level lockfiles +_SKIP_DIRS = { + ".git", ".hg", ".svn", + ".venv", "venv", ".env", + "node_modules", + "__pycache__", ".mypy_cache", ".pytest_cache", ".ruff_cache", + "dist", "build", ".build", "target", + ".tox", ".nox", +} + def detect_lockfile(repo_path: Path) -> tuple[Path, str] | None: - """Return (lockfile_path, ecosystem) for the first recognised lockfile found.""" + """Return (lockfile_path, filename) for the first recognised lockfile at repo root.""" for name in _LOCKFILE_PARSERS: candidate = repo_path / name if candidate.exists(): @@ -198,6 +208,17 @@ def detect_lockfile(repo_path: Path) -> tuple[Path, str] | None: return None +def detect_lockfiles_recursive(repo_path: Path) -> list[Path]: + """Walk repo_path and return all recognised lockfiles, skipping non-dep dirs.""" + found: list[Path] = [] + for dirpath, dirnames, filenames in os.walk(repo_path): + dirnames[:] = sorted(d for d in dirnames if d not in _SKIP_DIRS) + for name in _LOCKFILE_PARSERS: + if name in filenames: + found.append(Path(dirpath) / name) + return found + + def parse_lockfile(lockfile_path: Path) -> list[dict]: filename = lockfile_path.name parser = _LOCKFILE_PARSERS.get(filename) @@ -236,38 +257,60 @@ def post_ingest(api_base: str, repo_slug: str, entries: list[dict]) -> dict: # --------------------------------------------------------------------------- def main() -> None: - parser = argparse.ArgumentParser(description="Ingest a lockfile into the State Hub SBOM store.") + parser = argparse.ArgumentParser(description="Ingest a repo's lockfiles into the State Hub SBOM store.") parser.add_argument("--repo", required=True, help="Managed-repo slug (e.g. 'the-custodian')") - parser.add_argument("--lockfile", help="Path to lockfile (auto-detected if omitted)") - parser.add_argument("--repo-path", default=".", help="Repo root for auto-detection (default: cwd)") + parser.add_argument("--lockfile", action="append", dest="lockfiles", + metavar="PATH", help="Path to a specific lockfile (repeatable)") + parser.add_argument("--repo-path", default=".", help="Repo root for auto-detection/scan (default: cwd)") + parser.add_argument("--scan", action="store_true", + help="Recursively find ALL lockfiles under --repo-path (handles multi-ecosystem repos)") parser.add_argument("--api-base", default=API_BASE, help="State Hub API base URL") parser.add_argument("--dry-run", action="store_true", help="Parse only — do not submit") args = parser.parse_args() - if args.lockfile: - lockfile_path = Path(args.lockfile).resolve() + repo_root = Path(args.repo_path).resolve() + lockfile_paths: list[Path] = [] + + if args.lockfiles: + lockfile_paths = [Path(lf).resolve() for lf in args.lockfiles] + elif args.scan: + lockfile_paths = detect_lockfiles_recursive(repo_root) + if not lockfile_paths: + print(f"No lockfiles found under '{repo_root}'.", file=sys.stderr) + sys.exit(1) + print(f"Scan found {len(lockfile_paths)} lockfile(s):") + for lf in lockfile_paths: + print(f" {lf.relative_to(repo_root) if lf.is_relative_to(repo_root) else lf}") else: - found = detect_lockfile(Path(args.repo_path).resolve()) + found = detect_lockfile(repo_root) if not found: print( - f"No recognised lockfile found in '{args.repo_path}'. " - "Supported: " + ", ".join(_LOCKFILE_PARSERS), + f"No recognised lockfile found in '{repo_root}'. " + f"Supported: {', '.join(_LOCKFILE_PARSERS)}. " + "Use --scan to search subdirectories.", file=sys.stderr, ) sys.exit(1) lockfile_path, _ = found print(f"Auto-detected: {lockfile_path}") + lockfile_paths = [lockfile_path] - entries = parse_lockfile(lockfile_path) - print(f"Parsed {len(entries)} packages from {lockfile_path.name}") + all_entries: list[dict] = [] + for lf in lockfile_paths: + parsed = parse_lockfile(lf) + rel = lf.relative_to(repo_root) if lf.is_relative_to(repo_root) else lf + print(f" {rel}: {len(parsed)} packages") + all_entries.extend(parsed) + + print(f"Total: {len(all_entries)} packages across {len(lockfile_paths)} lockfile(s)") if args.dry_run: - print(json.dumps(entries[:5], indent=2)) - if len(entries) > 5: - print(f" … and {len(entries) - 5} more") + print(json.dumps(all_entries[:5], indent=2)) + if len(all_entries) > 5: + print(f" … and {len(all_entries) - 5} more") return - result = post_ingest(args.api_base, args.repo, entries) + result = post_ingest(args.api_base, args.repo, all_entries) print(f"Ingested {result.get('ingested', '?')} entries for repo '{args.repo}'") print(f"Snapshot at: {result.get('snapshot_at', '?')}")