diff --git a/ansible/inventories/local.ini b/ansible/inventories/local.ini new file mode 100644 index 0000000..df8b5f6 --- /dev/null +++ b/ansible/inventories/local.ini @@ -0,0 +1,2 @@ +[all] +localhost ansible_connection=local diff --git a/ansible/playbook.yml b/ansible/playbook.yml new file mode 100644 index 0000000..21d1400 --- /dev/null +++ b/ansible/playbook.yml @@ -0,0 +1,8 @@ +- name: TeleMcp bootstrap + hosts: all + become: yes + vars: + values_dir: /opt/telemcp-values + roles: + - k8s_host + - telemetry_stack diff --git a/ansible/roles/k8s_host/tasks/main.yml b/ansible/roles/k8s_host/tasks/main.yml new file mode 100644 index 0000000..0c345a9 --- /dev/null +++ b/ansible/roles/k8s_host/tasks/main.yml @@ -0,0 +1,19 @@ +- name: Ensure base packages + apt: + name: [curl, jq, gnupg, ca-certificates] + state: present + update_cache: yes + +- name: Install Helm if missing + shell: | + if ! command -v helm >/dev/null 2>&1; then + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + fi + args: + executable: /bin/bash + +- name: Ensure kube config dir + file: + path: /root/.kube + state: directory + mode: "0700" diff --git a/ansible/roles/telemetry_stack/defaults/main.yml b/ansible/roles/telemetry_stack/defaults/main.yml new file mode 100644 index 0000000..a0447cd --- /dev/null +++ b/ansible/roles/telemetry_stack/defaults/main.yml @@ -0,0 +1,4 @@ +telemcp_namespace_monitoring: monitoring +telemcp_namespace_logging: logging +telemcp_namespace_mcp: mcp +values_dir: /opt/telemcp-values diff --git a/ansible/roles/telemetry_stack/tasks/main.yml b/ansible/roles/telemetry_stack/tasks/main.yml new file mode 100644 index 0000000..e521a54 --- /dev/null +++ b/ansible/roles/telemetry_stack/tasks/main.yml @@ -0,0 +1,50 @@ +- name: Create values directory + file: + path: "{{ values_dir }}" + state: directory + mode: "0755" + +- name: Install helm repos + shell: | + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo add grafana https://grafana.github.io/helm-charts + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts + helm repo update + args: + executable: /bin/bash + +- name: Copy values files + copy: + src: "{{ item.src }}" + dest: "{{ values_dir }}/{{ item.dest }}" + mode: "0644" + with_items: + - { src: "../../../helm/values/kube-prometheus-stack.values.yaml", dest: "kube-prometheus-stack.values.yaml" } + - { src: "../../../helm/values/loki.values.yaml", dest: "loki.values.yaml" } + - { src: "../../../helm/values/otel-collector.values.yaml", dest: "otel-collector.values.yaml" } + - { src: "../../../helm/mcp-telemetry-bridge/values.yaml", dest: "mcp-telemetry-bridge.values.yaml" } + +- name: Deploy kube-prometheus-stack + shell: | + helm upgrade --install monitoring prometheus-community/kube-prometheus-stack -n {{ telemcp_namespace_monitoring }} --create-namespace -f {{ values_dir }}/kube-prometheus-stack.values.yaml + args: { executable: /bin/bash } + +- name: Deploy Loki + shell: | + helm upgrade --install loki grafana/loki -n {{ telemcp_namespace_logging }} --create-namespace -f {{ values_dir }}/loki.values.yaml + args: { executable: /bin/bash } + +- name: Deploy Promtail + shell: | + helm upgrade --install promtail grafana/promtail -n {{ telemcp_namespace_logging }} --create-namespace + args: { executable: /bin/bash } + +- name: (Optional) Deploy OpenTelemetry Collector + shell: | + helm upgrade --install otel-collector open-telemetry/opentelemetry-collector -n observability --create-namespace -f {{ values_dir }}/otel-collector.values.yaml + args: { executable: /bin/bash } + +- name: Deploy MCP Telemetry Bridge + shell: | + helm upgrade --install mcp-telemetry {{ playbook_dir }}/../helm/mcp-telemetry-bridge -n {{ telemcp_namespace_mcp }} --create-namespace -f {{ values_dir }}/mcp-telemetry-bridge.values.yaml + args: { executable: /bin/bash } diff --git a/assets/TeleMcpLogo.png b/assets/TeleMcpLogo.png new file mode 100644 index 0000000..ca2742e Binary files /dev/null and b/assets/TeleMcpLogo.png differ diff --git a/environments/dev/README.md b/environments/dev/README.md new file mode 100644 index 0000000..3007476 --- /dev/null +++ b/environments/dev/README.md @@ -0,0 +1 @@ +You can deploy charts directly with Ansible or manage them with Helmfile/Argo CD later. diff --git a/helm/mcp-telemetry-bridge/Chart.yaml b/helm/mcp-telemetry-bridge/Chart.yaml new file mode 100644 index 0000000..803844a --- /dev/null +++ b/helm/mcp-telemetry-bridge/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: mcp-telemetry-bridge +description: MCP Telemetry Bridge for TeleMcp +type: application +version: 0.1.0 +appVersion: "0.1.0" diff --git a/helm/mcp-telemetry-bridge/templates/deployment.yaml b/helm/mcp-telemetry-bridge/templates/deployment.yaml new file mode 100644 index 0000000..702cef2 --- /dev/null +++ b/helm/mcp-telemetry-bridge/templates/deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-telemetry-bridge +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: { app: mcp-telemetry-bridge } + template: + metadata: + labels: { app: mcp-telemetry-bridge } + spec: + serviceAccountName: {{ .Values.serviceAccount.name }} + containers: + - name: bridge + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: 8000 + env: + {{- range .Values.env }} + - name: {{ .name }} + value: "{{ .value }}" + {{- end }} + readinessProbe: + httpGet: { path: /healthz, port: http } + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + httpGet: { path: /healthz, port: http } + initialDelaySeconds: 10 + periodSeconds: 10 + resources: +{{ toYaml .Values.resources | indent 12 }} diff --git a/helm/mcp-telemetry-bridge/templates/networkpolicy.yaml b/helm/mcp-telemetry-bridge/templates/networkpolicy.yaml new file mode 100644 index 0000000..841494d --- /dev/null +++ b/helm/mcp-telemetry-bridge/templates/networkpolicy.yaml @@ -0,0 +1,21 @@ +{{- if .Values.networkPolicy.enabled }} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: mcp-telemetry-bridge-deny-all +spec: + podSelector: + matchLabels: + app: mcp-telemetry-bridge + policyTypes: ["Ingress","Egress"] + ingress: + - {} # allow same-namespace by default; tighten as needed + egress: + - to: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 9090 # Prometheus + - protocol: TCP + port: 3100 # Loki +{{- end }} diff --git a/helm/mcp-telemetry-bridge/templates/rbac.yaml b/helm/mcp-telemetry-bridge/templates/rbac.yaml new file mode 100644 index 0000000..adc37c9 --- /dev/null +++ b/helm/mcp-telemetry-bridge/templates/rbac.yaml @@ -0,0 +1,26 @@ +{{- if .Values.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: mcp-telemetry-readonly +rules: + - apiGroups: [""] + resources: ["pods","pods/log","nodes","events","namespaces","services","endpoints"] + verbs: ["get","list","watch"] + - apiGroups: ["apps"] + resources: ["deployments","daemonsets","statefulsets","replicasets"] + verbs: ["get","list","watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: mcp-telemetry-readonly-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: mcp-telemetry-readonly +subjects: + - kind: ServiceAccount + name: {{ .Values.serviceAccount.name }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/helm/mcp-telemetry-bridge/templates/service.yaml b/helm/mcp-telemetry-bridge/templates/service.yaml new file mode 100644 index 0000000..ef0cdc1 --- /dev/null +++ b/helm/mcp-telemetry-bridge/templates/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: mcp-telemetry-bridge +spec: + selector: { app: mcp-telemetry-bridge } + ports: + - name: http + port: {{ .Values.service.port }} + targetPort: 8000 + type: {{ .Values.service.type }} diff --git a/helm/mcp-telemetry-bridge/templates/serviceaccount.yaml b/helm/mcp-telemetry-bridge/templates/serviceaccount.yaml new file mode 100644 index 0000000..07e6523 --- /dev/null +++ b/helm/mcp-telemetry-bridge/templates/serviceaccount.yaml @@ -0,0 +1,6 @@ +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.serviceAccount.name }} +{{- end }} diff --git a/helm/mcp-telemetry-bridge/values.yaml b/helm/mcp-telemetry-bridge/values.yaml new file mode 100644 index 0000000..97ad9cd --- /dev/null +++ b/helm/mcp-telemetry-bridge/values.yaml @@ -0,0 +1,37 @@ +image: + repository: ghcr.io/example/telemcp-bridge + tag: "0.1.0" + pullPolicy: IfNotPresent + +replicaCount: 1 + +service: + type: ClusterIP + port: 80 + +env: + - name: PROM_URL + value: "http://monitoring-kube-prometheus-prometheus.monitoring:9090" + - name: LOKI_URL + value: "http://loki.logging:3100" + - name: K8S_API + value: "https://kubernetes.default.svc" + +resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + +rbac: + create: true + +serviceAccount: + create: true + name: mcp-telemetry + +networkPolicy: + enabled: true + allowFromNamespaces: [] # add namespace selectors if needed diff --git a/helm/values/kube-prometheus-stack.values.yaml b/helm/values/kube-prometheus-stack.values.yaml new file mode 100644 index 0000000..3a62898 --- /dev/null +++ b/helm/values/kube-prometheus-stack.values.yaml @@ -0,0 +1,25 @@ +kube-state-metrics: + enabled: true + +prometheus: + prometheusSpec: + retention: 5d + scrapeInterval: 15s + enableAdminAPI: false + resources: + requests: + cpu: 100m + memory: 512Mi + +alertmanager: + alertmanagerSpec: + replicas: 1 + +defaultRules: + create: true + rules: + kubeApiserverError: true + kubeNodeNotReady: true + kubePodCrashLooping: true + kubeJobFailed: true + etcdHighNumberOfFailedGRPCRequests: true diff --git a/helm/values/loki.values.yaml b/helm/values/loki.values.yaml new file mode 100644 index 0000000..1669db9 --- /dev/null +++ b/helm/values/loki.values.yaml @@ -0,0 +1,15 @@ +loki: + auth_enabled: false + commonConfig: + replication_factor: 1 + storage: + type: filesystem + schemaConfig: + configs: + - from: "2024-01-01" + store: boltdb-shipper + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h diff --git a/helm/values/otel-collector.values.yaml b/helm/values/otel-collector.values.yaml new file mode 100644 index 0000000..24027a0 --- /dev/null +++ b/helm/values/otel-collector.values.yaml @@ -0,0 +1,22 @@ +mode: deployment +config: + receivers: + otlp: + protocols: { http: {}, grpc: {} } + processors: + batch: {} + exporters: + prometheusremotewrite: + endpoint: "http://monitoring-kube-prometheus-prometheus.monitoring:9090/api/v1/write" + loki: + endpoint: "http://loki.logging:3100/loki/api/v1/push" + service: + pipelines: + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheusremotewrite] + logs: + receivers: [otlp] + processors: [batch] + exporters: [loki] diff --git a/mcp-telemetry-bridge/Dockerfile b/mcp-telemetry-bridge/Dockerfile new file mode 100644 index 0000000..43aeafb --- /dev/null +++ b/mcp-telemetry-bridge/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 + +WORKDIR /app +COPY requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +COPY app /app/app + +EXPOSE 8000 +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/mcp-telemetry-bridge/app/main.py b/mcp-telemetry-bridge/app/main.py new file mode 100644 index 0000000..d1698ae --- /dev/null +++ b/mcp-telemetry-bridge/app/main.py @@ -0,0 +1,134 @@ +import os, time +from typing import Any, Dict, List, Optional +from fastapi import FastAPI, Body +import httpx + +PROM = os.getenv("PROM_URL", "http://monitoring-kube-prometheus-prometheus.monitoring:9090") +LOKI = os.getenv("LOKI_URL", "http://loki.logging:3100") +K8S = os.getenv("K8S_API", "https://kubernetes.default.svc") +SERVICE_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" +CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +NAMESPACE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + +def _sa_headers() -> Dict[str, str]: + token = "" + try: + with open(SERVICE_TOKEN_PATH, "r") as f: + token = f.read().strip() + except FileNotFoundError: + pass + return {"Authorization": f"Bearer {token}"} if token else {} + +def _ssl_params() -> Dict[str, Any]: + return {"verify": CA_PATH} if os.path.exists(CA_PATH) else {} + +app = FastAPI(title="MCP Telemetry Bridge", version="0.1.0") + +RESOURCES = [ + {"uri":"res://dashboards/top-pods-by-cpu.promql","mimeType":"text/plain","content": + "topk(10, sum by (pod, namespace) (rate(container_cpu_usage_seconds_total{container!=\"\",image!=\"\"}[1m])))"}, + {"uri":"res://dashboards/pod-restarts.promql","mimeType":"text/plain","content": + "sum by (pod, namespace) (increase(kube_pod_container_status_restarts_total[10m])) > 0"}, + {"uri":"res://dashboards/warn-events.logql","mimeType":"text/plain","content": + "{app=\"kube-apiserver\"} |= \"Warning\""}, +] + +TOOLS = [ + {"name":"promql.query","inputSchema":{"type":"object","properties":{"expr":{"type":"string"},"range":{"type":"string"}}}}, + {"name":"loki.query","inputSchema":{"type":"object","properties":{"logql":{"type":"string"},"limit":{"type":"integer"},"since":{"type":"string"}}}}, + {"name":"k8s.get","inputSchema":{"type":"object","properties":{"kind":{"type":"string"},"namespace":{"type":"string"},"name":{"type":"string"}}}}, + {"name":"k8s.events","inputSchema":{"type":"object","properties":{"namespace":{"type":"string"},"since":{"type":"string"}}}}, + {"name":"inventory.snapshot","inputSchema":{"type":"object","properties":{}}}, +] + +PROMPTS = [ + {"name":"Triage-Now","description":"Summarize current alerts, top offenders and recent warnings."} +] + +@app.get("/healthz") +def healthz(): + return {"status":"ok","ts": int(time.time())} + +@app.get("/mcp/schema") +def mcp_schema(): + return {"resources": RESOURCES, "tools": TOOLS, "prompts": PROMPTS} + +@app.get("/mcp/resource") +def mcp_resource(uri: str): + for r in RESOURCES: + if r["uri"] == uri: + return {"uri": uri, "mimeType": r["mimeType"], "content": r["content"]} + return {"error": "not found", "uri": uri} + +@app.post("/tools/promql.query") +async def promql_query(payload: Dict[str, Any] = Body(...)): + expr = payload.get("expr") + rng = payload.get("range") + params = {"query": expr} if not rng else {"query": f"sum_over_time(({expr})[{rng}])"} + async with httpx.AsyncClient() as c: + r = await c.get(f"{PROM}/api/v1/query", params=params, timeout=30.0) + return r.json() + +@app.post("/tools/loki.query") +async def loki_query(payload: Dict[str, Any] = Body(...)): + logql = payload.get("logql") + limit = payload.get("limit", 100) + params = {"query": logql, "limit": str(limit)} + async with httpx.AsyncClient() as c: + r = await c.get(f"{LOKI}/loki/api/v1/query", params=params, timeout=30.0) + return r.json() + +@app.post("/tools/k8s.get") +async def k8s_get(payload: Dict[str, Any] = Body(...)): + kind = payload.get("kind", "").lower() + ns = payload.get("namespace") + name = payload.get("name") + # Map a few common kinds + mapping = { + "pods": ("/api/v1", "pods"), + "pod": ("/api/v1", "pods"), + "namespaces": ("/api/v1", "namespaces"), + "nodes": ("/api/v1", "nodes"), + "services": ("/api/v1", "services"), + "events": ("/api/v1", "events"), + "deployments": ("/apis/apps/v1", "deployments"), + "daemonsets": ("/apis/apps/v1", "daemonsets"), + "statefulsets": ("/apis/apps/v1", "statefulsets"), + "replicasets": ("/apis/apps/v1", "replicasets"), + } + if kind not in mapping: + return {"error":"unsupported kind", "kind": kind} + base, res = mapping[kind] + url = f"{K8S}{base}" + if ns: + url += f"/namespaces/{ns}/{res}" + else: + url += f"/{res}" + if name: + url += f"/{name}" + async with httpx.AsyncClient(**_ssl_params(), headers=_sa_headers()) as c: + r = await c.get(url, timeout=30.0) + return r.json() + +@app.post("/tools/k8s.events") +async def k8s_events(payload: Dict[str, Any] = Body(...)): + ns = payload.get("namespace") + url = f"{K8S}/api/v1" + if ns: + url += f"/namespaces/{ns}/events" + else: + url += "/events" + async with httpx.AsyncClient(**_ssl_params(), headers=_sa_headers()) as c: + r = await c.get(url, timeout=30.0) + return r.json() + +@app.post("/tools/inventory.snapshot") +async def inventory_snapshot(): + # Minimal cluster inventory + async with httpx.AsyncClient(**_ssl_params(), headers=_sa_headers()) as c: + nodes = (await c.get(f"{K8S}/api/v1/nodes", timeout=30.0)).json() + ns = (await c.get(f"{K8S}/api/v1/namespaces", timeout=30.0)).json() + dpls = (await c.get(f"{K8S}/apis/apps/v1/deployments", timeout=30.0)).json() + ds = (await c.get(f"{K8S}/apis/apps/v1/daemonsets", timeout=30.0)).json() + sts = (await c.get(f"{K8S}/apis/apps/v1/statefulsets", timeout=30.0)).json() + return {"nodes": nodes, "namespaces": ns, "deployments": dpls, "daemonsets": ds, "statefulsets": sts} diff --git a/mcp-telemetry-bridge/requirements.txt b/mcp-telemetry-bridge/requirements.txt new file mode 100644 index 0000000..ad43e5b --- /dev/null +++ b/mcp-telemetry-bridge/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.112.2 +uvicorn[standard]==0.30.6 +httpx==0.27.2