Bootstrap initial repo state

This commit is contained in:
2025-09-07 23:39:44 +02:00
parent 0ed92847f8
commit ed68e97829
20 changed files with 437 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
[all]
localhost ansible_connection=local

8
ansible/playbook.yml Normal file
View File

@@ -0,0 +1,8 @@
- name: TeleMcp bootstrap
hosts: all
become: yes
vars:
values_dir: /opt/telemcp-values
roles:
- k8s_host
- telemetry_stack

View File

@@ -0,0 +1,19 @@
- name: Ensure base packages
apt:
name: [curl, jq, gnupg, ca-certificates]
state: present
update_cache: yes
- name: Install Helm if missing
shell: |
if ! command -v helm >/dev/null 2>&1; then
curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
fi
args:
executable: /bin/bash
- name: Ensure kube config dir
file:
path: /root/.kube
state: directory
mode: "0700"

View File

@@ -0,0 +1,4 @@
telemcp_namespace_monitoring: monitoring
telemcp_namespace_logging: logging
telemcp_namespace_mcp: mcp
values_dir: /opt/telemcp-values

View File

@@ -0,0 +1,50 @@
- name: Create values directory
file:
path: "{{ values_dir }}"
state: directory
mode: "0755"
- name: Install helm repos
shell: |
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm repo update
args:
executable: /bin/bash
- name: Copy values files
copy:
src: "{{ item.src }}"
dest: "{{ values_dir }}/{{ item.dest }}"
mode: "0644"
with_items:
- { src: "../../../helm/values/kube-prometheus-stack.values.yaml", dest: "kube-prometheus-stack.values.yaml" }
- { src: "../../../helm/values/loki.values.yaml", dest: "loki.values.yaml" }
- { src: "../../../helm/values/otel-collector.values.yaml", dest: "otel-collector.values.yaml" }
- { src: "../../../helm/mcp-telemetry-bridge/values.yaml", dest: "mcp-telemetry-bridge.values.yaml" }
- name: Deploy kube-prometheus-stack
shell: |
helm upgrade --install monitoring prometheus-community/kube-prometheus-stack -n {{ telemcp_namespace_monitoring }} --create-namespace -f {{ values_dir }}/kube-prometheus-stack.values.yaml
args: { executable: /bin/bash }
- name: Deploy Loki
shell: |
helm upgrade --install loki grafana/loki -n {{ telemcp_namespace_logging }} --create-namespace -f {{ values_dir }}/loki.values.yaml
args: { executable: /bin/bash }
- name: Deploy Promtail
shell: |
helm upgrade --install promtail grafana/promtail -n {{ telemcp_namespace_logging }} --create-namespace
args: { executable: /bin/bash }
- name: (Optional) Deploy OpenTelemetry Collector
shell: |
helm upgrade --install otel-collector open-telemetry/opentelemetry-collector -n observability --create-namespace -f {{ values_dir }}/otel-collector.values.yaml
args: { executable: /bin/bash }
- name: Deploy MCP Telemetry Bridge
shell: |
helm upgrade --install mcp-telemetry {{ playbook_dir }}/../helm/mcp-telemetry-bridge -n {{ telemcp_namespace_mcp }} --create-namespace -f {{ values_dir }}/mcp-telemetry-bridge.values.yaml
args: { executable: /bin/bash }

BIN
assets/TeleMcpLogo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

View File

@@ -0,0 +1 @@
You can deploy charts directly with Ansible or manage them with Helmfile/Argo CD later.

View File

@@ -0,0 +1,6 @@
apiVersion: v2
name: mcp-telemetry-bridge
description: MCP Telemetry Bridge for TeleMcp
type: application
version: 0.1.0
appVersion: "0.1.0"

View File

@@ -0,0 +1,35 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: mcp-telemetry-bridge
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels: { app: mcp-telemetry-bridge }
template:
metadata:
labels: { app: mcp-telemetry-bridge }
spec:
serviceAccountName: {{ .Values.serviceAccount.name }}
containers:
- name: bridge
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
ports:
- name: http
containerPort: 8000
env:
{{- range .Values.env }}
- name: {{ .name }}
value: "{{ .value }}"
{{- end }}
readinessProbe:
httpGet: { path: /healthz, port: http }
initialDelaySeconds: 3
periodSeconds: 5
livenessProbe:
httpGet: { path: /healthz, port: http }
initialDelaySeconds: 10
periodSeconds: 10
resources:
{{ toYaml .Values.resources | indent 12 }}

View File

@@ -0,0 +1,21 @@
{{- if .Values.networkPolicy.enabled }}
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: mcp-telemetry-bridge-deny-all
spec:
podSelector:
matchLabels:
app: mcp-telemetry-bridge
policyTypes: ["Ingress","Egress"]
ingress:
- {} # allow same-namespace by default; tighten as needed
egress:
- to:
- namespaceSelector: {}
ports:
- protocol: TCP
port: 9090 # Prometheus
- protocol: TCP
port: 3100 # Loki
{{- end }}

View File

@@ -0,0 +1,26 @@
{{- if .Values.rbac.create }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: mcp-telemetry-readonly
rules:
- apiGroups: [""]
resources: ["pods","pods/log","nodes","events","namespaces","services","endpoints"]
verbs: ["get","list","watch"]
- apiGroups: ["apps"]
resources: ["deployments","daemonsets","statefulsets","replicasets"]
verbs: ["get","list","watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: mcp-telemetry-readonly-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: mcp-telemetry-readonly
subjects:
- kind: ServiceAccount
name: {{ .Values.serviceAccount.name }}
namespace: {{ .Release.Namespace }}
{{- end }}

View File

@@ -0,0 +1,11 @@
apiVersion: v1
kind: Service
metadata:
name: mcp-telemetry-bridge
spec:
selector: { app: mcp-telemetry-bridge }
ports:
- name: http
port: {{ .Values.service.port }}
targetPort: 8000
type: {{ .Values.service.type }}

View File

@@ -0,0 +1,6 @@
{{- if .Values.serviceAccount.create }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ .Values.serviceAccount.name }}
{{- end }}

View File

@@ -0,0 +1,37 @@
image:
repository: ghcr.io/example/telemcp-bridge
tag: "0.1.0"
pullPolicy: IfNotPresent
replicaCount: 1
service:
type: ClusterIP
port: 80
env:
- name: PROM_URL
value: "http://monitoring-kube-prometheus-prometheus.monitoring:9090"
- name: LOKI_URL
value: "http://loki.logging:3100"
- name: K8S_API
value: "https://kubernetes.default.svc"
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
rbac:
create: true
serviceAccount:
create: true
name: mcp-telemetry
networkPolicy:
enabled: true
allowFromNamespaces: [] # add namespace selectors if needed

View File

@@ -0,0 +1,25 @@
kube-state-metrics:
enabled: true
prometheus:
prometheusSpec:
retention: 5d
scrapeInterval: 15s
enableAdminAPI: false
resources:
requests:
cpu: 100m
memory: 512Mi
alertmanager:
alertmanagerSpec:
replicas: 1
defaultRules:
create: true
rules:
kubeApiserverError: true
kubeNodeNotReady: true
kubePodCrashLooping: true
kubeJobFailed: true
etcdHighNumberOfFailedGRPCRequests: true

View File

@@ -0,0 +1,15 @@
loki:
auth_enabled: false
commonConfig:
replication_factor: 1
storage:
type: filesystem
schemaConfig:
configs:
- from: "2024-01-01"
store: boltdb-shipper
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h

View File

@@ -0,0 +1,22 @@
mode: deployment
config:
receivers:
otlp:
protocols: { http: {}, grpc: {} }
processors:
batch: {}
exporters:
prometheusremotewrite:
endpoint: "http://monitoring-kube-prometheus-prometheus.monitoring:9090/api/v1/write"
loki:
endpoint: "http://loki.logging:3100/loki/api/v1/push"
service:
pipelines:
metrics:
receivers: [otlp]
processors: [batch]
exporters: [prometheusremotewrite]
logs:
receivers: [otlp]
processors: [batch]
exporters: [loki]

View File

@@ -0,0 +1,12 @@
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
WORKDIR /app
COPY requirements.txt /app/
RUN pip install --no-cache-dir -r requirements.txt
COPY app /app/app
EXPOSE 8000
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,134 @@
import os, time
from typing import Any, Dict, List, Optional
from fastapi import FastAPI, Body
import httpx
PROM = os.getenv("PROM_URL", "http://monitoring-kube-prometheus-prometheus.monitoring:9090")
LOKI = os.getenv("LOKI_URL", "http://loki.logging:3100")
K8S = os.getenv("K8S_API", "https://kubernetes.default.svc")
SERVICE_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
NAMESPACE_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
def _sa_headers() -> Dict[str, str]:
token = ""
try:
with open(SERVICE_TOKEN_PATH, "r") as f:
token = f.read().strip()
except FileNotFoundError:
pass
return {"Authorization": f"Bearer {token}"} if token else {}
def _ssl_params() -> Dict[str, Any]:
return {"verify": CA_PATH} if os.path.exists(CA_PATH) else {}
app = FastAPI(title="MCP Telemetry Bridge", version="0.1.0")
RESOURCES = [
{"uri":"res://dashboards/top-pods-by-cpu.promql","mimeType":"text/plain","content":
"topk(10, sum by (pod, namespace) (rate(container_cpu_usage_seconds_total{container!=\"\",image!=\"\"}[1m])))"},
{"uri":"res://dashboards/pod-restarts.promql","mimeType":"text/plain","content":
"sum by (pod, namespace) (increase(kube_pod_container_status_restarts_total[10m])) > 0"},
{"uri":"res://dashboards/warn-events.logql","mimeType":"text/plain","content":
"{app=\"kube-apiserver\"} |= \"Warning\""},
]
TOOLS = [
{"name":"promql.query","inputSchema":{"type":"object","properties":{"expr":{"type":"string"},"range":{"type":"string"}}}},
{"name":"loki.query","inputSchema":{"type":"object","properties":{"logql":{"type":"string"},"limit":{"type":"integer"},"since":{"type":"string"}}}},
{"name":"k8s.get","inputSchema":{"type":"object","properties":{"kind":{"type":"string"},"namespace":{"type":"string"},"name":{"type":"string"}}}},
{"name":"k8s.events","inputSchema":{"type":"object","properties":{"namespace":{"type":"string"},"since":{"type":"string"}}}},
{"name":"inventory.snapshot","inputSchema":{"type":"object","properties":{}}},
]
PROMPTS = [
{"name":"Triage-Now","description":"Summarize current alerts, top offenders and recent warnings."}
]
@app.get("/healthz")
def healthz():
return {"status":"ok","ts": int(time.time())}
@app.get("/mcp/schema")
def mcp_schema():
return {"resources": RESOURCES, "tools": TOOLS, "prompts": PROMPTS}
@app.get("/mcp/resource")
def mcp_resource(uri: str):
for r in RESOURCES:
if r["uri"] == uri:
return {"uri": uri, "mimeType": r["mimeType"], "content": r["content"]}
return {"error": "not found", "uri": uri}
@app.post("/tools/promql.query")
async def promql_query(payload: Dict[str, Any] = Body(...)):
expr = payload.get("expr")
rng = payload.get("range")
params = {"query": expr} if not rng else {"query": f"sum_over_time(({expr})[{rng}])"}
async with httpx.AsyncClient() as c:
r = await c.get(f"{PROM}/api/v1/query", params=params, timeout=30.0)
return r.json()
@app.post("/tools/loki.query")
async def loki_query(payload: Dict[str, Any] = Body(...)):
logql = payload.get("logql")
limit = payload.get("limit", 100)
params = {"query": logql, "limit": str(limit)}
async with httpx.AsyncClient() as c:
r = await c.get(f"{LOKI}/loki/api/v1/query", params=params, timeout=30.0)
return r.json()
@app.post("/tools/k8s.get")
async def k8s_get(payload: Dict[str, Any] = Body(...)):
kind = payload.get("kind", "").lower()
ns = payload.get("namespace")
name = payload.get("name")
# Map a few common kinds
mapping = {
"pods": ("/api/v1", "pods"),
"pod": ("/api/v1", "pods"),
"namespaces": ("/api/v1", "namespaces"),
"nodes": ("/api/v1", "nodes"),
"services": ("/api/v1", "services"),
"events": ("/api/v1", "events"),
"deployments": ("/apis/apps/v1", "deployments"),
"daemonsets": ("/apis/apps/v1", "daemonsets"),
"statefulsets": ("/apis/apps/v1", "statefulsets"),
"replicasets": ("/apis/apps/v1", "replicasets"),
}
if kind not in mapping:
return {"error":"unsupported kind", "kind": kind}
base, res = mapping[kind]
url = f"{K8S}{base}"
if ns:
url += f"/namespaces/{ns}/{res}"
else:
url += f"/{res}"
if name:
url += f"/{name}"
async with httpx.AsyncClient(**_ssl_params(), headers=_sa_headers()) as c:
r = await c.get(url, timeout=30.0)
return r.json()
@app.post("/tools/k8s.events")
async def k8s_events(payload: Dict[str, Any] = Body(...)):
ns = payload.get("namespace")
url = f"{K8S}/api/v1"
if ns:
url += f"/namespaces/{ns}/events"
else:
url += "/events"
async with httpx.AsyncClient(**_ssl_params(), headers=_sa_headers()) as c:
r = await c.get(url, timeout=30.0)
return r.json()
@app.post("/tools/inventory.snapshot")
async def inventory_snapshot():
# Minimal cluster inventory
async with httpx.AsyncClient(**_ssl_params(), headers=_sa_headers()) as c:
nodes = (await c.get(f"{K8S}/api/v1/nodes", timeout=30.0)).json()
ns = (await c.get(f"{K8S}/api/v1/namespaces", timeout=30.0)).json()
dpls = (await c.get(f"{K8S}/apis/apps/v1/deployments", timeout=30.0)).json()
ds = (await c.get(f"{K8S}/apis/apps/v1/daemonsets", timeout=30.0)).json()
sts = (await c.get(f"{K8S}/apis/apps/v1/statefulsets", timeout=30.0)).json()
return {"nodes": nodes, "namespaces": ns, "deployments": dpls, "daemonsets": ds, "statefulsets": sts}

View File

@@ -0,0 +1,3 @@
fastapi==0.112.2
uvicorn[standard]==0.30.6
httpx==0.27.2