
Chaos Mesh 自带 Dashboard 功能有限:
我们的平台要解决这些问题。
1# api/topology.py
2from kubernetes import client
3
4def get_service_topology(namespace: str) -> dict:
5 """
6 返回格式:
7 {
8 "nodes": [{"id": "order", "label": "order-service", "kind": "Deployment"}],
9 "edges": [{"from": "gateway", "to": "order"}, {"from": "order", "to": "redis"}]
10 }
11 """
12 apps_v1 = client.AppsV1Api()
13 core_v1 = client.CoreV1Api()
14
15 # 获取所有 Deployment
16 deployments = apps_v1.list_namespaced_deployment(namespace=namespace)
17 services = core_v1.list_namespaced_service(namespace=namespace)
18
19 node_map = {}
20 nodes = []
21 edges = []
22
23 # 添加 Deployment 节点
24 for dep in deployments.items:
25 name = dep.metadata.name
26 labels = dep.spec.template.metadata.labels or {}
27 node_map[name] = name
28 nodes.append({
29 "id": name,
30 "label": name,
31 "kind": "Deployment",
32 "labels": labels # ← 关键:用于后续 Chaos 注入选择
33 })
34
35 # 模拟依赖关系(实际可从 OpenTelemetry 或 Istio 获取)
36 # 此处简化:假设 service 名 = deployment 名
37 for svc in services.items:
38 if svc.spec.selector:
39 target_dep = next((n for n in nodes if n["labels"] == svc.spec.selector), None)
40 if target_dep:
41 # 假设 gateway 调用所有服务(简化)
42 edges.append({"from": "gateway", "to": target_dep["id"]})
43
44 return {"nodes": nodes, "edges": edges}
💡 进阶建议:在生产环境,应对接 服务网格(如 Istio)或 APM(如 SkyWalking) 获取真实调用链。
安装依赖:
1npm install @vue-flow/core @vue-flow/controls @vue-flow/minimap
组件实现(ServiceTopology.vue):
1<template>
2 <VueFlow
3 :nodes="nodes"
4 :edges="edges"
5 :default-edge-options="{ type: 'smoothstep' }"
6 fit-view-on-init
7 @node-click="onNodeClick"
8 >
9 <MiniMap />
10 <Controls />
11 </VueFlow>
12</template>
13
14<script setup lang="ts">
15import { VueFlow, useVueFlow } from '@vue-flow/core'
16import { MiniMap, Controls } from '@vue-flow'
17
18interface NodeData {
19 id: string
20 label: string
21 kind: string
22 labels: Record<string, string>
23}
24
25const props = defineProps<{
26 namespace: string
27}>()
28
29const { addNodes, addEdges } = useVueFlow()
30const nodes = ref<NodeData[]>([])
31const edges = ref<any[]>([])
32
33// 从 API 加载拓扑
34onMounted(async () => {
35 const res = await api.get(`/topology/${props.namespace}`)
36 nodes.value = res.data.nodes.map(n => ({
37 ...n,
38 position: { x: Math.random() * 400, y: Math.random() * 300 },
39 selectable: true,
40 draggable: false
41 }))
42 edges.value = res.data.edges
43 addNodes(nodes.value)
44 addEdges(edges.value)
45})
46
47const emit = defineEmits<{
48 (e: 'select', labels: Record<string, string>): void
49}>()
50
51function onNodeClick(node: any) {
52 // 将节点的 labels 传递给父组件(用于填充 Chaos 实验目标)
53 emit('select', node.data.labels)
54}
55</script>
✅ 效果:用户点击“order-service”节点,自动填充
{"app": "order"}到实验表单。
1# models/experiment_run.py
2class ExperimentRun(Base):
3 __tablename__ = "experiment_runs"
4
5 id = Column(Integer, primary_key=True)
6 name = Column(String, nullable=False)
7 config = Column(JSON, nullable=False) # 包含 Chaos CR + 验证规则
8 status = Column(Enum("draft", "pending_approval", "approved", "running", "completed", "rejected"))
9 created_by = Column(String, nullable=False) # 提交人(企业微信/钉钉 ID)
10 approved_by = Column(String, nullable=True) # 审批人
11 environment = Column(String, nullable=False) # dev / staging / prod
12 namespace = Column(String, nullable=False)
13 created_at = Column(DateTime, default=datetime.utcnow)

1# middleware/rbac.py
2async def require_chaos_permission(
3 request: Request,
4 current_user: User = Depends(get_current_user)
5):
6 env = request.path_params.get("env") or request.query_params.get("env")
7
8 if env == "prod":
9 # 生产环境:仅 SRE 可提交,且必须审批
10 if not current_user.roles or "sre" not in current_user.roles:
11 raise HTTPException(status_code=403, detail="生产环境仅 SRE 可操作")
12 elif env == "staging":
13 # 预发:开发可提交,但需 SRE 审批
14 pass
15 # dev 环境:自由操作
16
17 return current_user
1<template>
2 <div v-if="run.status === 'pending_approval' && isSRE">
3 <el-button type="success" @click="approve">批准</el-button>
4 <el-button type="danger" @click="reject">拒绝</el-button>
5 </div>
6</template>
7
8<script setup>
9const isSRE = computed(() => userStore.roles.includes('sre'))
10
11async function approve() {
12 await api.post(`/experiments/${run.id}/approve`)
13 // 刷新状态
14}
15</script>
平台环境 | Kubernetes Namespace | Chaos Mesh 权限 |
|---|---|---|
dev | chaos-dev | 开发可读写 |
staging | chaos-staging | 开发提交,SRE 执行 |
prod | chaos-prod | 仅 SRE 可操作 |
1# rbac-prod.yaml
2apiVersion: v1
3kind: ServiceAccount
4metadata:
5 name: chaos-platform-prod
6 namespace: chaos-prod
7
8---
9apiVersion: rbac.authorization.k8s.io/v1
10kind: Role
11metadata:
12 namespace: chaos-prod
13 name: chaos-executor
14rules:
15- apiGroups: ["chaos-mesh.org"]
16 resources: ["*"]
17 verbs: ["create", "get", "list", "delete"]
18- apiGroups: [""]
19 resources: ["pods", "services"]
20 verbs: ["get", "list"]
21
22---
23apiVersion: rbac.authorization.k8s.io/v1
24kind: RoleBinding
25metadata:
26 name: platform-to-chaos
27 namespace: chaos-prod
28subjects:
29- kind: ServiceAccount
30 name: chaos-platform-prod
31 namespace: platform # ← 我们的平台 Pod 所在 namespace
32roleRef:
33 kind: Role
34 name: chaos-executor
35 apiGroup: rbac.authorization.k8s.io
🔒 关键:平台后端 Pod 使用
chaos-platform-prodServiceAccount 访问生产 Chaos Mesh,绝不使用 cluster-admin。
1# 仅允许 chaos-testing namespace 内部通信
2apiVersion: networking.k8s.io/v1
3kind: NetworkPolicy
4metadata:
5 name: chaos-daemon-isolation
6 namespace: chaos-prod
7spec:
8 podSelector:
9 matchLabels:
10 app.kubernetes.io/component: chaos-daemon
11 policyTypes:
12 - Ingress
13 ingress:
14 - from:
15 - namespaceSelector:
16 matchLabels:
17 kubernetes.io/metadata.name: chaos-prod
禁止在 default / kube-system 执行实验
1if namespace in ["default", "kube-system", "monitoring"]:
2 raise ValueError("禁止在核心命名空间执行混沌实验")
实验前强制预览 YAML
所有操作留痕
紧急熔断开关
Chaos Mesh 提供了强大的“武器”,但没有瞄准镜和扳机保险的枪很危险。 我们的平台做了三件事:
下一期预告:《从零落地 Chaos Mesh(四)—— 生产环境灰度实验与无人值守演练实战》