这里分类和汇总了欣宸的全部原创(含配套源码):https://github.com/zq2599/blog_demos
[prometheus-group]
prometheus ansible_host=192.168.50.134 ansible_port=22 ansible_user=prometheus ansible_password=888888
working001 ansible_host=192.168.50.134 ansible_port=22 ansible_user=prometheus ansible_password=888888
alertmanager ansible_host=192.168.50.134 ansible_port=22 ansible_user=prometheus ansible_password=888888
prometheus_user_home: /home/prometheus
prometheus_base_path: '{{prometheus_user_home}}/prometheus'
prometheus_url: https://github.com/prometheus/prometheus/releases/download
prometheus_version: 2.37.7
prometheus_deploy_path: '{{prometheus_base_path}}/prometheus-{{prometheus_version}}.linux-amd64'
node_exporter_base_path: '{{prometheus_user_home}}/node_exporter'
node_exporter_url: https://github.com/prometheus/node_exporter/releases/download
node_exporter_version: 1.5.0
node_exporter_deploy_path: '{{node_exporter_base_path}}/node_exporter-{{node_exporter_version}}.linux-amd64'
alertmanager_base_path: '{{prometheus_user_home}}/alertmanager'
alertmanager_url: https://github.com/prometheus/alertmanager/releases/download
alertmanager_version: 0.25.0
alertmanager_deploy_path: '{{alertmanager_base_path}}/alertmanager-{{alertmanager_version}}.linux-amd64'
- name: 部署alertmanager
hosts: alertmanager
gather_facts: True
vars_files:
- vars.yml
tasks:
- name: 停止残留的alertmanager
ignore_errors: yes
systemd:
name: alertmanager
state: stopped
become: yes
- name: 清理可能的alertmanager service残留文件
file:
path: /etc/systemd/system/alertmanager.service
state: absent
become: yes
- name: 清理可能的alertmanager残留文件夹
file:
path: '{{alertmanager_base_path}}'
state: absent
- name: 新建部署文件夹
file:
path: '{{alertmanager_base_path}}'
state: directory
mode: '0755'
- name: 下载并解压文件alertmanager-{{alertmanager_version}}.linux-amd64.tar.gz
ansible.builtin.unarchive:
src: '{{alertmanager_url}}/v{{alertmanager_version}}/alertmanager-{{alertmanager_version}}.linux-amd64.tar.gz'
dest: '{{alertmanager_base_path}}'
remote_src: yes
- name: 生成systemd的service文件
shell: |
tee /etc/systemd/system/alertmanager.service <<-'EOF'
[Unit]
Description=Alert manager Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target
[Service]
User=prometheus
Restart=on-failure
ExecStart={{alertmanager_deploy_path}}/alertmanager --config.file={{alertmanager_deploy_path}}/alertmanager.yml --storage.path={{alertmanager_base_path}}/data
[Install]
WantedBy=multi-user.target
EOF
become: yes
- name: 刷新服务配置
systemd:
daemon_reload: true
become: yes
- name: 将alertmanager服务设置为自启动
systemd:
name: alertmanager
enabled: true
masked: no
become: yes
- name: 启动alertmanager
systemd:
state: started
name: alertmanager
become: yes
ansible-playbook install_alertmanager.yml
global:
# 全局配置,收到告警后,如果持续10分钟都没再收到告警,就把告警状态标记为resolved(已解决)
resolve_timeout: 10m
route:
# 分组,处于同一组的告警会被合并为同一个通知
# 这里设置的是alertname相同的告警会被合并为同一个通知
group_by: ['alertname']
# 30秒是个时间窗口,这个窗口内,同一个分组的所有消息会被合并为同一个通知
group_wait: 30s
# 同一个分组发送一次合并消息之后,每隔1分钟检查一次告警,判断是否要继续对此告警做操作
group_interval: 1m
# 按照group_interval的配置,每隔1每分钟检查一次,等到第六次时,1*6=6,大于repeat_interval的5m,此时就会在再次发送告警
repeat_interval: 5m
# 指定具体的通知方式
# 简单起见,这里只配置了顶级路由,没有针对故障的标签进行细分
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
# alertmanager发起web请求的地址
- url: 'http://192.168.50.134:8888/webhook'
# 告警抑制规则,可以有多条
inhibit_rules:
# 这个规则的意思是:一旦收到critical级别的告警,那么再收到低级别(warning)的告警就没必要通知了,
# 还有一处非常重要的比较,就是低级别告警的node标签的值,要和critical级别告警的node标签的值要相等,也就是确保两个告警的来源相同
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['node']
May 13 10:04:40 deskmini alertmanager[767]: ts=2023-05-13T02:04:40.869Z caller=notify.go:732 level=warn component=dispatcher receiver=web.hook integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"http://192.168.50.134:8888/webhook\": dial tcp http://192.168.50.134:8888/webhook: connect: connection refused"
May 13 10:09:40 deskmini alertmanager[767]: ts=2023-05-13T02:09:40.869Z caller=dispatch.go:352 level=error component=dispatcher msg="Notify for alerts failed" num_alerts=1 err="web.hook/webhook[0]: notify retry canceled after 16 attempts: Post \"http://192.168.50.134:8888/webhook\": dial tcp http://192.168.50.134:8888/webhook: connect: connection refused"
May 13 10:09:40 deskmini alertmanager[767]: ts=2023-05-13T02:09:40.869Z caller=notify.go:732 level=warn component=dispatcher receiver=web.hook integration=webhook[0] msg="Notify attempt failed, will retry later" attempts=1 err="Post \"http://192.168.50.134:8888/webhook\": dial tcp http://192.168.50.134:8888/webhook: connect: connection refused"