赞
踩
部署alertmanager
从prometheus官网下载二进制文件:Download | Prometheus
下载并安装 alertmanager
- wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
-
- tar -xzvf alertmanager-0.26.0.linux-amd64.tar.gz -C /usr/local
更改文件名称,并设置systemctl管理alertmanager
- cd /usr/loca/ & mv alertmanager-0.26.0.linux-amd64 alertmanager
-
- vim /usr/lib/systemd/system/alertmanager.service
- [Unit]
- Description=Alertmanager
- Documentation=https://github.com/prometheus/alertmanager/releases/
- After=network.target
-
- [Service]
- WorkingDirectory=/usr/local/alertmanager/ # alertmanager工作目录
- ExecStart=/usr/local/alertmanager/alertmanager # alertmanager启动二进制文件
- ExecReload=/bin/kill -HUP $MAINPID
- ExecStop=/bin/kill -KILL $MAINPID
- Type=simple
- KillMode=control-group
- Restart=on-failure # 智能重启
- RestartSec=15s
-
- [Install]
- WantedBy=multi-user.target

# 修改alertmanager配置文件
- vim alertmanger.yml
-
- global:
- resolve_timeout: 5m
- smtp_smarthost: 'smtp.163.com:25'
- smtp_from: 'xxxx' # 发送告警的邮箱
- smtp_auth_username: 'xxxx' #发送告警的邮箱
- smtp_auth_password: 'xxxx' #邮箱授权密码
- smtp_require_tls: false
-
- templates: #添加模板
- - '/usr/local/alertmanager/template/email.tmpl' #指定路径
-
- route:
- receiver: mail1
- group_by: ['alertname']
- group_wait: 1m # 分组等待的时间
- group_interval: 2m # 上下两组发送告警的间隔时间
- repeat_interval: 1h # 重复发送告警时间
- routes:
- - receiver: mail2 # 接收者
- match_re: # 条件匹配 与prometheus rules中设置的标签匹配
- db: sql
- repeat_interval: 1h # 分条件匹配重复发送告警时间
- receivers:
- - name: mail1
- email_configs:
- - send_resolved: true
- to: xxxx
- - name: "mail2"
- email_configs:
- - send_resolved: true
- to: xxxx
- inhibit_rules: # 静默匹配
- - source_match: # 如果告警信息中包含Disaster 就取消发送 warning信息
- severity: Disaster
- target_match:
- severity: warning
- equal:
- - alertmanager

# 报警模板
- cat /usr/local/alertmanager/template/email.tmpl
- {{ define "wechat.default.message" }}
- {{- if gt (len .Alerts.Firing) 0 -}}
- {{- range $index, $alert := .Alerts -}}
- {{- if eq $index 0 }}
- =========xxx环境监控报警 =========
- 告警状态:{{ .Status }}
- 告警级别:{{ .Labels.severity }}
- 告警类型:{{ $alert.Labels.alertname }}
- 故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.pod }}
- 告警主题: {{ $alert.Annotations.summary }}
- 告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
- 触发阀值:{{ .Annotations.value }}
- 故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
- ========= = end = =========
- {{- end }}
- {{- end }}
- {{- end }}
- {{- if gt (len .Alerts.Resolved) 0 -}}
- {{- range $index, $alert := .Alerts -}}
- {{- if eq $index 0 }}
- =========xxx环境异常恢复 =========
- 告警类型:{{ .Labels.alertname }}
- 告警状态:{{ .Status }}
- 告警主题: {{ $alert.Annotations.summary }}
- 告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
- 故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
- 恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
- {{- if gt (len $alert.Labels.instance) 0 }}
- 实例信息: {{ $alert.Labels.instance }}
- {{- end }}
- ========= = end = =========
- {{- end }}
- {{- end }}
- {{- end }}
- {{- end }}

# 启动alertmanager
- systemctl daemon-reload
- systemctl start alertmanager
# 在prometheus中配置
prometheus.yml
- alerting:
- alertmanagers:
- - static_configs:
- - targets:
- - 192.168.178.129:9093
-
- # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
- rule_files: # 报警规则目录
- - "rules/*_rules.yml"
- - "rules/*_alerts.yml"
rules/node_alerts.yml # 其中for是持续时间 expr是判断语句
- node_alerts.yml
- groups:
- - name: 主机状态-监控告警
- rules:
- - alert: 主机状态
- expr: up == 0
- for: 1m
- labels:
- status: Disaster
- annotations:
- summary: "{{$labels.instance}}:服务器宕机"
- description: "{{$labels.instance}}:服务器延时超过5分钟"
-
- - name: 实例存活告警规则
- rules:
- - alert: 实例存活告警
- expr: up{job="prometheus"} == 0 or up{job="Linux-host"} == 0
- for: 1m
- labels:
- user: prometheus
- severity: Disaster
- annotations:
- summary: "Instance {{ $labels.instance }} is down"
- description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
- value: "{{ $value }}"
-
- - name: 内存告警规则
- rules:
- - alert: "内存使用率告警"
- expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
- for: 1m
- labels:
- user: prometheus
- severity: warning
- db: sql
- annotations:
- summary: "服务器: {{$labels.alertname}} 内存报警"
- description: "{{ $labels.alertname }} 内存资源利用率大于75%!(当前值: {{ $value }}%)"
- value: "{{ $value }}"
-
- - name: CPU报警规则
- rules:
- - alert: CPU使用率告警
- expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
- for: 1m
- labels:
- user: prometheus
- severity: warning
- annotations:
- summary: "服务器: {{$labels.alertname}} CPU报警"
- description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
- value: "{{ $value }}"
-
- - name: 磁盘报警规则
- rules:
- - alert: 磁盘使用率告警
- expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 40
- for: 1m
- labels:
- user: prometheus
- severity: warning
- annotations:
- summary: "服务器: {{$labels.alertname}} 磁盘报警"
- description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- value: "{{ $value }}"

修改完prometheus时 重新启动prometheus (添加了prometheus热启动的前提下)
curl -XPOST http://localhost:9090/-/reload
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。