当前位置:   article > 正文

Prometheus 笔记 -- alertmanager 邮件报警_prometheus 告警配置

prometheus 告警配置

一、alertmanager

部署alertmanager

从prometheus官网下载二进制文件Download | Prometheus


下载并安装 alertmanager

  1. wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
  2. tar -xzvf alertmanager-0.26.0.linux-amd64.tar.gz -C /usr/local

更改文件名称,并设置systemctl管理alertmanager

  1. cd /usr/loca/ & mv alertmanager-0.26.0.linux-amd64 alertmanager
  2. vim /usr/lib/systemd/system/alertmanager.service
  3. [Unit]
  4. Description=Alertmanager
  5. Documentation=https://github.com/prometheus/alertmanager/releases/
  6. After=network.target
  7. [Service]
  8. WorkingDirectory=/usr/local/alertmanager/ # alertmanager工作目录
  9. ExecStart=/usr/local/alertmanager/alertmanager # alertmanager启动二进制文件
  10. ExecReload=/bin/kill -HUP $MAINPID
  11. ExecStop=/bin/kill -KILL $MAINPID
  12. Type=simple
  13. KillMode=control-group
  14. Restart=on-failure # 智能重启
  15. RestartSec=15s
  16. [Install]
  17. WantedBy=multi-user.target

# 修改alertmanager配置文件

  1. vim alertmanger.yml
  2. global:
  3. resolve_timeout: 5m
  4. smtp_smarthost: 'smtp.163.com:25'
  5. smtp_from: 'xxxx' # 发送告警的邮箱
  6. smtp_auth_username: 'xxxx' #发送告警的邮箱
  7. smtp_auth_password: 'xxxx' #邮箱授权密码
  8. smtp_require_tls: false
  9. templates: #添加模板
  10. - '/usr/local/alertmanager/template/email.tmpl' #指定路径
  11. route:
  12. receiver: mail1
  13. group_by: ['alertname']
  14. group_wait: 1m # 分组等待的时间
  15. group_interval: 2m # 上下两组发送告警的间隔时间
  16. repeat_interval: 1h # 重复发送告警时间
  17. routes:
  18. - receiver: mail2 # 接收者
  19. match_re: # 条件匹配 与prometheus rules中设置的标签匹配
  20. db: sql
  21. repeat_interval: 1h # 分条件匹配重复发送告警时间
  22. receivers:
  23. - name: mail1
  24. email_configs:
  25. - send_resolved: true
  26. to: xxxx
  27. - name: "mail2"
  28. email_configs:
  29. - send_resolved: true
  30. to: xxxx
  31. inhibit_rules: # 静默匹配
  32. - source_match: # 如果告警信息中包含Disaster 就取消发送 warning信息
  33. severity: Disaster
  34. target_match:
  35. severity: warning
  36. equal:
  37. - alertmanager

# 报警模板

  1. cat /usr/local/alertmanager/template/email.tmpl
  2. {{ define "wechat.default.message" }}
  3. {{- if gt (len .Alerts.Firing) 0 -}}
  4. {{- range $index, $alert := .Alerts -}}
  5. {{- if eq $index 0 }}
  6. =========xxx环境监控报警 =========
  7. 告警状态:{{ .Status }}
  8. 告警级别:{{ .Labels.severity }}
  9. 告警类型:{{ $alert.Labels.alertname }}
  10. 故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.pod }}
  11. 告警主题: {{ $alert.Annotations.summary }}
  12. 告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
  13. 触发阀值:{{ .Annotations.value }}
  14. 故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
  15. ========= = end = =========
  16. {{- end }}
  17. {{- end }}
  18. {{- end }}
  19. {{- if gt (len .Alerts.Resolved) 0 -}}
  20. {{- range $index, $alert := .Alerts -}}
  21. {{- if eq $index 0 }}
  22. =========xxx环境异常恢复 =========
  23. 告警类型:{{ .Labels.alertname }}
  24. 告警状态:{{ .Status }}
  25. 告警主题: {{ $alert.Annotations.summary }}
  26. 告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
  27. 故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
  28. 恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
  29. {{- if gt (len $alert.Labels.instance) 0 }}
  30. 实例信息: {{ $alert.Labels.instance }}
  31. {{- end }}
  32. ========= = end = =========
  33. {{- end }}
  34. {{- end }}
  35. {{- end }}
  36. {{- end }}

# 启动alertmanager

  1. systemctl daemon-reload
  2. systemctl start alertmanager

二、prometheus

# 在prometheus中配置

prometheus.yml

  1. alerting:
  2. alertmanagers:
  3. - static_configs:
  4. - targets:
  5. - 192.168.178.129:9093
  6. # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
  7. rule_files: # 报警规则目录
  8. - "rules/*_rules.yml"
  9. - "rules/*_alerts.yml"

rules/node_alerts.yml # 其中for是持续时间 expr是判断语句

  1. node_alerts.yml
  2. groups:
  3. - name: 主机状态-监控告警
  4. rules:
  5. - alert: 主机状态
  6. expr: up == 0
  7. for: 1m
  8. labels:
  9. status: Disaster
  10. annotations:
  11. summary: "{{$labels.instance}}:服务器宕机"
  12. description: "{{$labels.instance}}:服务器延时超过5分钟"
  13. - name: 实例存活告警规则
  14. rules:
  15. - alert: 实例存活告警
  16. expr: up{job="prometheus"} == 0 or up{job="Linux-host"} == 0
  17. for: 1m
  18. labels:
  19. user: prometheus
  20. severity: Disaster
  21. annotations:
  22. summary: "Instance {{ $labels.instance }} is down"
  23. description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
  24. value: "{{ $value }}"
  25. - name: 内存告警规则
  26. rules:
  27. - alert: "内存使用率告警"
  28. expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
  29. for: 1m
  30. labels:
  31. user: prometheus
  32. severity: warning
  33. db: sql
  34. annotations:
  35. summary: "服务器: {{$labels.alertname}} 内存报警"
  36. description: "{{ $labels.alertname }} 内存资源利用率大于75%!(当前值: {{ $value }}%)"
  37. value: "{{ $value }}"
  38. - name: CPU报警规则
  39. rules:
  40. - alert: CPU使用率告警
  41. expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
  42. for: 1m
  43. labels:
  44. user: prometheus
  45. severity: warning
  46. annotations:
  47. summary: "服务器: {{$labels.alertname}} CPU报警"
  48. description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
  49. value: "{{ $value }}"
  50. - name: 磁盘报警规则
  51. rules:
  52. - alert: 磁盘使用率告警
  53. expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 40
  54. for: 1m
  55. labels:
  56. user: prometheus
  57. severity: warning
  58. annotations:
  59. summary: "服务器: {{$labels.alertname}} 磁盘报警"
  60. description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
  61. value: "{{ $value }}"

修改完prometheus时 重新启动prometheus (添加了prometheus热启动的前提下)

curl -XPOST http://localhost:9090/-/reload

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小惠珠哦/article/detail/946078
推荐阅读
  

闽ICP备14008679号